livekit-plugins-munsit 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ """Munsit plugin for LiveKit Agents.
2
+
3
+ See https://docs.munsit.com for the upstream API documentation.
4
+ """
5
+
6
+ from ._utils import AudioEnergyFilter
7
+ from .models import MunsitModels
8
+ from .stt import STT, SpeechStream
9
+ from .version import __version__
10
+
11
+ __all__ = ["STT", "SpeechStream", "AudioEnergyFilter", "MunsitModels", "__version__"]
12
+
13
+
14
+ from livekit.agents import Plugin
15
+
16
+ from .log import logger
17
+
18
+
19
+ class MunsitPlugin(Plugin):
20
+ def __init__(self) -> None:
21
+ super().__init__(__name__, __version__, __package__, logger)
22
+
23
+
24
+ Plugin.register_plugin(MunsitPlugin())
25
+
26
+ # Cleanup docs of unexported modules
27
+ _module = dir()
28
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
29
+
30
+ __pdoc__ = {}
31
+
32
+ for n in NOT_IN_ALL:
33
+ __pdoc__[n] = False
@@ -0,0 +1,129 @@
1
+ # Copyright 2026 LiveKit, Inc.
2
+ from __future__ import annotations
3
+
4
+ import struct
5
+ import time
6
+ from collections.abc import Callable
7
+ from enum import Enum
8
+ from typing import Generic, TypeVar
9
+
10
+ import numpy as np
11
+
12
+ from livekit import rtc
13
+
14
+
15
+ def build_wav_header(
16
+ *, sample_rate: int, num_channels: int = 1, bits_per_sample: int = 16
17
+ ) -> bytes:
18
+ """Build a 44-byte PCM WAV header.
19
+
20
+ The data chunk size is set to a sentinel max value (0xFFFFFFFF - 44) because we are streaming
21
+ and don't know the total size in advance. Munsit only validates the format fields.
22
+ """
23
+ if sample_rate <= 0:
24
+ raise ValueError("sample_rate must be positive")
25
+ if num_channels <= 0:
26
+ raise ValueError("num_channels must be positive")
27
+ if bits_per_sample not in (8, 16, 24, 32):
28
+ raise ValueError("bits_per_sample must be 8, 16, 24, or 32")
29
+
30
+ byte_rate = sample_rate * num_channels * bits_per_sample // 8
31
+ block_align = num_channels * bits_per_sample // 8
32
+ data_size = 0xFFFFFFFF - 44
33
+ riff_size = data_size + 36
34
+
35
+ return (
36
+ b"RIFF"
37
+ + struct.pack("<I", riff_size)
38
+ + b"WAVE"
39
+ + b"fmt "
40
+ + struct.pack("<I", 16) # fmt chunk size
41
+ + struct.pack("<H", 1) # PCM format
42
+ + struct.pack("<H", num_channels)
43
+ + struct.pack("<I", sample_rate)
44
+ + struct.pack("<I", byte_rate)
45
+ + struct.pack("<H", block_align)
46
+ + struct.pack("<H", bits_per_sample)
47
+ + b"data"
48
+ + struct.pack("<I", data_size)
49
+ )
50
+
51
+
52
+ def pcm_to_audiobuffer(data: bytes) -> list[int]:
53
+ """Encode raw PCM bytes as Munsit's `audioBuffer` JSON int array (0-255 per byte)."""
54
+ return list(data)
55
+
56
+
57
+ T = TypeVar("T")
58
+
59
+
60
+ class PeriodicCollector(Generic[T]):
61
+ """Accumulate values and call ``callback`` once per ``duration`` seconds.
62
+
63
+ Used by SpeechStream to batch audio-duration reports so RECOGNITION_USAGE
64
+ events don't fire on every push.
65
+ """
66
+
67
+ def __init__(self, callback: Callable[[T], None], *, duration: float) -> None:
68
+ self._duration = duration
69
+ self._callback = callback
70
+ self._last_flush_time = time.monotonic()
71
+ self._total: T | None = None
72
+
73
+ def push(self, value: T) -> None:
74
+ if self._total is None:
75
+ self._total = value
76
+ else:
77
+ self._total += value # type: ignore[operator]
78
+ if time.monotonic() - self._last_flush_time >= self._duration:
79
+ self.flush()
80
+
81
+ def flush(self) -> None:
82
+ if self._total is not None:
83
+ self._callback(self._total)
84
+ self._total = None
85
+ self._last_flush_time = time.monotonic()
86
+
87
+
88
+ _DEFAULT_RMS_THRESHOLD = 0.004**2
89
+
90
+
91
+ class AudioEnergyFilter:
92
+ """Simple RMS-based VAD copied from the Gladia plugin pattern."""
93
+
94
+ class State(Enum):
95
+ START = 0
96
+ SPEAKING = 1
97
+ SILENCE = 2
98
+ END = 3
99
+
100
+ def __init__(
101
+ self, *, min_silence: float = 1.5, rms_threshold: float = _DEFAULT_RMS_THRESHOLD
102
+ ) -> None:
103
+ self._cooldown_seconds = min_silence
104
+ self._cooldown = min_silence
105
+ self._state = self.State.SILENCE
106
+ self._rms_threshold = rms_threshold
107
+
108
+ def update(self, frame: rtc.AudioFrame) -> AudioEnergyFilter.State:
109
+ arr = np.frombuffer(frame.data, dtype=np.int16)
110
+ float_arr = arr.astype(np.float32) / 32768.0
111
+ rms = float(np.mean(np.square(float_arr)))
112
+
113
+ if rms > self._rms_threshold:
114
+ self._cooldown = self._cooldown_seconds
115
+ if self._state in (self.State.SILENCE, self.State.END):
116
+ self._state = self.State.START
117
+ else:
118
+ self._state = self.State.SPEAKING
119
+ else:
120
+ if self._cooldown <= 0:
121
+ if self._state in (self.State.SPEAKING, self.State.START):
122
+ self._state = self.State.END
123
+ elif self._state == self.State.END:
124
+ self._state = self.State.SILENCE
125
+ else:
126
+ self._cooldown -= frame.duration
127
+ self._state = self.State.SPEAKING
128
+
129
+ return self._state
@@ -0,0 +1,3 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger("livekit.plugins.munsit")
@@ -0,0 +1,3 @@
1
+ from typing import Literal
2
+
3
+ MunsitModels = Literal["munsit", "munsit-en-ar"]
File without changes