livekit-plugins-munsit 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/munsit/__init__.py +33 -0
- livekit/plugins/munsit/_utils.py +129 -0
- livekit/plugins/munsit/log.py +3 -0
- livekit/plugins/munsit/models.py +3 -0
- livekit/plugins/munsit/py.typed +0 -0
- livekit/plugins/munsit/stt.py +914 -0
- livekit/plugins/munsit/version.py +1 -0
- livekit_plugins_munsit-0.3.0.dist-info/METADATA +102 -0
- livekit_plugins_munsit-0.3.0.dist-info/RECORD +11 -0
- livekit_plugins_munsit-0.3.0.dist-info/WHEEL +4 -0
- livekit_plugins_munsit-0.3.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Munsit plugin for LiveKit Agents.
|
|
2
|
+
|
|
3
|
+
See https://docs.munsit.com for the upstream API documentation.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from ._utils import AudioEnergyFilter
|
|
7
|
+
from .models import MunsitModels
|
|
8
|
+
from .stt import STT, SpeechStream
|
|
9
|
+
from .version import __version__
|
|
10
|
+
|
|
11
|
+
__all__ = ["STT", "SpeechStream", "AudioEnergyFilter", "MunsitModels", "__version__"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
from livekit.agents import Plugin
|
|
15
|
+
|
|
16
|
+
from .log import logger
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MunsitPlugin(Plugin):
|
|
20
|
+
def __init__(self) -> None:
|
|
21
|
+
super().__init__(__name__, __version__, __package__, logger)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
Plugin.register_plugin(MunsitPlugin())
|
|
25
|
+
|
|
26
|
+
# Cleanup docs of unexported modules
|
|
27
|
+
_module = dir()
|
|
28
|
+
NOT_IN_ALL = [m for m in _module if m not in __all__]
|
|
29
|
+
|
|
30
|
+
__pdoc__ = {}
|
|
31
|
+
|
|
32
|
+
for n in NOT_IN_ALL:
|
|
33
|
+
__pdoc__[n] = False
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# Copyright 2026 LiveKit, Inc.
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import struct
|
|
5
|
+
import time
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Generic, TypeVar
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
from livekit import rtc
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_wav_header(
|
|
16
|
+
*, sample_rate: int, num_channels: int = 1, bits_per_sample: int = 16
|
|
17
|
+
) -> bytes:
|
|
18
|
+
"""Build a 44-byte PCM WAV header.
|
|
19
|
+
|
|
20
|
+
The data chunk size is set to a sentinel max value (0xFFFFFFFF - 44) because we are streaming
|
|
21
|
+
and don't know the total size in advance. Munsit only validates the format fields.
|
|
22
|
+
"""
|
|
23
|
+
if sample_rate <= 0:
|
|
24
|
+
raise ValueError("sample_rate must be positive")
|
|
25
|
+
if num_channels <= 0:
|
|
26
|
+
raise ValueError("num_channels must be positive")
|
|
27
|
+
if bits_per_sample not in (8, 16, 24, 32):
|
|
28
|
+
raise ValueError("bits_per_sample must be 8, 16, 24, or 32")
|
|
29
|
+
|
|
30
|
+
byte_rate = sample_rate * num_channels * bits_per_sample // 8
|
|
31
|
+
block_align = num_channels * bits_per_sample // 8
|
|
32
|
+
data_size = 0xFFFFFFFF - 44
|
|
33
|
+
riff_size = data_size + 36
|
|
34
|
+
|
|
35
|
+
return (
|
|
36
|
+
b"RIFF"
|
|
37
|
+
+ struct.pack("<I", riff_size)
|
|
38
|
+
+ b"WAVE"
|
|
39
|
+
+ b"fmt "
|
|
40
|
+
+ struct.pack("<I", 16) # fmt chunk size
|
|
41
|
+
+ struct.pack("<H", 1) # PCM format
|
|
42
|
+
+ struct.pack("<H", num_channels)
|
|
43
|
+
+ struct.pack("<I", sample_rate)
|
|
44
|
+
+ struct.pack("<I", byte_rate)
|
|
45
|
+
+ struct.pack("<H", block_align)
|
|
46
|
+
+ struct.pack("<H", bits_per_sample)
|
|
47
|
+
+ b"data"
|
|
48
|
+
+ struct.pack("<I", data_size)
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def pcm_to_audiobuffer(data: bytes) -> list[int]:
|
|
53
|
+
"""Encode raw PCM bytes as Munsit's `audioBuffer` JSON int array (0-255 per byte)."""
|
|
54
|
+
return list(data)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
T = TypeVar("T")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class PeriodicCollector(Generic[T]):
|
|
61
|
+
"""Accumulate values and call ``callback`` once per ``duration`` seconds.
|
|
62
|
+
|
|
63
|
+
Used by SpeechStream to batch audio-duration reports so RECOGNITION_USAGE
|
|
64
|
+
events don't fire on every push.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(self, callback: Callable[[T], None], *, duration: float) -> None:
|
|
68
|
+
self._duration = duration
|
|
69
|
+
self._callback = callback
|
|
70
|
+
self._last_flush_time = time.monotonic()
|
|
71
|
+
self._total: T | None = None
|
|
72
|
+
|
|
73
|
+
def push(self, value: T) -> None:
|
|
74
|
+
if self._total is None:
|
|
75
|
+
self._total = value
|
|
76
|
+
else:
|
|
77
|
+
self._total += value # type: ignore[operator]
|
|
78
|
+
if time.monotonic() - self._last_flush_time >= self._duration:
|
|
79
|
+
self.flush()
|
|
80
|
+
|
|
81
|
+
def flush(self) -> None:
|
|
82
|
+
if self._total is not None:
|
|
83
|
+
self._callback(self._total)
|
|
84
|
+
self._total = None
|
|
85
|
+
self._last_flush_time = time.monotonic()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
_DEFAULT_RMS_THRESHOLD = 0.004**2
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class AudioEnergyFilter:
|
|
92
|
+
"""Simple RMS-based VAD copied from the Gladia plugin pattern."""
|
|
93
|
+
|
|
94
|
+
class State(Enum):
|
|
95
|
+
START = 0
|
|
96
|
+
SPEAKING = 1
|
|
97
|
+
SILENCE = 2
|
|
98
|
+
END = 3
|
|
99
|
+
|
|
100
|
+
def __init__(
|
|
101
|
+
self, *, min_silence: float = 1.5, rms_threshold: float = _DEFAULT_RMS_THRESHOLD
|
|
102
|
+
) -> None:
|
|
103
|
+
self._cooldown_seconds = min_silence
|
|
104
|
+
self._cooldown = min_silence
|
|
105
|
+
self._state = self.State.SILENCE
|
|
106
|
+
self._rms_threshold = rms_threshold
|
|
107
|
+
|
|
108
|
+
def update(self, frame: rtc.AudioFrame) -> AudioEnergyFilter.State:
|
|
109
|
+
arr = np.frombuffer(frame.data, dtype=np.int16)
|
|
110
|
+
float_arr = arr.astype(np.float32) / 32768.0
|
|
111
|
+
rms = float(np.mean(np.square(float_arr)))
|
|
112
|
+
|
|
113
|
+
if rms > self._rms_threshold:
|
|
114
|
+
self._cooldown = self._cooldown_seconds
|
|
115
|
+
if self._state in (self.State.SILENCE, self.State.END):
|
|
116
|
+
self._state = self.State.START
|
|
117
|
+
else:
|
|
118
|
+
self._state = self.State.SPEAKING
|
|
119
|
+
else:
|
|
120
|
+
if self._cooldown <= 0:
|
|
121
|
+
if self._state in (self.State.SPEAKING, self.State.START):
|
|
122
|
+
self._state = self.State.END
|
|
123
|
+
elif self._state == self.State.END:
|
|
124
|
+
self._state = self.State.SILENCE
|
|
125
|
+
else:
|
|
126
|
+
self._cooldown -= frame.duration
|
|
127
|
+
self._state = self.State.SPEAKING
|
|
128
|
+
|
|
129
|
+
return self._state
|
|
File without changes
|