converse-framework 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- converse_framework/__init__.py +108 -0
- converse_framework/audio_utils.py +412 -0
- converse_framework/cuda_utils.py +176 -0
- converse_framework/events.py +94 -0
- converse_framework/examples/__init__.py +20 -0
- converse_framework/examples/subprocess_provider.py +439 -0
- converse_framework/examples/text_chat.py +308 -0
- converse_framework/examples/voice_chat.py +223 -0
- converse_framework/examples/websocket_voice_chat.py +174 -0
- converse_framework/js/browser-voice-client.js +248 -0
- converse_framework/js/mic-frame-sender.js +445 -0
- converse_framework/js/speaker-echo-guard.js +308 -0
- converse_framework/js/tts-audio-player.js +237 -0
- converse_framework/pipeline.py +620 -0
- converse_framework/protocols.py +382 -0
- converse_framework/provider_events.py +159 -0
- converse_framework/providers/__init__.py +28 -0
- converse_framework/providers/faster_whisper.py +290 -0
- converse_framework/providers/kokoro_onnx.py +391 -0
- converse_framework/providers/llamacpp.py +264 -0
- converse_framework/providers/mock.py +171 -0
- converse_framework/providers/pocket_tts.py +409 -0
- converse_framework/providers/silero.py +161 -0
- converse_framework/providers/unavailable.py +137 -0
- converse_framework/providers/whisper_cpp.py +322 -0
- converse_framework/registry.py +397 -0
- converse_framework/session.py +315 -0
- converse_framework/transport.py +54 -0
- converse_framework/utterance_collector.py +336 -0
- converse_framework-0.2.0.dist-info/METADATA +992 -0
- converse_framework-0.2.0.dist-info/RECORD +33 -0
- converse_framework-0.2.0.dist-info/WHEEL +4 -0
- converse_framework-0.2.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
"""VAD-driven audio utterance collector.
|
|
2
|
+
|
|
3
|
+
Encapsulates the state machine that turns a stream of parsed
|
|
4
|
+
:class:`AudioFrame` objects into complete utterance byte buffers ready
|
|
5
|
+
for ASR, while emitting compatible input-level, VAD, and rejection
|
|
6
|
+
events through an :class:`EventSink`.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
from collections import deque
|
|
13
|
+
from collections.abc import Awaitable, Callable
|
|
14
|
+
from dataclasses import dataclass, field, fields, replace
|
|
15
|
+
|
|
16
|
+
from converse_framework.audio_utils import (
|
|
17
|
+
AudioFrame,
|
|
18
|
+
AudioFrameStats,
|
|
19
|
+
compute_pcm16_level,
|
|
20
|
+
trim_pcm16_silence,
|
|
21
|
+
)
|
|
22
|
+
from converse_framework.events import EventSink
|
|
23
|
+
from converse_framework.protocols import VADProvider
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
PreSpeechStartHook = Callable[[AudioFrame, str], Awaitable[None]]
|
|
29
|
+
UtteranceCallback = Callable[[bytes, int, str], Awaitable[None]]
|
|
30
|
+
CancelCallback = Callable[[str], Awaitable[None]]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True)
|
|
34
|
+
class UtteranceCollectorConfig:
|
|
35
|
+
"""Tuning knobs for the VAD utterance collector.
|
|
36
|
+
|
|
37
|
+
Derived frame counts and byte sizes are computed in
|
|
38
|
+
:meth:`__post_init__` so callers can read them after construction.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
sample_rate: int = 16000
|
|
42
|
+
channels: int = 1
|
|
43
|
+
frame_ms: int = 30
|
|
44
|
+
pre_speech_ms: int = 450
|
|
45
|
+
max_utterance_ms: int = 30000
|
|
46
|
+
min_speech_duration_ms: int = 300
|
|
47
|
+
reject_low_energy_rms: float = 0.003
|
|
48
|
+
reject_low_energy_max_duration_ms: int = 900
|
|
49
|
+
reject_utterance_rms: float = 0.002
|
|
50
|
+
trim_silence_rms: float = 0.003
|
|
51
|
+
trim_silence_frame_ms: int = 30
|
|
52
|
+
|
|
53
|
+
pre_speech_frames: int = field(init=False)
|
|
54
|
+
max_utterance_frames: int = field(init=False)
|
|
55
|
+
bytes_per_ms: int = field(init=False)
|
|
56
|
+
expected_frame_bytes: int = field(init=False)
|
|
57
|
+
|
|
58
|
+
def __post_init__(self) -> None:
|
|
59
|
+
if self.frame_ms <= 0:
|
|
60
|
+
raise ValueError("frame_ms must be > 0")
|
|
61
|
+
if self.channels <= 0:
|
|
62
|
+
raise ValueError("channels must be > 0")
|
|
63
|
+
if self.sample_rate <= 0:
|
|
64
|
+
raise ValueError("sample_rate must be > 0")
|
|
65
|
+
object.__setattr__(
|
|
66
|
+
self, "pre_speech_frames", max(1, self.pre_speech_ms // self.frame_ms)
|
|
67
|
+
)
|
|
68
|
+
object.__setattr__(
|
|
69
|
+
self,
|
|
70
|
+
"max_utterance_frames",
|
|
71
|
+
max(1, self.max_utterance_ms // self.frame_ms),
|
|
72
|
+
)
|
|
73
|
+
object.__setattr__(
|
|
74
|
+
self,
|
|
75
|
+
"bytes_per_ms",
|
|
76
|
+
self.sample_rate * self.channels * 2 // 1000,
|
|
77
|
+
)
|
|
78
|
+
object.__setattr__(
|
|
79
|
+
self, "expected_frame_bytes", self.bytes_per_ms * self.frame_ms
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def to_dict(self) -> dict[str, int | float]:
|
|
83
|
+
"""Return only caller-configurable fields for persistence."""
|
|
84
|
+
return {
|
|
85
|
+
item.name: getattr(self, item.name) for item in fields(self) if item.init
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class AudioUtteranceCollector:
|
|
90
|
+
"""VAD-driven utterance collector.
|
|
91
|
+
|
|
92
|
+
Accepts parsed :class:`AudioFrame` objects, runs the configured VAD
|
|
93
|
+
provider, applies the rejection gates, and dispatches the resulting
|
|
94
|
+
utterance bytes to a caller callback. Input-level, VAD, and
|
|
95
|
+
rejection events are emitted through the configured
|
|
96
|
+
:class:`EventSink` so transport layers can forward them to clients
|
|
97
|
+
without coupling to the collector itself.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
vad_provider: VADProvider,
|
|
103
|
+
event_sink: EventSink,
|
|
104
|
+
utterance_callback: UtteranceCallback,
|
|
105
|
+
config: UtteranceCollectorConfig | None = None,
|
|
106
|
+
cancel_callback: CancelCallback | None = None,
|
|
107
|
+
pre_speech_start_hook: PreSpeechStartHook | None = None,
|
|
108
|
+
) -> None:
|
|
109
|
+
self._vad = vad_provider
|
|
110
|
+
self._sink = event_sink
|
|
111
|
+
self._utterance_callback = utterance_callback
|
|
112
|
+
self._cancel_callback = cancel_callback
|
|
113
|
+
self._pre_speech_start_hook = pre_speech_start_hook
|
|
114
|
+
self.config = config or UtteranceCollectorConfig()
|
|
115
|
+
self._audio_stats = AudioFrameStats(
|
|
116
|
+
expected_sample_rate=self.config.sample_rate,
|
|
117
|
+
expected_channels=self.config.channels,
|
|
118
|
+
expected_frame_ms=self.config.frame_ms,
|
|
119
|
+
)
|
|
120
|
+
self._pre_buffer: deque[bytes] = deque(maxlen=self.config.pre_speech_frames)
|
|
121
|
+
self._utterance_buffer = bytearray()
|
|
122
|
+
self._recording = False
|
|
123
|
+
self._recording_mode = "chat"
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def is_recording(self) -> bool:
|
|
127
|
+
return self._recording
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def current_mode(self) -> str:
|
|
131
|
+
return self._recording_mode
|
|
132
|
+
|
|
133
|
+
def update_vad_provider(self, vad_provider: VADProvider) -> None:
|
|
134
|
+
"""Swap the VAD provider at runtime.
|
|
135
|
+
|
|
136
|
+
Rejected with :class:`RuntimeError` while an utterance is
|
|
137
|
+
being recorded to avoid corrupting the active utterance's
|
|
138
|
+
VAD state.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
vad_provider: The new VAD provider to use for
|
|
142
|
+
subsequent :meth:`ingest_frame` calls.
|
|
143
|
+
"""
|
|
144
|
+
if self._recording:
|
|
145
|
+
raise RuntimeError("cannot swap VAD provider while recording is active")
|
|
146
|
+
self._vad = vad_provider
|
|
147
|
+
self._pre_buffer.clear()
|
|
148
|
+
|
|
149
|
+
def serialize_config(self) -> dict[str, int | float]:
|
|
150
|
+
"""Return the current collector configuration for app persistence."""
|
|
151
|
+
return self.config.to_dict()
|
|
152
|
+
|
|
153
|
+
def update_config(self, **overrides: int | float) -> UtteranceCollectorConfig:
|
|
154
|
+
"""Update collector tuning knobs and rebuild derived state.
|
|
155
|
+
|
|
156
|
+
The update is rejected while recording so an in-flight utterance
|
|
157
|
+
cannot be interpreted with mixed frame sizes, sample rates, or
|
|
158
|
+
rejection gates.
|
|
159
|
+
"""
|
|
160
|
+
if self._recording:
|
|
161
|
+
raise RuntimeError("cannot update collector config while recording")
|
|
162
|
+
allowed = set(self.config.to_dict())
|
|
163
|
+
unknown = set(overrides) - allowed
|
|
164
|
+
if unknown:
|
|
165
|
+
names = ", ".join(sorted(unknown))
|
|
166
|
+
raise ValueError(f"unknown collector config field(s): {names}")
|
|
167
|
+
self.config = replace(self.config, **overrides)
|
|
168
|
+
self._audio_stats = AudioFrameStats(
|
|
169
|
+
expected_sample_rate=self.config.sample_rate,
|
|
170
|
+
expected_channels=self.config.channels,
|
|
171
|
+
expected_frame_ms=self.config.frame_ms,
|
|
172
|
+
)
|
|
173
|
+
self._pre_buffer = deque(maxlen=self.config.pre_speech_frames)
|
|
174
|
+
self._utterance_buffer.clear()
|
|
175
|
+
return self.config
|
|
176
|
+
|
|
177
|
+
async def cancel_active_turn(self, reason: str) -> None:
|
|
178
|
+
if self._cancel_callback is not None:
|
|
179
|
+
await self._cancel_callback(reason)
|
|
180
|
+
|
|
181
|
+
async def ingest_frame(
|
|
182
|
+
self,
|
|
183
|
+
frame: AudioFrame,
|
|
184
|
+
*,
|
|
185
|
+
mode: str = "chat",
|
|
186
|
+
pre_speech_start_hook: PreSpeechStartHook | None = None,
|
|
187
|
+
) -> None:
|
|
188
|
+
config = self.config
|
|
189
|
+
self._pre_buffer.append(frame.data)
|
|
190
|
+
|
|
191
|
+
if self._recording:
|
|
192
|
+
self._utterance_buffer.extend(frame.data)
|
|
193
|
+
max_bytes = config.max_utterance_frames * config.expected_frame_bytes
|
|
194
|
+
if len(self._utterance_buffer) > max_bytes:
|
|
195
|
+
await self._sink.emit(
|
|
196
|
+
"asr.buffer_warning",
|
|
197
|
+
message=(
|
|
198
|
+
"Maximum utterance length reached; closing current utterance."
|
|
199
|
+
),
|
|
200
|
+
)
|
|
201
|
+
self._recording = False
|
|
202
|
+
|
|
203
|
+
metrics = self._audio_stats.update(frame)
|
|
204
|
+
if metrics is not None:
|
|
205
|
+
await self._sink.emit("audio.input_level", **metrics)
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
vad_events = await self._vad.process_frame(frame)
|
|
209
|
+
except ValueError as exc:
|
|
210
|
+
await self._sink.emit("vad.error", message=str(exc))
|
|
211
|
+
return
|
|
212
|
+
|
|
213
|
+
hook = (
|
|
214
|
+
pre_speech_start_hook
|
|
215
|
+
if pre_speech_start_hook is not None
|
|
216
|
+
else self._pre_speech_start_hook
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
for vad_event in vad_events:
|
|
220
|
+
if vad_event.type == "vad.speech_start":
|
|
221
|
+
self._recording_mode = mode
|
|
222
|
+
if hook is not None:
|
|
223
|
+
await hook(frame, self._recording_mode)
|
|
224
|
+
if self._cancel_callback is not None:
|
|
225
|
+
await self._cancel_callback("vad_barge_in")
|
|
226
|
+
self._utterance_buffer.clear()
|
|
227
|
+
for buffered in self._pre_buffer:
|
|
228
|
+
self._utterance_buffer.extend(buffered)
|
|
229
|
+
self._recording = True
|
|
230
|
+
await self._sink.emit(
|
|
231
|
+
"vad.speech_start",
|
|
232
|
+
mode=self._recording_mode,
|
|
233
|
+
probability=vad_event.probability,
|
|
234
|
+
audio_ms=vad_event.audio_ms,
|
|
235
|
+
)
|
|
236
|
+
elif vad_event.type == "vad.speech_end":
|
|
237
|
+
self._recording = False
|
|
238
|
+
pcm = bytes(self._utterance_buffer)
|
|
239
|
+
self._utterance_buffer.clear()
|
|
240
|
+
await self._sink.emit(
|
|
241
|
+
"vad.speech_end",
|
|
242
|
+
mode=self._recording_mode,
|
|
243
|
+
probability=vad_event.probability,
|
|
244
|
+
audio_ms=vad_event.audio_ms,
|
|
245
|
+
)
|
|
246
|
+
pcm = await self._apply_rejection_gates(pcm, mode=self._recording_mode)
|
|
247
|
+
if pcm:
|
|
248
|
+
await self._utterance_callback(
|
|
249
|
+
pcm, config.sample_rate, self._recording_mode
|
|
250
|
+
)
|
|
251
|
+
elif vad_event.type == "vad.probability":
|
|
252
|
+
await self._sink.emit(
|
|
253
|
+
"vad.probability",
|
|
254
|
+
mode=mode,
|
|
255
|
+
probability=vad_event.probability,
|
|
256
|
+
audio_ms=vad_event.audio_ms,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
async def _apply_rejection_gates(self, pcm: bytes, *, mode: str) -> bytes:
|
|
260
|
+
config = self.config
|
|
261
|
+
bytes_per_ms = max(config.bytes_per_ms, 1)
|
|
262
|
+
|
|
263
|
+
if pcm and config.min_speech_duration_ms > 0:
|
|
264
|
+
duration_ms = len(pcm) // bytes_per_ms
|
|
265
|
+
if duration_ms < config.min_speech_duration_ms:
|
|
266
|
+
await self._sink.emit(
|
|
267
|
+
"vad.speech_rejected",
|
|
268
|
+
mode=mode,
|
|
269
|
+
duration_ms=duration_ms,
|
|
270
|
+
min_duration_ms=config.min_speech_duration_ms,
|
|
271
|
+
)
|
|
272
|
+
return b""
|
|
273
|
+
|
|
274
|
+
if (
|
|
275
|
+
pcm
|
|
276
|
+
and config.reject_low_energy_rms > 0
|
|
277
|
+
and config.reject_low_energy_max_duration_ms > 0
|
|
278
|
+
):
|
|
279
|
+
duration_ms = len(pcm) // bytes_per_ms
|
|
280
|
+
level = compute_pcm16_level(pcm)
|
|
281
|
+
if (
|
|
282
|
+
duration_ms <= config.reject_low_energy_max_duration_ms
|
|
283
|
+
and level["rms"] < config.reject_low_energy_rms
|
|
284
|
+
):
|
|
285
|
+
await self._sink.emit(
|
|
286
|
+
"vad.speech_rejected",
|
|
287
|
+
mode=mode,
|
|
288
|
+
duration_ms=duration_ms,
|
|
289
|
+
rms=level["rms"],
|
|
290
|
+
min_rms=config.reject_low_energy_rms,
|
|
291
|
+
reason="low_energy",
|
|
292
|
+
)
|
|
293
|
+
return b""
|
|
294
|
+
|
|
295
|
+
if pcm and config.reject_utterance_rms > 0:
|
|
296
|
+
duration_ms = len(pcm) // bytes_per_ms
|
|
297
|
+
level = compute_pcm16_level(pcm)
|
|
298
|
+
if level["rms"] < config.reject_utterance_rms:
|
|
299
|
+
await self._sink.emit(
|
|
300
|
+
"vad.speech_rejected",
|
|
301
|
+
mode=mode,
|
|
302
|
+
duration_ms=duration_ms,
|
|
303
|
+
rms=level["rms"],
|
|
304
|
+
min_rms=config.reject_utterance_rms,
|
|
305
|
+
reason="utterance_low_energy",
|
|
306
|
+
)
|
|
307
|
+
return b""
|
|
308
|
+
|
|
309
|
+
if pcm and config.trim_silence_rms > 0:
|
|
310
|
+
original_duration_ms = len(pcm) // bytes_per_ms
|
|
311
|
+
trimmed = trim_pcm16_silence(
|
|
312
|
+
pcm,
|
|
313
|
+
frame_ms=config.trim_silence_frame_ms,
|
|
314
|
+
sample_rate=config.sample_rate,
|
|
315
|
+
rms_threshold=config.trim_silence_rms,
|
|
316
|
+
)
|
|
317
|
+
trimmed_duration_ms = len(trimmed) // bytes_per_ms
|
|
318
|
+
if trimmed_duration_ms != original_duration_ms:
|
|
319
|
+
await self._sink.emit(
|
|
320
|
+
"asr.audio_trimmed",
|
|
321
|
+
mode=mode,
|
|
322
|
+
original_duration_ms=original_duration_ms,
|
|
323
|
+
trimmed_duration_ms=trimmed_duration_ms,
|
|
324
|
+
rms_threshold=config.trim_silence_rms,
|
|
325
|
+
)
|
|
326
|
+
pcm = trimmed
|
|
327
|
+
|
|
328
|
+
return pcm
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
__all__ = [
|
|
332
|
+
"AudioUtteranceCollector",
|
|
333
|
+
"PreSpeechStartHook",
|
|
334
|
+
"UtteranceCallback",
|
|
335
|
+
"UtteranceCollectorConfig",
|
|
336
|
+
]
|