converse-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. converse_framework/__init__.py +108 -0
  2. converse_framework/audio_utils.py +412 -0
  3. converse_framework/cuda_utils.py +176 -0
  4. converse_framework/events.py +94 -0
  5. converse_framework/examples/__init__.py +20 -0
  6. converse_framework/examples/subprocess_provider.py +439 -0
  7. converse_framework/examples/text_chat.py +308 -0
  8. converse_framework/examples/voice_chat.py +223 -0
  9. converse_framework/examples/websocket_voice_chat.py +174 -0
  10. converse_framework/js/browser-voice-client.js +248 -0
  11. converse_framework/js/mic-frame-sender.js +445 -0
  12. converse_framework/js/speaker-echo-guard.js +308 -0
  13. converse_framework/js/tts-audio-player.js +237 -0
  14. converse_framework/pipeline.py +620 -0
  15. converse_framework/protocols.py +382 -0
  16. converse_framework/provider_events.py +159 -0
  17. converse_framework/providers/__init__.py +28 -0
  18. converse_framework/providers/faster_whisper.py +290 -0
  19. converse_framework/providers/kokoro_onnx.py +391 -0
  20. converse_framework/providers/llamacpp.py +264 -0
  21. converse_framework/providers/mock.py +171 -0
  22. converse_framework/providers/pocket_tts.py +409 -0
  23. converse_framework/providers/silero.py +161 -0
  24. converse_framework/providers/unavailable.py +137 -0
  25. converse_framework/providers/whisper_cpp.py +322 -0
  26. converse_framework/registry.py +397 -0
  27. converse_framework/session.py +315 -0
  28. converse_framework/transport.py +54 -0
  29. converse_framework/utterance_collector.py +336 -0
  30. converse_framework-0.2.0.dist-info/METADATA +992 -0
  31. converse_framework-0.2.0.dist-info/RECORD +33 -0
  32. converse_framework-0.2.0.dist-info/WHEEL +4 -0
  33. converse_framework-0.2.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,336 @@
1
+ """VAD-driven audio utterance collector.
2
+
3
+ Encapsulates the state machine that turns a stream of parsed
4
+ :class:`AudioFrame` objects into complete utterance byte buffers ready
5
+ for ASR, while emitting compatible input-level, VAD, and rejection
6
+ events through an :class:`EventSink`.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ from collections import deque
13
+ from collections.abc import Awaitable, Callable
14
+ from dataclasses import dataclass, field, fields, replace
15
+
16
+ from converse_framework.audio_utils import (
17
+ AudioFrame,
18
+ AudioFrameStats,
19
+ compute_pcm16_level,
20
+ trim_pcm16_silence,
21
+ )
22
+ from converse_framework.events import EventSink
23
+ from converse_framework.protocols import VADProvider
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ PreSpeechStartHook = Callable[[AudioFrame, str], Awaitable[None]]
29
+ UtteranceCallback = Callable[[bytes, int, str], Awaitable[None]]
30
+ CancelCallback = Callable[[str], Awaitable[None]]
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class UtteranceCollectorConfig:
35
+ """Tuning knobs for the VAD utterance collector.
36
+
37
+ Derived frame counts and byte sizes are computed in
38
+ :meth:`__post_init__` so callers can read them after construction.
39
+ """
40
+
41
+ sample_rate: int = 16000
42
+ channels: int = 1
43
+ frame_ms: int = 30
44
+ pre_speech_ms: int = 450
45
+ max_utterance_ms: int = 30000
46
+ min_speech_duration_ms: int = 300
47
+ reject_low_energy_rms: float = 0.003
48
+ reject_low_energy_max_duration_ms: int = 900
49
+ reject_utterance_rms: float = 0.002
50
+ trim_silence_rms: float = 0.003
51
+ trim_silence_frame_ms: int = 30
52
+
53
+ pre_speech_frames: int = field(init=False)
54
+ max_utterance_frames: int = field(init=False)
55
+ bytes_per_ms: int = field(init=False)
56
+ expected_frame_bytes: int = field(init=False)
57
+
58
+ def __post_init__(self) -> None:
59
+ if self.frame_ms <= 0:
60
+ raise ValueError("frame_ms must be > 0")
61
+ if self.channels <= 0:
62
+ raise ValueError("channels must be > 0")
63
+ if self.sample_rate <= 0:
64
+ raise ValueError("sample_rate must be > 0")
65
+ object.__setattr__(
66
+ self, "pre_speech_frames", max(1, self.pre_speech_ms // self.frame_ms)
67
+ )
68
+ object.__setattr__(
69
+ self,
70
+ "max_utterance_frames",
71
+ max(1, self.max_utterance_ms // self.frame_ms),
72
+ )
73
+ object.__setattr__(
74
+ self,
75
+ "bytes_per_ms",
76
+ self.sample_rate * self.channels * 2 // 1000,
77
+ )
78
+ object.__setattr__(
79
+ self, "expected_frame_bytes", self.bytes_per_ms * self.frame_ms
80
+ )
81
+
82
+ def to_dict(self) -> dict[str, int | float]:
83
+ """Return only caller-configurable fields for persistence."""
84
+ return {
85
+ item.name: getattr(self, item.name) for item in fields(self) if item.init
86
+ }
87
+
88
+
89
+ class AudioUtteranceCollector:
90
+ """VAD-driven utterance collector.
91
+
92
+ Accepts parsed :class:`AudioFrame` objects, runs the configured VAD
93
+ provider, applies the rejection gates, and dispatches the resulting
94
+ utterance bytes to a caller callback. Input-level, VAD, and
95
+ rejection events are emitted through the configured
96
+ :class:`EventSink` so transport layers can forward them to clients
97
+ without coupling to the collector itself.
98
+ """
99
+
100
+ def __init__(
101
+ self,
102
+ vad_provider: VADProvider,
103
+ event_sink: EventSink,
104
+ utterance_callback: UtteranceCallback,
105
+ config: UtteranceCollectorConfig | None = None,
106
+ cancel_callback: CancelCallback | None = None,
107
+ pre_speech_start_hook: PreSpeechStartHook | None = None,
108
+ ) -> None:
109
+ self._vad = vad_provider
110
+ self._sink = event_sink
111
+ self._utterance_callback = utterance_callback
112
+ self._cancel_callback = cancel_callback
113
+ self._pre_speech_start_hook = pre_speech_start_hook
114
+ self.config = config or UtteranceCollectorConfig()
115
+ self._audio_stats = AudioFrameStats(
116
+ expected_sample_rate=self.config.sample_rate,
117
+ expected_channels=self.config.channels,
118
+ expected_frame_ms=self.config.frame_ms,
119
+ )
120
+ self._pre_buffer: deque[bytes] = deque(maxlen=self.config.pre_speech_frames)
121
+ self._utterance_buffer = bytearray()
122
+ self._recording = False
123
+ self._recording_mode = "chat"
124
+
125
+ @property
126
+ def is_recording(self) -> bool:
127
+ return self._recording
128
+
129
+ @property
130
+ def current_mode(self) -> str:
131
+ return self._recording_mode
132
+
133
+ def update_vad_provider(self, vad_provider: VADProvider) -> None:
134
+ """Swap the VAD provider at runtime.
135
+
136
+ Rejected with :class:`RuntimeError` while an utterance is
137
+ being recorded to avoid corrupting the active utterance's
138
+ VAD state.
139
+
140
+ Args:
141
+ vad_provider: The new VAD provider to use for
142
+ subsequent :meth:`ingest_frame` calls.
143
+ """
144
+ if self._recording:
145
+ raise RuntimeError("cannot swap VAD provider while recording is active")
146
+ self._vad = vad_provider
147
+ self._pre_buffer.clear()
148
+
149
+ def serialize_config(self) -> dict[str, int | float]:
150
+ """Return the current collector configuration for app persistence."""
151
+ return self.config.to_dict()
152
+
153
+ def update_config(self, **overrides: int | float) -> UtteranceCollectorConfig:
154
+ """Update collector tuning knobs and rebuild derived state.
155
+
156
+ The update is rejected while recording so an in-flight utterance
157
+ cannot be interpreted with mixed frame sizes, sample rates, or
158
+ rejection gates.
159
+ """
160
+ if self._recording:
161
+ raise RuntimeError("cannot update collector config while recording")
162
+ allowed = set(self.config.to_dict())
163
+ unknown = set(overrides) - allowed
164
+ if unknown:
165
+ names = ", ".join(sorted(unknown))
166
+ raise ValueError(f"unknown collector config field(s): {names}")
167
+ self.config = replace(self.config, **overrides)
168
+ self._audio_stats = AudioFrameStats(
169
+ expected_sample_rate=self.config.sample_rate,
170
+ expected_channels=self.config.channels,
171
+ expected_frame_ms=self.config.frame_ms,
172
+ )
173
+ self._pre_buffer = deque(maxlen=self.config.pre_speech_frames)
174
+ self._utterance_buffer.clear()
175
+ return self.config
176
+
177
+ async def cancel_active_turn(self, reason: str) -> None:
178
+ if self._cancel_callback is not None:
179
+ await self._cancel_callback(reason)
180
+
181
+ async def ingest_frame(
182
+ self,
183
+ frame: AudioFrame,
184
+ *,
185
+ mode: str = "chat",
186
+ pre_speech_start_hook: PreSpeechStartHook | None = None,
187
+ ) -> None:
188
+ config = self.config
189
+ self._pre_buffer.append(frame.data)
190
+
191
+ if self._recording:
192
+ self._utterance_buffer.extend(frame.data)
193
+ max_bytes = config.max_utterance_frames * config.expected_frame_bytes
194
+ if len(self._utterance_buffer) > max_bytes:
195
+ await self._sink.emit(
196
+ "asr.buffer_warning",
197
+ message=(
198
+ "Maximum utterance length reached; closing current utterance."
199
+ ),
200
+ )
201
+ self._recording = False
202
+
203
+ metrics = self._audio_stats.update(frame)
204
+ if metrics is not None:
205
+ await self._sink.emit("audio.input_level", **metrics)
206
+
207
+ try:
208
+ vad_events = await self._vad.process_frame(frame)
209
+ except ValueError as exc:
210
+ await self._sink.emit("vad.error", message=str(exc))
211
+ return
212
+
213
+ hook = (
214
+ pre_speech_start_hook
215
+ if pre_speech_start_hook is not None
216
+ else self._pre_speech_start_hook
217
+ )
218
+
219
+ for vad_event in vad_events:
220
+ if vad_event.type == "vad.speech_start":
221
+ self._recording_mode = mode
222
+ if hook is not None:
223
+ await hook(frame, self._recording_mode)
224
+ if self._cancel_callback is not None:
225
+ await self._cancel_callback("vad_barge_in")
226
+ self._utterance_buffer.clear()
227
+ for buffered in self._pre_buffer:
228
+ self._utterance_buffer.extend(buffered)
229
+ self._recording = True
230
+ await self._sink.emit(
231
+ "vad.speech_start",
232
+ mode=self._recording_mode,
233
+ probability=vad_event.probability,
234
+ audio_ms=vad_event.audio_ms,
235
+ )
236
+ elif vad_event.type == "vad.speech_end":
237
+ self._recording = False
238
+ pcm = bytes(self._utterance_buffer)
239
+ self._utterance_buffer.clear()
240
+ await self._sink.emit(
241
+ "vad.speech_end",
242
+ mode=self._recording_mode,
243
+ probability=vad_event.probability,
244
+ audio_ms=vad_event.audio_ms,
245
+ )
246
+ pcm = await self._apply_rejection_gates(pcm, mode=self._recording_mode)
247
+ if pcm:
248
+ await self._utterance_callback(
249
+ pcm, config.sample_rate, self._recording_mode
250
+ )
251
+ elif vad_event.type == "vad.probability":
252
+ await self._sink.emit(
253
+ "vad.probability",
254
+ mode=mode,
255
+ probability=vad_event.probability,
256
+ audio_ms=vad_event.audio_ms,
257
+ )
258
+
259
+ async def _apply_rejection_gates(self, pcm: bytes, *, mode: str) -> bytes:
260
+ config = self.config
261
+ bytes_per_ms = max(config.bytes_per_ms, 1)
262
+
263
+ if pcm and config.min_speech_duration_ms > 0:
264
+ duration_ms = len(pcm) // bytes_per_ms
265
+ if duration_ms < config.min_speech_duration_ms:
266
+ await self._sink.emit(
267
+ "vad.speech_rejected",
268
+ mode=mode,
269
+ duration_ms=duration_ms,
270
+ min_duration_ms=config.min_speech_duration_ms,
271
+ )
272
+ return b""
273
+
274
+ if (
275
+ pcm
276
+ and config.reject_low_energy_rms > 0
277
+ and config.reject_low_energy_max_duration_ms > 0
278
+ ):
279
+ duration_ms = len(pcm) // bytes_per_ms
280
+ level = compute_pcm16_level(pcm)
281
+ if (
282
+ duration_ms <= config.reject_low_energy_max_duration_ms
283
+ and level["rms"] < config.reject_low_energy_rms
284
+ ):
285
+ await self._sink.emit(
286
+ "vad.speech_rejected",
287
+ mode=mode,
288
+ duration_ms=duration_ms,
289
+ rms=level["rms"],
290
+ min_rms=config.reject_low_energy_rms,
291
+ reason="low_energy",
292
+ )
293
+ return b""
294
+
295
+ if pcm and config.reject_utterance_rms > 0:
296
+ duration_ms = len(pcm) // bytes_per_ms
297
+ level = compute_pcm16_level(pcm)
298
+ if level["rms"] < config.reject_utterance_rms:
299
+ await self._sink.emit(
300
+ "vad.speech_rejected",
301
+ mode=mode,
302
+ duration_ms=duration_ms,
303
+ rms=level["rms"],
304
+ min_rms=config.reject_utterance_rms,
305
+ reason="utterance_low_energy",
306
+ )
307
+ return b""
308
+
309
+ if pcm and config.trim_silence_rms > 0:
310
+ original_duration_ms = len(pcm) // bytes_per_ms
311
+ trimmed = trim_pcm16_silence(
312
+ pcm,
313
+ frame_ms=config.trim_silence_frame_ms,
314
+ sample_rate=config.sample_rate,
315
+ rms_threshold=config.trim_silence_rms,
316
+ )
317
+ trimmed_duration_ms = len(trimmed) // bytes_per_ms
318
+ if trimmed_duration_ms != original_duration_ms:
319
+ await self._sink.emit(
320
+ "asr.audio_trimmed",
321
+ mode=mode,
322
+ original_duration_ms=original_duration_ms,
323
+ trimmed_duration_ms=trimmed_duration_ms,
324
+ rms_threshold=config.trim_silence_rms,
325
+ )
326
+ pcm = trimmed
327
+
328
+ return pcm
329
+
330
+
331
+ __all__ = [
332
+ "AudioUtteranceCollector",
333
+ "PreSpeechStartHook",
334
+ "UtteranceCallback",
335
+ "UtteranceCollectorConfig",
336
+ ]