converse-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. converse_framework/__init__.py +108 -0
  2. converse_framework/audio_utils.py +412 -0
  3. converse_framework/cuda_utils.py +176 -0
  4. converse_framework/events.py +94 -0
  5. converse_framework/examples/__init__.py +20 -0
  6. converse_framework/examples/subprocess_provider.py +439 -0
  7. converse_framework/examples/text_chat.py +308 -0
  8. converse_framework/examples/voice_chat.py +223 -0
  9. converse_framework/examples/websocket_voice_chat.py +174 -0
  10. converse_framework/js/browser-voice-client.js +248 -0
  11. converse_framework/js/mic-frame-sender.js +445 -0
  12. converse_framework/js/speaker-echo-guard.js +308 -0
  13. converse_framework/js/tts-audio-player.js +237 -0
  14. converse_framework/pipeline.py +620 -0
  15. converse_framework/protocols.py +382 -0
  16. converse_framework/provider_events.py +159 -0
  17. converse_framework/providers/__init__.py +28 -0
  18. converse_framework/providers/faster_whisper.py +290 -0
  19. converse_framework/providers/kokoro_onnx.py +391 -0
  20. converse_framework/providers/llamacpp.py +264 -0
  21. converse_framework/providers/mock.py +171 -0
  22. converse_framework/providers/pocket_tts.py +409 -0
  23. converse_framework/providers/silero.py +161 -0
  24. converse_framework/providers/unavailable.py +137 -0
  25. converse_framework/providers/whisper_cpp.py +322 -0
  26. converse_framework/registry.py +397 -0
  27. converse_framework/session.py +315 -0
  28. converse_framework/transport.py +54 -0
  29. converse_framework/utterance_collector.py +336 -0
  30. converse_framework-0.2.0.dist-info/METADATA +992 -0
  31. converse_framework-0.2.0.dist-info/RECORD +33 -0
  32. converse_framework-0.2.0.dist-info/WHEEL +4 -0
  33. converse_framework-0.2.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,382 @@
1
+ """Provider interfaces and shared dataclasses for the speech stack."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import AsyncIterator, Awaitable, Callable
6
+ from dataclasses import dataclass
7
+ from typing import Protocol, runtime_checkable
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class ProviderCapabilities:
12
+ """Static capability flags advertised by a provider implementation.
13
+
14
+ The pipeline and transport layers consult these flags to decide
15
+ which features (partial transcripts, streaming TTS, barge-in, GPU
16
+ requirements, supported languages) are available without
17
+ instantiating the provider.
18
+
19
+ Attributes:
20
+ supports_partials: Provider can emit non-final transcript
21
+ chunks while audio is still arriving.
22
+ supports_streaming_tts: TTS can start streaming audio chunks
23
+ before the full text is known.
24
+ supports_barge_in: Provider can detect user speech while TTS
25
+ is still playing and signal cancellation.
26
+ requires_gpu: Provider needs a GPU at runtime. UI layers use
27
+ this to warn the user before they select the provider.
28
+ languages: ISO language codes the provider can handle.
29
+ """
30
+
31
+ supports_partials: bool = False
32
+ supports_streaming_tts: bool = False
33
+ supports_barge_in: bool = False
34
+ requires_gpu: bool = False
35
+ languages: tuple[str, ...] = ("en",)
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class ProviderStatus:
40
+ """Runtime status snapshot of a provider.
41
+
42
+ Returned by the ``status`` property / ``check_status`` coroutine of
43
+ every provider protocol. ``ready`` is the headline boolean the UI
44
+ uses to enable / disable a provider row. ``message`` carries the
45
+ human-readable explanation (missing dependency, model not loaded,
46
+ GPU absent, ...).
47
+
48
+ Attributes:
49
+ name: Provider name as registered in :mod:`registry` (e.g.
50
+ ``"mock"``, ``"silero"``, ``"faster-whisper"``).
51
+ kind: Provider category, one of ``"vad"``, ``"asr"``,
52
+ ``"llm"``, ``"tts"``.
53
+ ready: True if the provider can be used right now.
54
+ message: Human-readable status, surfaced verbatim in the UI.
55
+ install_hint: Optional package spec to install when this
56
+ provider is unavailable because an optional dependency is
57
+ missing, e.g. ``"converse-framework[silero]"``.
58
+ missing_extra: Optional extra name for UI display when the
59
+ framework knows which optional dependency group is missing.
60
+ capabilities: Static feature flags for this provider.
61
+ provider_id: Stable identifier for UI selection when the
62
+ registered ``name`` is aliased.
63
+ selected: True if this provider is the one currently bound
64
+ into the active :class:`ProviderBundle`.
65
+ loaded: True if the heavy backend has been initialised.
66
+ managed_externally: Provider lifecycle is owned by another
67
+ runtime (e.g. a TTS preset manager) and the framework
68
+ should not call :meth:`load` / :meth:`unload` on it.
69
+ supports_model_management: Provider exposes model hot-swap.
70
+ supports_voice_selection: Provider exposes voice selection.
71
+ voices: Tuple of voice metadata dicts (``id``, ``label``, …)
72
+ supported by this provider, if discoverable.
73
+ active_voice: Currently selected voice identifier, if any.
74
+ models: Tuple of model metadata dicts (``id``, ``label``, …)
75
+ supported by this provider, if discoverable.
76
+ active_model: Currently selected model identifier, if any.
77
+ status_level: Categorical readiness level. One of
78
+ ``"ready"`` | ``"configured"`` | ``"loading"`` |
79
+ ``"error"`` | ``"unavailable"``.
80
+ """
81
+
82
+ name: str
83
+ kind: str
84
+ ready: bool
85
+ message: str
86
+ capabilities: ProviderCapabilities
87
+ install_hint: str | None = None
88
+ missing_extra: str | None = None
89
+ provider_id: str | None = None
90
+ selected: bool = False
91
+ loaded: bool = True
92
+ managed_externally: bool = False
93
+ supports_model_management: bool = False
94
+ supports_voice_selection: bool = False
95
+ voices: tuple[dict[str, str], ...] = ()
96
+ active_voice: str | None = None
97
+ models: tuple[dict[str, str], ...] = ()
98
+ active_model: str | None = None
99
+ status_level: str = "ready"
100
+
101
+
102
+ @dataclass(frozen=True)
103
+ class VoiceInfo:
104
+ """Structured metadata for a voice supported by a TTS provider.
105
+
106
+ Attributes:
107
+ id: Machine-readable voice identifier (e.g. ``"azelma"``).
108
+ label: Human-readable voice display name (e.g. ``"Azelma"``).
109
+ language: ISO language code for the voice (e.g. ``"en"``, ``"fr"``).
110
+ description: Optional human-readable description.
111
+ gender: Optional gender hint (``"male"``, ``"female"``, ``"neutral"``).
112
+ """
113
+
114
+ id: str
115
+ label: str
116
+ language: str = "en"
117
+ description: str = ""
118
+ gender: str = "neutral"
119
+
120
+
121
+ @dataclass(frozen=True)
122
+ class ProviderConfigResult:
123
+ """Result of a provider :meth:`~TTSProvider.configure` call.
124
+
125
+ Attributes:
126
+ status: The provider's status after applying the change.
127
+ changed: True if at least one option value actually changed.
128
+ requires_reload: True if the change invalidated cached model
129
+ or voice state and the provider needs a :meth:`load` call
130
+ before it can be used again.
131
+ message: Human-readable summary of what was changed.
132
+ """
133
+
134
+ status: ProviderStatus
135
+ changed: bool = False
136
+ requires_reload: bool = False
137
+ message: str = ""
138
+
139
+
140
+ @dataclass(frozen=True)
141
+ class TranscriptEvent:
142
+ """A single incremental transcript chunk produced by an ASR provider.
143
+
144
+ ASR providers stream a sequence of these events for every audio
145
+ turn. Non-final events (``final=False``) are hypothesis updates
146
+ that may still change; only the last ``final=True`` event in a
147
+ stream is authoritative for the utterance.
148
+
149
+ Attributes:
150
+ text: Transcript text for this chunk. For non-final chunks
151
+ this is the running hypothesis; for the final chunk it is
152
+ the committed utterance text.
153
+ final: True iff this is the closing, committed transcript for
154
+ the current utterance.
155
+ """
156
+
157
+ text: str
158
+ final: bool
159
+
160
+
161
+ @dataclass(frozen=True)
162
+ class AudioChunk:
163
+ """A single chunk of encoded audio emitted by a TTS provider.
164
+
165
+ TTS providers yield a stream of these chunks. The framework does
166
+ not interpret the audio bytes directly -- it forwards them to
167
+ transports -- but it does attach enough metadata for downstream
168
+ consumers to render or persist the audio correctly.
169
+
170
+ Attributes:
171
+ data: Raw encoded audio bytes (the encoding is described by
172
+ ``encoding`` / ``mime_type``).
173
+ mime_type: Optional MIME hint (``"audio/wav"``,
174
+ ``"audio/mpeg"``, ...). ``None`` if the provider cannot
175
+ name the encoding.
176
+ sample_rate: Samples per second of the decoded audio, or
177
+ ``None`` if not applicable (e.g. compressed formats
178
+ served whole).
179
+ channels: Channel count of the decoded audio.
180
+ encoding: Encoding name (``"pcm_s16le"``, ``"mp3"``,
181
+ ``"wav"`` ...). Matches the value the
182
+ :mod:`audio_utils` helpers expect.
183
+ duration_ms: Duration of this chunk in milliseconds, when the
184
+ provider can compute it. ``None`` for the first chunk of
185
+ streaming codecs.
186
+ final: True if this is the last chunk for the current
187
+ synthesis request.
188
+ """
189
+
190
+ data: bytes
191
+ mime_type: str | None = None
192
+ sample_rate: int | None = None
193
+ channels: int = 1
194
+ encoding: str | None = None
195
+ duration_ms: int | None = None
196
+ final: bool = False
197
+
198
+
199
+ @dataclass(frozen=True)
200
+ class VADEvent:
201
+ """A single VAD decision produced by a VAD provider.
202
+
203
+ The :class:`AudioUtteranceCollector` state machine consumes a
204
+ stream of these events per :class:`AudioFrame` to drive the
205
+ recording lifecycle.
206
+
207
+ Attributes:
208
+ type: Event kind. ``"vad.speech_start"`` marks the leading
209
+ edge of detected speech; ``"vad.speech_end"`` marks the
210
+ trailing edge; ``"vad.probability"`` is an intermediate
211
+ level readout that does not change the recording state.
212
+ probability: Confidence of the decision, in ``[0.0, 1.0]``.
213
+ audio_ms: Position in the current utterance, in milliseconds
214
+ from the first frame of the turn.
215
+ """
216
+
217
+ type: str
218
+ probability: float
219
+ audio_ms: int
220
+
221
+
222
+ ProgressCallback = Callable[[str, dict], Awaitable[None]]
223
+
224
+
225
+ @runtime_checkable
226
+ class VADProvider(Protocol):
227
+ """Voice-activity-detection provider.
228
+
229
+ Implementations consume a stream of parsed :class:`AudioFrame`
230
+ objects and emit :class:`VADEvent` decisions that the utterance
231
+ collector turns into utterance boundaries.
232
+
233
+ The ``status`` property exposes the current
234
+ :class:`ProviderStatus`; :meth:`check_status` is the async form
235
+ that performs a real probe (file existence, model loaded, ...).
236
+
237
+ ``probe_status`` is a cheap, no-model-load variant;
238
+ ``load_status`` may load heavy resources.
239
+ """
240
+
241
+ @property
242
+ def status(self) -> ProviderStatus: ...
243
+
244
+ async def check_status(self) -> ProviderStatus: ...
245
+
246
+ async def probe_status(self) -> ProviderStatus:
247
+ """Cheap readiness probe, does not load heavy resources."""
248
+ return await self.check_status()
249
+
250
+ async def load_status(self) -> ProviderStatus:
251
+ """May load or initialise heavy resources."""
252
+ return await self.probe_status()
253
+
254
+ async def process_frame(self, frame) -> list[VADEvent]: ...
255
+
256
+ async def unload(self) -> ProviderStatus: ...
257
+
258
+
259
+ @runtime_checkable
260
+ class ASRProvider(Protocol):
261
+ """Automatic-speech-recognition provider.
262
+
263
+ Implementations accept either raw 16-bit signed-LE mono PCM
264
+ bytes (audio path) or a transcript seed string (text path) and
265
+ stream :class:`TranscriptEvent` chunks. The text-input path is
266
+ used by the pipeline to keep the public API symmetric between
267
+ audio and chat front-ends.
268
+ """
269
+
270
+ @property
271
+ def status(self) -> ProviderStatus: ...
272
+
273
+ async def check_status(self) -> ProviderStatus: ...
274
+
275
+ async def probe_status(self) -> ProviderStatus:
276
+ """Cheap readiness probe, does not load heavy resources."""
277
+ return await self.check_status()
278
+
279
+ async def load_status(self) -> ProviderStatus:
280
+ """May load or initialise heavy resources."""
281
+ return await self.load()
282
+
283
+ async def load(self) -> ProviderStatus: ...
284
+
285
+ def transcribe_text_input(self, text: str) -> AsyncIterator[TranscriptEvent]: ...
286
+
287
+ def transcribe_audio(
288
+ self,
289
+ pcm_s16le: bytes,
290
+ sample_rate: int,
291
+ progress: ProgressCallback | None = None,
292
+ ) -> AsyncIterator[TranscriptEvent]: ...
293
+
294
+ async def unload(self) -> ProviderStatus: ...
295
+
296
+
297
+ @runtime_checkable
298
+ class LLMProvider(Protocol):
299
+ """Large-language-model provider.
300
+
301
+ Implementations take an OpenAI-style ``messages`` list
302
+ (``[{"role": ..., "content": ...}, ...]``) and stream token
303
+ strings. The pipeline feeds these tokens into the TTS chunker,
304
+ so implementations do not need to do their own sentence
305
+ splitting -- a simple token stream is the contract.
306
+ """
307
+
308
+ @property
309
+ def status(self) -> ProviderStatus: ...
310
+
311
+ async def check_status(self) -> ProviderStatus: ...
312
+
313
+ async def probe_status(self) -> ProviderStatus:
314
+ """Cheap readiness probe, does not load heavy resources."""
315
+ return await self.check_status()
316
+
317
+ async def load_status(self) -> ProviderStatus:
318
+ """May load or initialise heavy resources."""
319
+ return await self.check_status()
320
+
321
+ def stream_response(self, messages: list[dict[str, str]]) -> AsyncIterator[str]: ...
322
+
323
+
324
+ @runtime_checkable
325
+ class TTSProvider(Protocol):
326
+ """Text-to-speech provider.
327
+
328
+ Implementations accept a single text string and stream
329
+ :class:`AudioChunk` objects back. The :meth:`stream_audio` form
330
+ is the simple contract; :meth:`stream_audio_with_progress` adds
331
+ an optional progress callback the pipeline uses to emit
332
+ ``tts.progress`` events to the transport layer.
333
+ """
334
+
335
+ @property
336
+ def status(self) -> ProviderStatus: ...
337
+
338
+ async def check_status(self) -> ProviderStatus: ...
339
+
340
+ async def probe_status(self) -> ProviderStatus:
341
+ """Cheap readiness probe, does not load heavy resources."""
342
+ return await self.check_status()
343
+
344
+ async def load_status(self) -> ProviderStatus:
345
+ """May load or initialise heavy resources."""
346
+ return await self.load()
347
+
348
+ async def load(self) -> ProviderStatus: ...
349
+
350
+ async def unload(self) -> ProviderStatus: ...
351
+
352
+ async def configure(self, **options) -> ProviderConfigResult:
353
+ """Apply configuration changes.
354
+
355
+ Supported options depend on the provider implementation.
356
+ Returns a :class:`ProviderConfigResult` describing whether
357
+ the change was applied and whether a reload is required.
358
+ """
359
+ from .protocols import ProviderConfigResult
360
+
361
+ return ProviderConfigResult(
362
+ status=await self.check_status(),
363
+ changed=False,
364
+ requires_reload=False,
365
+ message="configure() is not implemented by this provider.",
366
+ )
367
+
368
+ def list_voices(self) -> tuple[VoiceInfo, ...]:
369
+ """Return structured metadata for voices this provider supports.
370
+
371
+ Implementations should return static metadata where possible
372
+ rather than importing the heavy backend just to enumerate
373
+ voices.
374
+ """
375
+
376
+ return ()
377
+
378
+ def stream_audio(self, text: str) -> AsyncIterator[AudioChunk]: ...
379
+
380
+ def stream_audio_with_progress(
381
+ self, text: str, progress: ProgressCallback | None = None
382
+ ) -> AsyncIterator[AudioChunk]: ...
@@ -0,0 +1,159 @@
1
+ """Standardised provider lifecycle event helpers.
2
+
3
+ These helpers produce a consistent event shape for provider loading,
4
+ loaded, and error events across VAD, ASR, LLM, and TTS providers.
5
+ The pipeline and provider code emit these alongside existing
6
+ ``asr.progress`` and ``tts.progress`` events for backward compat.
7
+
8
+ Event types emitted:
9
+
10
+ * ``provider.loading`` — a provider has begun loading a model.
11
+ * ``provider.loaded`` — a provider has finished loading.
12
+ * ``provider.error`` — a provider encountered a non-recoverable error.
13
+
14
+ Payload fields:
15
+
16
+ * ``kind`` — ``"vad"`` | ``"asr"`` | ``"llm"`` | ``"tts"``
17
+ * ``provider`` — provider name (:attr:`ProviderStatus.name`)
18
+ * ``provider_id`` — stable identifier (:attr:`ProviderStatus.provider_id`)
19
+ * ``stage`` — substage description (``"loading"``, ``"loaded"``, …)
20
+ * ``message`` — human-readable detail
21
+ * ``error_type`` — exception class name for error events
22
+ * ``loaded`` — bool, whether the provider reports loaded after the event
23
+ * ``latency_ms`` — elapsed milliseconds when available
24
+ * ``turn_id`` and ``mode`` — tied to a turn when emitted from pipeline
25
+
26
+ Typical usage::
27
+
28
+ from converse_framework.provider_events import provider_loading_event
29
+
30
+ await sink.emit(**provider_loading_event(
31
+ kind="asr",
32
+ provider="faster-whisper",
33
+ stage="loading",
34
+ message="Loading model...",
35
+ ))
36
+
37
+ Or with latency and turn context::
38
+
39
+ await sink.emit(
40
+ **provider_error_event(
41
+ kind="tts",
42
+ provider="pocket-tts",
43
+ stage="synthesis",
44
+ message=str(exc),
45
+ error_type=type(exc).__name__,
46
+ ),
47
+ turn_id=turn_id,
48
+ mode=turn_mode,
49
+ latency_ms=elapsed_ms(started),
50
+ )
51
+ """
52
+
53
+ from __future__ import annotations
54
+
55
+ from typing import Any
56
+
57
+
58
+ def provider_loading_event(
59
+ *,
60
+ kind: str,
61
+ provider: str,
62
+ stage: str = "loading",
63
+ message: str = "",
64
+ **extra: Any,
65
+ ) -> dict[str, Any]:
66
+ """Build a ``provider.loading`` event payload.
67
+
68
+ Args:
69
+ kind: Provider category (``"vad"``, ``"asr"``, ``"llm"``, ``"tts"``).
70
+ provider: Provider name from :attr:`ProviderStatus.name`.
71
+ stage: Sub-stage label (e.g. ``"loading"``, ``"downloading"``).
72
+ message: Human-readable description.
73
+ **extra: Additional fields forwarded verbatim.
74
+
75
+ Returns:
76
+ A keyword-expandable dict with ``event_type`` and ``payload``
77
+ suitable for ``await sink.emit(**result)``.
78
+ """
79
+ return {
80
+ "event_type": "provider.loading",
81
+ "kind": kind,
82
+ "provider": provider,
83
+ "stage": stage,
84
+ "message": message,
85
+ "loaded": False,
86
+ **extra,
87
+ }
88
+
89
+
90
+ def provider_loaded_event(
91
+ *,
92
+ kind: str,
93
+ provider: str,
94
+ stage: str = "loaded",
95
+ message: str = "",
96
+ latency_ms: int | None = None,
97
+ **extra: Any,
98
+ ) -> dict[str, Any]:
99
+ """Build a ``provider.loaded`` event payload.
100
+
101
+ Args:
102
+ kind: Provider category.
103
+ provider: Provider name.
104
+ stage: Sub-stage label (e.g. ``"loaded"``).
105
+ message: Human-readable description.
106
+ latency_ms: Elapsed milliseconds for the load operation.
107
+ **extra: Additional fields forwarded verbatim.
108
+
109
+ Returns:
110
+ A keyword-expandable dict for ``await sink.emit(**result)``.
111
+ """
112
+ payload: dict[str, Any] = {
113
+ "event_type": "provider.loaded",
114
+ "kind": kind,
115
+ "provider": provider,
116
+ "stage": stage,
117
+ "message": message,
118
+ "loaded": True,
119
+ }
120
+ if latency_ms is not None:
121
+ payload["latency_ms"] = latency_ms
122
+ payload.update(extra)
123
+ return payload
124
+
125
+
126
+ def provider_error_event(
127
+ *,
128
+ kind: str,
129
+ provider: str,
130
+ stage: str = "",
131
+ message: str = "",
132
+ error_type: str = "Exception",
133
+ loaded: bool = False,
134
+ **extra: Any,
135
+ ) -> dict[str, Any]:
136
+ """Build a ``provider.error`` event payload.
137
+
138
+ Args:
139
+ kind: Provider category.
140
+ provider: Provider name.
141
+ stage: Sub-stage where the error occurred.
142
+ message: Human-readable error description.
143
+ error_type: Exception class name (e.g. ``"RuntimeError"``).
144
+ loaded: Whether the provider was loaded at the time of error.
145
+ **extra: Additional fields forwarded verbatim.
146
+
147
+ Returns:
148
+ A keyword-expandable dict for ``await sink.emit(**result)``.
149
+ """
150
+ return {
151
+ "event_type": "provider.error",
152
+ "kind": kind,
153
+ "provider": provider,
154
+ "stage": stage,
155
+ "message": message,
156
+ "error_type": error_type,
157
+ "loaded": loaded,
158
+ **extra,
159
+ }
@@ -0,0 +1,28 @@
1
+ """Built-in provider implementations.
2
+
3
+ Mock and unavailable providers are imported eagerly because they have no
4
+ heavy dependencies. The concrete providers (``silero``, ``faster-whisper``,
5
+ ``llamacpp``, ``kokoro-onnx``, ``pocket-tts``) are not imported here --
6
+ they are registered with :func:`converse_framework.registry.register_provider`
7
+ by import string and loaded lazily on first use.
8
+ """
9
+
10
+ from converse_framework.providers.mock import (
11
+ MockASRProvider,
12
+ MockLLMProvider,
13
+ MockTTSProvider,
14
+ MockVADProvider,
15
+ )
16
+ from converse_framework.providers.unavailable import (
17
+ UnavailableProvider,
18
+ extra_hint_for,
19
+ )
20
+
21
+ __all__ = [
22
+ "MockASRProvider",
23
+ "MockLLMProvider",
24
+ "MockTTSProvider",
25
+ "MockVADProvider",
26
+ "UnavailableProvider",
27
+ "extra_hint_for",
28
+ ]