converse-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. converse_framework/__init__.py +108 -0
  2. converse_framework/audio_utils.py +412 -0
  3. converse_framework/cuda_utils.py +176 -0
  4. converse_framework/events.py +94 -0
  5. converse_framework/examples/__init__.py +20 -0
  6. converse_framework/examples/subprocess_provider.py +439 -0
  7. converse_framework/examples/text_chat.py +308 -0
  8. converse_framework/examples/voice_chat.py +223 -0
  9. converse_framework/examples/websocket_voice_chat.py +174 -0
  10. converse_framework/js/browser-voice-client.js +248 -0
  11. converse_framework/js/mic-frame-sender.js +445 -0
  12. converse_framework/js/speaker-echo-guard.js +308 -0
  13. converse_framework/js/tts-audio-player.js +237 -0
  14. converse_framework/pipeline.py +620 -0
  15. converse_framework/protocols.py +382 -0
  16. converse_framework/provider_events.py +159 -0
  17. converse_framework/providers/__init__.py +28 -0
  18. converse_framework/providers/faster_whisper.py +290 -0
  19. converse_framework/providers/kokoro_onnx.py +391 -0
  20. converse_framework/providers/llamacpp.py +264 -0
  21. converse_framework/providers/mock.py +171 -0
  22. converse_framework/providers/pocket_tts.py +409 -0
  23. converse_framework/providers/silero.py +161 -0
  24. converse_framework/providers/unavailable.py +137 -0
  25. converse_framework/providers/whisper_cpp.py +322 -0
  26. converse_framework/registry.py +397 -0
  27. converse_framework/session.py +315 -0
  28. converse_framework/transport.py +54 -0
  29. converse_framework/utterance_collector.py +336 -0
  30. converse_framework-0.2.0.dist-info/METADATA +992 -0
  31. converse_framework-0.2.0.dist-info/RECORD +33 -0
  32. converse_framework-0.2.0.dist-info/WHEEL +4 -0
  33. converse_framework-0.2.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,94 @@
1
+ """Event sink API and event envelope for the speech stack."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import time
7
+ from dataclasses import dataclass, field
8
+ from typing import Any
9
+
10
+
11
+ @dataclass
12
+ class FrameworkEvent:
13
+ """Canonical event envelope emitted by the framework.
14
+
15
+ The wire shape is the v0.1 contract: ``{"type": str, "ts": float,
16
+ "payload": dict}``. ``type`` is a stable dotted string (e.g.
17
+ ``"turn.started"``, ``"tts.audio"``); ``ts`` is a monotonic
18
+ timestamp taken from :func:`time.perf_counter`; ``payload`` is
19
+ the keyword arguments supplied to
20
+ :meth:`EventSink.emit`. The dataclass is mutable so call sites
21
+ can adjust fields before forwarding, but the event flow itself
22
+ treats instances as value objects.
23
+ """
24
+
25
+ type: str
26
+ payload: dict[str, Any] = field(default_factory=dict)
27
+ ts: float = field(default_factory=time.perf_counter)
28
+
29
+ def to_json(self) -> dict[str, Any]:
30
+ return {
31
+ "type": self.type,
32
+ "ts": self.ts,
33
+ "payload": self.payload,
34
+ }
35
+
36
+
37
+ class EventSink:
38
+ """Abstract sink for emitting typed events during pipeline execution.
39
+
40
+ Implementations forward :meth:`emit` calls to their own delivery
41
+ mechanism (WebSocket, in-memory queue, log, ...). The framework
42
+ depends on this protocol rather than a concrete class so apps
43
+ can wire the pipeline into any transport without changing the
44
+ pipeline code.
45
+
46
+ The base class raises :class:`NotImplementedError`; subclasses
47
+ must override :meth:`emit`.
48
+ """
49
+
50
+ async def emit(self, event_type: str, **payload: Any) -> None:
51
+ raise NotImplementedError
52
+
53
+
54
+ class QueueEventSink(EventSink):
55
+ """In-memory event sink backed by an :class:`asyncio.Queue`.
56
+
57
+ Each call to :meth:`emit` puts a wire-shaped dict onto the queue
58
+ owned by the caller. Tests use this sink to assert on the exact
59
+ event stream the pipeline produces without involving a real
60
+ transport.
61
+
62
+ Args:
63
+ queue: The asyncio queue the sink writes into. The caller
64
+ owns the queue and is responsible for draining or
65
+ closing it.
66
+ """
67
+
68
+ def __init__(self, queue: asyncio.Queue[dict[str, Any]]):
69
+ self.queue = queue
70
+
71
+ async def emit(self, event_type: str, **payload: Any) -> None:
72
+ await self.queue.put(
73
+ {"type": event_type, "ts": time.perf_counter(), "payload": payload}
74
+ )
75
+
76
+
77
+ class TransportEventSink(EventSink):
78
+ """Event sink adapter that forwards emitted events to a transport.
79
+
80
+ This is the bridge for consumers that already implemented the
81
+ :class:`converse_framework.transport.Transport` protocol and want
82
+ pipeline / collector events delivered through ``send_event`` without
83
+ writing their own small adapter class.
84
+
85
+ Args:
86
+ transport: Object with an async ``send_event(FrameworkEvent)``
87
+ method, typically a ``Transport`` implementation.
88
+ """
89
+
90
+ def __init__(self, transport) -> None:
91
+ self.transport = transport
92
+
93
+ async def emit(self, event_type: str, **payload: Any) -> None:
94
+ await self.transport.send_event(FrameworkEvent(event_type, payload))
@@ -0,0 +1,20 @@
1
+ """Standalone example consumers for Converse Framework.
2
+
3
+ These examples prove the framework is useful outside the browser
4
+ harness. They live outside the framework's core import path so that
5
+ ``import converse_framework`` stays lightweight and the examples are
6
+ opt-in.
7
+
8
+ Run the text example from the repository root::
9
+
10
+ python -m converse_framework.examples.text_chat
11
+
12
+ The CLI uses the mock provider bundle by default. To try a real
13
+ provider, pass the relevant names and ensure the matching extra is
14
+ installed::
15
+
16
+ python -m converse_framework.examples.text_chat \\
17
+ --asr faster-whisper \\
18
+ --llm llamacpp \\
19
+ --tts kokoro
20
+ """
@@ -0,0 +1,439 @@
1
+ """Subprocess-based ASR provider recipe.
2
+
3
+ This module shows the pattern for wrapping any external CLI binary
4
+ (``whisper-cli`` from whisper.cpp, ``whisper.cpp/main``,
5
+ ``vosk-transcriber``, etc.) as a framework :class:`ASRProvider`. The
6
+ recipe is intentionally minimal so a consumer can copy the class and
7
+ adapt it to any tool that reads audio on stdin and writes a transcript
8
+ on stdout.
9
+
10
+ Pattern:
11
+
12
+ * accept a config with ``binary`` (path or name on PATH) and a
13
+ ``command_template`` (list of argument tokens; ``{model}`` is
14
+ substituted at construction time)
15
+ * in :meth:`transcribe_audio`, build a 44-byte WAV header from the
16
+ caller's sample rate and channels, append the PCM s16le body, and
17
+ pipe the result into the subprocess via ``stdin``
18
+ * read the subprocess's stdout as the transcript and yield a single
19
+ final :class:`TranscriptEvent`
20
+
21
+ The example is testable end-to-end with the bundled fake echo script
22
+ (``--use-fake-echo``), which writes its stdin back to stdout. That
23
+ makes the recipe runnable on a clean machine without installing
24
+ ``whisper-cli`` or any other ASR binary — the same trick the
25
+ ``run_subprocess_demo`` driver uses for CI smoke tests.
26
+
27
+ Usage::
28
+
29
+ # with a real whisper-cli install on PATH:
30
+ python -m converse_framework.examples.subprocess_provider \\
31
+ --binary whisper-cli \\
32
+ --model ggml-small.en.bin \\
33
+ --input path/to/16k_mono.wav
34
+
35
+ # to validate the wiring without a real ASR:
36
+ python -m converse_framework.examples.subprocess_provider \\
37
+ --binary "$(which python)" \\
38
+ --use-fake-echo \\
39
+ --input path/to/16k_mono.wav
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ import argparse
45
+ import asyncio
46
+ import io
47
+ import math
48
+ import os
49
+ import shutil
50
+ import struct
51
+ import sys
52
+ import tempfile
53
+ import wave
54
+ from collections.abc import AsyncIterator
55
+ from typing import Any
56
+
57
+ from converse_framework.protocols import (
58
+ ASRProvider,
59
+ ProviderCapabilities,
60
+ ProviderStatus,
61
+ TranscriptEvent,
62
+ )
63
+
64
+
65
+ # A tiny executable that emits a deterministic transcript on stdout.
66
+ # Used by the CLI driver to validate the subprocess wiring
67
+ # end-to-end without requiring whisper-cli (or any real ASR binary)
68
+ # on PATH. We deliberately do not echo stdin to stdout: raw PCM
69
+ # bytes are not valid UTF-8, which would corrupt the captured
70
+ # transcript and trip up the test that asserts on it. Instead the
71
+ # script prints a known string plus the number of bytes it
72
+ # consumed, so the test can confirm both the wiring and the
73
+ # round-trip count.
74
+ FAKE_ECHO_SCRIPT = (
75
+ "#!/usr/bin/env python\n"
76
+ "import sys\n"
77
+ "consumed = len(sys.stdin.read())\n"
78
+ "sys.stdout.write(f\"fake transcript: consumed={consumed} bytes\")\n"
79
+ )
80
+
81
+
82
+ class SubprocessASRProvider(ASRProvider):
83
+ """ASRProvider that shells out to an external CLI binary.
84
+
85
+ The provider wraps a 44-byte WAV header around the caller's PCM
86
+ s16le body and pipes the result into the subprocess's stdin. The
87
+ subprocess's stdout is decoded as UTF-8 and yielded as a single
88
+ final :class:`TranscriptEvent`.
89
+
90
+ This is the standard convention for CLI-based ASR tools and is
91
+ the recommended starting point when wrapping any external engine
92
+ that has not yet shipped a first-class provider in the framework
93
+ registry. Consumers can subclass and override
94
+ :meth:`_build_wav_bytes` to swap the WAV-wrapping for a different
95
+ on-disk format the target binary expects.
96
+ """
97
+
98
+ def __init__(self, config: dict[str, Any]):
99
+ self.binary: str = str(config.get("binary", ""))
100
+ self.command_template: list[str] = list(
101
+ config.get("command_template", ["-m", "{model}", "-f", "-"])
102
+ )
103
+ self.model: str = str(config.get("model", ""))
104
+ self.timeout_s: float = float(config.get("timeout_s", 120))
105
+ self.language: str | None = (
106
+ str(config["language"]) if config.get("language") else None
107
+ )
108
+ self.channels: int = int(config.get("channels", 1))
109
+ self._ready: bool | None = None
110
+ self._reason: str = ""
111
+
112
+ # ------------------------------------------------------------------
113
+ # Protocol surface
114
+ # ------------------------------------------------------------------
115
+
116
+ @property
117
+ def status(self) -> ProviderStatus:
118
+ if self._ready is True:
119
+ return ProviderStatus(
120
+ name="subprocess",
121
+ kind="asr",
122
+ ready=True,
123
+ message=(
124
+ f"Subprocess ASR ready: '{self.binary}' "
125
+ f"with model '{self.model}'."
126
+ ),
127
+ capabilities=ProviderCapabilities(
128
+ supports_partials=False,
129
+ languages=(self.language,) if self.language else ("auto",),
130
+ ),
131
+ provider_id="subprocess",
132
+ )
133
+ if not self.binary:
134
+ return ProviderStatus(
135
+ name="subprocess",
136
+ kind="asr",
137
+ ready=False,
138
+ message=(
139
+ "Subprocess ASR is not configured: 'binary' is empty. "
140
+ "Set 'binary' in the provider config to the CLI tool "
141
+ "to wrap (e.g. 'whisper-cli' or an absolute path)."
142
+ ),
143
+ capabilities=ProviderCapabilities(),
144
+ provider_id="subprocess",
145
+ )
146
+ if not self._is_on_path():
147
+ return ProviderStatus(
148
+ name="subprocess",
149
+ kind="asr",
150
+ ready=False,
151
+ message=(
152
+ f"Subprocess ASR cannot find binary '{self.binary}' "
153
+ "on PATH. Install the CLI tool or pass an absolute "
154
+ "path via 'binary'."
155
+ ),
156
+ capabilities=ProviderCapabilities(),
157
+ provider_id="subprocess",
158
+ )
159
+ return ProviderStatus(
160
+ name="subprocess",
161
+ kind="asr",
162
+ ready=False,
163
+ message=self._reason or "Subprocess ASR not initialised.",
164
+ capabilities=ProviderCapabilities(),
165
+ provider_id="subprocess",
166
+ )
167
+
168
+ async def check_status(self) -> ProviderStatus:
169
+ if self._ready is None:
170
+ self._ready = bool(self.binary) and self._is_on_path()
171
+ if not self._ready:
172
+ self._reason = (
173
+ f"Binary '{self.binary}' not on PATH."
174
+ if self.binary
175
+ else "No 'binary' configured."
176
+ )
177
+ return self.status
178
+
179
+ async def load(self) -> ProviderStatus:
180
+ # Subprocess ASR providers do not preload anything; the binary
181
+ # is launched on demand per transcription. We just verify the
182
+ # binary is on PATH and remember the result.
183
+ if not self.binary:
184
+ self._ready = False
185
+ self._reason = "No 'binary' configured."
186
+ elif not self._is_on_path():
187
+ self._ready = False
188
+ self._reason = f"Binary '{self.binary}' not on PATH."
189
+ else:
190
+ self._ready = True
191
+ self._reason = ""
192
+ return self.status
193
+
194
+ async def transcribe_text_input(
195
+ self, text: str
196
+ ) -> AsyncIterator[TranscriptEvent]:
197
+ stripped = text.strip()
198
+ if stripped:
199
+ yield TranscriptEvent(text=stripped, final=True)
200
+
201
+ async def transcribe_audio(
202
+ self,
203
+ pcm_s16le: bytes,
204
+ sample_rate: int,
205
+ progress=None,
206
+ ) -> AsyncIterator[TranscriptEvent]:
207
+ if not self._ready:
208
+ await self.load()
209
+ if not self._ready:
210
+ raise RuntimeError(self._reason or self.status.message)
211
+
212
+ wav_bytes = self._build_wav_bytes(pcm_s16le, sample_rate=sample_rate)
213
+ cmd = [self.binary, *self._render_command()]
214
+ try:
215
+ proc = await asyncio.create_subprocess_exec(
216
+ *cmd,
217
+ stdin=asyncio.subprocess.PIPE,
218
+ stdout=asyncio.subprocess.PIPE,
219
+ stderr=asyncio.subprocess.PIPE,
220
+ )
221
+ except FileNotFoundError as exc:
222
+ raise RuntimeError(
223
+ f"Subprocess ASR failed to launch '{self.binary}': {exc}"
224
+ ) from exc
225
+ try:
226
+ stdout, stderr = await asyncio.wait_for(
227
+ proc.communicate(wav_bytes), timeout=self.timeout_s
228
+ )
229
+ except asyncio.TimeoutError:
230
+ proc.kill()
231
+ await proc.wait()
232
+ raise RuntimeError(
233
+ f"Subprocess ASR '{self.binary}' timed out after {self.timeout_s}s."
234
+ )
235
+ if proc.returncode != 0:
236
+ raise RuntimeError(
237
+ f"Subprocess ASR '{self.binary}' failed with exit code "
238
+ f"{proc.returncode}: "
239
+ f"{stderr.decode('utf-8', errors='replace').strip() or '(no stderr)'}"
240
+ )
241
+ text = stdout.decode("utf-8", errors="replace").strip()
242
+ if text:
243
+ yield TranscriptEvent(text=text, final=True)
244
+
245
+ async def unload(self) -> ProviderStatus:
246
+ # No persistent state to release; the per-call subprocess already exits.
247
+ return self.status
248
+
249
+ # ------------------------------------------------------------------
250
+ # Helpers (subclass-friendly)
251
+ # ------------------------------------------------------------------
252
+
253
+ def _render_command(self) -> list[str]:
254
+ rendered: list[str] = []
255
+ for token in self.command_template:
256
+ rendered.append(token.replace("{model}", self.model))
257
+ return rendered
258
+
259
+ def _is_on_path(self) -> bool:
260
+ return bool(self.binary) and shutil.which(self.binary) is not None
261
+
262
+ def _build_wav_bytes(
263
+ self, pcm_s16le: bytes, *, sample_rate: int
264
+ ) -> bytes:
265
+ """Wrap raw PCM s16le bytes in a minimal 44-byte WAV header.
266
+
267
+ Most CLI-based ASR engines (whisper-cli, whisper.cpp/main, the
268
+ Vosk CLI, etc.) read a WAV header from stdin and decode the
269
+ body accordingly. This helper is the recommended default;
270
+ override it in a subclass if the target binary expects a
271
+ different on-the-wire format.
272
+ """
273
+ buf = io.BytesIO()
274
+ with wave.open(buf, "wb") as wf:
275
+ wf.setnchannels(self.channels)
276
+ wf.setsampwidth(2) # 16-bit
277
+ wf.setframerate(sample_rate)
278
+ wf.writeframes(pcm_s16le)
279
+ return buf.getvalue()
280
+
281
+
282
+ # ----------------------------------------------------------------------
283
+ # Driver + CLI
284
+ # ----------------------------------------------------------------------
285
+
286
+
287
+ def _synthesize_tone_wav(path: str, *, duration_s: float = 1.0, frequency: float = 440.0) -> None:
288
+ """Write a short mono 16 kHz tone WAV to *path*.
289
+
290
+ Used as a default input for the CLI smoke test, so the example
291
+ always has something to feed the subprocess.
292
+ """
293
+ sample_rate = 16000
294
+ total = int(sample_rate * duration_s)
295
+ with wave.open(path, "wb") as wf:
296
+ wf.setnchannels(1)
297
+ wf.setsampwidth(2)
298
+ wf.setframerate(sample_rate)
299
+ frames = bytearray()
300
+ for i in range(total):
301
+ sample = int(0.3 * 32767 * math.sin(2 * math.pi * frequency * i / sample_rate))
302
+ frames.extend(struct.pack("<h", sample))
303
+ wf.writeframes(bytes(frames))
304
+
305
+
306
+ async def run_subprocess_demo(
307
+ *,
308
+ binary: str,
309
+ command_template: list[str],
310
+ input_path: str,
311
+ model: str = "",
312
+ timeout_s: float = 30.0,
313
+ ) -> str | None:
314
+ """Drive :class:`SubprocessASRProvider` against a WAV file.
315
+
316
+ Returns the transcript text or ``None`` when the subprocess
317
+ produced no output. The driver is what the CLI ``__main__`` and
318
+ the integration test both call.
319
+ """
320
+ provider = SubprocessASRProvider(
321
+ {
322
+ "binary": binary,
323
+ "command_template": command_template,
324
+ "model": model,
325
+ "timeout_s": timeout_s,
326
+ }
327
+ )
328
+ await provider.load()
329
+ with wave.open(input_path, "rb") as wf:
330
+ assert wf.getnchannels() == 1, "demo expects mono input"
331
+ assert wf.getsampwidth() == 2, "demo expects 16-bit PCM"
332
+ sample_rate = wf.getframerate()
333
+ pcm = wf.readframes(wf.getnframes())
334
+ transcript: str | None = None
335
+ async for event in provider.transcribe_audio(pcm, sample_rate):
336
+ transcript = event.text
337
+ return transcript
338
+
339
+
340
+ def _build_arg_parser() -> argparse.ArgumentParser:
341
+ parser = argparse.ArgumentParser(
342
+ prog="python -m converse_framework.examples.subprocess_provider",
343
+ description=(
344
+ "Subprocess-based ASR provider recipe. Wraps an external CLI "
345
+ "(e.g. whisper-cli) as an ASRProvider. Use --use-fake-echo to "
346
+ "validate the round-trip without installing a real ASR."
347
+ ),
348
+ )
349
+ parser.add_argument(
350
+ "--binary",
351
+ required=True,
352
+ help="Path to the CLI binary (or a name on PATH).",
353
+ )
354
+ parser.add_argument(
355
+ "--model",
356
+ default="",
357
+ help="Model identifier substituted into {model} in the command template.",
358
+ )
359
+ parser.add_argument(
360
+ "--command-template",
361
+ nargs="*",
362
+ default=["-m", "{model}", "-f", "-"],
363
+ help="Tokens to pass after the binary; {model} is replaced with --model.",
364
+ )
365
+ parser.add_argument(
366
+ "--input",
367
+ type=str,
368
+ default=None,
369
+ help=(
370
+ "Path to a 16 kHz mono WAV file. If omitted, a synthetic 1-second "
371
+ "tone is generated and fed to the subprocess."
372
+ ),
373
+ )
374
+ parser.add_argument(
375
+ "--use-fake-echo",
376
+ action="store_true",
377
+ help=(
378
+ "Write the bundled FAKE_ECHO_SCRIPT to a temp file and invoke it "
379
+ "via --binary's interpreter. Useful for CI smoke tests."
380
+ ),
381
+ )
382
+ return parser
383
+
384
+
385
+ def main(argv: list[str] | None = None) -> int:
386
+ parser = _build_arg_parser()
387
+ args = parser.parse_args(argv)
388
+
389
+ cleanup_paths: list[str] = []
390
+ try:
391
+ binary = args.binary
392
+ if args.use_fake_echo:
393
+ # Write the fake echo script to a temp file and use the
394
+ # configured binary as its interpreter. The command
395
+ # template starts with the script path, so the fake echo
396
+ # receives the WAV bytes on stdin and echoes them to
397
+ # stdout.
398
+ fd, script_path = tempfile.mkstemp(suffix=".py", prefix="fake_echo_")
399
+ os.close(fd)
400
+ with open(script_path, "w", encoding="utf-8") as handle:
401
+ handle.write(FAKE_ECHO_SCRIPT)
402
+ cleanup_paths.append(script_path)
403
+ command_template = [script_path, *args.command_template]
404
+ else:
405
+ command_template = args.command_template
406
+
407
+ if args.input is None:
408
+ fd, tone_path = tempfile.mkstemp(suffix=".wav", prefix="subproc_tone_")
409
+ os.close(fd)
410
+ _synthesize_tone_wav(tone_path)
411
+ cleanup_paths.append(tone_path)
412
+ input_path = tone_path
413
+ else:
414
+ input_path = args.input
415
+
416
+ transcript = asyncio.run(
417
+ run_subprocess_demo(
418
+ binary=binary,
419
+ command_template=command_template,
420
+ input_path=input_path,
421
+ model=args.model,
422
+ )
423
+ )
424
+
425
+ if transcript is None:
426
+ print("(no transcript returned)")
427
+ else:
428
+ print(f"transcript: {transcript!r}")
429
+ return 0
430
+ finally:
431
+ for path in cleanup_paths:
432
+ try:
433
+ os.unlink(path)
434
+ except OSError:
435
+ pass
436
+
437
+
438
+ if __name__ == "__main__": # pragma: no cover - manual example
439
+ sys.exit(main())