converse-framework 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- converse_framework/__init__.py +108 -0
- converse_framework/audio_utils.py +412 -0
- converse_framework/cuda_utils.py +176 -0
- converse_framework/events.py +94 -0
- converse_framework/examples/__init__.py +20 -0
- converse_framework/examples/subprocess_provider.py +439 -0
- converse_framework/examples/text_chat.py +308 -0
- converse_framework/examples/voice_chat.py +223 -0
- converse_framework/examples/websocket_voice_chat.py +174 -0
- converse_framework/js/browser-voice-client.js +248 -0
- converse_framework/js/mic-frame-sender.js +445 -0
- converse_framework/js/speaker-echo-guard.js +308 -0
- converse_framework/js/tts-audio-player.js +237 -0
- converse_framework/pipeline.py +620 -0
- converse_framework/protocols.py +382 -0
- converse_framework/provider_events.py +159 -0
- converse_framework/providers/__init__.py +28 -0
- converse_framework/providers/faster_whisper.py +290 -0
- converse_framework/providers/kokoro_onnx.py +391 -0
- converse_framework/providers/llamacpp.py +264 -0
- converse_framework/providers/mock.py +171 -0
- converse_framework/providers/pocket_tts.py +409 -0
- converse_framework/providers/silero.py +161 -0
- converse_framework/providers/unavailable.py +137 -0
- converse_framework/providers/whisper_cpp.py +322 -0
- converse_framework/registry.py +397 -0
- converse_framework/session.py +315 -0
- converse_framework/transport.py +54 -0
- converse_framework/utterance_collector.py +336 -0
- converse_framework-0.2.0.dist-info/METADATA +992 -0
- converse_framework-0.2.0.dist-info/RECORD +33 -0
- converse_framework-0.2.0.dist-info/WHEEL +4 -0
- converse_framework-0.2.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Event sink API and event envelope for the speech stack."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class FrameworkEvent:
|
|
13
|
+
"""Canonical event envelope emitted by the framework.
|
|
14
|
+
|
|
15
|
+
The wire shape is the v0.1 contract: ``{"type": str, "ts": float,
|
|
16
|
+
"payload": dict}``. ``type`` is a stable dotted string (e.g.
|
|
17
|
+
``"turn.started"``, ``"tts.audio"``); ``ts`` is a monotonic
|
|
18
|
+
timestamp taken from :func:`time.perf_counter`; ``payload`` is
|
|
19
|
+
the keyword arguments supplied to
|
|
20
|
+
:meth:`EventSink.emit`. The dataclass is mutable so call sites
|
|
21
|
+
can adjust fields before forwarding, but the event flow itself
|
|
22
|
+
treats instances as value objects.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
type: str
|
|
26
|
+
payload: dict[str, Any] = field(default_factory=dict)
|
|
27
|
+
ts: float = field(default_factory=time.perf_counter)
|
|
28
|
+
|
|
29
|
+
def to_json(self) -> dict[str, Any]:
|
|
30
|
+
return {
|
|
31
|
+
"type": self.type,
|
|
32
|
+
"ts": self.ts,
|
|
33
|
+
"payload": self.payload,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class EventSink:
|
|
38
|
+
"""Abstract sink for emitting typed events during pipeline execution.
|
|
39
|
+
|
|
40
|
+
Implementations forward :meth:`emit` calls to their own delivery
|
|
41
|
+
mechanism (WebSocket, in-memory queue, log, ...). The framework
|
|
42
|
+
depends on this protocol rather than a concrete class so apps
|
|
43
|
+
can wire the pipeline into any transport without changing the
|
|
44
|
+
pipeline code.
|
|
45
|
+
|
|
46
|
+
The base class raises :class:`NotImplementedError`; subclasses
|
|
47
|
+
must override :meth:`emit`.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
async def emit(self, event_type: str, **payload: Any) -> None:
|
|
51
|
+
raise NotImplementedError
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class QueueEventSink(EventSink):
|
|
55
|
+
"""In-memory event sink backed by an :class:`asyncio.Queue`.
|
|
56
|
+
|
|
57
|
+
Each call to :meth:`emit` puts a wire-shaped dict onto the queue
|
|
58
|
+
owned by the caller. Tests use this sink to assert on the exact
|
|
59
|
+
event stream the pipeline produces without involving a real
|
|
60
|
+
transport.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
queue: The asyncio queue the sink writes into. The caller
|
|
64
|
+
owns the queue and is responsible for draining or
|
|
65
|
+
closing it.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(self, queue: asyncio.Queue[dict[str, Any]]):
|
|
69
|
+
self.queue = queue
|
|
70
|
+
|
|
71
|
+
async def emit(self, event_type: str, **payload: Any) -> None:
|
|
72
|
+
await self.queue.put(
|
|
73
|
+
{"type": event_type, "ts": time.perf_counter(), "payload": payload}
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class TransportEventSink(EventSink):
|
|
78
|
+
"""Event sink adapter that forwards emitted events to a transport.
|
|
79
|
+
|
|
80
|
+
This is the bridge for consumers that already implemented the
|
|
81
|
+
:class:`converse_framework.transport.Transport` protocol and want
|
|
82
|
+
pipeline / collector events delivered through ``send_event`` without
|
|
83
|
+
writing their own small adapter class.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
transport: Object with an async ``send_event(FrameworkEvent)``
|
|
87
|
+
method, typically a ``Transport`` implementation.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def __init__(self, transport) -> None:
|
|
91
|
+
self.transport = transport
|
|
92
|
+
|
|
93
|
+
async def emit(self, event_type: str, **payload: Any) -> None:
|
|
94
|
+
await self.transport.send_event(FrameworkEvent(event_type, payload))
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Standalone example consumers for Converse Framework.
|
|
2
|
+
|
|
3
|
+
These examples prove the framework is useful outside the browser
|
|
4
|
+
harness. They live outside the framework's core import path so that
|
|
5
|
+
``import converse_framework`` stays lightweight and the examples are
|
|
6
|
+
opt-in.
|
|
7
|
+
|
|
8
|
+
Run the text example from the repository root::
|
|
9
|
+
|
|
10
|
+
python -m converse_framework.examples.text_chat
|
|
11
|
+
|
|
12
|
+
The CLI uses the mock provider bundle by default. To try a real
|
|
13
|
+
provider, pass the relevant names and ensure the matching extra is
|
|
14
|
+
installed::
|
|
15
|
+
|
|
16
|
+
python -m converse_framework.examples.text_chat \\
|
|
17
|
+
--asr faster-whisper \\
|
|
18
|
+
--llm llamacpp \\
|
|
19
|
+
--tts kokoro
|
|
20
|
+
"""
|
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
"""Subprocess-based ASR provider recipe.
|
|
2
|
+
|
|
3
|
+
This module shows the pattern for wrapping any external CLI binary
|
|
4
|
+
(``whisper-cli`` from whisper.cpp, ``whisper.cpp/main``,
|
|
5
|
+
``vosk-transcriber``, etc.) as a framework :class:`ASRProvider`. The
|
|
6
|
+
recipe is intentionally minimal so a consumer can copy the class and
|
|
7
|
+
adapt it to any tool that reads audio on stdin and writes a transcript
|
|
8
|
+
on stdout.
|
|
9
|
+
|
|
10
|
+
Pattern:
|
|
11
|
+
|
|
12
|
+
* accept a config with ``binary`` (path or name on PATH) and a
|
|
13
|
+
``command_template`` (list of argument tokens; ``{model}`` is
|
|
14
|
+
substituted at construction time)
|
|
15
|
+
* in :meth:`transcribe_audio`, build a 44-byte WAV header from the
|
|
16
|
+
caller's sample rate and channels, append the PCM s16le body, and
|
|
17
|
+
pipe the result into the subprocess via ``stdin``
|
|
18
|
+
* read the subprocess's stdout as the transcript and yield a single
|
|
19
|
+
final :class:`TranscriptEvent`
|
|
20
|
+
|
|
21
|
+
The example is testable end-to-end with the bundled fake echo script
|
|
22
|
+
(``--use-fake-echo``), which writes its stdin back to stdout. That
|
|
23
|
+
makes the recipe runnable on a clean machine without installing
|
|
24
|
+
``whisper-cli`` or any other ASR binary — the same trick the
|
|
25
|
+
``run_subprocess_demo`` driver uses for CI smoke tests.
|
|
26
|
+
|
|
27
|
+
Usage::
|
|
28
|
+
|
|
29
|
+
# with a real whisper-cli install on PATH:
|
|
30
|
+
python -m converse_framework.examples.subprocess_provider \\
|
|
31
|
+
--binary whisper-cli \\
|
|
32
|
+
--model ggml-small.en.bin \\
|
|
33
|
+
--input path/to/16k_mono.wav
|
|
34
|
+
|
|
35
|
+
# to validate the wiring without a real ASR:
|
|
36
|
+
python -m converse_framework.examples.subprocess_provider \\
|
|
37
|
+
--binary "$(which python)" \\
|
|
38
|
+
--use-fake-echo \\
|
|
39
|
+
--input path/to/16k_mono.wav
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
import argparse
|
|
45
|
+
import asyncio
|
|
46
|
+
import io
|
|
47
|
+
import math
|
|
48
|
+
import os
|
|
49
|
+
import shutil
|
|
50
|
+
import struct
|
|
51
|
+
import sys
|
|
52
|
+
import tempfile
|
|
53
|
+
import wave
|
|
54
|
+
from collections.abc import AsyncIterator
|
|
55
|
+
from typing import Any
|
|
56
|
+
|
|
57
|
+
from converse_framework.protocols import (
|
|
58
|
+
ASRProvider,
|
|
59
|
+
ProviderCapabilities,
|
|
60
|
+
ProviderStatus,
|
|
61
|
+
TranscriptEvent,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# A tiny executable that emits a deterministic transcript on stdout.
|
|
66
|
+
# Used by the CLI driver to validate the subprocess wiring
|
|
67
|
+
# end-to-end without requiring whisper-cli (or any real ASR binary)
|
|
68
|
+
# on PATH. We deliberately do not echo stdin to stdout: raw PCM
|
|
69
|
+
# bytes are not valid UTF-8, which would corrupt the captured
|
|
70
|
+
# transcript and trip up the test that asserts on it. Instead the
|
|
71
|
+
# script prints a known string plus the number of bytes it
|
|
72
|
+
# consumed, so the test can confirm both the wiring and the
|
|
73
|
+
# round-trip count.
|
|
74
|
+
FAKE_ECHO_SCRIPT = (
|
|
75
|
+
"#!/usr/bin/env python\n"
|
|
76
|
+
"import sys\n"
|
|
77
|
+
"consumed = len(sys.stdin.read())\n"
|
|
78
|
+
"sys.stdout.write(f\"fake transcript: consumed={consumed} bytes\")\n"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class SubprocessASRProvider(ASRProvider):
|
|
83
|
+
"""ASRProvider that shells out to an external CLI binary.
|
|
84
|
+
|
|
85
|
+
The provider wraps a 44-byte WAV header around the caller's PCM
|
|
86
|
+
s16le body and pipes the result into the subprocess's stdin. The
|
|
87
|
+
subprocess's stdout is decoded as UTF-8 and yielded as a single
|
|
88
|
+
final :class:`TranscriptEvent`.
|
|
89
|
+
|
|
90
|
+
This is the standard convention for CLI-based ASR tools and is
|
|
91
|
+
the recommended starting point when wrapping any external engine
|
|
92
|
+
that has not yet shipped a first-class provider in the framework
|
|
93
|
+
registry. Consumers can subclass and override
|
|
94
|
+
:meth:`_build_wav_bytes` to swap the WAV-wrapping for a different
|
|
95
|
+
on-disk format the target binary expects.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
def __init__(self, config: dict[str, Any]):
|
|
99
|
+
self.binary: str = str(config.get("binary", ""))
|
|
100
|
+
self.command_template: list[str] = list(
|
|
101
|
+
config.get("command_template", ["-m", "{model}", "-f", "-"])
|
|
102
|
+
)
|
|
103
|
+
self.model: str = str(config.get("model", ""))
|
|
104
|
+
self.timeout_s: float = float(config.get("timeout_s", 120))
|
|
105
|
+
self.language: str | None = (
|
|
106
|
+
str(config["language"]) if config.get("language") else None
|
|
107
|
+
)
|
|
108
|
+
self.channels: int = int(config.get("channels", 1))
|
|
109
|
+
self._ready: bool | None = None
|
|
110
|
+
self._reason: str = ""
|
|
111
|
+
|
|
112
|
+
# ------------------------------------------------------------------
|
|
113
|
+
# Protocol surface
|
|
114
|
+
# ------------------------------------------------------------------
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def status(self) -> ProviderStatus:
|
|
118
|
+
if self._ready is True:
|
|
119
|
+
return ProviderStatus(
|
|
120
|
+
name="subprocess",
|
|
121
|
+
kind="asr",
|
|
122
|
+
ready=True,
|
|
123
|
+
message=(
|
|
124
|
+
f"Subprocess ASR ready: '{self.binary}' "
|
|
125
|
+
f"with model '{self.model}'."
|
|
126
|
+
),
|
|
127
|
+
capabilities=ProviderCapabilities(
|
|
128
|
+
supports_partials=False,
|
|
129
|
+
languages=(self.language,) if self.language else ("auto",),
|
|
130
|
+
),
|
|
131
|
+
provider_id="subprocess",
|
|
132
|
+
)
|
|
133
|
+
if not self.binary:
|
|
134
|
+
return ProviderStatus(
|
|
135
|
+
name="subprocess",
|
|
136
|
+
kind="asr",
|
|
137
|
+
ready=False,
|
|
138
|
+
message=(
|
|
139
|
+
"Subprocess ASR is not configured: 'binary' is empty. "
|
|
140
|
+
"Set 'binary' in the provider config to the CLI tool "
|
|
141
|
+
"to wrap (e.g. 'whisper-cli' or an absolute path)."
|
|
142
|
+
),
|
|
143
|
+
capabilities=ProviderCapabilities(),
|
|
144
|
+
provider_id="subprocess",
|
|
145
|
+
)
|
|
146
|
+
if not self._is_on_path():
|
|
147
|
+
return ProviderStatus(
|
|
148
|
+
name="subprocess",
|
|
149
|
+
kind="asr",
|
|
150
|
+
ready=False,
|
|
151
|
+
message=(
|
|
152
|
+
f"Subprocess ASR cannot find binary '{self.binary}' "
|
|
153
|
+
"on PATH. Install the CLI tool or pass an absolute "
|
|
154
|
+
"path via 'binary'."
|
|
155
|
+
),
|
|
156
|
+
capabilities=ProviderCapabilities(),
|
|
157
|
+
provider_id="subprocess",
|
|
158
|
+
)
|
|
159
|
+
return ProviderStatus(
|
|
160
|
+
name="subprocess",
|
|
161
|
+
kind="asr",
|
|
162
|
+
ready=False,
|
|
163
|
+
message=self._reason or "Subprocess ASR not initialised.",
|
|
164
|
+
capabilities=ProviderCapabilities(),
|
|
165
|
+
provider_id="subprocess",
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
async def check_status(self) -> ProviderStatus:
|
|
169
|
+
if self._ready is None:
|
|
170
|
+
self._ready = bool(self.binary) and self._is_on_path()
|
|
171
|
+
if not self._ready:
|
|
172
|
+
self._reason = (
|
|
173
|
+
f"Binary '{self.binary}' not on PATH."
|
|
174
|
+
if self.binary
|
|
175
|
+
else "No 'binary' configured."
|
|
176
|
+
)
|
|
177
|
+
return self.status
|
|
178
|
+
|
|
179
|
+
async def load(self) -> ProviderStatus:
|
|
180
|
+
# Subprocess ASR providers do not preload anything; the binary
|
|
181
|
+
# is launched on demand per transcription. We just verify the
|
|
182
|
+
# binary is on PATH and remember the result.
|
|
183
|
+
if not self.binary:
|
|
184
|
+
self._ready = False
|
|
185
|
+
self._reason = "No 'binary' configured."
|
|
186
|
+
elif not self._is_on_path():
|
|
187
|
+
self._ready = False
|
|
188
|
+
self._reason = f"Binary '{self.binary}' not on PATH."
|
|
189
|
+
else:
|
|
190
|
+
self._ready = True
|
|
191
|
+
self._reason = ""
|
|
192
|
+
return self.status
|
|
193
|
+
|
|
194
|
+
async def transcribe_text_input(
|
|
195
|
+
self, text: str
|
|
196
|
+
) -> AsyncIterator[TranscriptEvent]:
|
|
197
|
+
stripped = text.strip()
|
|
198
|
+
if stripped:
|
|
199
|
+
yield TranscriptEvent(text=stripped, final=True)
|
|
200
|
+
|
|
201
|
+
async def transcribe_audio(
|
|
202
|
+
self,
|
|
203
|
+
pcm_s16le: bytes,
|
|
204
|
+
sample_rate: int,
|
|
205
|
+
progress=None,
|
|
206
|
+
) -> AsyncIterator[TranscriptEvent]:
|
|
207
|
+
if not self._ready:
|
|
208
|
+
await self.load()
|
|
209
|
+
if not self._ready:
|
|
210
|
+
raise RuntimeError(self._reason or self.status.message)
|
|
211
|
+
|
|
212
|
+
wav_bytes = self._build_wav_bytes(pcm_s16le, sample_rate=sample_rate)
|
|
213
|
+
cmd = [self.binary, *self._render_command()]
|
|
214
|
+
try:
|
|
215
|
+
proc = await asyncio.create_subprocess_exec(
|
|
216
|
+
*cmd,
|
|
217
|
+
stdin=asyncio.subprocess.PIPE,
|
|
218
|
+
stdout=asyncio.subprocess.PIPE,
|
|
219
|
+
stderr=asyncio.subprocess.PIPE,
|
|
220
|
+
)
|
|
221
|
+
except FileNotFoundError as exc:
|
|
222
|
+
raise RuntimeError(
|
|
223
|
+
f"Subprocess ASR failed to launch '{self.binary}': {exc}"
|
|
224
|
+
) from exc
|
|
225
|
+
try:
|
|
226
|
+
stdout, stderr = await asyncio.wait_for(
|
|
227
|
+
proc.communicate(wav_bytes), timeout=self.timeout_s
|
|
228
|
+
)
|
|
229
|
+
except asyncio.TimeoutError:
|
|
230
|
+
proc.kill()
|
|
231
|
+
await proc.wait()
|
|
232
|
+
raise RuntimeError(
|
|
233
|
+
f"Subprocess ASR '{self.binary}' timed out after {self.timeout_s}s."
|
|
234
|
+
)
|
|
235
|
+
if proc.returncode != 0:
|
|
236
|
+
raise RuntimeError(
|
|
237
|
+
f"Subprocess ASR '{self.binary}' failed with exit code "
|
|
238
|
+
f"{proc.returncode}: "
|
|
239
|
+
f"{stderr.decode('utf-8', errors='replace').strip() or '(no stderr)'}"
|
|
240
|
+
)
|
|
241
|
+
text = stdout.decode("utf-8", errors="replace").strip()
|
|
242
|
+
if text:
|
|
243
|
+
yield TranscriptEvent(text=text, final=True)
|
|
244
|
+
|
|
245
|
+
async def unload(self) -> ProviderStatus:
|
|
246
|
+
# No persistent state to release; the per-call subprocess already exits.
|
|
247
|
+
return self.status
|
|
248
|
+
|
|
249
|
+
# ------------------------------------------------------------------
|
|
250
|
+
# Helpers (subclass-friendly)
|
|
251
|
+
# ------------------------------------------------------------------
|
|
252
|
+
|
|
253
|
+
def _render_command(self) -> list[str]:
|
|
254
|
+
rendered: list[str] = []
|
|
255
|
+
for token in self.command_template:
|
|
256
|
+
rendered.append(token.replace("{model}", self.model))
|
|
257
|
+
return rendered
|
|
258
|
+
|
|
259
|
+
def _is_on_path(self) -> bool:
|
|
260
|
+
return bool(self.binary) and shutil.which(self.binary) is not None
|
|
261
|
+
|
|
262
|
+
def _build_wav_bytes(
|
|
263
|
+
self, pcm_s16le: bytes, *, sample_rate: int
|
|
264
|
+
) -> bytes:
|
|
265
|
+
"""Wrap raw PCM s16le bytes in a minimal 44-byte WAV header.
|
|
266
|
+
|
|
267
|
+
Most CLI-based ASR engines (whisper-cli, whisper.cpp/main, the
|
|
268
|
+
Vosk CLI, etc.) read a WAV header from stdin and decode the
|
|
269
|
+
body accordingly. This helper is the recommended default;
|
|
270
|
+
override it in a subclass if the target binary expects a
|
|
271
|
+
different on-the-wire format.
|
|
272
|
+
"""
|
|
273
|
+
buf = io.BytesIO()
|
|
274
|
+
with wave.open(buf, "wb") as wf:
|
|
275
|
+
wf.setnchannels(self.channels)
|
|
276
|
+
wf.setsampwidth(2) # 16-bit
|
|
277
|
+
wf.setframerate(sample_rate)
|
|
278
|
+
wf.writeframes(pcm_s16le)
|
|
279
|
+
return buf.getvalue()
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
# ----------------------------------------------------------------------
|
|
283
|
+
# Driver + CLI
|
|
284
|
+
# ----------------------------------------------------------------------
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _synthesize_tone_wav(path: str, *, duration_s: float = 1.0, frequency: float = 440.0) -> None:
|
|
288
|
+
"""Write a short mono 16 kHz tone WAV to *path*.
|
|
289
|
+
|
|
290
|
+
Used as a default input for the CLI smoke test, so the example
|
|
291
|
+
always has something to feed the subprocess.
|
|
292
|
+
"""
|
|
293
|
+
sample_rate = 16000
|
|
294
|
+
total = int(sample_rate * duration_s)
|
|
295
|
+
with wave.open(path, "wb") as wf:
|
|
296
|
+
wf.setnchannels(1)
|
|
297
|
+
wf.setsampwidth(2)
|
|
298
|
+
wf.setframerate(sample_rate)
|
|
299
|
+
frames = bytearray()
|
|
300
|
+
for i in range(total):
|
|
301
|
+
sample = int(0.3 * 32767 * math.sin(2 * math.pi * frequency * i / sample_rate))
|
|
302
|
+
frames.extend(struct.pack("<h", sample))
|
|
303
|
+
wf.writeframes(bytes(frames))
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
async def run_subprocess_demo(
|
|
307
|
+
*,
|
|
308
|
+
binary: str,
|
|
309
|
+
command_template: list[str],
|
|
310
|
+
input_path: str,
|
|
311
|
+
model: str = "",
|
|
312
|
+
timeout_s: float = 30.0,
|
|
313
|
+
) -> str | None:
|
|
314
|
+
"""Drive :class:`SubprocessASRProvider` against a WAV file.
|
|
315
|
+
|
|
316
|
+
Returns the transcript text or ``None`` when the subprocess
|
|
317
|
+
produced no output. The driver is what the CLI ``__main__`` and
|
|
318
|
+
the integration test both call.
|
|
319
|
+
"""
|
|
320
|
+
provider = SubprocessASRProvider(
|
|
321
|
+
{
|
|
322
|
+
"binary": binary,
|
|
323
|
+
"command_template": command_template,
|
|
324
|
+
"model": model,
|
|
325
|
+
"timeout_s": timeout_s,
|
|
326
|
+
}
|
|
327
|
+
)
|
|
328
|
+
await provider.load()
|
|
329
|
+
with wave.open(input_path, "rb") as wf:
|
|
330
|
+
assert wf.getnchannels() == 1, "demo expects mono input"
|
|
331
|
+
assert wf.getsampwidth() == 2, "demo expects 16-bit PCM"
|
|
332
|
+
sample_rate = wf.getframerate()
|
|
333
|
+
pcm = wf.readframes(wf.getnframes())
|
|
334
|
+
transcript: str | None = None
|
|
335
|
+
async for event in provider.transcribe_audio(pcm, sample_rate):
|
|
336
|
+
transcript = event.text
|
|
337
|
+
return transcript
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _build_arg_parser() -> argparse.ArgumentParser:
|
|
341
|
+
parser = argparse.ArgumentParser(
|
|
342
|
+
prog="python -m converse_framework.examples.subprocess_provider",
|
|
343
|
+
description=(
|
|
344
|
+
"Subprocess-based ASR provider recipe. Wraps an external CLI "
|
|
345
|
+
"(e.g. whisper-cli) as an ASRProvider. Use --use-fake-echo to "
|
|
346
|
+
"validate the round-trip without installing a real ASR."
|
|
347
|
+
),
|
|
348
|
+
)
|
|
349
|
+
parser.add_argument(
|
|
350
|
+
"--binary",
|
|
351
|
+
required=True,
|
|
352
|
+
help="Path to the CLI binary (or a name on PATH).",
|
|
353
|
+
)
|
|
354
|
+
parser.add_argument(
|
|
355
|
+
"--model",
|
|
356
|
+
default="",
|
|
357
|
+
help="Model identifier substituted into {model} in the command template.",
|
|
358
|
+
)
|
|
359
|
+
parser.add_argument(
|
|
360
|
+
"--command-template",
|
|
361
|
+
nargs="*",
|
|
362
|
+
default=["-m", "{model}", "-f", "-"],
|
|
363
|
+
help="Tokens to pass after the binary; {model} is replaced with --model.",
|
|
364
|
+
)
|
|
365
|
+
parser.add_argument(
|
|
366
|
+
"--input",
|
|
367
|
+
type=str,
|
|
368
|
+
default=None,
|
|
369
|
+
help=(
|
|
370
|
+
"Path to a 16 kHz mono WAV file. If omitted, a synthetic 1-second "
|
|
371
|
+
"tone is generated and fed to the subprocess."
|
|
372
|
+
),
|
|
373
|
+
)
|
|
374
|
+
parser.add_argument(
|
|
375
|
+
"--use-fake-echo",
|
|
376
|
+
action="store_true",
|
|
377
|
+
help=(
|
|
378
|
+
"Write the bundled FAKE_ECHO_SCRIPT to a temp file and invoke it "
|
|
379
|
+
"via --binary's interpreter. Useful for CI smoke tests."
|
|
380
|
+
),
|
|
381
|
+
)
|
|
382
|
+
return parser
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def main(argv: list[str] | None = None) -> int:
|
|
386
|
+
parser = _build_arg_parser()
|
|
387
|
+
args = parser.parse_args(argv)
|
|
388
|
+
|
|
389
|
+
cleanup_paths: list[str] = []
|
|
390
|
+
try:
|
|
391
|
+
binary = args.binary
|
|
392
|
+
if args.use_fake_echo:
|
|
393
|
+
# Write the fake echo script to a temp file and use the
|
|
394
|
+
# configured binary as its interpreter. The command
|
|
395
|
+
# template starts with the script path, so the fake echo
|
|
396
|
+
# receives the WAV bytes on stdin and echoes them to
|
|
397
|
+
# stdout.
|
|
398
|
+
fd, script_path = tempfile.mkstemp(suffix=".py", prefix="fake_echo_")
|
|
399
|
+
os.close(fd)
|
|
400
|
+
with open(script_path, "w", encoding="utf-8") as handle:
|
|
401
|
+
handle.write(FAKE_ECHO_SCRIPT)
|
|
402
|
+
cleanup_paths.append(script_path)
|
|
403
|
+
command_template = [script_path, *args.command_template]
|
|
404
|
+
else:
|
|
405
|
+
command_template = args.command_template
|
|
406
|
+
|
|
407
|
+
if args.input is None:
|
|
408
|
+
fd, tone_path = tempfile.mkstemp(suffix=".wav", prefix="subproc_tone_")
|
|
409
|
+
os.close(fd)
|
|
410
|
+
_synthesize_tone_wav(tone_path)
|
|
411
|
+
cleanup_paths.append(tone_path)
|
|
412
|
+
input_path = tone_path
|
|
413
|
+
else:
|
|
414
|
+
input_path = args.input
|
|
415
|
+
|
|
416
|
+
transcript = asyncio.run(
|
|
417
|
+
run_subprocess_demo(
|
|
418
|
+
binary=binary,
|
|
419
|
+
command_template=command_template,
|
|
420
|
+
input_path=input_path,
|
|
421
|
+
model=args.model,
|
|
422
|
+
)
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
if transcript is None:
|
|
426
|
+
print("(no transcript returned)")
|
|
427
|
+
else:
|
|
428
|
+
print(f"transcript: {transcript!r}")
|
|
429
|
+
return 0
|
|
430
|
+
finally:
|
|
431
|
+
for path in cleanup_paths:
|
|
432
|
+
try:
|
|
433
|
+
os.unlink(path)
|
|
434
|
+
except OSError:
|
|
435
|
+
pass
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
if __name__ == "__main__": # pragma: no cover - manual example
|
|
439
|
+
sys.exit(main())
|