voice-runtime 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- voice_runtime/__init__.py +30 -0
- voice_runtime/audio.py +278 -0
- voice_runtime/mock/__init__.py +1 -0
- voice_runtime/mock/stt.py +78 -0
- voice_runtime/mock/tts.py +48 -0
- voice_runtime/providers/__init__.py +57 -0
- voice_runtime/providers/azure_stt.py +286 -0
- voice_runtime/providers/azure_tts.py +123 -0
- voice_runtime/providers/elevenlabs_stt.py +287 -0
- voice_runtime/providers/elevenlabs_tts.py +107 -0
- voice_runtime/session.py +346 -0
- voice_runtime/stt.py +49 -0
- voice_runtime/stt_tee.py +145 -0
- voice_runtime/transport.py +20 -0
- voice_runtime/transports/__init__.py +0 -0
- voice_runtime/transports/mock_bridge.py +215 -0
- voice_runtime/transports/twilio_call.py +88 -0
- voice_runtime/transports/twilio_sms.py +36 -0
- voice_runtime/transports/twilio_ws.py +206 -0
- voice_runtime/tts.py +25 -0
- voice_runtime-0.1.0.dist-info/METADATA +516 -0
- voice_runtime-0.1.0.dist-info/RECORD +25 -0
- voice_runtime-0.1.0.dist-info/WHEEL +5 -0
- voice_runtime-0.1.0.dist-info/licenses/LICENSE +21 -0
- voice_runtime-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# voice_runtime — provider-agnostic voice call runtime for telephony projects
|
|
2
|
+
#
|
|
3
|
+
# Public API surface. Import from here rather than internal modules.
|
|
4
|
+
|
|
5
|
+
from voice_runtime.audio import AudioMixer, mix_frames
|
|
6
|
+
from voice_runtime.providers import SttProvider, TtsProvider
|
|
7
|
+
from voice_runtime.session import (
|
|
8
|
+
CallHangupError,
|
|
9
|
+
CallNotAnsweredError,
|
|
10
|
+
MissingStreamUrlError,
|
|
11
|
+
VoiceSession,
|
|
12
|
+
)
|
|
13
|
+
from voice_runtime.stt import create_stt, get_stt_class
|
|
14
|
+
from voice_runtime.transport import get_sms_transport
|
|
15
|
+
from voice_runtime.tts import create_tts
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"VoiceSession",
|
|
19
|
+
"MissingStreamUrlError",
|
|
20
|
+
"CallNotAnsweredError",
|
|
21
|
+
"CallHangupError",
|
|
22
|
+
"create_stt",
|
|
23
|
+
"get_stt_class",
|
|
24
|
+
"create_tts",
|
|
25
|
+
"get_sms_transport",
|
|
26
|
+
"AudioMixer",
|
|
27
|
+
"mix_frames",
|
|
28
|
+
"SttProvider",
|
|
29
|
+
"TtsProvider",
|
|
30
|
+
]
|
voice_runtime/audio.py
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"""G.711 μ-law codec and real-time audio mixer.
|
|
2
|
+
|
|
3
|
+
NC-152: Extracted from outcaller/nodes/audio_mixer.py (228 lines, zero
|
|
4
|
+
project imports). Provides mulaw encode/decode, frame mixing, and the
|
|
5
|
+
AudioMixer class for local audio monitoring.
|
|
6
|
+
|
|
7
|
+
NC-235: Optional WAV file recording (record_path parameter).
|
|
8
|
+
Shared by outcaller and ninchat_voice (vendored). Additive change —
|
|
9
|
+
record_path=None default preserves existing behavior. Cross-project
|
|
10
|
+
impact disappears post NC-230 extraction.
|
|
11
|
+
|
|
12
|
+
Architecture:
|
|
13
|
+
tap_caller(chunk) → caller deque ─┐
|
|
14
|
+
├─ mix thread (20ms tick) → ffplay stdin
|
|
15
|
+
tap_agent(chunk) → agent deque ─┘ → WAV file (opt)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import collections
|
|
21
|
+
import contextlib
|
|
22
|
+
import logging
|
|
23
|
+
import struct
|
|
24
|
+
import subprocess
|
|
25
|
+
import threading
|
|
26
|
+
import time
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
FRAME_BYTES: int = 160 # 20ms @ 8kHz, 1 byte/sample (mulaw)
|
|
32
|
+
FRAME_INTERVAL: float = 0.020 # 20ms
|
|
33
|
+
MAX_FRAMES: int = 400 # 8 seconds at 20ms/frame
|
|
34
|
+
SILENCE_FRAME: bytes = b"\xff" * FRAME_BYTES # mulaw silence (zero amplitude)
|
|
35
|
+
CATCHUP_LIMIT: float = 5 * FRAME_INTERVAL # 100ms — reset threshold
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# G.711 μ-law codec (ITU-T, public domain tables)
|
|
40
|
+
# Inline implementation — avoids audioop dependency removed in Python 3.13.
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
|
|
43
|
+
_ULAW_TO_LINEAR: list[int] = []
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _build_ulaw_table() -> None:
|
|
47
|
+
"""Populate _ULAW_TO_LINEAR (256 entries) from the G.711 formula."""
|
|
48
|
+
_exp_lut = [0, 132, 396, 924, 1980, 4092, 8316, 16764]
|
|
49
|
+
for i in range(256):
|
|
50
|
+
val = ~i
|
|
51
|
+
sign = val & 0x80
|
|
52
|
+
exponent = (val >> 4) & 0x07
|
|
53
|
+
mantissa = val & 0x0F
|
|
54
|
+
sample = _exp_lut[exponent] + (mantissa << (exponent + 3))
|
|
55
|
+
if sign != 0:
|
|
56
|
+
sample = -sample
|
|
57
|
+
_ULAW_TO_LINEAR.append(sample)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
_build_ulaw_table()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _linear_to_ulaw(sample: int) -> int:
|
|
64
|
+
"""Encode a single 16-bit signed sample to μ-law byte."""
|
|
65
|
+
BIAS = 0x84
|
|
66
|
+
CLIP = 32635
|
|
67
|
+
sign = 0
|
|
68
|
+
if sample < 0:
|
|
69
|
+
sign = 0x80
|
|
70
|
+
sample = -sample
|
|
71
|
+
if sample > CLIP:
|
|
72
|
+
sample = CLIP
|
|
73
|
+
sample += BIAS
|
|
74
|
+
exponent = 7
|
|
75
|
+
exp_mask = 0x4000
|
|
76
|
+
for _ in range(8):
|
|
77
|
+
if sample & exp_mask:
|
|
78
|
+
break
|
|
79
|
+
exponent -= 1
|
|
80
|
+
exp_mask >>= 1
|
|
81
|
+
mantissa = (sample >> (exponent + 3)) & 0x0F
|
|
82
|
+
return ~(sign | (exponent << 4) | mantissa) & 0xFF
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def mix_frames(caller: bytes, agent: bytes) -> bytes:
|
|
86
|
+
"""Mix two mulaw frames by decoding, adding (clamped), re-encoding.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
caller: FRAME_BYTES of mulaw caller audio
|
|
90
|
+
agent: FRAME_BYTES of mulaw agent audio
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
FRAME_BYTES of mixed mulaw audio
|
|
94
|
+
"""
|
|
95
|
+
out = bytearray(FRAME_BYTES)
|
|
96
|
+
for i in range(FRAME_BYTES):
|
|
97
|
+
c = _ULAW_TO_LINEAR[caller[i]]
|
|
98
|
+
a = _ULAW_TO_LINEAR[agent[i]]
|
|
99
|
+
mixed = c + a
|
|
100
|
+
if mixed > 32767:
|
|
101
|
+
mixed = 32767
|
|
102
|
+
elif mixed < -32768:
|
|
103
|
+
mixed = -32768
|
|
104
|
+
out[i] = _linear_to_ulaw(mixed)
|
|
105
|
+
return bytes(out)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _write_wav_header(f, data_size: int) -> None:
|
|
109
|
+
"""Write a WAV header for 8kHz mono mulaw audio.
|
|
110
|
+
|
|
111
|
+
WAV mulaw: codec ID 7, 8000 Hz, 1 channel, 8 bits/sample.
|
|
112
|
+
fmt chunk is 18 bytes (includes cbSize=0 for non-PCM).
|
|
113
|
+
"""
|
|
114
|
+
fmt_size = 18
|
|
115
|
+
header_size = 4 + (8 + fmt_size) + (8 + data_size) # WAVE + fmt chunk + data chunk
|
|
116
|
+
f.seek(0)
|
|
117
|
+
# RIFF header
|
|
118
|
+
f.write(struct.pack("<4sI4s", b"RIFF", header_size, b"WAVE"))
|
|
119
|
+
# fmt chunk: codec=7(mulaw), channels=1, rate=8000, byterate=8000, blockalign=1, bits=8, cbSize=0
|
|
120
|
+
f.write(struct.pack("<4sIHHIIHHH", b"fmt ", fmt_size, 7, 1, 8000, 8000, 1, 8, 0))
|
|
121
|
+
# data chunk header
|
|
122
|
+
f.write(struct.pack("<4sI", b"data", data_size))
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class AudioMixer:
|
|
126
|
+
"""Real-time mulaw mixer: two input channels → one mixed output.
|
|
127
|
+
|
|
128
|
+
Uses bounded deques (not ring buffers) — correct for bursty agent audio.
|
|
129
|
+
Each direction has an independent FIFO. The mix thread pops one frame from
|
|
130
|
+
each deque every 20ms. Empty deque → silence. Overfull deque (agent burst)
|
|
131
|
+
→ frames accumulate and drain at 1×; maxlen drops oldest on overflow.
|
|
132
|
+
|
|
133
|
+
NC-235: When record_path is set, mixed audio is also written to a WAV file.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
def __init__(self, record_path: Path | None = None) -> None:
|
|
137
|
+
self._caller: collections.deque[bytes] = collections.deque(maxlen=MAX_FRAMES)
|
|
138
|
+
self._agent: collections.deque[bytes] = collections.deque(maxlen=MAX_FRAMES)
|
|
139
|
+
self._proc: subprocess.Popen[bytes] | None = None
|
|
140
|
+
self._thread: threading.Thread | None = None
|
|
141
|
+
self._running: bool = False
|
|
142
|
+
# NC-235: WAV recording
|
|
143
|
+
self._record_path = record_path
|
|
144
|
+
self._record_file = None
|
|
145
|
+
self._record_bytes: int = 0
|
|
146
|
+
|
|
147
|
+
def start(self) -> None:
|
|
148
|
+
"""Start ffplay and the mix drain thread."""
|
|
149
|
+
import shutil
|
|
150
|
+
if not shutil.which("ffplay"):
|
|
151
|
+
raise RuntimeError(
|
|
152
|
+
"ffplay not found — install ffmpeg to use AudioMixer: "
|
|
153
|
+
"https://ffmpeg.org/download.html"
|
|
154
|
+
)
|
|
155
|
+
cmd = [
|
|
156
|
+
"ffplay", "-nodisp",
|
|
157
|
+
"-probesize", "32",
|
|
158
|
+
"-fflags", "nobuffer",
|
|
159
|
+
"-f", "mulaw", "-ar", "8000", "-",
|
|
160
|
+
]
|
|
161
|
+
self._proc = subprocess.Popen(
|
|
162
|
+
cmd,
|
|
163
|
+
stdin=subprocess.PIPE,
|
|
164
|
+
stdout=subprocess.DEVNULL,
|
|
165
|
+
stderr=subprocess.DEVNULL,
|
|
166
|
+
bufsize=0,
|
|
167
|
+
)
|
|
168
|
+
# NC-235: Open recording file
|
|
169
|
+
if self._record_path:
|
|
170
|
+
self._record_path.parent.mkdir(parents=True, exist_ok=True)
|
|
171
|
+
self._record_file = open(self._record_path, "wb") # noqa: SIM115
|
|
172
|
+
_write_wav_header(self._record_file, 0) # placeholder header
|
|
173
|
+
logger.info("AudioMixer recording to %s", self._record_path)
|
|
174
|
+
|
|
175
|
+
self._running = True
|
|
176
|
+
self._thread = threading.Thread(target=self._mix_loop, daemon=True)
|
|
177
|
+
self._thread.start()
|
|
178
|
+
logger.info("AudioMixer started (pid=%d)", self._proc.pid)
|
|
179
|
+
|
|
180
|
+
def write_caller(self, chunk: bytes) -> None:
|
|
181
|
+
"""Enqueue caller mulaw frames (160B each) into the caller deque."""
|
|
182
|
+
for i in range(0, len(chunk), FRAME_BYTES):
|
|
183
|
+
frame = chunk[i : i + FRAME_BYTES]
|
|
184
|
+
if len(frame) < FRAME_BYTES:
|
|
185
|
+
frame = frame + b"\xff" * (FRAME_BYTES - len(frame))
|
|
186
|
+
self._caller.append(frame)
|
|
187
|
+
|
|
188
|
+
def write_agent(self, chunk: bytes) -> None:
|
|
189
|
+
"""Enqueue agent mulaw frames (variable size) into the agent deque."""
|
|
190
|
+
for i in range(0, len(chunk), FRAME_BYTES):
|
|
191
|
+
frame = chunk[i : i + FRAME_BYTES]
|
|
192
|
+
if len(frame) < FRAME_BYTES:
|
|
193
|
+
frame = frame + b"\xff" * (FRAME_BYTES - len(frame))
|
|
194
|
+
self._agent.append(frame)
|
|
195
|
+
|
|
196
|
+
def _mix_loop(self) -> None:
|
|
197
|
+
"""Every 20ms: pop one frame from each deque, mix, write to ffplay."""
|
|
198
|
+
next_tick = time.monotonic()
|
|
199
|
+
|
|
200
|
+
while self._running and self._proc and self._proc.stdin:
|
|
201
|
+
now = time.monotonic()
|
|
202
|
+
|
|
203
|
+
if now > next_tick + CATCHUP_LIMIT:
|
|
204
|
+
next_tick = now
|
|
205
|
+
elif now < next_tick:
|
|
206
|
+
time.sleep(next_tick - now)
|
|
207
|
+
|
|
208
|
+
next_tick += FRAME_INTERVAL
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
caller_frame = self._caller.popleft()
|
|
212
|
+
except IndexError:
|
|
213
|
+
caller_frame = SILENCE_FRAME
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
agent_frame = self._agent.popleft()
|
|
217
|
+
except IndexError:
|
|
218
|
+
agent_frame = SILENCE_FRAME
|
|
219
|
+
|
|
220
|
+
if caller_frame is SILENCE_FRAME and agent_frame is SILENCE_FRAME:
|
|
221
|
+
mixed = SILENCE_FRAME
|
|
222
|
+
elif agent_frame is SILENCE_FRAME:
|
|
223
|
+
mixed = caller_frame
|
|
224
|
+
elif caller_frame is SILENCE_FRAME:
|
|
225
|
+
mixed = agent_frame
|
|
226
|
+
else:
|
|
227
|
+
mixed = mix_frames(caller_frame, agent_frame)
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
self._proc.stdin.write(mixed)
|
|
231
|
+
except (BrokenPipeError, OSError):
|
|
232
|
+
logger.warning("AudioMixer: ffplay pipe broken")
|
|
233
|
+
self._running = False
|
|
234
|
+
break
|
|
235
|
+
|
|
236
|
+
# NC-235: Write to recording file
|
|
237
|
+
if self._record_file:
|
|
238
|
+
try:
|
|
239
|
+
self._record_file.write(mixed)
|
|
240
|
+
self._record_bytes += len(mixed)
|
|
241
|
+
except OSError:
|
|
242
|
+
logger.warning("AudioMixer: recording write failed")
|
|
243
|
+
self._record_file = None
|
|
244
|
+
|
|
245
|
+
def shutdown(self) -> None:
|
|
246
|
+
"""Stop mix thread, finalize recording, and stop ffplay process."""
|
|
247
|
+
self._running = False
|
|
248
|
+
if self._thread:
|
|
249
|
+
self._thread.join(timeout=2.0)
|
|
250
|
+
|
|
251
|
+
# NC-235: Finalize WAV header with actual data size
|
|
252
|
+
if self._record_file:
|
|
253
|
+
try:
|
|
254
|
+
_write_wav_header(self._record_file, self._record_bytes)
|
|
255
|
+
self._record_file.close()
|
|
256
|
+
duration_s = self._record_bytes / 8000
|
|
257
|
+
logger.info(
|
|
258
|
+
"AudioMixer recording saved: %s (%.1fs, %d bytes)",
|
|
259
|
+
self._record_path, duration_s, self._record_bytes,
|
|
260
|
+
)
|
|
261
|
+
except OSError:
|
|
262
|
+
logger.warning("AudioMixer: failed to finalize recording")
|
|
263
|
+
self._record_file = None
|
|
264
|
+
if self._proc:
|
|
265
|
+
pid = self._proc.pid
|
|
266
|
+
with contextlib.suppress(OSError):
|
|
267
|
+
if self._proc.stdin:
|
|
268
|
+
self._proc.stdin.close()
|
|
269
|
+
self._proc.terminate()
|
|
270
|
+
try:
|
|
271
|
+
self._proc.wait(timeout=2.0)
|
|
272
|
+
except subprocess.TimeoutExpired:
|
|
273
|
+
# NC-170 Fix 5: SIGKILL after terminate timeout
|
|
274
|
+
logger.warning("AudioMixer: ffplay pid=%d did not terminate, sending SIGKILL", pid)
|
|
275
|
+
self._proc.kill()
|
|
276
|
+
with contextlib.suppress(Exception):
|
|
277
|
+
self._proc.wait(timeout=1.0)
|
|
278
|
+
logger.info("AudioMixer shutdown (pid=%d)", pid)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Mock voice providers for scripted testing (NC-267)."""
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Mock STT provider for scripted testing.
|
|
2
|
+
|
|
3
|
+
NC-267: Feeds pre-scripted utterances via inject(). Cross-thread safe
|
|
4
|
+
using loop.call_soon_threadsafe for inject from sync callers.
|
|
5
|
+
Conforms to SttProvider protocol.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import logging
|
|
12
|
+
from typing import TYPE_CHECKING, Any
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from collections.abc import Callable
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MockStt:
|
|
21
|
+
"""STT provider that yields scripted utterances via inject()."""
|
|
22
|
+
|
|
23
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
24
|
+
self.on_committed: Callable[[str], None] | None = None
|
|
25
|
+
self.on_recognizing: Callable[[str], None] | None = None
|
|
26
|
+
self.on_error: Callable[[str], None] | None = None
|
|
27
|
+
self._utterances: asyncio.Queue[str] = asyncio.Queue()
|
|
28
|
+
self._running = False
|
|
29
|
+
self._loop: asyncio.AbstractEventLoop | None = None
|
|
30
|
+
self._kwargs = kwargs
|
|
31
|
+
|
|
32
|
+
def inject(self, text: str) -> None:
|
|
33
|
+
"""Enqueue a scripted utterance (thread-safe).
|
|
34
|
+
|
|
35
|
+
Can be called from any thread. Uses call_soon_threadsafe when
|
|
36
|
+
the event loop is running in another thread.
|
|
37
|
+
"""
|
|
38
|
+
if self._loop is not None and self._loop.is_running():
|
|
39
|
+
self._loop.call_soon_threadsafe(self._utterances.put_nowait, text)
|
|
40
|
+
else:
|
|
41
|
+
self._utterances.put_nowait(text)
|
|
42
|
+
|
|
43
|
+
def set_speaking(self, speaking: bool) -> None:
|
|
44
|
+
"""No-op — mock has no echo discard logic."""
|
|
45
|
+
|
|
46
|
+
async def start(self, inbound_queue: asyncio.Queue[bytes | None]) -> None:
|
|
47
|
+
"""Start the mock STT consumer loop.
|
|
48
|
+
|
|
49
|
+
Spawns a background task that waits for injected utterances and
|
|
50
|
+
fires on_committed. The inbound_queue (raw audio) is ignored —
|
|
51
|
+
transcripts come from inject() calls.
|
|
52
|
+
|
|
53
|
+
Returns immediately (like ElevenLabs/Azure providers) so callers
|
|
54
|
+
can await start() without blocking.
|
|
55
|
+
"""
|
|
56
|
+
self._loop = asyncio.get_running_loop()
|
|
57
|
+
self._running = True
|
|
58
|
+
self._feed_task = asyncio.create_task(
|
|
59
|
+
self._consume_loop(), name="mock_stt_consume"
|
|
60
|
+
)
|
|
61
|
+
logger.info("MockStt started")
|
|
62
|
+
|
|
63
|
+
async def _consume_loop(self) -> None:
|
|
64
|
+
"""Background task: dispatch injected utterances to on_committed."""
|
|
65
|
+
while self._running:
|
|
66
|
+
try:
|
|
67
|
+
text = await asyncio.wait_for(self._utterances.get(), timeout=0.1)
|
|
68
|
+
except asyncio.TimeoutError:
|
|
69
|
+
continue
|
|
70
|
+
if self.on_committed:
|
|
71
|
+
self.on_committed(text)
|
|
72
|
+
logger.debug("MockStt committed: %s", text[:60])
|
|
73
|
+
|
|
74
|
+
async def stop(self) -> None:
|
|
75
|
+
"""Stop the consumer loop."""
|
|
76
|
+
self._running = False
|
|
77
|
+
if hasattr(self, "_feed_task") and self._feed_task:
|
|
78
|
+
self._feed_task.cancel()
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Mock TTS provider for scripted testing.
|
|
2
|
+
|
|
3
|
+
NC-267: Records spoken text without audio synthesis.
|
|
4
|
+
NC-271: Adds send_mark_and_wait for FSM timing + on_spoken callback for text relay.
|
|
5
|
+
Conforms to TtsProvider protocol.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import threading
|
|
12
|
+
from typing import TYPE_CHECKING, Any
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from collections.abc import Callable
|
|
16
|
+
|
|
17
|
+
from voice_runtime.session import VoiceSession
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MockTts:
|
|
23
|
+
"""TTS provider that records calls without producing audio."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, on_spoken: Callable[[str], None] | None = None, **kwargs: Any) -> None:
|
|
26
|
+
self.on_error: Callable[[str], None] | None = None
|
|
27
|
+
self.on_spoken: Callable[[str], None] | None = on_spoken
|
|
28
|
+
self.spoken: list[str] = []
|
|
29
|
+
self._kwargs = kwargs
|
|
30
|
+
|
|
31
|
+
def speak(
|
|
32
|
+
self,
|
|
33
|
+
text: str,
|
|
34
|
+
session: VoiceSession,
|
|
35
|
+
stop_event: threading.Event | None = None,
|
|
36
|
+
) -> dict[str, Any]:
|
|
37
|
+
"""Record text, fire on_spoken relay, and signal mark completion."""
|
|
38
|
+
self.spoken.append(text)
|
|
39
|
+
interrupted = stop_event.is_set() if stop_event else False
|
|
40
|
+
if self.on_spoken:
|
|
41
|
+
try:
|
|
42
|
+
self.on_spoken(text)
|
|
43
|
+
except Exception:
|
|
44
|
+
logger.exception("on_spoken callback failed")
|
|
45
|
+
# Skip mark wait when no event loop is wired (pure unit test context)
|
|
46
|
+
if session._loop is not None:
|
|
47
|
+
session.send_mark_and_wait("tts_complete", timeout=10.0)
|
|
48
|
+
return {"last_spoken": text, "interrupted": interrupted}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Voice runtime provider protocols.
|
|
2
|
+
|
|
3
|
+
NC-165: SttProvider Protocol defines the consumer-facing contract for
|
|
4
|
+
all STT providers. Enforced by type checker (pyright/mypy), not at runtime.
|
|
5
|
+
NC-166: Simplified — routing decisions moved to consumers. Provider fires
|
|
6
|
+
on_committed callback, consumer decides action.
|
|
7
|
+
NC-258: on_error callback for STT death detection and recovery.
|
|
8
|
+
NC-260 Gap A: TtsProvider Protocol for TTS error detection.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import threading
|
|
15
|
+
from typing import TYPE_CHECKING, Any, Protocol
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from collections.abc import Callable
|
|
19
|
+
|
|
20
|
+
from voice_runtime.session import VoiceSession
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SttProvider(Protocol):
|
|
24
|
+
"""Structural interface for speech-to-text providers.
|
|
25
|
+
|
|
26
|
+
NC-166: Provider normalizes audio → text and fires on_committed
|
|
27
|
+
for every committed utterance past echo discard. Consumer decides
|
|
28
|
+
routing (queue, dispatch, ignore). Provider does not make policy
|
|
29
|
+
decisions.
|
|
30
|
+
NC-258: on_error fires when STT encounters a fatal error and
|
|
31
|
+
reconnect has been exhausted. Consumer forwards to FSM.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
on_committed: Callable[[str], None] | None
|
|
35
|
+
on_recognizing: Callable[[str], None] | None
|
|
36
|
+
on_error: Callable[[str], None] | None
|
|
37
|
+
|
|
38
|
+
def set_speaking(self, speaking: bool) -> None: ...
|
|
39
|
+
async def start(self, inbound_queue: asyncio.Queue[bytes | None]) -> None: ...
|
|
40
|
+
async def stop(self) -> None: ...
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TtsProvider(Protocol):
|
|
44
|
+
"""Structural interface for text-to-speech providers.
|
|
45
|
+
|
|
46
|
+
NC-260 Gap A: TTS providers must expose on_error so synthesis failures
|
|
47
|
+
are reported to the FSM instead of hanging in a speaking_* state.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
on_error: Callable[[str], None] | None
|
|
51
|
+
|
|
52
|
+
def speak(
|
|
53
|
+
self,
|
|
54
|
+
text: str,
|
|
55
|
+
session: VoiceSession,
|
|
56
|
+
stop_event: threading.Event | None = ...,
|
|
57
|
+
) -> dict[str, Any]: ...
|