voice-runtime 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ # voice_runtime — provider-agnostic voice call runtime for telephony projects
2
+ #
3
+ # Public API surface. Import from here rather than internal modules.
4
+
5
+ from voice_runtime.audio import AudioMixer, mix_frames
6
+ from voice_runtime.providers import SttProvider, TtsProvider
7
+ from voice_runtime.session import (
8
+ CallHangupError,
9
+ CallNotAnsweredError,
10
+ MissingStreamUrlError,
11
+ VoiceSession,
12
+ )
13
+ from voice_runtime.stt import create_stt, get_stt_class
14
+ from voice_runtime.transport import get_sms_transport
15
+ from voice_runtime.tts import create_tts
16
+
17
+ __all__ = [
18
+ "VoiceSession",
19
+ "MissingStreamUrlError",
20
+ "CallNotAnsweredError",
21
+ "CallHangupError",
22
+ "create_stt",
23
+ "get_stt_class",
24
+ "create_tts",
25
+ "get_sms_transport",
26
+ "AudioMixer",
27
+ "mix_frames",
28
+ "SttProvider",
29
+ "TtsProvider",
30
+ ]
voice_runtime/audio.py ADDED
@@ -0,0 +1,278 @@
1
+ """G.711 μ-law codec and real-time audio mixer.
2
+
3
+ NC-152: Extracted from outcaller/nodes/audio_mixer.py (228 lines, zero
4
+ project imports). Provides mulaw encode/decode, frame mixing, and the
5
+ AudioMixer class for local audio monitoring.
6
+
7
+ NC-235: Optional WAV file recording (record_path parameter).
8
+ Shared by outcaller and ninchat_voice (vendored). Additive change —
9
+ record_path=None default preserves existing behavior. Cross-project
10
+ impact disappears post NC-230 extraction.
11
+
12
+ Architecture:
13
+ tap_caller(chunk) → caller deque ─┐
14
+ ├─ mix thread (20ms tick) → ffplay stdin
15
+ tap_agent(chunk) → agent deque ─┘ → WAV file (opt)
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import collections
21
+ import contextlib
22
+ import logging
23
+ import struct
24
+ import subprocess
25
+ import threading
26
+ import time
27
+ from pathlib import Path
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ FRAME_BYTES: int = 160 # 20ms @ 8kHz, 1 byte/sample (mulaw)
32
+ FRAME_INTERVAL: float = 0.020 # 20ms
33
+ MAX_FRAMES: int = 400 # 8 seconds at 20ms/frame
34
+ SILENCE_FRAME: bytes = b"\xff" * FRAME_BYTES # mulaw silence (zero amplitude)
35
+ CATCHUP_LIMIT: float = 5 * FRAME_INTERVAL # 100ms — reset threshold
36
+
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # G.711 μ-law codec (ITU-T, public domain tables)
40
+ # Inline implementation — avoids audioop dependency removed in Python 3.13.
41
+ # ---------------------------------------------------------------------------
42
+
43
+ _ULAW_TO_LINEAR: list[int] = []
44
+
45
+
46
+ def _build_ulaw_table() -> None:
47
+ """Populate _ULAW_TO_LINEAR (256 entries) from the G.711 formula."""
48
+ _exp_lut = [0, 132, 396, 924, 1980, 4092, 8316, 16764]
49
+ for i in range(256):
50
+ val = ~i
51
+ sign = val & 0x80
52
+ exponent = (val >> 4) & 0x07
53
+ mantissa = val & 0x0F
54
+ sample = _exp_lut[exponent] + (mantissa << (exponent + 3))
55
+ if sign != 0:
56
+ sample = -sample
57
+ _ULAW_TO_LINEAR.append(sample)
58
+
59
+
60
+ _build_ulaw_table()
61
+
62
+
63
+ def _linear_to_ulaw(sample: int) -> int:
64
+ """Encode a single 16-bit signed sample to μ-law byte."""
65
+ BIAS = 0x84
66
+ CLIP = 32635
67
+ sign = 0
68
+ if sample < 0:
69
+ sign = 0x80
70
+ sample = -sample
71
+ if sample > CLIP:
72
+ sample = CLIP
73
+ sample += BIAS
74
+ exponent = 7
75
+ exp_mask = 0x4000
76
+ for _ in range(8):
77
+ if sample & exp_mask:
78
+ break
79
+ exponent -= 1
80
+ exp_mask >>= 1
81
+ mantissa = (sample >> (exponent + 3)) & 0x0F
82
+ return ~(sign | (exponent << 4) | mantissa) & 0xFF
83
+
84
+
85
+ def mix_frames(caller: bytes, agent: bytes) -> bytes:
86
+ """Mix two mulaw frames by decoding, adding (clamped), re-encoding.
87
+
88
+ Args:
89
+ caller: FRAME_BYTES of mulaw caller audio
90
+ agent: FRAME_BYTES of mulaw agent audio
91
+
92
+ Returns:
93
+ FRAME_BYTES of mixed mulaw audio
94
+ """
95
+ out = bytearray(FRAME_BYTES)
96
+ for i in range(FRAME_BYTES):
97
+ c = _ULAW_TO_LINEAR[caller[i]]
98
+ a = _ULAW_TO_LINEAR[agent[i]]
99
+ mixed = c + a
100
+ if mixed > 32767:
101
+ mixed = 32767
102
+ elif mixed < -32768:
103
+ mixed = -32768
104
+ out[i] = _linear_to_ulaw(mixed)
105
+ return bytes(out)
106
+
107
+
108
+ def _write_wav_header(f, data_size: int) -> None:
109
+ """Write a WAV header for 8kHz mono mulaw audio.
110
+
111
+ WAV mulaw: codec ID 7, 8000 Hz, 1 channel, 8 bits/sample.
112
+ fmt chunk is 18 bytes (includes cbSize=0 for non-PCM).
113
+ """
114
+ fmt_size = 18
115
+ header_size = 4 + (8 + fmt_size) + (8 + data_size) # WAVE + fmt chunk + data chunk
116
+ f.seek(0)
117
+ # RIFF header
118
+ f.write(struct.pack("<4sI4s", b"RIFF", header_size, b"WAVE"))
119
+ # fmt chunk: codec=7(mulaw), channels=1, rate=8000, byterate=8000, blockalign=1, bits=8, cbSize=0
120
+ f.write(struct.pack("<4sIHHIIHHH", b"fmt ", fmt_size, 7, 1, 8000, 8000, 1, 8, 0))
121
+ # data chunk header
122
+ f.write(struct.pack("<4sI", b"data", data_size))
123
+
124
+
125
+ class AudioMixer:
126
+ """Real-time mulaw mixer: two input channels → one mixed output.
127
+
128
+ Uses bounded deques (not ring buffers) — correct for bursty agent audio.
129
+ Each direction has an independent FIFO. The mix thread pops one frame from
130
+ each deque every 20ms. Empty deque → silence. Overfull deque (agent burst)
131
+ → frames accumulate and drain at 1×; maxlen drops oldest on overflow.
132
+
133
+ NC-235: When record_path is set, mixed audio is also written to a WAV file.
134
+ """
135
+
136
+ def __init__(self, record_path: Path | None = None) -> None:
137
+ self._caller: collections.deque[bytes] = collections.deque(maxlen=MAX_FRAMES)
138
+ self._agent: collections.deque[bytes] = collections.deque(maxlen=MAX_FRAMES)
139
+ self._proc: subprocess.Popen[bytes] | None = None
140
+ self._thread: threading.Thread | None = None
141
+ self._running: bool = False
142
+ # NC-235: WAV recording
143
+ self._record_path = record_path
144
+ self._record_file = None
145
+ self._record_bytes: int = 0
146
+
147
+ def start(self) -> None:
148
+ """Start ffplay and the mix drain thread."""
149
+ import shutil
150
+ if not shutil.which("ffplay"):
151
+ raise RuntimeError(
152
+ "ffplay not found — install ffmpeg to use AudioMixer: "
153
+ "https://ffmpeg.org/download.html"
154
+ )
155
+ cmd = [
156
+ "ffplay", "-nodisp",
157
+ "-probesize", "32",
158
+ "-fflags", "nobuffer",
159
+ "-f", "mulaw", "-ar", "8000", "-",
160
+ ]
161
+ self._proc = subprocess.Popen(
162
+ cmd,
163
+ stdin=subprocess.PIPE,
164
+ stdout=subprocess.DEVNULL,
165
+ stderr=subprocess.DEVNULL,
166
+ bufsize=0,
167
+ )
168
+ # NC-235: Open recording file
169
+ if self._record_path:
170
+ self._record_path.parent.mkdir(parents=True, exist_ok=True)
171
+ self._record_file = open(self._record_path, "wb") # noqa: SIM115
172
+ _write_wav_header(self._record_file, 0) # placeholder header
173
+ logger.info("AudioMixer recording to %s", self._record_path)
174
+
175
+ self._running = True
176
+ self._thread = threading.Thread(target=self._mix_loop, daemon=True)
177
+ self._thread.start()
178
+ logger.info("AudioMixer started (pid=%d)", self._proc.pid)
179
+
180
+ def write_caller(self, chunk: bytes) -> None:
181
+ """Enqueue caller mulaw frames (160B each) into the caller deque."""
182
+ for i in range(0, len(chunk), FRAME_BYTES):
183
+ frame = chunk[i : i + FRAME_BYTES]
184
+ if len(frame) < FRAME_BYTES:
185
+ frame = frame + b"\xff" * (FRAME_BYTES - len(frame))
186
+ self._caller.append(frame)
187
+
188
+ def write_agent(self, chunk: bytes) -> None:
189
+ """Enqueue agent mulaw frames (variable size) into the agent deque."""
190
+ for i in range(0, len(chunk), FRAME_BYTES):
191
+ frame = chunk[i : i + FRAME_BYTES]
192
+ if len(frame) < FRAME_BYTES:
193
+ frame = frame + b"\xff" * (FRAME_BYTES - len(frame))
194
+ self._agent.append(frame)
195
+
196
+ def _mix_loop(self) -> None:
197
+ """Every 20ms: pop one frame from each deque, mix, write to ffplay."""
198
+ next_tick = time.monotonic()
199
+
200
+ while self._running and self._proc and self._proc.stdin:
201
+ now = time.monotonic()
202
+
203
+ if now > next_tick + CATCHUP_LIMIT:
204
+ next_tick = now
205
+ elif now < next_tick:
206
+ time.sleep(next_tick - now)
207
+
208
+ next_tick += FRAME_INTERVAL
209
+
210
+ try:
211
+ caller_frame = self._caller.popleft()
212
+ except IndexError:
213
+ caller_frame = SILENCE_FRAME
214
+
215
+ try:
216
+ agent_frame = self._agent.popleft()
217
+ except IndexError:
218
+ agent_frame = SILENCE_FRAME
219
+
220
+ if caller_frame is SILENCE_FRAME and agent_frame is SILENCE_FRAME:
221
+ mixed = SILENCE_FRAME
222
+ elif agent_frame is SILENCE_FRAME:
223
+ mixed = caller_frame
224
+ elif caller_frame is SILENCE_FRAME:
225
+ mixed = agent_frame
226
+ else:
227
+ mixed = mix_frames(caller_frame, agent_frame)
228
+
229
+ try:
230
+ self._proc.stdin.write(mixed)
231
+ except (BrokenPipeError, OSError):
232
+ logger.warning("AudioMixer: ffplay pipe broken")
233
+ self._running = False
234
+ break
235
+
236
+ # NC-235: Write to recording file
237
+ if self._record_file:
238
+ try:
239
+ self._record_file.write(mixed)
240
+ self._record_bytes += len(mixed)
241
+ except OSError:
242
+ logger.warning("AudioMixer: recording write failed")
243
+ self._record_file = None
244
+
245
+ def shutdown(self) -> None:
246
+ """Stop mix thread, finalize recording, and stop ffplay process."""
247
+ self._running = False
248
+ if self._thread:
249
+ self._thread.join(timeout=2.0)
250
+
251
+ # NC-235: Finalize WAV header with actual data size
252
+ if self._record_file:
253
+ try:
254
+ _write_wav_header(self._record_file, self._record_bytes)
255
+ self._record_file.close()
256
+ duration_s = self._record_bytes / 8000
257
+ logger.info(
258
+ "AudioMixer recording saved: %s (%.1fs, %d bytes)",
259
+ self._record_path, duration_s, self._record_bytes,
260
+ )
261
+ except OSError:
262
+ logger.warning("AudioMixer: failed to finalize recording")
263
+ self._record_file = None
264
+ if self._proc:
265
+ pid = self._proc.pid
266
+ with contextlib.suppress(OSError):
267
+ if self._proc.stdin:
268
+ self._proc.stdin.close()
269
+ self._proc.terminate()
270
+ try:
271
+ self._proc.wait(timeout=2.0)
272
+ except subprocess.TimeoutExpired:
273
+ # NC-170 Fix 5: SIGKILL after terminate timeout
274
+ logger.warning("AudioMixer: ffplay pid=%d did not terminate, sending SIGKILL", pid)
275
+ self._proc.kill()
276
+ with contextlib.suppress(Exception):
277
+ self._proc.wait(timeout=1.0)
278
+ logger.info("AudioMixer shutdown (pid=%d)", pid)
@@ -0,0 +1 @@
1
+ """Mock voice providers for scripted testing (NC-267)."""
@@ -0,0 +1,78 @@
1
+ """Mock STT provider for scripted testing.
2
+
3
+ NC-267: Feeds pre-scripted utterances via inject(). Cross-thread safe
4
+ using loop.call_soon_threadsafe for inject from sync callers.
5
+ Conforms to SttProvider protocol.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import asyncio
11
+ import logging
12
+ from typing import TYPE_CHECKING, Any
13
+
14
+ if TYPE_CHECKING:
15
+ from collections.abc import Callable
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class MockStt:
21
+ """STT provider that yields scripted utterances via inject()."""
22
+
23
+ def __init__(self, **kwargs: Any) -> None:
24
+ self.on_committed: Callable[[str], None] | None = None
25
+ self.on_recognizing: Callable[[str], None] | None = None
26
+ self.on_error: Callable[[str], None] | None = None
27
+ self._utterances: asyncio.Queue[str] = asyncio.Queue()
28
+ self._running = False
29
+ self._loop: asyncio.AbstractEventLoop | None = None
30
+ self._kwargs = kwargs
31
+
32
+ def inject(self, text: str) -> None:
33
+ """Enqueue a scripted utterance (thread-safe).
34
+
35
+ Can be called from any thread. Uses call_soon_threadsafe when
36
+ the event loop is running in another thread.
37
+ """
38
+ if self._loop is not None and self._loop.is_running():
39
+ self._loop.call_soon_threadsafe(self._utterances.put_nowait, text)
40
+ else:
41
+ self._utterances.put_nowait(text)
42
+
43
+ def set_speaking(self, speaking: bool) -> None:
44
+ """No-op — mock has no echo discard logic."""
45
+
46
+ async def start(self, inbound_queue: asyncio.Queue[bytes | None]) -> None:
47
+ """Start the mock STT consumer loop.
48
+
49
+ Spawns a background task that waits for injected utterances and
50
+ fires on_committed. The inbound_queue (raw audio) is ignored —
51
+ transcripts come from inject() calls.
52
+
53
+ Returns immediately (like ElevenLabs/Azure providers) so callers
54
+ can await start() without blocking.
55
+ """
56
+ self._loop = asyncio.get_running_loop()
57
+ self._running = True
58
+ self._feed_task = asyncio.create_task(
59
+ self._consume_loop(), name="mock_stt_consume"
60
+ )
61
+ logger.info("MockStt started")
62
+
63
+ async def _consume_loop(self) -> None:
64
+ """Background task: dispatch injected utterances to on_committed."""
65
+ while self._running:
66
+ try:
67
+ text = await asyncio.wait_for(self._utterances.get(), timeout=0.1)
68
+ except asyncio.TimeoutError:
69
+ continue
70
+ if self.on_committed:
71
+ self.on_committed(text)
72
+ logger.debug("MockStt committed: %s", text[:60])
73
+
74
+ async def stop(self) -> None:
75
+ """Stop the consumer loop."""
76
+ self._running = False
77
+ if hasattr(self, "_feed_task") and self._feed_task:
78
+ self._feed_task.cancel()
@@ -0,0 +1,48 @@
1
+ """Mock TTS provider for scripted testing.
2
+
3
+ NC-267: Records spoken text without audio synthesis.
4
+ NC-271: Adds send_mark_and_wait for FSM timing + on_spoken callback for text relay.
5
+ Conforms to TtsProvider protocol.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import threading
12
+ from typing import TYPE_CHECKING, Any
13
+
14
+ if TYPE_CHECKING:
15
+ from collections.abc import Callable
16
+
17
+ from voice_runtime.session import VoiceSession
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class MockTts:
23
+ """TTS provider that records calls without producing audio."""
24
+
25
+ def __init__(self, on_spoken: Callable[[str], None] | None = None, **kwargs: Any) -> None:
26
+ self.on_error: Callable[[str], None] | None = None
27
+ self.on_spoken: Callable[[str], None] | None = on_spoken
28
+ self.spoken: list[str] = []
29
+ self._kwargs = kwargs
30
+
31
+ def speak(
32
+ self,
33
+ text: str,
34
+ session: VoiceSession,
35
+ stop_event: threading.Event | None = None,
36
+ ) -> dict[str, Any]:
37
+ """Record text, fire on_spoken relay, and signal mark completion."""
38
+ self.spoken.append(text)
39
+ interrupted = stop_event.is_set() if stop_event else False
40
+ if self.on_spoken:
41
+ try:
42
+ self.on_spoken(text)
43
+ except Exception:
44
+ logger.exception("on_spoken callback failed")
45
+ # Skip mark wait when no event loop is wired (pure unit test context)
46
+ if session._loop is not None:
47
+ session.send_mark_and_wait("tts_complete", timeout=10.0)
48
+ return {"last_spoken": text, "interrupted": interrupted}
@@ -0,0 +1,57 @@
1
+ """Voice runtime provider protocols.
2
+
3
+ NC-165: SttProvider Protocol defines the consumer-facing contract for
4
+ all STT providers. Enforced by type checker (pyright/mypy), not at runtime.
5
+ NC-166: Simplified — routing decisions moved to consumers. Provider fires
6
+ on_committed callback, consumer decides action.
7
+ NC-258: on_error callback for STT death detection and recovery.
8
+ NC-260 Gap A: TtsProvider Protocol for TTS error detection.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import threading
15
+ from typing import TYPE_CHECKING, Any, Protocol
16
+
17
+ if TYPE_CHECKING:
18
+ from collections.abc import Callable
19
+
20
+ from voice_runtime.session import VoiceSession
21
+
22
+
23
+ class SttProvider(Protocol):
24
+ """Structural interface for speech-to-text providers.
25
+
26
+ NC-166: Provider normalizes audio → text and fires on_committed
27
+ for every committed utterance past echo discard. Consumer decides
28
+ routing (queue, dispatch, ignore). Provider does not make policy
29
+ decisions.
30
+ NC-258: on_error fires when STT encounters a fatal error and
31
+ reconnect has been exhausted. Consumer forwards to FSM.
32
+ """
33
+
34
+ on_committed: Callable[[str], None] | None
35
+ on_recognizing: Callable[[str], None] | None
36
+ on_error: Callable[[str], None] | None
37
+
38
+ def set_speaking(self, speaking: bool) -> None: ...
39
+ async def start(self, inbound_queue: asyncio.Queue[bytes | None]) -> None: ...
40
+ async def stop(self) -> None: ...
41
+
42
+
43
+ class TtsProvider(Protocol):
44
+ """Structural interface for text-to-speech providers.
45
+
46
+ NC-260 Gap A: TTS providers must expose on_error so synthesis failures
47
+ are reported to the FSM instead of hanging in a speaking_* state.
48
+ """
49
+
50
+ on_error: Callable[[str], None] | None
51
+
52
+ def speak(
53
+ self,
54
+ text: str,
55
+ session: VoiceSession,
56
+ stop_event: threading.Event | None = ...,
57
+ ) -> dict[str, Any]: ...