voice-runtime 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. voice_runtime-0.1.0/LICENSE +21 -0
  2. voice_runtime-0.1.0/PKG-INFO +516 -0
  3. voice_runtime-0.1.0/README.md +472 -0
  4. voice_runtime-0.1.0/pyproject.toml +37 -0
  5. voice_runtime-0.1.0/setup.cfg +4 -0
  6. voice_runtime-0.1.0/tests/test_audio.py +254 -0
  7. voice_runtime-0.1.0/tests/test_azure_stt.py +223 -0
  8. voice_runtime-0.1.0/tests/test_azure_tts.py +188 -0
  9. voice_runtime-0.1.0/tests/test_elevenlabs_tts.py +116 -0
  10. voice_runtime-0.1.0/tests/test_factories.py +81 -0
  11. voice_runtime-0.1.0/tests/test_nc165_stt_protocol.py +156 -0
  12. voice_runtime-0.1.0/tests/test_nc166_on_committed.py +373 -0
  13. voice_runtime-0.1.0/tests/test_nc170_resilience.py +438 -0
  14. voice_runtime-0.1.0/tests/test_nc193_twilio_sms.py +91 -0
  15. voice_runtime-0.1.0/tests/test_nc199_on_recognizing.py +170 -0
  16. voice_runtime-0.1.0/tests/test_nc236_unique_mark_names.py +109 -0
  17. voice_runtime-0.1.0/tests/test_nc260_gap_a_tts_on_error.py +138 -0
  18. voice_runtime-0.1.0/tests/test_nc260_gap_b_ws_disconnect.py +167 -0
  19. voice_runtime-0.1.0/tests/test_nc260_gap_c_stt_fatal_errors.py +159 -0
  20. voice_runtime-0.1.0/tests/test_nc260_gap_e_stt_ready.py +36 -0
  21. voice_runtime-0.1.0/tests/test_nc260_gap_f_loop_warning.py +41 -0
  22. voice_runtime-0.1.0/tests/test_nc267_mock_providers.py +253 -0
  23. voice_runtime-0.1.0/tests/test_nc271_mock_bridge.py +180 -0
  24. voice_runtime-0.1.0/tests/test_session.py +603 -0
  25. voice_runtime-0.1.0/tests/test_stt_lifecycle.py +119 -0
  26. voice_runtime-0.1.0/tests/test_stt_tee.py +159 -0
  27. voice_runtime-0.1.0/tests/test_twilio_call.py +110 -0
  28. voice_runtime-0.1.0/tests/test_twilio_ws.py +159 -0
  29. voice_runtime-0.1.0/voice_runtime/__init__.py +30 -0
  30. voice_runtime-0.1.0/voice_runtime/audio.py +278 -0
  31. voice_runtime-0.1.0/voice_runtime/mock/__init__.py +1 -0
  32. voice_runtime-0.1.0/voice_runtime/mock/stt.py +78 -0
  33. voice_runtime-0.1.0/voice_runtime/mock/tts.py +48 -0
  34. voice_runtime-0.1.0/voice_runtime/providers/__init__.py +57 -0
  35. voice_runtime-0.1.0/voice_runtime/providers/azure_stt.py +286 -0
  36. voice_runtime-0.1.0/voice_runtime/providers/azure_tts.py +123 -0
  37. voice_runtime-0.1.0/voice_runtime/providers/elevenlabs_stt.py +287 -0
  38. voice_runtime-0.1.0/voice_runtime/providers/elevenlabs_tts.py +107 -0
  39. voice_runtime-0.1.0/voice_runtime/session.py +346 -0
  40. voice_runtime-0.1.0/voice_runtime/stt.py +49 -0
  41. voice_runtime-0.1.0/voice_runtime/stt_tee.py +145 -0
  42. voice_runtime-0.1.0/voice_runtime/transport.py +20 -0
  43. voice_runtime-0.1.0/voice_runtime/transports/__init__.py +0 -0
  44. voice_runtime-0.1.0/voice_runtime/transports/mock_bridge.py +215 -0
  45. voice_runtime-0.1.0/voice_runtime/transports/twilio_call.py +88 -0
  46. voice_runtime-0.1.0/voice_runtime/transports/twilio_sms.py +36 -0
  47. voice_runtime-0.1.0/voice_runtime/transports/twilio_ws.py +206 -0
  48. voice_runtime-0.1.0/voice_runtime/tts.py +25 -0
  49. voice_runtime-0.1.0/voice_runtime.egg-info/PKG-INFO +516 -0
  50. voice_runtime-0.1.0/voice_runtime.egg-info/SOURCES.txt +51 -0
  51. voice_runtime-0.1.0/voice_runtime.egg-info/dependency_links.txt +1 -0
  52. voice_runtime-0.1.0/voice_runtime.egg-info/requires.txt +15 -0
  53. voice_runtime-0.1.0/voice_runtime.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Kasper Heikkinen, Sami Heikkinen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,516 @@
1
+ Metadata-Version: 2.4
2
+ Name: voice-runtime
3
+ Version: 0.1.0
4
+ Summary: Provider-agnostic voice call runtime for telephony projects
5
+ Author: Kasper Heikkinen, Sami Heikkinen
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Kasper Heikkinen, Sami Heikkinen
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Requires-Python: >=3.11
29
+ Description-Content-Type: text/markdown
30
+ License-File: LICENSE
31
+ Requires-Dist: fastapi>=0.110.0
32
+ Requires-Dist: uvicorn>=0.29.0
33
+ Requires-Dist: twilio>=9.0.0
34
+ Requires-Dist: httpx>=0.27.0
35
+ Provides-Extra: elevenlabs
36
+ Requires-Dist: elevenlabs>=1.9.0; extra == "elevenlabs"
37
+ Provides-Extra: azure
38
+ Requires-Dist: azure-cognitiveservices-speech>=1.38.0; extra == "azure"
39
+ Provides-Extra: dev
40
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
41
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
42
+ Requires-Dist: pytest-mock>=3.12.0; extra == "dev"
43
+ Dynamic: license-file
44
+
45
+ # voice_runtime
46
+
47
+ Provider-agnostic voice call runtime for telephony projects. Manages audio queues, mark synchronization, STT/TTS providers, and transport protocols — so consumers focus on conversation logic, not plumbing.
48
+
49
+ ## Quick Example
50
+
51
+ Make a call, say something, listen for a response via `on_committed` callback, hang up:
52
+
53
+ ```python
54
+ import asyncio
55
+ import threading
56
+ import time
57
+ import uvicorn
58
+ from fastapi import FastAPI
59
+
60
+ from voice_runtime.session import VoiceSession
61
+ from voice_runtime.transports.twilio_ws import register_voice_websocket
62
+ from voice_runtime.transports.twilio_call import initiate_outbound_call
63
+ from voice_runtime.tts import create_tts
64
+ from voice_runtime.stt import create_stt
65
+
66
+ # 1. Create session and start WebSocket server
67
+ session = VoiceSession()
68
+ app = FastAPI()
69
+ register_voice_websocket(app, session)
70
+
71
+ def run_server():
72
+ loop = asyncio.new_event_loop()
73
+ asyncio.set_event_loop(loop)
74
+ session.set_loop(loop)
75
+ loop.run_until_complete(uvicorn.Server(
76
+ uvicorn.Config(app, host="0.0.0.0", port=8080, log_level="warning")
77
+ ).serve())
78
+
79
+ threading.Thread(target=run_server, daemon=True).start()
80
+ time.sleep(1) # wait for server to start
81
+
82
+ # 2. Initiate call (Twilio calls back to our /voice WebSocket)
83
+ call_sid = initiate_outbound_call("+358401234567")
84
+ session.call_sid = call_sid
85
+ session.wait_for_ws_connect(timeout=30)
86
+
87
+ # 3. Speak
88
+ tts = create_tts() # default: ElevenLabs
89
+ tts.speak("Hello! How are you today?", session)
90
+ session.send_mark_and_wait("after-greeting") # block until playback done
91
+
92
+ # 4. Listen — persistent STT with on_committed callback
93
+ transcript_event = threading.Event()
94
+ heard = []
95
+
96
+ def on_committed(text: str):
97
+ heard.append(text)
98
+ transcript_event.set()
99
+
100
+ session.stt_factory = lambda: create_stt() # default: ElevenLabs
101
+ # Transport starts STT automatically; on_committed fires for each utterance
102
+
103
+ transcript_event.wait(timeout=30)
104
+ print(f"Caller said: {heard}")
105
+
106
+ # 5. Hang up
107
+ session.request_disconnect()
108
+ ```
109
+
110
+ ### What happens under the hood
111
+
112
+ ```
113
+ Consumer thread voice_runtime Transport (Twilio)
114
+ ─────────────────────────────────────────────────────────────────────────────
115
+ initiate_outbound_call() → Twilio REST calls.create() → Twilio dials phone
116
+ build_stream_twiml() with <Connect><Stream>
117
+ wait_for_ws_connect() ← signal_ws_connected() ← Twilio opens /voice WS
118
+ stt_factory() → stt.start()
119
+ tts.speak(text, session) → ffmpeg MP3→μ-law → send_audio task
120
+ put_outbound_sync() sends base64 frames
121
+ send_mark_and_wait() → get_pending_mark() → WS sends mark JSON
122
+ ← signal_mark_received() ← WS receives mark echo
123
+ on_committed(text) ← stt.on_committed() ← WS receives media frames
124
+ decodes base64 → STT
125
+ request_disconnect() → _disconnect_requested.set() → watch_disconnect task
126
+ stt.stop(), closes WS
127
+ Twilio ends call
128
+ ```
129
+
130
+ ## Architecture
131
+
132
+ ```
133
+ ┌──────────────────────────────────────────────┐
134
+ │ Consumer (outcaller, ninchat_voice) │
135
+ │ - Subclass VoiceSession (e.g. TelcoSession) │
136
+ │ - Call speak(), listen(), hang up │
137
+ ├──────────────────────────────────────────────┤
138
+ │ voice_runtime │
139
+ │ - VoiceSession: queues, marks, intents │
140
+ │ - Factories: create_stt / create_tts │
141
+ │ - Providers: ElevenLabs, Azure (STT + TTS) │
142
+ │ - SttTee: dual-provider fan-out │
143
+ │ - Audio: G.711 μ-law codec + mixer │
144
+ ├──────────────────────────────────────────────┤
145
+ │ Transport (protocol-specific) │
146
+ │ - twilio_ws: Media Streams WebSocket │
147
+ │ - twilio_call: REST call initiation + TwiML │
148
+ └──────────────────────────────────────────────┘
149
+ ```
150
+
151
+ **Key invariant:** VoiceSession has zero transport or provider imports. Consumers never import Twilio or ElevenLabs directly — they use factories and the intent API.
152
+
153
+ ## Factories
154
+
155
+ Provider-agnostic factories mirror the yamlgraph `create_llm()` pattern:
156
+
157
+ ```python
158
+ from voice_runtime.stt import create_stt, get_stt_class
159
+ from voice_runtime.tts import create_tts
160
+ from voice_runtime.transport import create_transport
161
+
162
+ stt = create_stt(provider="elevenlabs") # or "azure"
163
+ tts = create_tts(provider="elevenlabs") # or "azure"
164
+ transport = create_transport(provider="twilio")
165
+
166
+ # get_stt_class returns the class without instantiating (for factory arguments)
167
+ SttClass = get_stt_class(provider="elevenlabs")
168
+ session.stt_factory = lambda: SttClass(language_code="en")
169
+ ```
170
+
171
+ ## SttProvider Protocol
172
+
173
+ All STT providers implement this structural protocol (defined in `providers/__init__.py`):
174
+
175
+ ```python
176
+ class SttProvider(Protocol):
177
+ on_committed: Callable[[str], None] | None # final transcript for utterance
178
+ on_recognizing: Callable[[str], None] | None # interim hypothesis (NC-199)
179
+ on_error: Callable[[str], None] | None # fatal error after reconnect exhausted (NC-258)
180
+
181
+ def set_speaking(self, speaking: bool) -> None: ...
182
+ async def start(self, inbound_queue: asyncio.Queue[bytes | None]) -> None: ...
183
+ async def stop(self) -> None: ...
184
+ ```
185
+
186
+ | Callback | When it fires | Typical consumer action |
187
+ |----------|---------------|------------------------|
188
+ | `on_committed` | Final transcript past echo discard window | Route to LLM / FSM |
189
+ | `on_recognizing` | Interim hypothesis (may change) | Show live transcription UI |
190
+ | `on_error` | Reconnect attempts exhausted (fatal) | Transition FSM to error state |
191
+
192
+ Transport starts/stops the provider; the consumer decides routing (queue, dispatch, ignore).
193
+
194
+ ## TtsProvider Protocol
195
+
196
+ All TTS providers implement this structural protocol (NC-260 Gap A):
197
+
198
+ ```python
199
+ class TtsProvider(Protocol):
200
+ on_error: Callable[[str], None] | None # synthesis failure (NC-260 Gap A)
201
+
202
+ def speak(
203
+ self,
204
+ text: str,
205
+ session: VoiceSession,
206
+ stop_event: threading.Event | None = ...,
207
+ ) -> dict[str, Any]: ...
208
+ ```
209
+
210
+ `speak()` returns a dict with keys: `last_spoken` (str), and optionally `call_disconnected` (bool) or `interrupted` (bool). `on_error` fires on synthesis failures so the FSM doesn't hang in a speaking state.
211
+
212
+ ## VoiceSession
213
+
214
+ Central coordinator between sync tool threads, async transport, and STT/TTS providers.
215
+
216
+ ### Audio I/O
217
+
218
+ | Method | Thread safety | Purpose |
219
+ |--------|---------------|---------|
220
+ | `put_inbound(data)` | Any → async | Enqueue caller audio (transport calls this) |
221
+ | `get_outbound()` | async only | Dequeue agent audio (transport reads this) |
222
+ | `put_outbound_sync(data)` | Sync → async | Enqueue agent audio (TTS provider calls this) |
223
+ | `clear_inbound()` | Any | Drain stale audio frames |
224
+
225
+ All sync→async bridging uses `asyncio.run_coroutine_threadsafe()`.
226
+
227
+ ### Mark Synchronization
228
+
229
+ Marks let sync tool code block until the transport confirms audio playback reached a point. This is how you know a TTS utterance finished playing before you start listening.
230
+
231
+ ```python
232
+ tts.speak("What is your name?", session)
233
+ session.send_mark_and_wait("after-question", timeout=10.0)
234
+ # Now safe to start listening — caller heard the full question
235
+
236
+ session.clear_inbound()
237
+ transcript = stt.listen(session, timeout=30)
238
+ ```
239
+
240
+ | Method | Purpose |
241
+ |--------|---------|
242
+ | `send_mark_and_wait(name, timeout)` | Block sync thread until mark echoed |
243
+ | `signal_mark_received(name)` | Called by transport when mark arrives |
244
+ | `get_pending_mark()` | Async — transport reads next mark to send |
245
+
246
+ ### Transport Intent (NC-154)
247
+
248
+ Consumers signal *what* they want; the transport decides *how*.
249
+
250
+ ```python
251
+ session.request_disconnect() # transport closes connection, call ends
252
+ session.request_clear_buffer() # transport discards buffered audio (barge-in)
253
+ ```
254
+
255
+ Both are thread-safe. The transport watches `_disconnect_requested` (asyncio.Event) and `_clear_queue` (asyncio.Queue) and acts in its own protocol's terms — e.g. Twilio closes the WebSocket, which ends the call; SIP would send BYE.
256
+
257
+ ### STT Factory
258
+
259
+ Attach an STT factory and the transport manages its lifecycle automatically:
260
+
261
+ ```python
262
+ from voice_runtime.stt import create_stt
263
+
264
+ session.stt_factory = lambda: create_stt(provider="elevenlabs")
265
+ # Transport calls stt_factory() on stream start, stt.stop() on disconnect
266
+ ```
267
+
268
+ Optional secondary STT for parallel logging/comparison (via `SttTee`):
269
+
270
+ ```python
271
+ session.stt_secondary_factory = lambda: create_stt(provider="azure")
272
+ # Transport wraps both in SttTee — primary drives on_committed, secondary logs only
273
+ ```
274
+
275
+ ### STT Ready Hook (NC-260 Gap E)
276
+
277
+ Wire callbacks *after* the transport creates the STT instance but *before* `start()`:
278
+
279
+ ```python
280
+ def wire_callbacks(stt: SttProvider):
281
+ stt.on_committed = handle_transcript
282
+ stt.on_recognizing = handle_interim
283
+ stt.on_error = handle_stt_death
284
+
285
+ session.on_stt_ready = wire_callbacks
286
+ ```
287
+
288
+ This replaces the old pattern of wiring callbacks before attaching the factory. The transport calls `on_stt_ready(stt)` after construction, guaranteeing callbacks are set before `start()` fires.
289
+
290
+ ### Lifecycle
291
+
292
+ | Method | Purpose |
293
+ |--------|---------|
294
+ | `signal_ws_connected(stream_sid)` | Transport calls when connection established |
295
+ | `wait_for_ws_connect(timeout)` | Consumer blocks until connected; raises `CallNotAnsweredError` |
296
+ | `signal_disconnected()` | Transport calls on hangup |
297
+ | `is_disconnected` | Property — check if call ended |
298
+ | `reset()` | Clear all state for session reuse (multi-call servers) |
299
+
300
+ ### Audio Monitoring
301
+
302
+ Optional two-channel mixer for real-time call monitoring (requires `ffplay`) and WAV recording (NC-235):
303
+
304
+ ```python
305
+ from pathlib import Path
306
+ from voice_runtime.audio import AudioMixer
307
+
308
+ # Monitor only (plays mixed audio through ffplay)
309
+ mixer = AudioMixer()
310
+
311
+ # Monitor + record to WAV file (8kHz mono mulaw)
312
+ mixer = AudioMixer(record_path=Path("recordings/call_001.wav"))
313
+
314
+ mixer.start()
315
+ session.set_mixer(mixer)
316
+ # session.tap_caller() / session.tap_agent() now feed audio to ffplay + WAV
317
+ ```
318
+
319
+ ### Exceptions
320
+
321
+ | Exception | When |
322
+ |-----------|------|
323
+ | `MissingStreamUrlError` | `VOICE_STREAM_URL` env var not set |
324
+ | `CallNotAnsweredError(timeout)` | WebSocket didn't connect within timeout |
325
+ | `CallHangupError` | Call hung up during a listen operation |
326
+
327
+ ## Transport: Twilio
328
+
329
+ ### WebSocket Handler
330
+
331
+ Registers a `/voice` endpoint on a FastAPI app implementing Twilio Media Streams:
332
+
333
+ ```python
334
+ from voice_runtime.transports.twilio_ws import register_voice_websocket
335
+
336
+ app = FastAPI()
337
+ register_voice_websocket(app, session)
338
+ ```
339
+
340
+ Runs 5 async tasks on stream start: `send_audio`, `send_marks`, `watch_disconnect`, `send_clears`, `stt` (if factory provided).
341
+
342
+ ### Call Initiation
343
+
344
+ ```python
345
+ from voice_runtime.transports.twilio_call import (
346
+ initiate_outbound_call,
347
+ build_stream_twiml, # alias: build_stream_xml
348
+ )
349
+
350
+ # Outbound: dial phone, Twilio connects back to /voice WebSocket
351
+ call_sid = initiate_outbound_call("+358401234567")
352
+
353
+ # Inbound webhook: return TwiML that tells Twilio to stream audio to /voice
354
+ xml = build_stream_twiml("wss://example.ngrok.io")
355
+ ```
356
+
357
+ ## Providers
358
+
359
+ ### ElevenLabs TTS
360
+
361
+ Streams text → ElevenLabs API → ffmpeg (MP3 → μ-law 8kHz) → session outbound queue.
362
+
363
+ ```python
364
+ from voice_runtime.tts import create_tts
365
+
366
+ tts = create_tts(provider="elevenlabs")
367
+ result = tts.speak("Hello", session, stop_event=barge_in_event)
368
+ # result: {"last_spoken": "Hello"} or {"last_spoken": "Hello", "call_disconnected": True}
369
+ ```
370
+
371
+ Supports barge-in interrupt: pass a `threading.Event` as `stop_event`; set it from another thread to cut TTS mid-stream. Result may include `{"interrupted": True}`.
372
+
373
+ ### ElevenLabs STT
374
+
375
+ Persistent Scribe WebSocket per call lifetime. Barge-in detection, echo discard, reconnect on fatal errors.
376
+
377
+ ```python
378
+ from voice_runtime.stt import create_stt
379
+
380
+ stt = create_stt(provider="elevenlabs")
381
+ stt.on_committed = lambda text: print(f"Heard: {text}")
382
+ await stt.start(session.inbound)
383
+ # ... later
384
+ await stt.stop()
385
+ ```
386
+
387
+ Typically managed by the transport via `session.stt_factory` rather than started manually.
388
+
389
+ ### Azure TTS
390
+
391
+ Streams text → Azure Speech SDK → mulaw 8kHz → session outbound queue.
392
+
393
+ ```python
394
+ from voice_runtime.tts import create_tts
395
+
396
+ tts = create_tts(provider="azure")
397
+ result = tts.speak("Hello", session, stop_event=barge_in_event)
398
+ ```
399
+
400
+ Same `speak()` interface as ElevenLabs. Uses `AZURE_SPEECH_KEY`, `AZURE_SPEECH_REGION`, `AZURE_TTS_VOICE` env vars.
401
+
402
+ ### Azure STT
403
+
404
+ Persistent Azure Speech SDK recognizer with continuous recognition.
405
+
406
+ ```python
407
+ from voice_runtime.stt import create_stt
408
+
409
+ stt = create_stt(provider="azure", language_code="fi-FI", silence_timeout_ms=1500)
410
+ stt.on_committed = lambda text: print(f"Heard: {text}")
411
+ await stt.start(session.inbound)
412
+ ```
413
+
414
+ Same `SttProvider` protocol as ElevenLabs. Echo discard window (400ms) after TTS ends.
415
+
416
+ ### SttTee (Dual STT)
417
+
418
+ Fan-out adapter running two STT providers on the same audio stream:
419
+
420
+ ```python
421
+ from voice_runtime.stt_tee import SttTee
422
+
423
+ tee = SttTee(primary=elevenlabs_stt, secondary=azure_stt)
424
+ tee.on_committed = handler # proxied to primary only
425
+ await tee.start(session.inbound)
426
+ ```
427
+
428
+ Primary drives production (`on_committed`). Secondary receives same frames for logging/comparison only — its errors never propagate. Typically wired automatically via `session.stt_secondary_factory`.
429
+
430
+ ## Audio Codec
431
+
432
+ G.711 μ-law at 8kHz — Twilio's native format. 160 bytes = 20ms frame.
433
+
434
+ ```python
435
+ from voice_runtime.audio import mix_frames
436
+
437
+ mixed = mix_frames(caller_chunk, agent_chunk) # mix two 160-byte frames
438
+ ```
439
+
440
+ ## Environment Variables
441
+
442
+ | Variable | Purpose | Required |
443
+ |----------|---------|----------|
444
+ | `VOICE_STREAM_URL` | Public WebSocket URL for transport callback | Yes |
445
+ | `VOICE_SERVER_PORT` | Uvicorn listen port | No (default: 8080) |
446
+ | `TWILIO_ACCOUNT_SID` | Twilio credentials (call initiation) | For outbound |
447
+ | `TWILIO_AUTH_TOKEN` | Twilio credentials | For outbound |
448
+ | `TWILIO_PHONE_NUMBER` | Outbound caller ID | For outbound |
449
+ | `ELEVENLABS_API_KEY` | ElevenLabs authentication | For ElevenLabs |
450
+ | `ELEVENLABS_VOICE_ID` | TTS voice | No (default: Rachel) |
451
+ | `ELEVENLABS_MODEL` | TTS model | No (default: `eleven_multilingual_v2`) |
452
+ | `STT_MODEL_ID` | ElevenLabs STT model | No (default: `scribe_v2_realtime`) |
453
+ | `STT_LANGUAGE_CODE` | ElevenLabs STT language | No (default: `fi`) |
454
+ | `AZURE_SPEECH_KEY` | Azure Speech SDK authentication | For Azure |
455
+ | `AZURE_SPEECH_REGION` | Azure Speech SDK region | No (default: `westeurope`) |
456
+ | `AZURE_TTS_VOICE` | Azure TTS voice name | No (default: `fi-FI-NooraNeural`) |
457
+ | `VOICE_MONITOR` | Enable AudioMixer monitoring | No (default: off) |
458
+
459
+ ## Transport: SMS (NC-193)
460
+
461
+ Send SMS via Twilio REST API without importing the Twilio SDK in consumers:
462
+
463
+ ```python
464
+ from voice_runtime.transport import get_sms_transport
465
+
466
+ sms = get_sms_transport() # default: twilio
467
+ result = sms.send_sms(to="+358401234567", body="Your appointment is confirmed.")
468
+ # result: {"message_sid": "SM...", "status": "queued", "to": "+358..."}
469
+ ```
470
+
471
+ Requires `TWILIO_ACCOUNT_SID`, `TWILIO_AUTH_TOKEN`, and `TWILIO_PHONE_NUMBER` env vars.
472
+
473
+ ## Consumer Pattern
474
+
475
+ Typical consumer subclasses `VoiceSession` and adds server lifecycle:
476
+
477
+ ```python
478
+ from dataclasses import dataclass
479
+ from voice_runtime.session import VoiceSession
480
+
481
+ @dataclass
482
+ class TelcoSession(VoiceSession):
483
+ def start(self):
484
+ app = FastAPI()
485
+ register_voice_websocket(app, self)
486
+ # Run uvicorn in daemon thread
487
+ threading.Thread(target=self._run_loop, daemon=True).start()
488
+
489
+ def shutdown(self):
490
+ # Signal event loop to stop, join thread
491
+ ...
492
+ ```
493
+
494
+ Tool nodes then use the session for audio I/O, mark sync, and transport intents — without knowing anything about Twilio, WebSockets, or ElevenLabs.
495
+
496
+ ### Known Consumers
497
+
498
+ Any project that subclasses `VoiceSession` and registers a FastAPI WebSocket handler via `register_voice_websocket` is a consumer. See the Consumer Pattern section above for the canonical implementation.
499
+
500
+ ## Multi-Call Session Reuse
501
+
502
+ When servers handle multiple sequential calls on the same `VoiceSession` instance, `reset()` clears all state between calls:
503
+
504
+ - Stops active STT via `asyncio.run_coroutine_threadsafe(stt.stop(), loop)` before clearing the reference (prevents orphaned WebSocket connections)
505
+ - Drains inbound and outbound queues
506
+ - Resets mark synchronization and transport intent events
507
+
508
+ The STT `start()` method also drains the inbound queue as defense-in-depth against sentinel values from prior call cleanup.
509
+
510
+ ### STT Reconnect on Fatal Errors
511
+
512
+ `PersistentSttSession` detects fatal WebSocket errors (connection closed, protocol errors) and automatically reconnects:
513
+
514
+ - `_on_error()` schedules `_reconnect_after_error()` for errors in `_FATAL_ERRORS`
515
+ - Reconnect drains stale frames, creates a new WebSocket, and resumes the feed task
516
+ - `_feed_audio()` wraps `send()` in try/except for dead socket resilience