openspeechapi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. openspeech/__init__.py +75 -0
  2. openspeech/__main__.py +5 -0
  3. openspeech/cli.py +413 -0
  4. openspeech/client/__init__.py +4 -0
  5. openspeech/client/client.py +145 -0
  6. openspeech/config.py +212 -0
  7. openspeech/core/__init__.py +0 -0
  8. openspeech/core/base.py +75 -0
  9. openspeech/core/enums.py +39 -0
  10. openspeech/core/models.py +61 -0
  11. openspeech/core/registry.py +37 -0
  12. openspeech/core/settings.py +8 -0
  13. openspeech/demo.py +675 -0
  14. openspeech/dispatch/__init__.py +0 -0
  15. openspeech/dispatch/context.py +34 -0
  16. openspeech/dispatch/dispatcher.py +661 -0
  17. openspeech/dispatch/executors/__init__.py +0 -0
  18. openspeech/dispatch/executors/base.py +34 -0
  19. openspeech/dispatch/executors/in_process.py +66 -0
  20. openspeech/dispatch/executors/remote.py +64 -0
  21. openspeech/dispatch/executors/subprocess_exec.py +446 -0
  22. openspeech/dispatch/fanout.py +95 -0
  23. openspeech/dispatch/filters.py +73 -0
  24. openspeech/dispatch/lifecycle.py +178 -0
  25. openspeech/dispatch/watcher.py +82 -0
  26. openspeech/engine_catalog.py +236 -0
  27. openspeech/engine_registry.yaml +347 -0
  28. openspeech/exceptions.py +51 -0
  29. openspeech/factory.py +325 -0
  30. openspeech/local_engines/__init__.py +12 -0
  31. openspeech/local_engines/aim_resolver.py +91 -0
  32. openspeech/local_engines/backends/__init__.py +1 -0
  33. openspeech/local_engines/backends/docker_backend.py +490 -0
  34. openspeech/local_engines/backends/native_backend.py +902 -0
  35. openspeech/local_engines/base.py +30 -0
  36. openspeech/local_engines/engines/__init__.py +1 -0
  37. openspeech/local_engines/engines/faster_whisper.py +36 -0
  38. openspeech/local_engines/engines/fish_speech.py +33 -0
  39. openspeech/local_engines/engines/sherpa_onnx.py +56 -0
  40. openspeech/local_engines/engines/whisper.py +41 -0
  41. openspeech/local_engines/engines/whisperlivekit.py +60 -0
  42. openspeech/local_engines/manager.py +208 -0
  43. openspeech/local_engines/models.py +50 -0
  44. openspeech/local_engines/progress.py +69 -0
  45. openspeech/local_engines/registry.py +19 -0
  46. openspeech/local_engines/task_store.py +52 -0
  47. openspeech/local_engines/tasks.py +71 -0
  48. openspeech/logging_config.py +607 -0
  49. openspeech/observe/__init__.py +0 -0
  50. openspeech/observe/base.py +79 -0
  51. openspeech/observe/debug.py +44 -0
  52. openspeech/observe/latency.py +19 -0
  53. openspeech/observe/metrics.py +47 -0
  54. openspeech/observe/tracing.py +44 -0
  55. openspeech/observe/usage.py +27 -0
  56. openspeech/providers/__init__.py +0 -0
  57. openspeech/providers/_template.py +101 -0
  58. openspeech/providers/stt/__init__.py +0 -0
  59. openspeech/providers/stt/alibaba.py +86 -0
  60. openspeech/providers/stt/assemblyai.py +135 -0
  61. openspeech/providers/stt/azure_speech.py +99 -0
  62. openspeech/providers/stt/baidu.py +135 -0
  63. openspeech/providers/stt/deepgram.py +311 -0
  64. openspeech/providers/stt/elevenlabs.py +385 -0
  65. openspeech/providers/stt/faster_whisper.py +211 -0
  66. openspeech/providers/stt/google_cloud.py +106 -0
  67. openspeech/providers/stt/iflytek.py +427 -0
  68. openspeech/providers/stt/macos_speech.py +226 -0
  69. openspeech/providers/stt/openai.py +84 -0
  70. openspeech/providers/stt/sherpa_onnx.py +353 -0
  71. openspeech/providers/stt/tencent.py +212 -0
  72. openspeech/providers/stt/volcengine.py +107 -0
  73. openspeech/providers/stt/whisper.py +153 -0
  74. openspeech/providers/stt/whisperlivekit.py +530 -0
  75. openspeech/providers/stt/windows_speech.py +249 -0
  76. openspeech/providers/tts/__init__.py +0 -0
  77. openspeech/providers/tts/alibaba.py +95 -0
  78. openspeech/providers/tts/azure_speech.py +123 -0
  79. openspeech/providers/tts/baidu.py +143 -0
  80. openspeech/providers/tts/coqui.py +64 -0
  81. openspeech/providers/tts/cosyvoice.py +90 -0
  82. openspeech/providers/tts/deepgram.py +174 -0
  83. openspeech/providers/tts/elevenlabs.py +311 -0
  84. openspeech/providers/tts/fish_speech.py +158 -0
  85. openspeech/providers/tts/google_cloud.py +107 -0
  86. openspeech/providers/tts/iflytek.py +209 -0
  87. openspeech/providers/tts/macos_say.py +251 -0
  88. openspeech/providers/tts/minimax.py +122 -0
  89. openspeech/providers/tts/openai.py +104 -0
  90. openspeech/providers/tts/piper.py +104 -0
  91. openspeech/providers/tts/tencent.py +189 -0
  92. openspeech/providers/tts/volcengine.py +117 -0
  93. openspeech/providers/tts/windows_sapi.py +234 -0
  94. openspeech/server/__init__.py +1 -0
  95. openspeech/server/app.py +72 -0
  96. openspeech/server/auth.py +42 -0
  97. openspeech/server/middleware.py +75 -0
  98. openspeech/server/routes/__init__.py +1 -0
  99. openspeech/server/routes/management.py +848 -0
  100. openspeech/server/routes/stt.py +121 -0
  101. openspeech/server/routes/tts.py +159 -0
  102. openspeech/server/routes/webui.py +29 -0
  103. openspeech/server/webui/app.js +2649 -0
  104. openspeech/server/webui/index.html +216 -0
  105. openspeech/server/webui/styles.css +617 -0
  106. openspeech/server/ws/__init__.py +1 -0
  107. openspeech/server/ws/stt_stream.py +263 -0
  108. openspeech/server/ws/tts_stream.py +207 -0
  109. openspeech/telemetry/__init__.py +21 -0
  110. openspeech/telemetry/perf.py +307 -0
  111. openspeech/utils/__init__.py +5 -0
  112. openspeech/utils/audio_converter.py +406 -0
  113. openspeech/utils/audio_playback.py +156 -0
  114. openspeech/vendor_registry.yaml +74 -0
  115. openspeechapi-0.1.0.dist-info/METADATA +101 -0
  116. openspeechapi-0.1.0.dist-info/RECORD +118 -0
  117. openspeechapi-0.1.0.dist-info/WHEEL +4 -0
  118. openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,530 @@
1
+ """WhisperLiveKit STT provider (local service, Deepgram-compatible WS)."""
2
+ from __future__ import annotations
3
+
4
+ from collections.abc import AsyncIterator
5
+ from dataclasses import dataclass
6
+ import io
7
+ import json
8
+ from pathlib import Path
9
+ import re
10
+ import time
11
+ from typing import Any
12
+ from uuid import uuid4
13
+ from urllib.parse import urlencode, urljoin, urlparse, urlunparse
14
+ import wave
15
+
16
+ from openspeech.core.base import STTProvider
17
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
18
+ from openspeech.core.models import AudioData, STTOptions, Transcription
19
+ from openspeech.core.settings import BaseSettings
20
+ from openspeech.logging_config import logger
21
+
22
+ @dataclass
23
+ class WhisperLiveKitSTTSettings(BaseSettings):
24
+ api_url: str = "http://127.0.0.1:12100"
25
+ ws_path: str = "/asr"
26
+ ws_mode: str = "diff"
27
+ http_transcribe_path: str = "/v1/audio/transcriptions"
28
+ model: str = "openai/whisper-large-v3-turbo"
29
+ language: str = ""
30
+ sample_rate: int = 16000
31
+ encoding: str = "linear16"
32
+ interim_results: bool = True
33
+ punctuate: bool = True
34
+ smart_format: bool = True
35
+ timeout_s: float = 30.0
36
+ retries: int = 0
37
+ # Trace logs for streaming parse/debug (JSONL).
38
+ trace_enabled: bool = False
39
+ trace_path: str = ".tmp/logs/stt_wlk_trace.jsonl"
40
+ trace_max_chars: int = 240
41
+ # WhisperLiveKit (mlx/simul) may return empty text for very short WAV clips.
42
+ # To improve UX, pad trailing silence up to this duration before request.
43
+ min_audio_duration_ms: int = 6000
44
+
45
+ class WhisperLiveKitSTT(STTProvider):
46
+ name = "whisperlivekit-stt"
47
+ provider_type = ProviderType.STT
48
+ execution_mode = ExecMode.LOCAL
49
+ settings_cls = WhisperLiveKitSTTSettings
50
+ capabilities = {
51
+ Capability.STREAMING,
52
+ Capability.BATCH,
53
+ Capability.MULTILINGUAL,
54
+ }
55
+ field_options = {"language": ["auto", "en", "zh", "ja", "ko", "es", "fr", "de"]}
56
+
57
+ def __init__(self, settings: WhisperLiveKitSTTSettings | None = None) -> None:
58
+ self.settings = settings or WhisperLiveKitSTTSettings()
59
+ self._client: Any = None
60
+ self._owns_client: bool = True
61
+ self._trace_file: Path | None = None
62
+
63
+ def set_http_client(self, client) -> None:
64
+ self._client = client
65
+ self._owns_client = False
66
+
67
+ async def start(self) -> None:
68
+ if self._client is None:
69
+ try:
70
+ import httpx
71
+ except ImportError:
72
+ raise ImportError(
73
+ "Install httpx + websockets: pip install openspeech[server] openspeech[whisperlivekit]"
74
+ )
75
+ self._client = httpx.AsyncClient(timeout=self.settings.timeout_s, trust_env=False)
76
+ self._owns_client = True
77
+ logger.info("{} provider started", self.name)
78
+ if self.settings.trace_enabled:
79
+ self._trace_file = Path(self.settings.trace_path).expanduser()
80
+ self._trace_file.parent.mkdir(parents=True, exist_ok=True)
81
+ self._trace("start", ws_path=self.settings.ws_path, ws_mode=self.settings.ws_mode)
82
+
83
+ async def stop(self) -> None:
84
+ if self._client and self._owns_client:
85
+ await self._client.aclose()
86
+ self._client = None
87
+ logger.info("{} provider stopped", self.name)
88
+ self._trace("stop")
89
+
90
+ def _truncate(self, value: Any) -> Any:
91
+ if value is None:
92
+ return None
93
+ s = str(value)
94
+ max_chars = max(40, int(self.settings.trace_max_chars))
95
+ if len(s) <= max_chars:
96
+ return s
97
+ return s[:max_chars] + "..."
98
+
99
+ def _trace(self, event: str, **payload: Any) -> None:
100
+ if not self.settings.trace_enabled or self._trace_file is None:
101
+ return
102
+ row = {
103
+ "ts": time.time(),
104
+ "event": event,
105
+ **payload,
106
+ }
107
+ try:
108
+ self._trace_file.parent.mkdir(parents=True, exist_ok=True)
109
+ with self._trace_file.open("a", encoding="utf-8") as f:
110
+ f.write(json.dumps(row, ensure_ascii=False) + "\n")
111
+ except Exception as exc: # noqa: BLE001
112
+ logger.warning("Failed to write stt_wlk trace log: {}", exc)
113
+
114
+ async def health_check(self) -> bool:
115
+ if self._client is None:
116
+ return False
117
+ try:
118
+ url = urljoin(self.settings.api_url.rstrip("/") + "/", "health")
119
+ r = await self._client.get(url)
120
+ return r.status_code < 500
121
+ except Exception:
122
+ return False
123
+
124
+ async def transcribe(
125
+ self, audio: AudioData, opts: STTOptions | None = None
126
+ ) -> Transcription:
127
+ if self._client is None:
128
+ raise RuntimeError("Provider not started — call start() first")
129
+ logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
130
+ opts = opts or STTOptions()
131
+ url = urljoin(
132
+ self.settings.api_url.rstrip("/") + "/",
133
+ self.settings.http_transcribe_path.lstrip("/"),
134
+ )
135
+ model = (opts.model or self.settings.model).strip()
136
+ language = (opts.language or self.settings.language).strip()
137
+ wav_bytes = self._maybe_pad_wav_for_short_clip(
138
+ audio.data, min_duration_ms=max(0, int(self.settings.min_audio_duration_ms))
139
+ )
140
+ files = {
141
+ "file": ("audio.wav", wav_bytes, "audio/wav"),
142
+ }
143
+ data: dict[str, Any] = {"model": model}
144
+ if language:
145
+ data["language"] = language
146
+ if opts.prompt:
147
+ data["prompt"] = opts.prompt
148
+ if opts.temperature is not None:
149
+ data["temperature"] = opts.temperature
150
+
151
+ last_exc: Exception | None = None
152
+ attempts = max(0, int(self.settings.retries)) + 1
153
+ resp = None
154
+ started_at = time.perf_counter()
155
+ for _ in range(attempts):
156
+ try:
157
+ resp = await self._client.post(url, files=files, data=data)
158
+ break
159
+ except Exception as exc: # noqa: BLE001
160
+ last_exc = exc
161
+ if resp is None:
162
+ raise RuntimeError(f"WhisperLiveKit request failed: {last_exc}") from last_exc
163
+ resp.raise_for_status()
164
+ payload = resp.json()
165
+ text = str(payload.get("text", "")).strip()
166
+ if not text:
167
+ # Deepgram-like response fallback
168
+ channel = payload.get("channel", {})
169
+ alts = channel.get("alternatives", []) if isinstance(channel, dict) else []
170
+ if alts:
171
+ text = str(alts[0].get("transcript", "")).strip()
172
+ elapsed_ms = int((time.perf_counter() - started_at) * 1000)
173
+ duration_ms = payload.get("duration_ms")
174
+ if duration_ms is None:
175
+ metadata = payload.get("metadata", {})
176
+ if isinstance(metadata, dict):
177
+ md_dur = metadata.get("duration")
178
+ if isinstance(md_dur, (int, float)):
179
+ duration_ms = int(float(md_dur) * 1000)
180
+ if duration_ms is None:
181
+ duration_ms = elapsed_ms
182
+ result = Transcription(
183
+ text=text,
184
+ language=payload.get("language") or language or None,
185
+ confidence=payload.get("confidence"),
186
+ duration_ms=int(duration_ms),
187
+ )
188
+ logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, elapsed_ms, len(result.text))
189
+ return result
190
+
191
+ @staticmethod
192
+ def _maybe_pad_wav_for_short_clip(data: bytes, min_duration_ms: int) -> bytes:
193
+ """Pad trailing silence for short PCM WAV clips to avoid empty short-utterance outputs."""
194
+ if min_duration_ms <= 0 or len(data) < 44:
195
+ return data
196
+ if not (data.startswith(b"RIFF") and data[8:12] == b"WAVE"):
197
+ return data
198
+ try:
199
+ with wave.open(io.BytesIO(data), "rb") as r:
200
+ params = r.getparams()
201
+ framerate = int(r.getframerate())
202
+ sampwidth = int(r.getsampwidth())
203
+ channels = int(r.getnchannels())
204
+ frames = r.readframes(r.getnframes())
205
+ except Exception:
206
+ return data
207
+ if framerate <= 0 or sampwidth <= 0 or channels <= 0:
208
+ return data
209
+ target_frames = int((min_duration_ms / 1000.0) * framerate)
210
+ current_frames = len(frames) // max(1, sampwidth * channels)
211
+ if current_frames >= target_frames:
212
+ return data
213
+ add_frames = target_frames - current_frames
214
+ silence = b"\x00" * (add_frames * sampwidth * channels)
215
+ out = io.BytesIO()
216
+ with wave.open(out, "wb") as w:
217
+ w.setparams(params)
218
+ w.writeframes(frames + silence)
219
+ return out.getvalue()
220
+
221
+ def _build_ws_url(self, model: str, language: str) -> str:
222
+ base = self.settings.api_url.rstrip("/")
223
+ u = urlparse(base)
224
+ scheme = "wss" if u.scheme == "https" else "ws"
225
+ path = self.settings.ws_path if self.settings.ws_path.startswith("/") else f"/{self.settings.ws_path}"
226
+ query: dict[str, str] = {}
227
+ if path == "/asr":
228
+ query["mode"] = self.settings.ws_mode or "diff"
229
+ if language:
230
+ query["language"] = language
231
+ else:
232
+ query = {
233
+ "model": model,
234
+ "encoding": self.settings.encoding,
235
+ "sample_rate": str(int(self.settings.sample_rate)),
236
+ "interim_results": "true" if self.settings.interim_results else "false",
237
+ "punctuate": "true" if self.settings.punctuate else "false",
238
+ "smart_format": "true" if self.settings.smart_format else "false",
239
+ }
240
+ if language:
241
+ query["language"] = language
242
+ if path != "/asr" and language:
243
+ query["language"] = language
244
+ return urlunparse((scheme, u.netloc, path, "", urlencode(query), ""))
245
+
246
+ @staticmethod
247
+ def _merge_text(prev: str, incoming: str) -> str:
248
+ p = (prev or "").strip()
249
+ n = (incoming or "").strip()
250
+ if not n:
251
+ return p
252
+ if not p:
253
+ return n
254
+ if n == p:
255
+ return p
256
+ if n.startswith(p):
257
+ return n
258
+ if p.startswith(n):
259
+ return p
260
+ if p in n:
261
+ return n
262
+ if n in p:
263
+ return p
264
+ return f"{p} {n}".strip()
265
+
266
+ @staticmethod
267
+ def _snapshot_text(prev: str, incoming: str) -> str:
268
+ p = (prev or "").strip()
269
+ n = (incoming or "").strip()
270
+ if not n:
271
+ return p
272
+ if n == p:
273
+ return p
274
+ # WhisperLiveKit /asr (diff/snapshot) messages represent latest hypothesis.
275
+ # Replace instead of append to avoid repeated growth on revisions.
276
+ return n
277
+
278
+ @staticmethod
279
+ def _stabilize_asr_text(text: str) -> str:
280
+ t = str(text or "").strip()
281
+ if not t:
282
+ return ""
283
+ # Collapse obvious repeated Latin chunks, e.g. "HelloHelloHello" -> "Hello".
284
+ t = re.sub(r"(?i)\b([a-z][a-z0-9']{1,})(?:\1){1,}\b", r"\1", t)
285
+ # Collapse repeated word with spaces, e.g. "hello hello hello" -> "hello".
286
+ t = re.sub(r"(?i)\b([a-z][a-z0-9']{1,})\b(?:\s+\1\b){1,}", r"\1", t)
287
+ # Collapse CJK/punctuation long runs (3+) caused by unstable interim decoding.
288
+ t = re.sub(r"([\u4e00-\u9fff])\1{2,}", r"\1", t)
289
+ t = re.sub(r"([,,。.!!??、;;::\"'`~\-])\1{2,}", r"\1", t)
290
+ return t.strip()
291
+
292
+ @staticmethod
293
+ def _extract_asr_text_from_lines(lines: list[Any]) -> str:
294
+ parts: list[str] = []
295
+ for line in lines:
296
+ if not isinstance(line, dict):
297
+ continue
298
+ if int(line.get("speaker", 0) or 0) == -2:
299
+ continue
300
+ t = str(line.get("text") or "").strip()
301
+ if t:
302
+ parts.append(t)
303
+ return " ".join(parts).strip()
304
+
305
+ def _extract_asr_message_text(
306
+ self,
307
+ data: dict[str, Any],
308
+ committed_lines: list[dict[str, Any]],
309
+ ) -> str:
310
+ t = str(data.get("type") or "").strip()
311
+ if t == "snapshot":
312
+ lines = data.get("lines") if isinstance(data.get("lines"), list) else []
313
+ committed_lines.clear()
314
+ committed_lines.extend([x for x in lines if isinstance(x, dict)])
315
+ elif t == "diff":
316
+ pruned = int(data.get("lines_pruned") or 0)
317
+ if pruned > 0:
318
+ del committed_lines[:pruned]
319
+ new_lines = data.get("new_lines") if isinstance(data.get("new_lines"), list) else []
320
+ committed_lines.extend([x for x in new_lines if isinstance(x, dict)])
321
+ elif t == "ready_to_stop":
322
+ return ""
323
+ elif "lines" in data and isinstance(data.get("lines"), list):
324
+ committed_lines.clear()
325
+ committed_lines.extend([x for x in data.get("lines", []) if isinstance(x, dict)])
326
+ base = self._extract_asr_text_from_lines(committed_lines)
327
+ buffer_t = str(data.get("buffer_transcription") or "").strip()
328
+ if buffer_t:
329
+ return self._merge_text(base, buffer_t)
330
+ return base
331
+
332
+ @staticmethod
333
+ def _extract_deepgram_text(data: dict[str, Any]) -> tuple[str, str | None, float | None]:
334
+ if data.get("type") != "Results":
335
+ return "", None, None
336
+ channel = data.get("channel", {})
337
+ alts = channel.get("alternatives", []) if isinstance(channel, dict) else []
338
+ if not alts:
339
+ return "", None, None
340
+ transcript = str(alts[0].get("transcript", "")).strip()
341
+ language = channel.get("detected_language")
342
+ confidence = alts[0].get("confidence")
343
+ return transcript, language, confidence
344
+
345
+ async def transcribe_stream(
346
+ self, stream: AsyncIterator[bytes]
347
+ ) -> AsyncIterator[Any]:
348
+ if self._client is None:
349
+ raise RuntimeError("Provider not started — call start() first")
350
+ try:
351
+ import websockets
352
+ except ImportError:
353
+ raise ImportError(
354
+ "Install websockets: pip install openspeech[server]"
355
+ )
356
+
357
+ _t0 = time.perf_counter()
358
+
359
+ ws_url = self._build_ws_url(self.settings.model, self.settings.language)
360
+ is_asr = self.settings.ws_path.startswith("/asr")
361
+ stream_id = uuid4().hex[:12]
362
+ def trace(ev: str, **kw: Any) -> None:
363
+ self._trace(ev, stream_id=stream_id, **kw)
364
+
365
+ logger.debug("{}: connecting to WhisperLiveKit WebSocket...", self.name)
366
+ trace("stream_open", ws_url=ws_url, is_asr=is_asr)
367
+ async with websockets.connect(ws_url, ping_interval=20, ping_timeout=20) as ws:
368
+ _t_connected = time.perf_counter()
369
+ logger.info("{}: WS connected in {:.0f}ms", self.name,
370
+ (_t_connected - _t0) * 1000)
371
+ committed_lines: list[dict[str, Any]] = []
372
+ latest_text = ""
373
+ sent_chunks = 0
374
+ sent_bytes = 0
375
+ _resp_count = 0
376
+
377
+ _sender_stop = asyncio.Event()
378
+
379
+ async def send_audio() -> None:
380
+ nonlocal sent_chunks, sent_bytes
381
+ try:
382
+ async for chunk in stream:
383
+ if _sender_stop.is_set():
384
+ break
385
+ if chunk:
386
+ await ws.send(chunk)
387
+ sent_chunks += 1
388
+ sent_bytes += len(chunk)
389
+ if sent_chunks == 1:
390
+ logger.debug("{}: first frame sent at {:.0f}ms",
391
+ self.name, (time.perf_counter() - _t0) * 1000)
392
+ trace("stream_send_chunk", sent_chunks=sent_chunks, sent_bytes=sent_bytes)
393
+ # WhisperLiveKit stop sentinel for both /asr and /v1/listen.
394
+ if not _sender_stop.is_set():
395
+ await ws.send(b"")
396
+ trace("stream_send_stop", sent_chunks=sent_chunks, sent_bytes=sent_bytes)
397
+ except websockets.exceptions.ConnectionClosed:
398
+ pass
399
+ finally:
400
+ logger.debug(
401
+ "{}: stream sender done, sent {} frames ({} bytes) in {:.0f}ms",
402
+ self.name, sent_chunks, sent_bytes,
403
+ (time.perf_counter() - _t0) * 1000,
404
+ )
405
+
406
+ import asyncio
407
+ send_task = asyncio.create_task(send_audio())
408
+ try:
409
+ while True:
410
+ try:
411
+ msg = await asyncio.wait_for(ws.recv(), timeout=1.2)
412
+ except asyncio.TimeoutError:
413
+ if send_task.done():
414
+ trace("stream_recv_timeout_done")
415
+ break
416
+ continue
417
+ if isinstance(msg, bytes):
418
+ trace("stream_recv_binary", size=len(msg))
419
+ continue
420
+ data = json.loads(msg)
421
+ msg_type = str(data.get("type") or "")
422
+ if is_asr:
423
+ trace(
424
+ "stream_recv_asr",
425
+ type=msg_type,
426
+ seq=data.get("seq"),
427
+ status=data.get("status"),
428
+ n_lines=data.get("n_lines"),
429
+ lines_pruned=data.get("lines_pruned"),
430
+ buffer_transcription=self._truncate(data.get("buffer_transcription")),
431
+ new_lines=self._truncate(
432
+ " | ".join(
433
+ str(x.get("text", ""))
434
+ for x in (data.get("new_lines") or [])
435
+ if isinstance(x, dict)
436
+ )
437
+ ),
438
+ )
439
+ else:
440
+ transcript, _lang, _conf = self._extract_deepgram_text(data)
441
+ trace(
442
+ "stream_recv_deepgram",
443
+ type=msg_type,
444
+ transcript=self._truncate(transcript),
445
+ )
446
+ _resp_count += 1
447
+ if _resp_count == 1:
448
+ logger.debug("{}: first response at {:.0f}ms type={}",
449
+ self.name, (time.perf_counter() - _t0) * 1000, msg_type)
450
+ if msg_type in {"error", "Error"}:
451
+ detail = str(data.get("detail") or data.get("message") or data)
452
+ trace("stream_error", detail=self._truncate(detail))
453
+ raise RuntimeError(f"WhisperLiveKit stream error: {detail}")
454
+ if is_asr:
455
+ parsed_text = self._extract_asr_message_text(data, committed_lines)
456
+ stabilized_text = self._stabilize_asr_text(parsed_text)
457
+ trace(
458
+ "stream_parse_asr",
459
+ parsed_text=self._truncate(parsed_text),
460
+ stabilized_text=self._truncate(stabilized_text),
461
+ latest_text_before=self._truncate(latest_text),
462
+ )
463
+ if not stabilized_text:
464
+ continue
465
+ merged = self._snapshot_text(latest_text, stabilized_text)
466
+ trace(
467
+ "stream_merge_asr",
468
+ merged_text=self._truncate(merged),
469
+ changed=(merged != latest_text),
470
+ )
471
+ if merged == latest_text:
472
+ continue
473
+ latest_text = merged
474
+ trace("stream_emit_asr", emit_text=self._truncate(latest_text))
475
+ yield Transcription(
476
+ text=latest_text,
477
+ language=self.settings.language or None,
478
+ confidence=None,
479
+ is_partial=True,
480
+ )
481
+ else:
482
+ transcript, language, confidence = self._extract_deepgram_text(data)
483
+ trace(
484
+ "stream_parse_deepgram",
485
+ transcript=self._truncate(transcript),
486
+ latest_text_before=self._truncate(latest_text),
487
+ )
488
+ if not transcript:
489
+ continue
490
+ merged = self._merge_text(latest_text, transcript)
491
+ trace(
492
+ "stream_merge_deepgram",
493
+ merged_text=self._truncate(merged),
494
+ changed=(merged != latest_text),
495
+ )
496
+ if merged == latest_text:
497
+ continue
498
+ latest_text = merged
499
+ trace("stream_emit_deepgram", emit_text=self._truncate(latest_text))
500
+ yield Transcription(
501
+ text=latest_text,
502
+ language=language or self.settings.language or None,
503
+ confidence=confidence,
504
+ is_partial=True,
505
+ )
506
+ finally:
507
+ _sender_stop.set()
508
+ send_task.cancel()
509
+ try:
510
+ await send_task
511
+ except asyncio.CancelledError:
512
+ pass
513
+ trace("stream_close", sent_chunks=sent_chunks, sent_bytes=sent_bytes)
514
+
515
+ # Emit final snapshot after the receive loop exits
516
+ if latest_text:
517
+ logger.info("{}: final result at {:.0f}ms responses={} text='{}'",
518
+ self.name, (time.perf_counter() - _t0) * 1000,
519
+ _resp_count, latest_text[:60])
520
+ yield Transcription(
521
+ text=latest_text,
522
+ language=self.settings.language or None,
523
+ confidence=None,
524
+ is_partial=False,
525
+ )
526
+
527
+ logger.info(
528
+ "{}: stream completed in {:.0f}ms, frames={}",
529
+ self.name, (time.perf_counter() - _t0) * 1000, sent_chunks,
530
+ )