openspeechapi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. openspeech/__init__.py +75 -0
  2. openspeech/__main__.py +5 -0
  3. openspeech/cli.py +413 -0
  4. openspeech/client/__init__.py +4 -0
  5. openspeech/client/client.py +145 -0
  6. openspeech/config.py +212 -0
  7. openspeech/core/__init__.py +0 -0
  8. openspeech/core/base.py +75 -0
  9. openspeech/core/enums.py +39 -0
  10. openspeech/core/models.py +61 -0
  11. openspeech/core/registry.py +37 -0
  12. openspeech/core/settings.py +8 -0
  13. openspeech/demo.py +675 -0
  14. openspeech/dispatch/__init__.py +0 -0
  15. openspeech/dispatch/context.py +34 -0
  16. openspeech/dispatch/dispatcher.py +661 -0
  17. openspeech/dispatch/executors/__init__.py +0 -0
  18. openspeech/dispatch/executors/base.py +34 -0
  19. openspeech/dispatch/executors/in_process.py +66 -0
  20. openspeech/dispatch/executors/remote.py +64 -0
  21. openspeech/dispatch/executors/subprocess_exec.py +446 -0
  22. openspeech/dispatch/fanout.py +95 -0
  23. openspeech/dispatch/filters.py +73 -0
  24. openspeech/dispatch/lifecycle.py +178 -0
  25. openspeech/dispatch/watcher.py +82 -0
  26. openspeech/engine_catalog.py +236 -0
  27. openspeech/engine_registry.yaml +347 -0
  28. openspeech/exceptions.py +51 -0
  29. openspeech/factory.py +325 -0
  30. openspeech/local_engines/__init__.py +12 -0
  31. openspeech/local_engines/aim_resolver.py +91 -0
  32. openspeech/local_engines/backends/__init__.py +1 -0
  33. openspeech/local_engines/backends/docker_backend.py +490 -0
  34. openspeech/local_engines/backends/native_backend.py +902 -0
  35. openspeech/local_engines/base.py +30 -0
  36. openspeech/local_engines/engines/__init__.py +1 -0
  37. openspeech/local_engines/engines/faster_whisper.py +36 -0
  38. openspeech/local_engines/engines/fish_speech.py +33 -0
  39. openspeech/local_engines/engines/sherpa_onnx.py +56 -0
  40. openspeech/local_engines/engines/whisper.py +41 -0
  41. openspeech/local_engines/engines/whisperlivekit.py +60 -0
  42. openspeech/local_engines/manager.py +208 -0
  43. openspeech/local_engines/models.py +50 -0
  44. openspeech/local_engines/progress.py +69 -0
  45. openspeech/local_engines/registry.py +19 -0
  46. openspeech/local_engines/task_store.py +52 -0
  47. openspeech/local_engines/tasks.py +71 -0
  48. openspeech/logging_config.py +607 -0
  49. openspeech/observe/__init__.py +0 -0
  50. openspeech/observe/base.py +79 -0
  51. openspeech/observe/debug.py +44 -0
  52. openspeech/observe/latency.py +19 -0
  53. openspeech/observe/metrics.py +47 -0
  54. openspeech/observe/tracing.py +44 -0
  55. openspeech/observe/usage.py +27 -0
  56. openspeech/providers/__init__.py +0 -0
  57. openspeech/providers/_template.py +101 -0
  58. openspeech/providers/stt/__init__.py +0 -0
  59. openspeech/providers/stt/alibaba.py +86 -0
  60. openspeech/providers/stt/assemblyai.py +135 -0
  61. openspeech/providers/stt/azure_speech.py +99 -0
  62. openspeech/providers/stt/baidu.py +135 -0
  63. openspeech/providers/stt/deepgram.py +311 -0
  64. openspeech/providers/stt/elevenlabs.py +385 -0
  65. openspeech/providers/stt/faster_whisper.py +211 -0
  66. openspeech/providers/stt/google_cloud.py +106 -0
  67. openspeech/providers/stt/iflytek.py +427 -0
  68. openspeech/providers/stt/macos_speech.py +226 -0
  69. openspeech/providers/stt/openai.py +84 -0
  70. openspeech/providers/stt/sherpa_onnx.py +353 -0
  71. openspeech/providers/stt/tencent.py +212 -0
  72. openspeech/providers/stt/volcengine.py +107 -0
  73. openspeech/providers/stt/whisper.py +153 -0
  74. openspeech/providers/stt/whisperlivekit.py +530 -0
  75. openspeech/providers/stt/windows_speech.py +249 -0
  76. openspeech/providers/tts/__init__.py +0 -0
  77. openspeech/providers/tts/alibaba.py +95 -0
  78. openspeech/providers/tts/azure_speech.py +123 -0
  79. openspeech/providers/tts/baidu.py +143 -0
  80. openspeech/providers/tts/coqui.py +64 -0
  81. openspeech/providers/tts/cosyvoice.py +90 -0
  82. openspeech/providers/tts/deepgram.py +174 -0
  83. openspeech/providers/tts/elevenlabs.py +311 -0
  84. openspeech/providers/tts/fish_speech.py +158 -0
  85. openspeech/providers/tts/google_cloud.py +107 -0
  86. openspeech/providers/tts/iflytek.py +209 -0
  87. openspeech/providers/tts/macos_say.py +251 -0
  88. openspeech/providers/tts/minimax.py +122 -0
  89. openspeech/providers/tts/openai.py +104 -0
  90. openspeech/providers/tts/piper.py +104 -0
  91. openspeech/providers/tts/tencent.py +189 -0
  92. openspeech/providers/tts/volcengine.py +117 -0
  93. openspeech/providers/tts/windows_sapi.py +234 -0
  94. openspeech/server/__init__.py +1 -0
  95. openspeech/server/app.py +72 -0
  96. openspeech/server/auth.py +42 -0
  97. openspeech/server/middleware.py +75 -0
  98. openspeech/server/routes/__init__.py +1 -0
  99. openspeech/server/routes/management.py +848 -0
  100. openspeech/server/routes/stt.py +121 -0
  101. openspeech/server/routes/tts.py +159 -0
  102. openspeech/server/routes/webui.py +29 -0
  103. openspeech/server/webui/app.js +2649 -0
  104. openspeech/server/webui/index.html +216 -0
  105. openspeech/server/webui/styles.css +617 -0
  106. openspeech/server/ws/__init__.py +1 -0
  107. openspeech/server/ws/stt_stream.py +263 -0
  108. openspeech/server/ws/tts_stream.py +207 -0
  109. openspeech/telemetry/__init__.py +21 -0
  110. openspeech/telemetry/perf.py +307 -0
  111. openspeech/utils/__init__.py +5 -0
  112. openspeech/utils/audio_converter.py +406 -0
  113. openspeech/utils/audio_playback.py +156 -0
  114. openspeech/vendor_registry.yaml +74 -0
  115. openspeechapi-0.1.0.dist-info/METADATA +101 -0
  116. openspeechapi-0.1.0.dist-info/RECORD +118 -0
  117. openspeechapi-0.1.0.dist-info/WHEEL +4 -0
  118. openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,106 @@
1
+ """Google Cloud STT provider adapter (batch, httpx)."""
2
+ from __future__ import annotations
3
+
4
+ import base64
5
+ from openspeech.logging_config import logger
6
+ import time
7
+ from collections.abc import AsyncIterator
8
+ from dataclasses import dataclass
9
+ from typing import Any
10
+
11
+ import httpx
12
+
13
+ from openspeech.core.base import STTProvider
14
+
15
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
16
+ from openspeech.core.models import AudioData, STTOptions, Transcription
17
+ from openspeech.core.settings import BaseSettings
18
+
19
+ @dataclass
20
+ class GoogleCloudSTTSettings(BaseSettings):
21
+ api_key: str = ""
22
+ model: str = "latest_long"
23
+ language: str = "en-US"
24
+
25
+ class GoogleCloudSTT(STTProvider):
26
+ name = "google-stt"
27
+ provider_type = ProviderType.STT
28
+ execution_mode = ExecMode.REMOTE
29
+ settings_cls = GoogleCloudSTTSettings
30
+ capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
31
+ field_options = {"model": ["latest_long", "latest_short", "telephony", "command_and_search"], "language": ["en-US", "zh-CN", "ja-JP", "ko-KR", "es-ES", "fr-FR", "de-DE", "pt-BR", "it-IT", "ru-RU", "ar-SA", "hi-IN"]}
32
+
33
+ def __init__(self, settings: GoogleCloudSTTSettings | None = None) -> None:
34
+ self.settings = settings or GoogleCloudSTTSettings()
35
+ self._client: httpx.AsyncClient | None = None
36
+ self._owns_client: bool = True
37
+
38
+ def set_http_client(self, client) -> None:
39
+ self._client = client
40
+ self._owns_client = False
41
+
42
+ async def start(self) -> None:
43
+ if self._client is None:
44
+ self._client = httpx.AsyncClient(timeout=60.0)
45
+ self._owns_client = True
46
+ logger.info("{} provider started", self.name)
47
+
48
+ async def stop(self) -> None:
49
+ if self._client and self._owns_client:
50
+ await self._client.aclose()
51
+ self._client = None
52
+ logger.info("{} provider stopped", self.name)
53
+
54
+ async def health_check(self) -> bool:
55
+ return bool(self.settings.api_key)
56
+
57
+ async def transcribe(
58
+ self, audio: AudioData, opts: STTOptions | None = None
59
+ ) -> Transcription:
60
+ if self._client is None:
61
+ raise RuntimeError("Provider not started — call start() first")
62
+ logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
63
+ _t0 = time.perf_counter()
64
+ opts = opts or STTOptions()
65
+ language = opts.language or self.settings.language
66
+ model = self.settings.model
67
+
68
+ b64_audio = base64.b64encode(audio.data).decode("utf-8")
69
+ url = (
70
+ "https://speech.googleapis.com/v1/speech:recognize"
71
+ f"?key={self.settings.api_key}"
72
+ )
73
+ body = {
74
+ "config": {
75
+ "encoding": "LINEAR16",
76
+ "sampleRateHertz": audio.sample_rate,
77
+ "languageCode": language,
78
+ "model": model,
79
+ },
80
+ "audio": {"content": b64_audio},
81
+ }
82
+ response = await self._client.post(url, json=body)
83
+ if response.status_code != 200:
84
+ raise RuntimeError(
85
+ f"Google Cloud STT API error {response.status_code}: {response.text}"
86
+ )
87
+ data = response.json()
88
+ results = data.get("results", [])
89
+ if not results:
90
+ return Transcription(text="", language=language)
91
+ alternative = results[0]["alternatives"][0]
92
+ result = Transcription(
93
+ text=alternative.get("transcript", ""),
94
+ language=language,
95
+ confidence=alternative.get("confidence"),
96
+ )
97
+ logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(result.text))
98
+ return result
99
+
100
+ async def transcribe_stream(
101
+ self, stream: AsyncIterator[bytes]
102
+ ) -> AsyncIterator[Any]:
103
+ raise NotImplementedError(
104
+ "Google Cloud STT batch provider does not support streaming input"
105
+ )
106
+ yield # pragma: no cover
@@ -0,0 +1,427 @@
1
+ """iFlytek (讯飞) STT provider adapter — WebSocket-based."""
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import base64
6
+ import hashlib
7
+ import hmac
8
+ import json
9
+ from openspeech.logging_config import logger
10
+ import time
11
+ import urllib.parse
12
+ from collections.abc import AsyncIterator
13
+ from dataclasses import dataclass
14
+ from datetime import datetime, timezone
15
+ from email.utils import formatdate
16
+ from typing import Any
17
+
18
+ import httpx
19
+ import websockets
20
+
21
+ from openspeech.core.base import STTProvider
22
+
23
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
24
+ from openspeech.core.models import AudioData, STTOptions, Transcription
25
+ from openspeech.core.settings import BaseSettings
26
+
27
+ @dataclass
28
+ class IflytekSTTSettings(BaseSettings):
29
+ app_id: str = ""
30
+ api_key: str = ""
31
+ api_secret: str = ""
32
+ language: str = "zh_cn"
33
+
34
+ class IflytekSTT(STTProvider):
35
+ name = "iflytek-stt"
36
+ provider_type = ProviderType.STT
37
+ execution_mode = ExecMode.IN_PROCESS
38
+ settings_cls = IflytekSTTSettings
39
+ capabilities = {Capability.BATCH, Capability.STREAMING, Capability.MULTILINGUAL}
40
+ field_options = {"language": ["zh_cn", "en_us"]}
41
+
42
+ _WS_HOST = "iat-api.xfyun.cn"
43
+ _WS_PATH = "/v2/iat"
44
+
45
+ def __init__(self, settings: IflytekSTTSettings | None = None) -> None:
46
+ self.settings = settings or IflytekSTTSettings()
47
+ self._client: httpx.AsyncClient | None = None
48
+ self._owns_client: bool = True
49
+
50
+ def set_http_client(self, client) -> None:
51
+ self._client = client
52
+ self._owns_client = False
53
+
54
+ async def start(self) -> None:
55
+ if self._client is None:
56
+ self._client = httpx.AsyncClient(timeout=60.0)
57
+ self._owns_client = True
58
+ logger.info("{} provider started", self.name)
59
+
60
+ async def stop(self) -> None:
61
+ if self._client and self._owns_client:
62
+ await self._client.aclose()
63
+ self._client = None
64
+ logger.info("{} provider stopped", self.name)
65
+
66
+ async def health_check(self) -> bool:
67
+ return bool(self.settings.app_id) and bool(self.settings.api_key) and bool(self.settings.api_secret)
68
+
69
+ def _build_auth_url(self) -> str:
70
+ """Build HMAC-SHA256 signed WebSocket URL."""
71
+ now = datetime.now(tz=timezone.utc)
72
+ date = formatdate(timeval=now.timestamp(), localtime=False, usegmt=True)
73
+
74
+ signature_origin = (
75
+ f"host: {self._WS_HOST}\n"
76
+ f"date: {date}\n"
77
+ f"GET {self._WS_PATH} HTTP/1.1"
78
+ )
79
+ signature_sha = hmac.new(
80
+ self.settings.api_secret.encode("utf-8"),
81
+ signature_origin.encode("utf-8"),
82
+ hashlib.sha256,
83
+ ).digest()
84
+ signature = base64.b64encode(signature_sha).decode("utf-8")
85
+
86
+ authorization_origin = (
87
+ f'api_key="{self.settings.api_key}", '
88
+ f'algorithm="hmac-sha256", '
89
+ f'headers="host date request-line", '
90
+ f'signature="{signature}"'
91
+ )
92
+ authorization = base64.b64encode(
93
+ authorization_origin.encode("utf-8")
94
+ ).decode("utf-8")
95
+
96
+ params = urllib.parse.urlencode(
97
+ {"authorization": authorization, "date": date, "host": self._WS_HOST}
98
+ )
99
+ return f"wss://{self._WS_HOST}{self._WS_PATH}?{params}"
100
+
101
+ async def transcribe(
102
+ self, audio: AudioData, opts: STTOptions | None = None
103
+ ) -> Transcription:
104
+ if self._client is None:
105
+ raise RuntimeError("Provider not started — call start() first")
106
+ logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
107
+ _t0 = time.perf_counter()
108
+
109
+ url = self._build_auth_url()
110
+ audio_bytes = audio.data
111
+ # iFlytek recommends ~40ms per frame at 16kHz 16bit mono = 1280 bytes.
112
+ # Use larger frames (8000 bytes = ~250ms) with pacing to avoid server
113
+ # read-timeout when sending pre-recorded audio faster than real-time.
114
+ frame_size = 8000 # bytes per chunk (~250ms of 16kHz 16bit mono)
115
+ # Pacing: small delay between frames to avoid server read-timeout
116
+ # when sending pre-recorded audio faster than real-time.
117
+ # 8000 bytes = 250ms of audio; 10ms interval = ~25x real-time.
118
+ frame_interval = 0.01 # 10ms between frames (~25x real-time)
119
+
120
+ result_texts: list[str] = []
121
+
122
+ async with websockets.connect(url) as ws:
123
+ # Send audio in chunks with interleaved receive
124
+ total = len(audio_bytes)
125
+ offset = 0
126
+ status = 0 # 0=first frame
127
+ frames_sent = 0
128
+
129
+ while offset < total:
130
+ end = min(offset + frame_size, total)
131
+ chunk = audio_bytes[offset:end]
132
+
133
+ if end >= total:
134
+ status = 2 # last frame
135
+ elif offset > 0:
136
+ status = 1 # continue
137
+
138
+ frame_data = base64.b64encode(chunk).decode("utf-8")
139
+
140
+ if status == 0:
141
+ # First frame includes common and business params
142
+ msg = {
143
+ "common": {"app_id": self.settings.app_id},
144
+ "business": {
145
+ "language": self.settings.language,
146
+ "domain": "iat",
147
+ "accent": "mandarin",
148
+ "vad_eos": 2000,
149
+ },
150
+ "data": {
151
+ "status": 0,
152
+ "format": "audio/L16;rate=16000",
153
+ "encoding": "raw",
154
+ "audio": frame_data,
155
+ },
156
+ }
157
+ else:
158
+ msg = {
159
+ "data": {
160
+ "status": status,
161
+ "format": "audio/L16;rate=16000",
162
+ "encoding": "raw",
163
+ "audio": frame_data,
164
+ }
165
+ }
166
+
167
+ await ws.send(json.dumps(msg))
168
+ frames_sent += 1
169
+ offset = end
170
+
171
+ # Pacing: small delay between frames to avoid server timeout
172
+ if status != 2 and frame_interval > 0:
173
+ await asyncio.sleep(frame_interval)
174
+
175
+ logger.debug("{}: sent {} frames in {:.0f}ms", self.name, frames_sent,
176
+ (time.perf_counter() - _t0) * 1000)
177
+
178
+ # Receive results
179
+ async for message in ws:
180
+ resp = json.loads(message)
181
+ code = resp.get("code", -1)
182
+ if code != 0:
183
+ raise RuntimeError(
184
+ f"iFlytek STT error [{code}]: {resp.get('message', 'unknown')}"
185
+ )
186
+
187
+ data = resp.get("data", {})
188
+ result = data.get("result", {})
189
+ ws_items = result.get("ws", [])
190
+ for ws_item in ws_items:
191
+ cw_list = ws_item.get("cw", [])
192
+ for cw in cw_list:
193
+ result_texts.append(cw.get("w", ""))
194
+
195
+ if data.get("status") == 2:
196
+ break
197
+
198
+ result = Transcription(text="".join(result_texts))
199
+ logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(result.text))
200
+ return result
201
+
202
+ @staticmethod
203
+ def _extract_segment_text(ws_items: list[dict]) -> str:
204
+ """Extract text from a single response's ws array."""
205
+ parts: list[str] = []
206
+ for ws_item in ws_items:
207
+ for cw in ws_item.get("cw", []):
208
+ parts.append(cw.get("w", ""))
209
+ return "".join(parts)
210
+
211
+ async def transcribe_stream(
212
+ self, stream: AsyncIterator[bytes]
213
+ ) -> AsyncIterator[Transcription]:
214
+ """Stream audio chunks to iFlytek via WebSocket and yield transcriptions.
215
+
216
+ Each bytes chunk from *stream* is a raw PCM frame (16kHz 16bit mono).
217
+ A ``None`` or empty chunk signals end-of-stream (VAD end).
218
+
219
+ The implementation uses two concurrent coroutines:
220
+ - **sender**: reads chunks from *stream* and forwards them to iFlytek
221
+ with natural pacing (frames arrive at ~real-time from the mic).
222
+ - **receiver**: reads iFlytek responses, parses ``dwa=wpgs`` dynamic
223
+ correction fields (``pgs``/``rg``), maintains a segment array, and
224
+ yields partial ``Transcription`` on every response plus a final one
225
+ on ``status == 2``.
226
+
227
+ wpgs protocol:
228
+ - ``pgs="apd"``: append — new segment at index ``sn``
229
+ - ``pgs="rpl"``: replace — replace segments ``rg[0]..rg[1]`` with
230
+ new content, effectively a correction of earlier partial results
231
+ - No ``pgs`` field: legacy mode (no dynamic correction) — accumulate
232
+ """
233
+ if self._client is None:
234
+ raise RuntimeError("Provider not started — call start() first")
235
+
236
+ url = self._build_auth_url()
237
+ results: asyncio.Queue[Transcription | None] = asyncio.Queue()
238
+ _t0 = time.perf_counter()
239
+ _frames_sent = 0
240
+
241
+ # Event to signal sender to stop (set by receiver when iFlytek
242
+ # returns status=2 or the connection closes). This handles the case
243
+ # where the user doesn't click stop — iFlytek's VAD triggers a final
244
+ # result, and we need the sender to stop consuming the frame queue.
245
+ _sender_stop = asyncio.Event()
246
+
247
+ logger.debug("{}: connecting to iFlytek WebSocket...", self.name)
248
+ async with websockets.connect(url) as ws:
249
+ _t_connected = time.perf_counter()
250
+ logger.info("{}: WS connected in {:.0f}ms", self.name,
251
+ (_t_connected - _t0) * 1000)
252
+
253
+ async def sender() -> None:
254
+ nonlocal _frames_sent
255
+ is_first = True
256
+ try:
257
+ async for chunk in stream:
258
+ # Check if receiver signaled us to stop
259
+ if _sender_stop.is_set():
260
+ break
261
+ if chunk is None or len(chunk) == 0:
262
+ # End-of-stream sentinel — send last frame
263
+ break
264
+ frame_data = base64.b64encode(chunk).decode("utf-8")
265
+ if is_first:
266
+ msg = {
267
+ "common": {"app_id": self.settings.app_id},
268
+ "business": {
269
+ "language": self.settings.language,
270
+ "domain": "iat",
271
+ "accent": "mandarin",
272
+ "dwa": "wpgs",
273
+ "vad_eos": 2000,
274
+ },
275
+ "data": {
276
+ "status": 0,
277
+ "format": "audio/L16;rate=16000",
278
+ "encoding": "raw",
279
+ "audio": frame_data,
280
+ },
281
+ }
282
+ is_first = False
283
+ else:
284
+ msg = {
285
+ "data": {
286
+ "status": 1,
287
+ "format": "audio/L16;rate=16000",
288
+ "encoding": "raw",
289
+ "audio": frame_data,
290
+ }
291
+ }
292
+ await ws.send(json.dumps(msg))
293
+ _frames_sent += 1
294
+ if _frames_sent == 1:
295
+ logger.debug("{}: first frame sent at {:.0f}ms",
296
+ self.name, (time.perf_counter() - _t0) * 1000)
297
+
298
+ # Send empty last frame to signal end (only if WS still open)
299
+ if not _sender_stop.is_set():
300
+ last_msg = {
301
+ "data": {
302
+ "status": 2,
303
+ "format": "audio/L16;rate=16000",
304
+ "encoding": "raw",
305
+ "audio": "",
306
+ }
307
+ }
308
+ await ws.send(json.dumps(last_msg))
309
+ except websockets.exceptions.ConnectionClosed:
310
+ # iFlytek closed the connection (e.g. server read timeout
311
+ # or VAD-triggered close). This is expected when the user
312
+ # doesn't click stop — just exit silently.
313
+ pass
314
+ finally:
315
+ logger.debug(
316
+ "{}: stream sender done, sent {} frames in {:.0f}ms",
317
+ self.name, _frames_sent, (time.perf_counter() - _t0) * 1000,
318
+ )
319
+
320
+ async def receiver() -> None:
321
+ # Segment array for wpgs dynamic correction.
322
+ # Index = sn (sentence number from iFlytek).
323
+ # Each element is the text for that segment.
324
+ segments: list[str] = []
325
+ _resp_count = 0
326
+
327
+ try:
328
+ async for message in ws:
329
+ resp = json.loads(message)
330
+ code = resp.get("code", -1)
331
+ if code != 0:
332
+ raise RuntimeError(
333
+ f"iFlytek STT error [{code}]: {resp.get('message', 'unknown')}"
334
+ )
335
+
336
+ _resp_count += 1
337
+ data = resp.get("data", {})
338
+ resp_status = data.get("status", 0)
339
+ result = data.get("result", {})
340
+ ws_items = result.get("ws", [])
341
+ pgs = result.get("pgs", "") # "apd" or "rpl"
342
+ rg = result.get("rg", []) # [start, end] for rpl
343
+ sn = result.get("sn", 0) # segment number
344
+
345
+ seg_text = self._extract_segment_text(ws_items)
346
+
347
+ if _resp_count == 1:
348
+ logger.debug("{}: first response at {:.0f}ms sn={} pgs={}",
349
+ self.name, (time.perf_counter() - _t0) * 1000,
350
+ sn, pgs or "none")
351
+
352
+ if pgs == "rpl" and len(rg) == 2:
353
+ # Replace: clear segments rg[0]..rg[1], put new
354
+ # text at rg[0], remove the rest in range.
355
+ start, end = rg[0], rg[1]
356
+ # Ensure segments list is large enough
357
+ while len(segments) <= end:
358
+ segments.append("")
359
+ # Clear the replaced range
360
+ for i in range(start, end + 1):
361
+ segments[i] = ""
362
+ # Put new text at start position
363
+ segments[start] = seg_text
364
+ elif pgs == "apd":
365
+ # Append: add/overwrite segment at index sn
366
+ while len(segments) <= sn:
367
+ segments.append("")
368
+ segments[sn] = seg_text
369
+ else:
370
+ # No pgs (legacy / non-wpgs fallback): append
371
+ while len(segments) <= sn:
372
+ segments.append("")
373
+ segments[sn] = seg_text
374
+
375
+ # Build current full text from all segments
376
+ current_text = "".join(segments).strip()
377
+
378
+ if resp_status == 2:
379
+ # Final result — stop sender
380
+ _sender_stop.set()
381
+ logger.info("{}: final result at {:.0f}ms responses={} text='{}'",
382
+ self.name, (time.perf_counter() - _t0) * 1000,
383
+ _resp_count, current_text[:60])
384
+ if current_text:
385
+ await results.put(
386
+ Transcription(text=current_text, is_partial=False)
387
+ )
388
+ break
389
+ else:
390
+ # Partial result — yield for real-time display
391
+ if current_text:
392
+ await results.put(
393
+ Transcription(text=current_text, is_partial=True)
394
+ )
395
+
396
+ except websockets.exceptions.ConnectionClosed:
397
+ # Connection closed by server — stop sender, emit whatever we have
398
+ _sender_stop.set()
399
+ current_text = "".join(segments).strip()
400
+ if current_text:
401
+ await results.put(
402
+ Transcription(text=current_text, is_partial=False)
403
+ )
404
+ finally:
405
+ _sender_stop.set() # ensure sender stops in all cases
406
+ await results.put(None) # sentinel
407
+
408
+ send_task = asyncio.create_task(sender())
409
+ recv_task = asyncio.create_task(receiver())
410
+
411
+ while True:
412
+ item = await results.get()
413
+ if item is None:
414
+ break
415
+ yield item
416
+
417
+ logger.info(
418
+ "{}: stream completed in {:.0f}ms, frames={}",
419
+ self.name, (time.perf_counter() - _t0) * 1000, _frames_sent,
420
+ )
421
+ # Wait for tasks; suppress sender errors (e.g. ConnectionClosed
422
+ # that slipped past the try/except if timing was tight).
423
+ for task in (send_task, recv_task):
424
+ try:
425
+ await task
426
+ except websockets.exceptions.ConnectionClosed:
427
+ pass