openspeechapi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. openspeech/__init__.py +75 -0
  2. openspeech/__main__.py +5 -0
  3. openspeech/cli.py +413 -0
  4. openspeech/client/__init__.py +4 -0
  5. openspeech/client/client.py +145 -0
  6. openspeech/config.py +212 -0
  7. openspeech/core/__init__.py +0 -0
  8. openspeech/core/base.py +75 -0
  9. openspeech/core/enums.py +39 -0
  10. openspeech/core/models.py +61 -0
  11. openspeech/core/registry.py +37 -0
  12. openspeech/core/settings.py +8 -0
  13. openspeech/demo.py +675 -0
  14. openspeech/dispatch/__init__.py +0 -0
  15. openspeech/dispatch/context.py +34 -0
  16. openspeech/dispatch/dispatcher.py +661 -0
  17. openspeech/dispatch/executors/__init__.py +0 -0
  18. openspeech/dispatch/executors/base.py +34 -0
  19. openspeech/dispatch/executors/in_process.py +66 -0
  20. openspeech/dispatch/executors/remote.py +64 -0
  21. openspeech/dispatch/executors/subprocess_exec.py +446 -0
  22. openspeech/dispatch/fanout.py +95 -0
  23. openspeech/dispatch/filters.py +73 -0
  24. openspeech/dispatch/lifecycle.py +178 -0
  25. openspeech/dispatch/watcher.py +82 -0
  26. openspeech/engine_catalog.py +236 -0
  27. openspeech/engine_registry.yaml +347 -0
  28. openspeech/exceptions.py +51 -0
  29. openspeech/factory.py +325 -0
  30. openspeech/local_engines/__init__.py +12 -0
  31. openspeech/local_engines/aim_resolver.py +91 -0
  32. openspeech/local_engines/backends/__init__.py +1 -0
  33. openspeech/local_engines/backends/docker_backend.py +490 -0
  34. openspeech/local_engines/backends/native_backend.py +902 -0
  35. openspeech/local_engines/base.py +30 -0
  36. openspeech/local_engines/engines/__init__.py +1 -0
  37. openspeech/local_engines/engines/faster_whisper.py +36 -0
  38. openspeech/local_engines/engines/fish_speech.py +33 -0
  39. openspeech/local_engines/engines/sherpa_onnx.py +56 -0
  40. openspeech/local_engines/engines/whisper.py +41 -0
  41. openspeech/local_engines/engines/whisperlivekit.py +60 -0
  42. openspeech/local_engines/manager.py +208 -0
  43. openspeech/local_engines/models.py +50 -0
  44. openspeech/local_engines/progress.py +69 -0
  45. openspeech/local_engines/registry.py +19 -0
  46. openspeech/local_engines/task_store.py +52 -0
  47. openspeech/local_engines/tasks.py +71 -0
  48. openspeech/logging_config.py +607 -0
  49. openspeech/observe/__init__.py +0 -0
  50. openspeech/observe/base.py +79 -0
  51. openspeech/observe/debug.py +44 -0
  52. openspeech/observe/latency.py +19 -0
  53. openspeech/observe/metrics.py +47 -0
  54. openspeech/observe/tracing.py +44 -0
  55. openspeech/observe/usage.py +27 -0
  56. openspeech/providers/__init__.py +0 -0
  57. openspeech/providers/_template.py +101 -0
  58. openspeech/providers/stt/__init__.py +0 -0
  59. openspeech/providers/stt/alibaba.py +86 -0
  60. openspeech/providers/stt/assemblyai.py +135 -0
  61. openspeech/providers/stt/azure_speech.py +99 -0
  62. openspeech/providers/stt/baidu.py +135 -0
  63. openspeech/providers/stt/deepgram.py +311 -0
  64. openspeech/providers/stt/elevenlabs.py +385 -0
  65. openspeech/providers/stt/faster_whisper.py +211 -0
  66. openspeech/providers/stt/google_cloud.py +106 -0
  67. openspeech/providers/stt/iflytek.py +427 -0
  68. openspeech/providers/stt/macos_speech.py +226 -0
  69. openspeech/providers/stt/openai.py +84 -0
  70. openspeech/providers/stt/sherpa_onnx.py +353 -0
  71. openspeech/providers/stt/tencent.py +212 -0
  72. openspeech/providers/stt/volcengine.py +107 -0
  73. openspeech/providers/stt/whisper.py +153 -0
  74. openspeech/providers/stt/whisperlivekit.py +530 -0
  75. openspeech/providers/stt/windows_speech.py +249 -0
  76. openspeech/providers/tts/__init__.py +0 -0
  77. openspeech/providers/tts/alibaba.py +95 -0
  78. openspeech/providers/tts/azure_speech.py +123 -0
  79. openspeech/providers/tts/baidu.py +143 -0
  80. openspeech/providers/tts/coqui.py +64 -0
  81. openspeech/providers/tts/cosyvoice.py +90 -0
  82. openspeech/providers/tts/deepgram.py +174 -0
  83. openspeech/providers/tts/elevenlabs.py +311 -0
  84. openspeech/providers/tts/fish_speech.py +158 -0
  85. openspeech/providers/tts/google_cloud.py +107 -0
  86. openspeech/providers/tts/iflytek.py +209 -0
  87. openspeech/providers/tts/macos_say.py +251 -0
  88. openspeech/providers/tts/minimax.py +122 -0
  89. openspeech/providers/tts/openai.py +104 -0
  90. openspeech/providers/tts/piper.py +104 -0
  91. openspeech/providers/tts/tencent.py +189 -0
  92. openspeech/providers/tts/volcengine.py +117 -0
  93. openspeech/providers/tts/windows_sapi.py +234 -0
  94. openspeech/server/__init__.py +1 -0
  95. openspeech/server/app.py +72 -0
  96. openspeech/server/auth.py +42 -0
  97. openspeech/server/middleware.py +75 -0
  98. openspeech/server/routes/__init__.py +1 -0
  99. openspeech/server/routes/management.py +848 -0
  100. openspeech/server/routes/stt.py +121 -0
  101. openspeech/server/routes/tts.py +159 -0
  102. openspeech/server/routes/webui.py +29 -0
  103. openspeech/server/webui/app.js +2649 -0
  104. openspeech/server/webui/index.html +216 -0
  105. openspeech/server/webui/styles.css +617 -0
  106. openspeech/server/ws/__init__.py +1 -0
  107. openspeech/server/ws/stt_stream.py +263 -0
  108. openspeech/server/ws/tts_stream.py +207 -0
  109. openspeech/telemetry/__init__.py +21 -0
  110. openspeech/telemetry/perf.py +307 -0
  111. openspeech/utils/__init__.py +5 -0
  112. openspeech/utils/audio_converter.py +406 -0
  113. openspeech/utils/audio_playback.py +156 -0
  114. openspeech/vendor_registry.yaml +74 -0
  115. openspeechapi-0.1.0.dist-info/METADATA +101 -0
  116. openspeechapi-0.1.0.dist-info/RECORD +118 -0
  117. openspeechapi-0.1.0.dist-info/WHEEL +4 -0
  118. openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,99 @@
1
+ """Azure Speech STT provider adapter (batch, httpx)."""
2
+ from __future__ import annotations
3
+
4
+ from openspeech.logging_config import logger
5
+ import time
6
+ from collections.abc import AsyncIterator
7
+ from dataclasses import dataclass
8
+ from typing import Any
9
+
10
+ import httpx
11
+
12
+ from openspeech.core.base import STTProvider
13
+
14
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
15
+ from openspeech.core.models import AudioData, STTOptions, Transcription
16
+ from openspeech.core.settings import BaseSettings
17
+
18
+ @dataclass
19
+ class AzureSpeechSTTSettings(BaseSettings):
20
+ subscription_key: str = ""
21
+ region: str = "eastus"
22
+ language: str = "en-US"
23
+
24
+ class AzureSpeechSTT(STTProvider):
25
+ name = "azure-stt"
26
+ provider_type = ProviderType.STT
27
+ execution_mode = ExecMode.REMOTE
28
+ settings_cls = AzureSpeechSTTSettings
29
+ capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
30
+ field_options = {"language": ["en-US", "zh-CN", "ja-JP", "ko-KR", "es-ES", "fr-FR", "de-DE", "pt-BR", "it-IT", "ru-RU", "ar-SA", "hi-IN"], "region": ["eastus", "westus2", "westeurope", "eastasia", "southeastasia"]}
31
+
32
+ def __init__(self, settings: AzureSpeechSTTSettings | None = None) -> None:
33
+ self.settings = settings or AzureSpeechSTTSettings()
34
+ self._client: httpx.AsyncClient | None = None
35
+ self._owns_client: bool = True
36
+
37
+ def set_http_client(self, client) -> None:
38
+ self._client = client
39
+ self._owns_client = False
40
+
41
+ async def start(self) -> None:
42
+ if self._client is None:
43
+ self._client = httpx.AsyncClient(timeout=60.0)
44
+ self._owns_client = True
45
+ logger.info("{} provider started", self.name)
46
+
47
+ async def stop(self) -> None:
48
+ if self._client and self._owns_client:
49
+ await self._client.aclose()
50
+ self._client = None
51
+ logger.info("{} provider stopped", self.name)
52
+
53
+ async def health_check(self) -> bool:
54
+ return bool(self.settings.subscription_key)
55
+
56
+ async def transcribe(
57
+ self, audio: AudioData, opts: STTOptions | None = None
58
+ ) -> Transcription:
59
+ if self._client is None:
60
+ raise RuntimeError("Provider not started — call start() first")
61
+ logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
62
+ _t0 = time.perf_counter()
63
+ opts = opts or STTOptions()
64
+ language = opts.language or self.settings.language
65
+ region = self.settings.region
66
+
67
+ url = (
68
+ f"https://{region}.stt.speech.microsoft.com"
69
+ f"/speech/recognition/conversation/cognitiveservices/v1"
70
+ f"?language={language}"
71
+ )
72
+ headers = {
73
+ "Ocp-Apim-Subscription-Key": self.settings.subscription_key,
74
+ "Content-Type": "audio/wav",
75
+ }
76
+ response = await self._client.post(url, headers=headers, content=audio.data)
77
+ if response.status_code != 200:
78
+ raise RuntimeError(
79
+ f"Azure Speech STT API error {response.status_code}: {response.text}"
80
+ )
81
+ data = response.json()
82
+ recognition_status = data.get("RecognitionStatus", "")
83
+ if recognition_status != "Success":
84
+ return Transcription(text="", language=language)
85
+ result = Transcription(
86
+ text=data.get("DisplayText", ""),
87
+ language=language,
88
+ confidence=data.get("Confidence"),
89
+ )
90
+ logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(result.text))
91
+ return result
92
+
93
+ async def transcribe_stream(
94
+ self, stream: AsyncIterator[bytes]
95
+ ) -> AsyncIterator[Any]:
96
+ raise NotImplementedError(
97
+ "Azure Speech STT batch provider does not support streaming input"
98
+ )
99
+ yield # pragma: no cover
@@ -0,0 +1,135 @@
1
+ """Baidu Cloud ASR STT provider adapter."""
2
+ from __future__ import annotations
3
+
4
+ import base64
5
+ from openspeech.logging_config import logger
6
+ import time
7
+ from collections.abc import AsyncIterator
8
+ from dataclasses import dataclass
9
+ from typing import Any
10
+
11
+ import httpx
12
+
13
+ from openspeech.core.base import STTProvider
14
+
15
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
16
+ from openspeech.core.models import AudioData, STTOptions, Transcription
17
+ from openspeech.core.settings import BaseSettings
18
+
19
+ @dataclass
20
+ class BaiduSTTSettings(BaseSettings):
21
+ api_key: str = ""
22
+ secret_key: str = ""
23
+ dev_pid: int = 1537 # 1537=普通话, 1737=英语, 1637=粤语
24
+
25
+ class BaiduSTT(STTProvider):
26
+ name = "baidu-stt"
27
+ provider_type = ProviderType.STT
28
+ execution_mode = ExecMode.IN_PROCESS
29
+ settings_cls = BaiduSTTSettings
30
+ capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
31
+ field_options = {"dev_pid": [1537, 1737, 1637, 1936, 1836]}
32
+
33
+ def __init__(self, settings: BaiduSTTSettings | None = None) -> None:
34
+ self.settings = settings or BaiduSTTSettings()
35
+ self._client: httpx.AsyncClient | None = None
36
+ self._owns_client: bool = True
37
+ self._token: str | None = None
38
+ self._token_expires_at: float = 0.0
39
+
40
+ def set_http_client(self, client) -> None:
41
+ self._client = client
42
+ self._owns_client = False
43
+
44
+ async def start(self) -> None:
45
+ if self._client is None:
46
+ self._client = httpx.AsyncClient(timeout=60.0)
47
+ self._owns_client = True
48
+ logger.info("{} provider started", self.name)
49
+
50
+ async def stop(self) -> None:
51
+ if self._client and self._owns_client:
52
+ await self._client.aclose()
53
+ self._client = None
54
+ self._token = None
55
+ self._token_expires_at = 0.0
56
+ logger.info("{} provider stopped", self.name)
57
+
58
+ async def health_check(self) -> bool:
59
+ return bool(self.settings.api_key) and bool(self.settings.secret_key)
60
+
61
+ async def _get_token(self) -> str:
62
+ """Fetch or return cached OAuth access token."""
63
+ if self._token and time.time() < self._token_expires_at:
64
+ return self._token
65
+
66
+ if self._client is None:
67
+ raise RuntimeError("Provider not started — call start() first")
68
+
69
+ resp = await self._client.get(
70
+ "https://aip.baidubce.com/oauth/2.0/token",
71
+ params={
72
+ "grant_type": "client_credentials",
73
+ "client_id": self.settings.api_key,
74
+ "client_secret": self.settings.secret_key,
75
+ },
76
+ )
77
+ resp.raise_for_status()
78
+ data = resp.json()
79
+
80
+ if "access_token" not in data:
81
+ raise RuntimeError(
82
+ f"Baidu OAuth error: {data.get('error_description', 'unknown')}"
83
+ )
84
+
85
+ self._token = data["access_token"]
86
+ self._token_expires_at = time.time() + data.get("expires_in", 2592000) - 60
87
+ return self._token # type: ignore[return-value]
88
+
89
+ async def transcribe(
90
+ self, audio: AudioData, opts: STTOptions | None = None
91
+ ) -> Transcription:
92
+ if self._client is None:
93
+ raise RuntimeError("Provider not started — call start() first")
94
+ logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
95
+ _t0 = time.perf_counter()
96
+
97
+ token = await self._get_token()
98
+ base64_audio = base64.b64encode(audio.data).decode("utf-8")
99
+
100
+ payload = {
101
+ "format": "wav",
102
+ "rate": 16000,
103
+ "channel": 1,
104
+ "cuid": "openspeech",
105
+ "token": token,
106
+ "dev_pid": self.settings.dev_pid,
107
+ "speech": base64_audio,
108
+ "len": len(audio.data),
109
+ }
110
+
111
+ resp = await self._client.post(
112
+ "https://vop.baidu.com/server_api",
113
+ json=payload,
114
+ )
115
+ resp.raise_for_status()
116
+ data = resp.json()
117
+
118
+ err_no = data.get("err_no", 0)
119
+ if err_no != 0:
120
+ raise RuntimeError(
121
+ f"Baidu ASR error [{err_no}]: {data.get('err_msg', 'unknown')}"
122
+ )
123
+
124
+ results = data.get("result", [])
125
+ text = results[0] if results else ""
126
+
127
+ result = Transcription(text=text)
128
+ logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(result.text))
129
+ return result
130
+
131
+ async def transcribe_stream(
132
+ self, stream: AsyncIterator[bytes]
133
+ ) -> AsyncIterator[Any]:
134
+ raise NotImplementedError("Baidu STT streaming not implemented")
135
+ yield # noqa: unreachable — makes this an async generator
@@ -0,0 +1,311 @@
1
+ """Deepgram STT provider adapter (batch + streaming, httpx-based, no SDK needed)."""
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import inspect
6
+ import json
7
+ from openspeech.logging_config import logger
8
+ import time
9
+ from collections.abc import AsyncIterator
10
+ from dataclasses import dataclass
11
+ from typing import Any
12
+
13
+ import httpx
14
+
15
+ from openspeech.core.base import STTProvider
16
+
17
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
18
+ from openspeech.core.models import AudioData, STTOptions, Transcription, Word
19
+ from openspeech.core.settings import BaseSettings
20
+
21
+
22
+ def _ws_connect_with_headers(websockets_mod, url: str, headers: dict[str, str]):
23
+ """Compatible connect kwargs across websockets versions."""
24
+ try:
25
+ sig = inspect.signature(websockets_mod.connect)
26
+ if "additional_headers" in sig.parameters:
27
+ return websockets_mod.connect(url, additional_headers=headers)
28
+ except Exception:
29
+ pass
30
+ return websockets_mod.connect(url, extra_headers=headers)
31
+
32
+
33
+ @dataclass
34
+ class DeepgramSTTSettings(BaseSettings):
35
+ api_key: str = ""
36
+ model: str = "nova-2"
37
+ language: str = "en-US"
38
+ punctuate: bool = True
39
+ smart_format: bool = True
40
+
41
+ class DeepgramSTT(STTProvider):
42
+ name = "deepgram"
43
+ provider_type = ProviderType.STT
44
+ execution_mode = ExecMode.IN_PROCESS
45
+ settings_cls = DeepgramSTTSettings
46
+ capabilities = {
47
+ Capability.STREAMING,
48
+ Capability.BATCH,
49
+ Capability.MULTILINGUAL,
50
+ }
51
+ field_options = {
52
+ "model": [
53
+ "nova-3", "nova-3-general", "nova-3-medical",
54
+ "nova-2", "nova-2-general", "nova-2-meeting", "nova-2-phonecall",
55
+ "nova-2-voicemail", "nova-2-finance", "nova-2-medical",
56
+ "enhanced", "enhanced-general",
57
+ "base", "base-general",
58
+ "whisper-large", "whisper-medium", "whisper-small",
59
+ ],
60
+ "language": [
61
+ "multi", "en", "en-US", "en-GB", "en-AU", "en-IN",
62
+ "zh", "zh-CN", "zh-TW", "zh-HK",
63
+ "ja", "ko", "ko-KR",
64
+ "es", "es-419", "fr", "fr-CA", "de", "de-CH",
65
+ "pt", "pt-BR", "pt-PT", "it", "nl", "nl-BE",
66
+ "ru", "uk", "pl", "cs", "sk",
67
+ "sv", "sv-SE", "da", "da-DK", "no", "fi",
68
+ "tr", "el", "ro", "hu", "bg",
69
+ "ar", "he", "fa", "hi", "hi-Latn", "bn", "ta", "te", "ur",
70
+ "id", "ms", "th", "th-TH", "vi", "tl",
71
+ ],
72
+ }
73
+
74
+ def __init__(self, settings: DeepgramSTTSettings | None = None) -> None:
75
+ self.settings = settings or DeepgramSTTSettings()
76
+ self._client: httpx.AsyncClient | None = None
77
+ self._owns_client: bool = True
78
+
79
+ def set_http_client(self, client) -> None:
80
+ self._client = client
81
+ self._owns_client = False
82
+
83
+ async def start(self) -> None:
84
+ if self._client is None:
85
+ self._client = httpx.AsyncClient(timeout=60.0)
86
+ self._owns_client = True
87
+ logger.info("{} provider started", self.name)
88
+
89
+ async def stop(self) -> None:
90
+ if self._client and self._owns_client:
91
+ await self._client.aclose()
92
+ self._client = None
93
+ logger.info("{} provider stopped", self.name)
94
+
95
+ async def health_check(self) -> bool:
96
+ # Cloud provider: healthy if API key is configured (client is lazy-started)
97
+ return bool(self.settings.api_key)
98
+
99
+ async def transcribe(
100
+ self, audio: AudioData, opts: STTOptions | None = None
101
+ ) -> Transcription:
102
+ if self._client is None:
103
+ raise RuntimeError("Provider not started — call start() first")
104
+ logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
105
+ _t0 = time.perf_counter()
106
+ opts = opts or STTOptions()
107
+
108
+ language = opts.language or self.settings.language
109
+ params = {
110
+ "model": self.settings.model,
111
+ "language": language,
112
+ "punctuate": str(self.settings.punctuate).lower(),
113
+ "smart_format": str(self.settings.smart_format).lower(),
114
+ }
115
+ headers = {
116
+ "Authorization": f"Token {self.settings.api_key}",
117
+ "Content-Type": "audio/wav",
118
+ }
119
+
120
+ resp = await self._client.post(
121
+ "https://api.deepgram.com/v1/listen",
122
+ params=params,
123
+ headers=headers,
124
+ content=audio.data,
125
+ )
126
+ if resp.status_code != 200:
127
+ raise RuntimeError(f"Deepgram API error ({resp.status_code}): {resp.text}")
128
+
129
+ data = resp.json()
130
+ channel = data.get("results", {}).get("channels", [{}])[0]
131
+ alt = channel.get("alternatives", [{}])[0]
132
+
133
+ words: list[Word] = []
134
+ for w in alt.get("words", []):
135
+ words.append(
136
+ Word(
137
+ text=w.get("word", ""),
138
+ start_ms=int(w.get("start", 0) * 1000),
139
+ end_ms=int(w.get("end", 0) * 1000),
140
+ confidence=w.get("confidence"),
141
+ )
142
+ )
143
+
144
+ result = Transcription(
145
+ text=alt.get("transcript", ""),
146
+ language=channel.get("detected_language"),
147
+ confidence=alt.get("confidence"),
148
+ words=words if words else None,
149
+ )
150
+ logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(result.text))
151
+ return result
152
+
153
+ async def transcribe_stream(
154
+ self, stream: AsyncIterator[bytes]
155
+ ) -> AsyncIterator[Any]:
156
+ """Stream audio chunks to Deepgram via WebSocket and yield transcriptions.
157
+
158
+ Deepgram streaming protocol (``interim_results=true``):
159
+ - Each ``Results`` message contains an utterance-level transcript with
160
+ ``is_final`` (bool) indicating whether the utterance is finalized.
161
+ - ``speech_final`` (bool) indicates end-of-speech (VAD silence).
162
+ - This implementation accumulates finalized utterances and appends the
163
+ latest interim text so each yield is a **full-text snapshot** that
164
+ the frontend can display directly via ``streamTextSnapshot``.
165
+ - ``is_partial`` on the yielded ``Transcription`` is set according to
166
+ ``speech_final`` so the server can forward ``"type": "final"`` and
167
+ trigger auto-stop on the client.
168
+ """
169
+ if self._client is None:
170
+ raise RuntimeError("Provider not started — call start() first")
171
+
172
+ import websockets
173
+
174
+ _t0 = time.perf_counter()
175
+ _frames_sent = 0
176
+
177
+ params = (
178
+ f"model={self.settings.model}"
179
+ f"&language={self.settings.language}"
180
+ f"&punctuate={'true' if self.settings.punctuate else 'false'}"
181
+ f"&smart_format={'true' if self.settings.smart_format else 'false'}"
182
+ f"&encoding=linear16&sample_rate=16000"
183
+ f"&interim_results=true"
184
+ f"&vad_events=true"
185
+ )
186
+ url = f"wss://api.deepgram.com/v1/listen?{params}"
187
+ headers = {"Authorization": f"Token {self.settings.api_key}"}
188
+
189
+ results: asyncio.Queue[Transcription | None] = asyncio.Queue()
190
+ _sender_stop = asyncio.Event()
191
+
192
+ logger.debug("{}: connecting to Deepgram WebSocket...", self.name)
193
+ async with _ws_connect_with_headers(websockets, url, headers) as ws:
194
+ _t_connected = time.perf_counter()
195
+ logger.info("{}: WS connected in {:.0f}ms", self.name,
196
+ (_t_connected - _t0) * 1000)
197
+
198
+ async def send_audio() -> None:
199
+ nonlocal _frames_sent
200
+ try:
201
+ async for chunk in stream:
202
+ if _sender_stop.is_set():
203
+ break
204
+ if chunk:
205
+ await ws.send(chunk)
206
+ _frames_sent += 1
207
+ if _frames_sent == 1:
208
+ logger.debug("{}: first frame sent at {:.0f}ms",
209
+ self.name, (time.perf_counter() - _t0) * 1000)
210
+ if not _sender_stop.is_set():
211
+ await ws.send(json.dumps({"type": "CloseStream"}))
212
+ except websockets.exceptions.ConnectionClosed:
213
+ pass
214
+ finally:
215
+ logger.debug(
216
+ "{}: stream sender done, sent {} frames in {:.0f}ms",
217
+ self.name, _frames_sent, (time.perf_counter() - _t0) * 1000,
218
+ )
219
+
220
+ async def receive_results() -> None:
221
+ # Accumulate finalized utterance parts so each yield is a
222
+ # full-text snapshot (not just the latest utterance fragment).
223
+ confirmed_parts: list[str] = []
224
+ _resp_count = 0
225
+
226
+ try:
227
+ async for msg in ws:
228
+ data = json.loads(msg)
229
+ if data.get("type") != "Results":
230
+ continue
231
+
232
+ _resp_count += 1
233
+ channel = data.get("channel", {})
234
+ alts = channel.get("alternatives", [])
235
+ if not alts:
236
+ continue
237
+ transcript = alts[0].get("transcript", "").strip()
238
+ is_final = data.get("is_final", False)
239
+ speech_final = data.get("speech_final", False)
240
+ detected_language = channel.get(
241
+ "detected_language", self.settings.language
242
+ )
243
+ confidence = alts[0].get("confidence")
244
+
245
+ if _resp_count == 1:
246
+ logger.debug("{}: first response at {:.0f}ms is_final={} speech_final={}",
247
+ self.name, (time.perf_counter() - _t0) * 1000,
248
+ is_final, speech_final)
249
+
250
+ if is_final and transcript:
251
+ confirmed_parts.append(transcript)
252
+
253
+ # Build full-text snapshot: confirmed + current interim
254
+ if is_final:
255
+ snapshot = " ".join(confirmed_parts)
256
+ else:
257
+ parts = list(confirmed_parts)
258
+ if transcript:
259
+ parts.append(transcript)
260
+ snapshot = " ".join(parts)
261
+
262
+ if not snapshot:
263
+ continue
264
+
265
+ # speech_final = Deepgram VAD detected end of speech
266
+ is_partial = not speech_final
267
+ await results.put(Transcription(
268
+ text=snapshot,
269
+ confidence=confidence,
270
+ language=detected_language or self.settings.language,
271
+ is_partial=is_partial,
272
+ ))
273
+
274
+ if speech_final:
275
+ _sender_stop.set()
276
+ logger.info("{}: final result at {:.0f}ms responses={} text='{}'",
277
+ self.name, (time.perf_counter() - _t0) * 1000,
278
+ _resp_count, snapshot[:60])
279
+ break
280
+ except websockets.exceptions.ConnectionClosed:
281
+ # Emit whatever we have as final
282
+ snapshot = " ".join(confirmed_parts).strip()
283
+ if snapshot:
284
+ await results.put(Transcription(
285
+ text=snapshot, is_partial=False,
286
+ language=self.settings.language,
287
+ ))
288
+ _sender_stop.set()
289
+ finally:
290
+ _sender_stop.set()
291
+ await results.put(None)
292
+
293
+ send_task = asyncio.create_task(send_audio())
294
+ recv_task = asyncio.create_task(receive_results())
295
+
296
+ while True:
297
+ item = await results.get()
298
+ if item is None:
299
+ break
300
+ yield item
301
+
302
+ logger.info(
303
+ "{}: stream completed in {:.0f}ms, frames={}",
304
+ self.name, (time.perf_counter() - _t0) * 1000, _frames_sent,
305
+ )
306
+ send_task.cancel()
307
+ try:
308
+ await send_task
309
+ except asyncio.CancelledError:
310
+ pass
311
+ await recv_task