openspeechapi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. openspeech/__init__.py +75 -0
  2. openspeech/__main__.py +5 -0
  3. openspeech/cli.py +413 -0
  4. openspeech/client/__init__.py +4 -0
  5. openspeech/client/client.py +145 -0
  6. openspeech/config.py +212 -0
  7. openspeech/core/__init__.py +0 -0
  8. openspeech/core/base.py +75 -0
  9. openspeech/core/enums.py +39 -0
  10. openspeech/core/models.py +61 -0
  11. openspeech/core/registry.py +37 -0
  12. openspeech/core/settings.py +8 -0
  13. openspeech/demo.py +675 -0
  14. openspeech/dispatch/__init__.py +0 -0
  15. openspeech/dispatch/context.py +34 -0
  16. openspeech/dispatch/dispatcher.py +661 -0
  17. openspeech/dispatch/executors/__init__.py +0 -0
  18. openspeech/dispatch/executors/base.py +34 -0
  19. openspeech/dispatch/executors/in_process.py +66 -0
  20. openspeech/dispatch/executors/remote.py +64 -0
  21. openspeech/dispatch/executors/subprocess_exec.py +446 -0
  22. openspeech/dispatch/fanout.py +95 -0
  23. openspeech/dispatch/filters.py +73 -0
  24. openspeech/dispatch/lifecycle.py +178 -0
  25. openspeech/dispatch/watcher.py +82 -0
  26. openspeech/engine_catalog.py +236 -0
  27. openspeech/engine_registry.yaml +347 -0
  28. openspeech/exceptions.py +51 -0
  29. openspeech/factory.py +325 -0
  30. openspeech/local_engines/__init__.py +12 -0
  31. openspeech/local_engines/aim_resolver.py +91 -0
  32. openspeech/local_engines/backends/__init__.py +1 -0
  33. openspeech/local_engines/backends/docker_backend.py +490 -0
  34. openspeech/local_engines/backends/native_backend.py +902 -0
  35. openspeech/local_engines/base.py +30 -0
  36. openspeech/local_engines/engines/__init__.py +1 -0
  37. openspeech/local_engines/engines/faster_whisper.py +36 -0
  38. openspeech/local_engines/engines/fish_speech.py +33 -0
  39. openspeech/local_engines/engines/sherpa_onnx.py +56 -0
  40. openspeech/local_engines/engines/whisper.py +41 -0
  41. openspeech/local_engines/engines/whisperlivekit.py +60 -0
  42. openspeech/local_engines/manager.py +208 -0
  43. openspeech/local_engines/models.py +50 -0
  44. openspeech/local_engines/progress.py +69 -0
  45. openspeech/local_engines/registry.py +19 -0
  46. openspeech/local_engines/task_store.py +52 -0
  47. openspeech/local_engines/tasks.py +71 -0
  48. openspeech/logging_config.py +607 -0
  49. openspeech/observe/__init__.py +0 -0
  50. openspeech/observe/base.py +79 -0
  51. openspeech/observe/debug.py +44 -0
  52. openspeech/observe/latency.py +19 -0
  53. openspeech/observe/metrics.py +47 -0
  54. openspeech/observe/tracing.py +44 -0
  55. openspeech/observe/usage.py +27 -0
  56. openspeech/providers/__init__.py +0 -0
  57. openspeech/providers/_template.py +101 -0
  58. openspeech/providers/stt/__init__.py +0 -0
  59. openspeech/providers/stt/alibaba.py +86 -0
  60. openspeech/providers/stt/assemblyai.py +135 -0
  61. openspeech/providers/stt/azure_speech.py +99 -0
  62. openspeech/providers/stt/baidu.py +135 -0
  63. openspeech/providers/stt/deepgram.py +311 -0
  64. openspeech/providers/stt/elevenlabs.py +385 -0
  65. openspeech/providers/stt/faster_whisper.py +211 -0
  66. openspeech/providers/stt/google_cloud.py +106 -0
  67. openspeech/providers/stt/iflytek.py +427 -0
  68. openspeech/providers/stt/macos_speech.py +226 -0
  69. openspeech/providers/stt/openai.py +84 -0
  70. openspeech/providers/stt/sherpa_onnx.py +353 -0
  71. openspeech/providers/stt/tencent.py +212 -0
  72. openspeech/providers/stt/volcengine.py +107 -0
  73. openspeech/providers/stt/whisper.py +153 -0
  74. openspeech/providers/stt/whisperlivekit.py +530 -0
  75. openspeech/providers/stt/windows_speech.py +249 -0
  76. openspeech/providers/tts/__init__.py +0 -0
  77. openspeech/providers/tts/alibaba.py +95 -0
  78. openspeech/providers/tts/azure_speech.py +123 -0
  79. openspeech/providers/tts/baidu.py +143 -0
  80. openspeech/providers/tts/coqui.py +64 -0
  81. openspeech/providers/tts/cosyvoice.py +90 -0
  82. openspeech/providers/tts/deepgram.py +174 -0
  83. openspeech/providers/tts/elevenlabs.py +311 -0
  84. openspeech/providers/tts/fish_speech.py +158 -0
  85. openspeech/providers/tts/google_cloud.py +107 -0
  86. openspeech/providers/tts/iflytek.py +209 -0
  87. openspeech/providers/tts/macos_say.py +251 -0
  88. openspeech/providers/tts/minimax.py +122 -0
  89. openspeech/providers/tts/openai.py +104 -0
  90. openspeech/providers/tts/piper.py +104 -0
  91. openspeech/providers/tts/tencent.py +189 -0
  92. openspeech/providers/tts/volcengine.py +117 -0
  93. openspeech/providers/tts/windows_sapi.py +234 -0
  94. openspeech/server/__init__.py +1 -0
  95. openspeech/server/app.py +72 -0
  96. openspeech/server/auth.py +42 -0
  97. openspeech/server/middleware.py +75 -0
  98. openspeech/server/routes/__init__.py +1 -0
  99. openspeech/server/routes/management.py +848 -0
  100. openspeech/server/routes/stt.py +121 -0
  101. openspeech/server/routes/tts.py +159 -0
  102. openspeech/server/routes/webui.py +29 -0
  103. openspeech/server/webui/app.js +2649 -0
  104. openspeech/server/webui/index.html +216 -0
  105. openspeech/server/webui/styles.css +617 -0
  106. openspeech/server/ws/__init__.py +1 -0
  107. openspeech/server/ws/stt_stream.py +263 -0
  108. openspeech/server/ws/tts_stream.py +207 -0
  109. openspeech/telemetry/__init__.py +21 -0
  110. openspeech/telemetry/perf.py +307 -0
  111. openspeech/utils/__init__.py +5 -0
  112. openspeech/utils/audio_converter.py +406 -0
  113. openspeech/utils/audio_playback.py +156 -0
  114. openspeech/vendor_registry.yaml +74 -0
  115. openspeechapi-0.1.0.dist-info/METADATA +101 -0
  116. openspeechapi-0.1.0.dist-info/RECORD +118 -0
  117. openspeechapi-0.1.0.dist-info/WHEEL +4 -0
  118. openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,226 @@
1
+ """macOS native STT provider using SFSpeechRecognizer via compiled Swift binary.
2
+
3
+ The Swift helper lives inside a .app bundle so that macOS TCC can track its
4
+ Speech Recognition authorization by bundle ID. The provider launches it via
5
+ ``open -W`` which inherits the .app TCC grant; results are exchanged through
6
+ a temporary JSON file (``--output``).
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import asyncio
11
+ import json
12
+ from openspeech.logging_config import logger
13
+ import tempfile
14
+ import time
15
+ from collections.abc import AsyncIterator
16
+ from dataclasses import dataclass
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ from openspeech.core.base import STTProvider
21
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
22
+ from openspeech.core.models import AudioData, STTOptions, Transcription
23
+ from openspeech.core.settings import BaseSettings
24
+ from openspeech.utils.audio_converter import AudioConverter
25
+
26
+ @dataclass
27
+ class MacOSSpeechSettings(BaseSettings):
28
+ language: str = "zh-CN"
29
+ binary_path: str = ""
30
+
31
+ class MacOSSpeechSTT(STTProvider):
32
+ """STT provider wrapping macOS SFSpeechRecognizer via a compiled Swift helper."""
33
+
34
+ name = "macos-stt"
35
+ provider_type = ProviderType.STT
36
+ execution_mode = ExecMode.IN_PROCESS
37
+ settings_cls = MacOSSpeechSettings
38
+ capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
39
+ field_options = {"language": ["zh-CN", "en-US", "en-AE", "ja-JP", "ko-KR", "fr-FR", "de-DE", "es-ES"]}
40
+
41
+ # .app bundle that contains the Swift helper binary.
42
+ _DEFAULT_APP_BUNDLE = (
43
+ Path(__file__).resolve().parents[3]
44
+ / "scripts"
45
+ / "engines"
46
+ / "macos-stt"
47
+ / "MacOSSTTHelper.app"
48
+ )
49
+
50
+ def __init__(self, settings: MacOSSpeechSettings | None = None) -> None:
51
+ self.settings = settings or MacOSSpeechSettings()
52
+ self._app_bundle: Path | None = None
53
+ self._binary: Path | None = None
54
+ self._started = False
55
+ self._auth_ok = False
56
+
57
+ # ------------------------------------------------------------------
58
+ # Internal helpers
59
+ # ------------------------------------------------------------------
60
+
61
+ async def _run_via_open(self, *extra_args: str, output_file: str | None = None) -> dict:
62
+ """Launch the helper through ``open -W`` so it inherits .app TCC grants.
63
+
64
+ If *output_file* is given the helper writes JSON there (``--output``);
65
+ otherwise we fall back to direct execution for ``--check`` which
66
+ doesn't need TCC.
67
+ """
68
+ if self._app_bundle is None:
69
+ return {"status": "error", "error": "app bundle not set"}
70
+
71
+ cmd: list[str]
72
+ if output_file is not None:
73
+ # Launch via `open -W` for TCC context, results via --output file
74
+ cmd = [
75
+ "open", "-W", str(self._app_bundle),
76
+ "--args", *extra_args, "--output", output_file,
77
+ ]
78
+ else:
79
+ # Direct execution is fine for --check (no TCC needed)
80
+ if self._binary is None:
81
+ return {"status": "error", "error": "binary not set"}
82
+ cmd = [str(self._binary), *extra_args]
83
+
84
+ proc = await asyncio.create_subprocess_exec(
85
+ *cmd,
86
+ stdout=asyncio.subprocess.PIPE,
87
+ stderr=asyncio.subprocess.PIPE,
88
+ )
89
+ stdout, _ = await proc.communicate()
90
+
91
+ # Read result from output file or stdout
92
+ raw = ""
93
+ if output_file is not None:
94
+ out_path = Path(output_file)
95
+ if out_path.exists():
96
+ raw = out_path.read_text(encoding="utf-8").strip()
97
+ out_path.unlink(missing_ok=True)
98
+ else:
99
+ raw = stdout.decode(errors="replace").strip() if stdout else ""
100
+
101
+ if not raw:
102
+ return {"status": "error", "error": f"no output (rc={proc.returncode})"}
103
+ try:
104
+ return json.loads(raw)
105
+ except json.JSONDecodeError:
106
+ return {"status": "error", "error": raw}
107
+
108
+ async def _run_check(self) -> dict:
109
+ """Run ``--check`` — direct execution, no TCC needed."""
110
+ return await self._run_via_open(
111
+ "--check", "--language", self.settings.language,
112
+ )
113
+
114
+ # ------------------------------------------------------------------
115
+ # Lifecycle
116
+ # ------------------------------------------------------------------
117
+
118
+ async def start(self) -> None:
119
+ if self.settings.binary_path:
120
+ # Custom binary path — use directly (user manages TCC)
121
+ candidate = Path(self.settings.binary_path)
122
+ if not candidate.exists():
123
+ raise RuntimeError(
124
+ f"macos-stt binary not found at {candidate}."
125
+ )
126
+ self._binary = candidate
127
+ self._app_bundle = None
128
+ else:
129
+ app = self._DEFAULT_APP_BUNDLE
130
+ binary = app / "Contents" / "MacOS" / "macos-stt-helper"
131
+ if not binary.exists():
132
+ raise RuntimeError(
133
+ f"macos-stt binary not found at {binary}. "
134
+ "Run scripts/engines/macos-stt/install.sh to compile it."
135
+ )
136
+ self._app_bundle = app
137
+ self._binary = binary
138
+
139
+ # Verify speech recognition availability via --check
140
+ check = await self._run_check()
141
+ if check.get("status") != "ok":
142
+ err = check.get("error", "unknown authorization issue")
143
+ logger.warning("macos-stt auth check failed: {}", err)
144
+ raise RuntimeError(f"macos-stt not authorized: {err}")
145
+
146
+ self._auth_ok = True
147
+ self._started = True
148
+ logger.info("{} provider started", self.name)
149
+
150
+ async def stop(self) -> None:
151
+ self._binary = None
152
+ self._app_bundle = None
153
+ self._started = False
154
+ self._auth_ok = False
155
+ logger.info("{} provider stopped", self.name)
156
+
157
+ async def health_check(self) -> bool:
158
+ if self._started and self._auth_ok:
159
+ return True
160
+ # Pre-start check: verify binary exists
161
+ if self._binary is not None and self._binary.exists():
162
+ return True
163
+ # Check default location
164
+ default = self._DEFAULT_APP_BUNDLE / "Contents" / "MacOS" / "macos-stt-helper"
165
+ return default.exists()
166
+
167
+ # ------------------------------------------------------------------
168
+ # Transcription
169
+ # ------------------------------------------------------------------
170
+
171
+ async def transcribe(
172
+ self, audio: AudioData, opts: STTOptions | None = None
173
+ ) -> Transcription:
174
+ if not self._started or self._binary is None:
175
+ raise RuntimeError("Provider not started -- call start() first")
176
+ logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
177
+ _t0 = time.perf_counter()
178
+
179
+ opts = opts or STTOptions()
180
+ language = opts.language or self.settings.language
181
+
182
+ # Convert to WAV for the Swift helper.
183
+ wav_audio = AudioConverter.to_wav(audio)
184
+
185
+ tmp_wav: str | None = None
186
+ tmp_out: str | None = None
187
+ try:
188
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
189
+ f.write(wav_audio.data)
190
+ tmp_wav = f.name
191
+ with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f:
192
+ tmp_out = f.name
193
+
194
+ result = await self._run_via_open(
195
+ "--audio", tmp_wav,
196
+ "--language", language,
197
+ output_file=tmp_out if self._app_bundle else None,
198
+ )
199
+
200
+ # When running without .app bundle (custom binary_path), _run_via_open
201
+ # uses direct execution and reads stdout — handle that path too.
202
+ if self._app_bundle is None and not result:
203
+ raise RuntimeError("macos-stt-helper returned no output")
204
+
205
+ if "error" in result:
206
+ raise RuntimeError(
207
+ f"macos-stt-helper error: {result['error']}"
208
+ )
209
+
210
+ transcription = Transcription(
211
+ text=result.get("text", ""),
212
+ language=result.get("language"),
213
+ confidence=result.get("confidence"),
214
+ )
215
+ logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(transcription.text))
216
+ return transcription
217
+ finally:
218
+ if tmp_wav is not None:
219
+ Path(tmp_wav).unlink(missing_ok=True)
220
+ if tmp_out is not None:
221
+ Path(tmp_out).unlink(missing_ok=True)
222
+
223
+ def transcribe_stream(
224
+ self, stream: AsyncIterator[bytes]
225
+ ) -> AsyncIterator[Any]:
226
+ raise NotImplementedError("macOS SFSpeechRecognizer does not support streaming via this provider")
@@ -0,0 +1,84 @@
1
+ """OpenAI STT provider adapter (Whisper API)."""
2
+ from __future__ import annotations
3
+
4
+ import io
5
+ from openspeech.logging_config import logger
6
+ import time
7
+ from collections.abc import AsyncIterator
8
+ from dataclasses import dataclass
9
+ from typing import Any
10
+
11
+ from openspeech.core.base import STTProvider
12
+
13
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
14
+ from openspeech.core.models import AudioData, STTOptions, Transcription
15
+ from openspeech.core.settings import BaseSettings
16
+
17
+ @dataclass
18
+ class OpenAISTTSettings(BaseSettings):
19
+ api_key: str = ""
20
+ base_url: str = ""
21
+ model: str = "whisper-1"
22
+
23
+ class OpenAISTT(STTProvider):
24
+ name = "openai-stt"
25
+ provider_type = ProviderType.STT
26
+ execution_mode = ExecMode.IN_PROCESS
27
+ settings_cls = OpenAISTTSettings
28
+ capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
29
+ field_options = {"model": ["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]}
30
+
31
+ def __init__(self, settings: OpenAISTTSettings | None = None) -> None:
32
+ self.settings = settings or OpenAISTTSettings()
33
+ self._client: Any = None
34
+
35
+ async def start(self) -> None:
36
+ try:
37
+ from openai import AsyncOpenAI
38
+ kwargs: dict[str, Any] = {"api_key": self.settings.api_key}
39
+ if self.settings.base_url:
40
+ kwargs["base_url"] = self.settings.base_url
41
+ self._client = AsyncOpenAI(**kwargs)
42
+ except ImportError:
43
+ raise ImportError("Install openai: pip install openspeech[openai]")
44
+ logger.info("{} provider started", self.name)
45
+
46
+ async def stop(self) -> None:
47
+ self._client = None
48
+ logger.info("{} provider stopped", self.name)
49
+
50
+ async def health_check(self) -> bool:
51
+ return bool(self.settings.api_key)
52
+
53
+ async def transcribe(
54
+ self, audio: AudioData, opts: STTOptions | None = None
55
+ ) -> Transcription:
56
+ if self._client is None:
57
+ raise RuntimeError("Provider not started — call start() first")
58
+ logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
59
+ _t0 = time.perf_counter()
60
+ opts = opts or STTOptions()
61
+ audio_file = io.BytesIO(audio.data)
62
+ audio_file.name = "audio.wav"
63
+ kwargs: dict[str, Any] = {
64
+ "model": self.settings.model,
65
+ "file": audio_file,
66
+ }
67
+ if opts.language:
68
+ kwargs["language"] = opts.language
69
+ if opts.prompt:
70
+ kwargs["prompt"] = opts.prompt
71
+ if opts.temperature is not None:
72
+ kwargs["temperature"] = opts.temperature
73
+ response = await self._client.audio.transcriptions.create(**kwargs)
74
+ result = Transcription(text=response.text, language=opts.language)
75
+ logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(result.text))
76
+ return result
77
+
78
+ async def transcribe_stream(
79
+ self, stream: AsyncIterator[bytes]
80
+ ) -> AsyncIterator[Any]:
81
+ raise NotImplementedError(
82
+ "OpenAI Whisper API does not support streaming input"
83
+ )
84
+ yield # pragma: no cover
@@ -0,0 +1,353 @@
1
+ """Sherpa-ONNX STT provider adapter (local service)."""
2
+ from __future__ import annotations
3
+
4
+ from collections.abc import AsyncIterator
5
+ from dataclasses import dataclass
6
+ import io
7
+ import json
8
+ from openspeech.logging_config import logger
9
+ import math
10
+ import struct
11
+ import time
12
+ from urllib.parse import urlparse, urlunparse
13
+ import wave
14
+ from typing import Any
15
+ from urllib.parse import urljoin
16
+
17
+ from openspeech.core.base import STTProvider
18
+
19
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
20
+ from openspeech.core.models import AudioData, STTOptions, Transcription
21
+ from openspeech.core.settings import BaseSettings
22
+
23
+ @dataclass
24
+ class SherpaOnnxSTTSettings(BaseSettings):
25
+ api_url: str = "http://127.0.0.1:17000"
26
+ health_path: str = "/health"
27
+ ws_path: str = "/"
28
+ model: str = ""
29
+ language: str = "auto"
30
+ sample_rate: int = 16000
31
+ chunk_ms: int = 120
32
+ recv_timeout_s: float = 1.0
33
+ timeout_s: float = 60.0
34
+ retries: int = 0
35
+
36
+ class SherpaOnnxSTT(STTProvider):
37
+ name = "sherpa-onnx-stt"
38
+ provider_type = ProviderType.STT
39
+ execution_mode = ExecMode.LOCAL
40
+ settings_cls = SherpaOnnxSTTSettings
41
+ capabilities = {
42
+ Capability.STREAMING,
43
+ Capability.BATCH,
44
+ Capability.MULTILINGUAL,
45
+ }
46
+ field_options = {"language": ["auto", "en", "zh", "ja"]}
47
+
48
+ def __init__(self, settings: SherpaOnnxSTTSettings | None = None) -> None:
49
+ self.settings = settings or SherpaOnnxSTTSettings()
50
+ self._client: Any = None
51
+ self._owns_client: bool = True
52
+
53
+ def set_http_client(self, client) -> None:
54
+ self._client = client
55
+ self._owns_client = False
56
+
57
+ async def start(self) -> None:
58
+ if self._client is None:
59
+ try:
60
+ import httpx
61
+ except ImportError:
62
+ raise ImportError(
63
+ "Install httpx: pip install openspeech[server]"
64
+ )
65
+ self._client = httpx.AsyncClient(timeout=self.settings.timeout_s, trust_env=False)
66
+ self._owns_client = True
67
+ logger.info("{} provider started", self.name)
68
+
69
+ async def stop(self) -> None:
70
+ if self._client and self._owns_client:
71
+ await self._client.aclose()
72
+ self._client = None
73
+ logger.info("{} provider stopped", self.name)
74
+
75
+ def _ws_url(self) -> str:
76
+ u = urlparse(self.settings.api_url.rstrip("/"))
77
+ scheme = "wss" if u.scheme == "https" else "ws"
78
+ path = self.settings.ws_path or "/"
79
+ if not path.startswith("/"):
80
+ path = f"/{path}"
81
+ return urlunparse((scheme, u.netloc, path, "", "", ""))
82
+
83
+ async def health_check(self) -> bool:
84
+ if self._client is None:
85
+ return False
86
+ try:
87
+ url = urljoin(
88
+ self.settings.api_url.rstrip("/") + "/",
89
+ self.settings.health_path.lstrip("/"),
90
+ )
91
+ resp = await self._client.get(url)
92
+ return resp.status_code < 500
93
+ except Exception:
94
+ return False
95
+
96
+ async def transcribe(
97
+ self, audio: AudioData, opts: STTOptions | None = None
98
+ ) -> Transcription:
99
+ if self._client is None:
100
+ raise RuntimeError("Provider not started — call start() first")
101
+ logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
102
+ opts = opts or STTOptions()
103
+ language = (opts.language or self.settings.language).strip()
104
+
105
+ samples = self._audio_to_float32_samples(audio)
106
+ started_at = time.perf_counter()
107
+ text = await self._transcribe_samples_via_ws(samples)
108
+
109
+ duration_ms = int((time.perf_counter() - started_at) * 1000)
110
+ if audio.duration_ms is not None:
111
+ duration_ms = int(audio.duration_ms)
112
+
113
+ result = Transcription(
114
+ text=text,
115
+ language=language if language != "auto" else None,
116
+ confidence=None,
117
+ duration_ms=duration_ms,
118
+ )
119
+ logger.info("{}: completed in {}ms, result={} chars", self.name, duration_ms, len(result.text))
120
+ return result
121
+
122
+ async def transcribe_stream(
123
+ self, stream: AsyncIterator[bytes]
124
+ ) -> AsyncIterator[Any]:
125
+ if self._client is None:
126
+ raise RuntimeError("Provider not started — call start() first")
127
+ try:
128
+ import websockets
129
+ except ImportError:
130
+ raise ImportError("Install websockets: pip install openspeech[server]")
131
+
132
+ import asyncio
133
+
134
+ _t0 = time.perf_counter()
135
+ _frames_sent = 0
136
+
137
+ ws_url = self._ws_url()
138
+ _sender_stop = asyncio.Event()
139
+
140
+ logger.debug("{}: connecting to Sherpa-ONNX WebSocket...", self.name)
141
+ async with websockets.connect(ws_url, open_timeout=self.settings.timeout_s) as ws:
142
+ _t_connected = time.perf_counter()
143
+ logger.info("{}: WS connected in {:.0f}ms", self.name,
144
+ (_t_connected - _t0) * 1000)
145
+ sender_done = False
146
+ current_segment = 0
147
+ final_parts: list[str] = []
148
+ current_text = ""
149
+ _resp_count = 0
150
+
151
+ async def _sender() -> None:
152
+ nonlocal sender_done, _frames_sent
153
+ try:
154
+ async for chunk in stream:
155
+ if _sender_stop.is_set():
156
+ break
157
+ if not chunk:
158
+ continue
159
+ data = self._pcm16_bytes_to_float32_bytes(chunk)
160
+ if data:
161
+ await ws.send(data)
162
+ _frames_sent += 1
163
+ if _frames_sent == 1:
164
+ logger.debug("{}: first frame sent at {:.0f}ms",
165
+ self.name, (time.perf_counter() - _t0) * 1000)
166
+ if not _sender_stop.is_set():
167
+ await ws.send("Done")
168
+ except websockets.exceptions.ConnectionClosed:
169
+ pass
170
+ finally:
171
+ sender_done = True
172
+ logger.debug(
173
+ "{}: stream sender done, sent {} frames in {:.0f}ms",
174
+ self.name, _frames_sent, (time.perf_counter() - _t0) * 1000,
175
+ )
176
+
177
+ send_task = asyncio.create_task(_sender())
178
+ try:
179
+ while True:
180
+ try:
181
+ raw = await asyncio.wait_for(ws.recv(), timeout=self.settings.recv_timeout_s)
182
+ except asyncio.TimeoutError:
183
+ if sender_done:
184
+ break
185
+ continue
186
+ if not isinstance(raw, str):
187
+ continue
188
+ evt = self._parse_ws_event(raw)
189
+ if evt is None:
190
+ continue
191
+ _resp_count += 1
192
+ seg, txt = evt
193
+ if _resp_count == 1:
194
+ logger.debug("{}: first response at {:.0f}ms seg={}",
195
+ self.name, (time.perf_counter() - _t0) * 1000, seg)
196
+ if seg > current_segment:
197
+ if current_text:
198
+ final_parts.append(current_text)
199
+ current_segment = seg
200
+ current_text = txt
201
+ else:
202
+ current_text = txt
203
+ merged = self._merge_with_current(final_parts, current_text)
204
+ if merged:
205
+ yield Transcription(text=merged, is_partial=True)
206
+ finally:
207
+ send_task.cancel()
208
+ try:
209
+ await send_task
210
+ except asyncio.CancelledError:
211
+ pass
212
+
213
+ merged = self._merge_with_current(final_parts, current_text)
214
+ if merged:
215
+ logger.info("{}: final result at {:.0f}ms responses={} text='{}'",
216
+ self.name, (time.perf_counter() - _t0) * 1000,
217
+ _resp_count, merged[:60])
218
+ yield Transcription(text=merged, is_partial=False)
219
+
220
+ logger.info(
221
+ "{}: stream completed in {:.0f}ms, frames={}",
222
+ self.name, (time.perf_counter() - _t0) * 1000, _frames_sent,
223
+ )
224
+
225
+ @staticmethod
226
+ def _merge_with_current(parts: list[str], current: str) -> str:
227
+ data = [p.strip() for p in parts if p and p.strip()]
228
+ c = (current or "").strip()
229
+ if c:
230
+ data.append(c)
231
+ return " ".join(data).strip()
232
+
233
+ def _parse_ws_event(self, raw: str) -> tuple[int, str] | None:
234
+ try:
235
+ payload = json.loads(raw)
236
+ except Exception:
237
+ txt = raw.strip()
238
+ return (0, txt) if txt else None
239
+ txt = str(payload.get("text", "")).strip()
240
+ if not txt:
241
+ return None
242
+ seg = int(payload.get("segment", 0) or 0)
243
+ return seg, txt
244
+
245
+ def _transcribe_chunk_frames(self, sample_count: int) -> int:
246
+ sample_rate = max(8000, int(self.settings.sample_rate))
247
+ chunk_ms = max(20, int(self.settings.chunk_ms))
248
+ frames = max(1, int(sample_rate * (chunk_ms / 1000.0)))
249
+ if sample_count < frames:
250
+ return sample_count
251
+ return frames
252
+
253
+ async def _transcribe_samples_via_ws(self, samples: list[float]) -> str:
254
+ try:
255
+ import websockets
256
+ except ImportError:
257
+ raise ImportError("Install websockets: pip install openspeech[server]")
258
+ ws_url = self._ws_url()
259
+ current_segment = 0
260
+ final_parts: list[str] = []
261
+ current_text = ""
262
+ import asyncio
263
+
264
+ async with websockets.connect(ws_url, open_timeout=self.settings.timeout_s) as ws:
265
+ i = 0
266
+ step = self._transcribe_chunk_frames(len(samples))
267
+ while i < len(samples):
268
+ chunk = samples[i:i + step]
269
+ i += step
270
+ if chunk:
271
+ await ws.send(self._float32_list_to_bytes(chunk))
272
+
273
+ await ws.send("Done")
274
+ while True:
275
+ try:
276
+ raw = await asyncio.wait_for(ws.recv(), timeout=self.settings.recv_timeout_s)
277
+ except asyncio.TimeoutError:
278
+ break
279
+ if not isinstance(raw, str):
280
+ continue
281
+ evt = self._parse_ws_event(raw)
282
+ if evt is None:
283
+ continue
284
+ seg, txt = evt
285
+ if seg > current_segment:
286
+ if current_text:
287
+ final_parts.append(current_text)
288
+ current_segment = seg
289
+ current_text = txt
290
+ else:
291
+ current_text = txt
292
+ return self._merge_with_current(final_parts, current_text)
293
+
294
+ @staticmethod
295
+ def _float32_list_to_bytes(samples: list[float]) -> bytes:
296
+ if not samples:
297
+ return b""
298
+ return struct.pack(f"<{len(samples)}f", *samples)
299
+
300
+ @staticmethod
301
+ def _pcm16_bytes_to_float32_bytes(data: bytes) -> bytes:
302
+ if not data:
303
+ return b""
304
+ usable = len(data) - (len(data) % 2)
305
+ if usable <= 0:
306
+ return b""
307
+ values = struct.unpack(f"<{usable // 2}h", data[:usable])
308
+ floats = [max(-1.0, min(1.0, v / 32768.0)) for v in values]
309
+ return struct.pack(f"<{len(floats)}f", *floats)
310
+
311
+ def _audio_to_float32_samples(self, audio: AudioData) -> list[float]:
312
+ if len(audio.data) > 12 and audio.data[:4] == b"RIFF" and audio.data[8:12] == b"WAVE":
313
+ with wave.open(io.BytesIO(audio.data), "rb") as wf:
314
+ channels = max(1, int(wf.getnchannels()))
315
+ sampwidth = int(wf.getsampwidth())
316
+ sample_rate = int(wf.getframerate())
317
+ frames = wf.readframes(wf.getnframes())
318
+ if sampwidth != 2:
319
+ raise RuntimeError("Only PCM16 WAV is supported for sherpa transcribe")
320
+ ints = struct.unpack(f"<{len(frames) // 2}h", frames)
321
+ if channels > 1:
322
+ mono: list[int] = []
323
+ for i in range(0, len(ints), channels):
324
+ frame = ints[i:i + channels]
325
+ mono.append(int(sum(frame) / max(1, len(frame))))
326
+ ints = tuple(mono)
327
+ samples = [max(-1.0, min(1.0, v / 32768.0)) for v in ints]
328
+ if sample_rate != int(self.settings.sample_rate):
329
+ return self._resample_linear(samples, sample_rate, int(self.settings.sample_rate))
330
+ return samples
331
+
332
+ # Raw PCM16 fallback.
333
+ raw = self._pcm16_bytes_to_float32_bytes(audio.data)
334
+ if not raw:
335
+ return []
336
+ count = len(raw) // 4
337
+ return list(struct.unpack(f"<{count}f", raw))
338
+
339
+ @staticmethod
340
+ def _resample_linear(samples: list[float], src_rate: int, dst_rate: int) -> list[float]:
341
+ if src_rate <= 0 or dst_rate <= 0 or src_rate == dst_rate or not samples:
342
+ return samples
343
+ ratio = dst_rate / src_rate
344
+ out_len = max(1, int(math.floor(len(samples) * ratio)))
345
+ out: list[float] = []
346
+ for i in range(out_len):
347
+ src_pos = i / ratio
348
+ left = int(math.floor(src_pos))
349
+ right = min(left + 1, len(samples) - 1)
350
+ frac = src_pos - left
351
+ val = samples[left] * (1.0 - frac) + samples[right] * frac
352
+ out.append(float(val))
353
+ return out