openspeechapi 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openspeech/__init__.py +75 -0
- openspeech/__main__.py +5 -0
- openspeech/cli.py +413 -0
- openspeech/client/__init__.py +4 -0
- openspeech/client/client.py +145 -0
- openspeech/config.py +212 -0
- openspeech/core/__init__.py +0 -0
- openspeech/core/base.py +75 -0
- openspeech/core/enums.py +39 -0
- openspeech/core/models.py +61 -0
- openspeech/core/registry.py +37 -0
- openspeech/core/settings.py +8 -0
- openspeech/demo.py +675 -0
- openspeech/dispatch/__init__.py +0 -0
- openspeech/dispatch/context.py +34 -0
- openspeech/dispatch/dispatcher.py +661 -0
- openspeech/dispatch/executors/__init__.py +0 -0
- openspeech/dispatch/executors/base.py +34 -0
- openspeech/dispatch/executors/in_process.py +66 -0
- openspeech/dispatch/executors/remote.py +64 -0
- openspeech/dispatch/executors/subprocess_exec.py +446 -0
- openspeech/dispatch/fanout.py +95 -0
- openspeech/dispatch/filters.py +73 -0
- openspeech/dispatch/lifecycle.py +178 -0
- openspeech/dispatch/watcher.py +82 -0
- openspeech/engine_catalog.py +236 -0
- openspeech/engine_registry.yaml +347 -0
- openspeech/exceptions.py +51 -0
- openspeech/factory.py +325 -0
- openspeech/local_engines/__init__.py +12 -0
- openspeech/local_engines/aim_resolver.py +91 -0
- openspeech/local_engines/backends/__init__.py +1 -0
- openspeech/local_engines/backends/docker_backend.py +490 -0
- openspeech/local_engines/backends/native_backend.py +902 -0
- openspeech/local_engines/base.py +30 -0
- openspeech/local_engines/engines/__init__.py +1 -0
- openspeech/local_engines/engines/faster_whisper.py +36 -0
- openspeech/local_engines/engines/fish_speech.py +33 -0
- openspeech/local_engines/engines/sherpa_onnx.py +56 -0
- openspeech/local_engines/engines/whisper.py +41 -0
- openspeech/local_engines/engines/whisperlivekit.py +60 -0
- openspeech/local_engines/manager.py +208 -0
- openspeech/local_engines/models.py +50 -0
- openspeech/local_engines/progress.py +69 -0
- openspeech/local_engines/registry.py +19 -0
- openspeech/local_engines/task_store.py +52 -0
- openspeech/local_engines/tasks.py +71 -0
- openspeech/logging_config.py +607 -0
- openspeech/observe/__init__.py +0 -0
- openspeech/observe/base.py +79 -0
- openspeech/observe/debug.py +44 -0
- openspeech/observe/latency.py +19 -0
- openspeech/observe/metrics.py +47 -0
- openspeech/observe/tracing.py +44 -0
- openspeech/observe/usage.py +27 -0
- openspeech/providers/__init__.py +0 -0
- openspeech/providers/_template.py +101 -0
- openspeech/providers/stt/__init__.py +0 -0
- openspeech/providers/stt/alibaba.py +86 -0
- openspeech/providers/stt/assemblyai.py +135 -0
- openspeech/providers/stt/azure_speech.py +99 -0
- openspeech/providers/stt/baidu.py +135 -0
- openspeech/providers/stt/deepgram.py +311 -0
- openspeech/providers/stt/elevenlabs.py +385 -0
- openspeech/providers/stt/faster_whisper.py +211 -0
- openspeech/providers/stt/google_cloud.py +106 -0
- openspeech/providers/stt/iflytek.py +427 -0
- openspeech/providers/stt/macos_speech.py +226 -0
- openspeech/providers/stt/openai.py +84 -0
- openspeech/providers/stt/sherpa_onnx.py +353 -0
- openspeech/providers/stt/tencent.py +212 -0
- openspeech/providers/stt/volcengine.py +107 -0
- openspeech/providers/stt/whisper.py +153 -0
- openspeech/providers/stt/whisperlivekit.py +530 -0
- openspeech/providers/stt/windows_speech.py +249 -0
- openspeech/providers/tts/__init__.py +0 -0
- openspeech/providers/tts/alibaba.py +95 -0
- openspeech/providers/tts/azure_speech.py +123 -0
- openspeech/providers/tts/baidu.py +143 -0
- openspeech/providers/tts/coqui.py +64 -0
- openspeech/providers/tts/cosyvoice.py +90 -0
- openspeech/providers/tts/deepgram.py +174 -0
- openspeech/providers/tts/elevenlabs.py +311 -0
- openspeech/providers/tts/fish_speech.py +158 -0
- openspeech/providers/tts/google_cloud.py +107 -0
- openspeech/providers/tts/iflytek.py +209 -0
- openspeech/providers/tts/macos_say.py +251 -0
- openspeech/providers/tts/minimax.py +122 -0
- openspeech/providers/tts/openai.py +104 -0
- openspeech/providers/tts/piper.py +104 -0
- openspeech/providers/tts/tencent.py +189 -0
- openspeech/providers/tts/volcengine.py +117 -0
- openspeech/providers/tts/windows_sapi.py +234 -0
- openspeech/server/__init__.py +1 -0
- openspeech/server/app.py +72 -0
- openspeech/server/auth.py +42 -0
- openspeech/server/middleware.py +75 -0
- openspeech/server/routes/__init__.py +1 -0
- openspeech/server/routes/management.py +848 -0
- openspeech/server/routes/stt.py +121 -0
- openspeech/server/routes/tts.py +159 -0
- openspeech/server/routes/webui.py +29 -0
- openspeech/server/webui/app.js +2649 -0
- openspeech/server/webui/index.html +216 -0
- openspeech/server/webui/styles.css +617 -0
- openspeech/server/ws/__init__.py +1 -0
- openspeech/server/ws/stt_stream.py +263 -0
- openspeech/server/ws/tts_stream.py +207 -0
- openspeech/telemetry/__init__.py +21 -0
- openspeech/telemetry/perf.py +307 -0
- openspeech/utils/__init__.py +5 -0
- openspeech/utils/audio_converter.py +406 -0
- openspeech/utils/audio_playback.py +156 -0
- openspeech/vendor_registry.yaml +74 -0
- openspeechapi-0.1.0.dist-info/METADATA +101 -0
- openspeechapi-0.1.0.dist-info/RECORD +118 -0
- openspeechapi-0.1.0.dist-info/WHEEL +4 -0
- openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""macOS native STT provider using SFSpeechRecognizer via compiled Swift binary.
|
|
2
|
+
|
|
3
|
+
The Swift helper lives inside a .app bundle so that macOS TCC can track its
|
|
4
|
+
Speech Recognition authorization by bundle ID. The provider launches it via
|
|
5
|
+
``open -W`` which inherits the .app TCC grant; results are exchanged through
|
|
6
|
+
a temporary JSON file (``--output``).
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import json
|
|
12
|
+
from openspeech.logging_config import logger
|
|
13
|
+
import tempfile
|
|
14
|
+
import time
|
|
15
|
+
from collections.abc import AsyncIterator
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from openspeech.core.base import STTProvider
|
|
21
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
22
|
+
from openspeech.core.models import AudioData, STTOptions, Transcription
|
|
23
|
+
from openspeech.core.settings import BaseSettings
|
|
24
|
+
from openspeech.utils.audio_converter import AudioConverter
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class MacOSSpeechSettings(BaseSettings):
|
|
28
|
+
language: str = "zh-CN"
|
|
29
|
+
binary_path: str = ""
|
|
30
|
+
|
|
31
|
+
class MacOSSpeechSTT(STTProvider):
|
|
32
|
+
"""STT provider wrapping macOS SFSpeechRecognizer via a compiled Swift helper."""
|
|
33
|
+
|
|
34
|
+
name = "macos-stt"
|
|
35
|
+
provider_type = ProviderType.STT
|
|
36
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
37
|
+
settings_cls = MacOSSpeechSettings
|
|
38
|
+
capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
|
|
39
|
+
field_options = {"language": ["zh-CN", "en-US", "en-AE", "ja-JP", "ko-KR", "fr-FR", "de-DE", "es-ES"]}
|
|
40
|
+
|
|
41
|
+
# .app bundle that contains the Swift helper binary.
|
|
42
|
+
_DEFAULT_APP_BUNDLE = (
|
|
43
|
+
Path(__file__).resolve().parents[3]
|
|
44
|
+
/ "scripts"
|
|
45
|
+
/ "engines"
|
|
46
|
+
/ "macos-stt"
|
|
47
|
+
/ "MacOSSTTHelper.app"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def __init__(self, settings: MacOSSpeechSettings | None = None) -> None:
|
|
51
|
+
self.settings = settings or MacOSSpeechSettings()
|
|
52
|
+
self._app_bundle: Path | None = None
|
|
53
|
+
self._binary: Path | None = None
|
|
54
|
+
self._started = False
|
|
55
|
+
self._auth_ok = False
|
|
56
|
+
|
|
57
|
+
# ------------------------------------------------------------------
|
|
58
|
+
# Internal helpers
|
|
59
|
+
# ------------------------------------------------------------------
|
|
60
|
+
|
|
61
|
+
async def _run_via_open(self, *extra_args: str, output_file: str | None = None) -> dict:
|
|
62
|
+
"""Launch the helper through ``open -W`` so it inherits .app TCC grants.
|
|
63
|
+
|
|
64
|
+
If *output_file* is given the helper writes JSON there (``--output``);
|
|
65
|
+
otherwise we fall back to direct execution for ``--check`` which
|
|
66
|
+
doesn't need TCC.
|
|
67
|
+
"""
|
|
68
|
+
if self._app_bundle is None:
|
|
69
|
+
return {"status": "error", "error": "app bundle not set"}
|
|
70
|
+
|
|
71
|
+
cmd: list[str]
|
|
72
|
+
if output_file is not None:
|
|
73
|
+
# Launch via `open -W` for TCC context, results via --output file
|
|
74
|
+
cmd = [
|
|
75
|
+
"open", "-W", str(self._app_bundle),
|
|
76
|
+
"--args", *extra_args, "--output", output_file,
|
|
77
|
+
]
|
|
78
|
+
else:
|
|
79
|
+
# Direct execution is fine for --check (no TCC needed)
|
|
80
|
+
if self._binary is None:
|
|
81
|
+
return {"status": "error", "error": "binary not set"}
|
|
82
|
+
cmd = [str(self._binary), *extra_args]
|
|
83
|
+
|
|
84
|
+
proc = await asyncio.create_subprocess_exec(
|
|
85
|
+
*cmd,
|
|
86
|
+
stdout=asyncio.subprocess.PIPE,
|
|
87
|
+
stderr=asyncio.subprocess.PIPE,
|
|
88
|
+
)
|
|
89
|
+
stdout, _ = await proc.communicate()
|
|
90
|
+
|
|
91
|
+
# Read result from output file or stdout
|
|
92
|
+
raw = ""
|
|
93
|
+
if output_file is not None:
|
|
94
|
+
out_path = Path(output_file)
|
|
95
|
+
if out_path.exists():
|
|
96
|
+
raw = out_path.read_text(encoding="utf-8").strip()
|
|
97
|
+
out_path.unlink(missing_ok=True)
|
|
98
|
+
else:
|
|
99
|
+
raw = stdout.decode(errors="replace").strip() if stdout else ""
|
|
100
|
+
|
|
101
|
+
if not raw:
|
|
102
|
+
return {"status": "error", "error": f"no output (rc={proc.returncode})"}
|
|
103
|
+
try:
|
|
104
|
+
return json.loads(raw)
|
|
105
|
+
except json.JSONDecodeError:
|
|
106
|
+
return {"status": "error", "error": raw}
|
|
107
|
+
|
|
108
|
+
async def _run_check(self) -> dict:
|
|
109
|
+
"""Run ``--check`` — direct execution, no TCC needed."""
|
|
110
|
+
return await self._run_via_open(
|
|
111
|
+
"--check", "--language", self.settings.language,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# ------------------------------------------------------------------
|
|
115
|
+
# Lifecycle
|
|
116
|
+
# ------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
async def start(self) -> None:
|
|
119
|
+
if self.settings.binary_path:
|
|
120
|
+
# Custom binary path — use directly (user manages TCC)
|
|
121
|
+
candidate = Path(self.settings.binary_path)
|
|
122
|
+
if not candidate.exists():
|
|
123
|
+
raise RuntimeError(
|
|
124
|
+
f"macos-stt binary not found at {candidate}."
|
|
125
|
+
)
|
|
126
|
+
self._binary = candidate
|
|
127
|
+
self._app_bundle = None
|
|
128
|
+
else:
|
|
129
|
+
app = self._DEFAULT_APP_BUNDLE
|
|
130
|
+
binary = app / "Contents" / "MacOS" / "macos-stt-helper"
|
|
131
|
+
if not binary.exists():
|
|
132
|
+
raise RuntimeError(
|
|
133
|
+
f"macos-stt binary not found at {binary}. "
|
|
134
|
+
"Run scripts/engines/macos-stt/install.sh to compile it."
|
|
135
|
+
)
|
|
136
|
+
self._app_bundle = app
|
|
137
|
+
self._binary = binary
|
|
138
|
+
|
|
139
|
+
# Verify speech recognition availability via --check
|
|
140
|
+
check = await self._run_check()
|
|
141
|
+
if check.get("status") != "ok":
|
|
142
|
+
err = check.get("error", "unknown authorization issue")
|
|
143
|
+
logger.warning("macos-stt auth check failed: {}", err)
|
|
144
|
+
raise RuntimeError(f"macos-stt not authorized: {err}")
|
|
145
|
+
|
|
146
|
+
self._auth_ok = True
|
|
147
|
+
self._started = True
|
|
148
|
+
logger.info("{} provider started", self.name)
|
|
149
|
+
|
|
150
|
+
async def stop(self) -> None:
|
|
151
|
+
self._binary = None
|
|
152
|
+
self._app_bundle = None
|
|
153
|
+
self._started = False
|
|
154
|
+
self._auth_ok = False
|
|
155
|
+
logger.info("{} provider stopped", self.name)
|
|
156
|
+
|
|
157
|
+
async def health_check(self) -> bool:
|
|
158
|
+
if self._started and self._auth_ok:
|
|
159
|
+
return True
|
|
160
|
+
# Pre-start check: verify binary exists
|
|
161
|
+
if self._binary is not None and self._binary.exists():
|
|
162
|
+
return True
|
|
163
|
+
# Check default location
|
|
164
|
+
default = self._DEFAULT_APP_BUNDLE / "Contents" / "MacOS" / "macos-stt-helper"
|
|
165
|
+
return default.exists()
|
|
166
|
+
|
|
167
|
+
# ------------------------------------------------------------------
|
|
168
|
+
# Transcription
|
|
169
|
+
# ------------------------------------------------------------------
|
|
170
|
+
|
|
171
|
+
async def transcribe(
|
|
172
|
+
self, audio: AudioData, opts: STTOptions | None = None
|
|
173
|
+
) -> Transcription:
|
|
174
|
+
if not self._started or self._binary is None:
|
|
175
|
+
raise RuntimeError("Provider not started -- call start() first")
|
|
176
|
+
logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
|
|
177
|
+
_t0 = time.perf_counter()
|
|
178
|
+
|
|
179
|
+
opts = opts or STTOptions()
|
|
180
|
+
language = opts.language or self.settings.language
|
|
181
|
+
|
|
182
|
+
# Convert to WAV for the Swift helper.
|
|
183
|
+
wav_audio = AudioConverter.to_wav(audio)
|
|
184
|
+
|
|
185
|
+
tmp_wav: str | None = None
|
|
186
|
+
tmp_out: str | None = None
|
|
187
|
+
try:
|
|
188
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
189
|
+
f.write(wav_audio.data)
|
|
190
|
+
tmp_wav = f.name
|
|
191
|
+
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f:
|
|
192
|
+
tmp_out = f.name
|
|
193
|
+
|
|
194
|
+
result = await self._run_via_open(
|
|
195
|
+
"--audio", tmp_wav,
|
|
196
|
+
"--language", language,
|
|
197
|
+
output_file=tmp_out if self._app_bundle else None,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# When running without .app bundle (custom binary_path), _run_via_open
|
|
201
|
+
# uses direct execution and reads stdout — handle that path too.
|
|
202
|
+
if self._app_bundle is None and not result:
|
|
203
|
+
raise RuntimeError("macos-stt-helper returned no output")
|
|
204
|
+
|
|
205
|
+
if "error" in result:
|
|
206
|
+
raise RuntimeError(
|
|
207
|
+
f"macos-stt-helper error: {result['error']}"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
transcription = Transcription(
|
|
211
|
+
text=result.get("text", ""),
|
|
212
|
+
language=result.get("language"),
|
|
213
|
+
confidence=result.get("confidence"),
|
|
214
|
+
)
|
|
215
|
+
logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(transcription.text))
|
|
216
|
+
return transcription
|
|
217
|
+
finally:
|
|
218
|
+
if tmp_wav is not None:
|
|
219
|
+
Path(tmp_wav).unlink(missing_ok=True)
|
|
220
|
+
if tmp_out is not None:
|
|
221
|
+
Path(tmp_out).unlink(missing_ok=True)
|
|
222
|
+
|
|
223
|
+
def transcribe_stream(
|
|
224
|
+
self, stream: AsyncIterator[bytes]
|
|
225
|
+
) -> AsyncIterator[Any]:
|
|
226
|
+
raise NotImplementedError("macOS SFSpeechRecognizer does not support streaming via this provider")
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""OpenAI STT provider adapter (Whisper API)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import io
|
|
5
|
+
from openspeech.logging_config import logger
|
|
6
|
+
import time
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from openspeech.core.base import STTProvider
|
|
12
|
+
|
|
13
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
14
|
+
from openspeech.core.models import AudioData, STTOptions, Transcription
|
|
15
|
+
from openspeech.core.settings import BaseSettings
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class OpenAISTTSettings(BaseSettings):
|
|
19
|
+
api_key: str = ""
|
|
20
|
+
base_url: str = ""
|
|
21
|
+
model: str = "whisper-1"
|
|
22
|
+
|
|
23
|
+
class OpenAISTT(STTProvider):
|
|
24
|
+
name = "openai-stt"
|
|
25
|
+
provider_type = ProviderType.STT
|
|
26
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
27
|
+
settings_cls = OpenAISTTSettings
|
|
28
|
+
capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
|
|
29
|
+
field_options = {"model": ["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]}
|
|
30
|
+
|
|
31
|
+
def __init__(self, settings: OpenAISTTSettings | None = None) -> None:
|
|
32
|
+
self.settings = settings or OpenAISTTSettings()
|
|
33
|
+
self._client: Any = None
|
|
34
|
+
|
|
35
|
+
async def start(self) -> None:
|
|
36
|
+
try:
|
|
37
|
+
from openai import AsyncOpenAI
|
|
38
|
+
kwargs: dict[str, Any] = {"api_key": self.settings.api_key}
|
|
39
|
+
if self.settings.base_url:
|
|
40
|
+
kwargs["base_url"] = self.settings.base_url
|
|
41
|
+
self._client = AsyncOpenAI(**kwargs)
|
|
42
|
+
except ImportError:
|
|
43
|
+
raise ImportError("Install openai: pip install openspeech[openai]")
|
|
44
|
+
logger.info("{} provider started", self.name)
|
|
45
|
+
|
|
46
|
+
async def stop(self) -> None:
|
|
47
|
+
self._client = None
|
|
48
|
+
logger.info("{} provider stopped", self.name)
|
|
49
|
+
|
|
50
|
+
async def health_check(self) -> bool:
|
|
51
|
+
return bool(self.settings.api_key)
|
|
52
|
+
|
|
53
|
+
async def transcribe(
|
|
54
|
+
self, audio: AudioData, opts: STTOptions | None = None
|
|
55
|
+
) -> Transcription:
|
|
56
|
+
if self._client is None:
|
|
57
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
58
|
+
logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
|
|
59
|
+
_t0 = time.perf_counter()
|
|
60
|
+
opts = opts or STTOptions()
|
|
61
|
+
audio_file = io.BytesIO(audio.data)
|
|
62
|
+
audio_file.name = "audio.wav"
|
|
63
|
+
kwargs: dict[str, Any] = {
|
|
64
|
+
"model": self.settings.model,
|
|
65
|
+
"file": audio_file,
|
|
66
|
+
}
|
|
67
|
+
if opts.language:
|
|
68
|
+
kwargs["language"] = opts.language
|
|
69
|
+
if opts.prompt:
|
|
70
|
+
kwargs["prompt"] = opts.prompt
|
|
71
|
+
if opts.temperature is not None:
|
|
72
|
+
kwargs["temperature"] = opts.temperature
|
|
73
|
+
response = await self._client.audio.transcriptions.create(**kwargs)
|
|
74
|
+
result = Transcription(text=response.text, language=opts.language)
|
|
75
|
+
logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(result.text))
|
|
76
|
+
return result
|
|
77
|
+
|
|
78
|
+
async def transcribe_stream(
|
|
79
|
+
self, stream: AsyncIterator[bytes]
|
|
80
|
+
) -> AsyncIterator[Any]:
|
|
81
|
+
raise NotImplementedError(
|
|
82
|
+
"OpenAI Whisper API does not support streaming input"
|
|
83
|
+
)
|
|
84
|
+
yield # pragma: no cover
|
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
"""Sherpa-ONNX STT provider adapter (local service)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from collections.abc import AsyncIterator
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
import io
|
|
7
|
+
import json
|
|
8
|
+
from openspeech.logging_config import logger
|
|
9
|
+
import math
|
|
10
|
+
import struct
|
|
11
|
+
import time
|
|
12
|
+
from urllib.parse import urlparse, urlunparse
|
|
13
|
+
import wave
|
|
14
|
+
from typing import Any
|
|
15
|
+
from urllib.parse import urljoin
|
|
16
|
+
|
|
17
|
+
from openspeech.core.base import STTProvider
|
|
18
|
+
|
|
19
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
20
|
+
from openspeech.core.models import AudioData, STTOptions, Transcription
|
|
21
|
+
from openspeech.core.settings import BaseSettings
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class SherpaOnnxSTTSettings(BaseSettings):
|
|
25
|
+
api_url: str = "http://127.0.0.1:17000"
|
|
26
|
+
health_path: str = "/health"
|
|
27
|
+
ws_path: str = "/"
|
|
28
|
+
model: str = ""
|
|
29
|
+
language: str = "auto"
|
|
30
|
+
sample_rate: int = 16000
|
|
31
|
+
chunk_ms: int = 120
|
|
32
|
+
recv_timeout_s: float = 1.0
|
|
33
|
+
timeout_s: float = 60.0
|
|
34
|
+
retries: int = 0
|
|
35
|
+
|
|
36
|
+
class SherpaOnnxSTT(STTProvider):
|
|
37
|
+
name = "sherpa-onnx-stt"
|
|
38
|
+
provider_type = ProviderType.STT
|
|
39
|
+
execution_mode = ExecMode.LOCAL
|
|
40
|
+
settings_cls = SherpaOnnxSTTSettings
|
|
41
|
+
capabilities = {
|
|
42
|
+
Capability.STREAMING,
|
|
43
|
+
Capability.BATCH,
|
|
44
|
+
Capability.MULTILINGUAL,
|
|
45
|
+
}
|
|
46
|
+
field_options = {"language": ["auto", "en", "zh", "ja"]}
|
|
47
|
+
|
|
48
|
+
def __init__(self, settings: SherpaOnnxSTTSettings | None = None) -> None:
|
|
49
|
+
self.settings = settings or SherpaOnnxSTTSettings()
|
|
50
|
+
self._client: Any = None
|
|
51
|
+
self._owns_client: bool = True
|
|
52
|
+
|
|
53
|
+
def set_http_client(self, client) -> None:
|
|
54
|
+
self._client = client
|
|
55
|
+
self._owns_client = False
|
|
56
|
+
|
|
57
|
+
async def start(self) -> None:
|
|
58
|
+
if self._client is None:
|
|
59
|
+
try:
|
|
60
|
+
import httpx
|
|
61
|
+
except ImportError:
|
|
62
|
+
raise ImportError(
|
|
63
|
+
"Install httpx: pip install openspeech[server]"
|
|
64
|
+
)
|
|
65
|
+
self._client = httpx.AsyncClient(timeout=self.settings.timeout_s, trust_env=False)
|
|
66
|
+
self._owns_client = True
|
|
67
|
+
logger.info("{} provider started", self.name)
|
|
68
|
+
|
|
69
|
+
async def stop(self) -> None:
|
|
70
|
+
if self._client and self._owns_client:
|
|
71
|
+
await self._client.aclose()
|
|
72
|
+
self._client = None
|
|
73
|
+
logger.info("{} provider stopped", self.name)
|
|
74
|
+
|
|
75
|
+
def _ws_url(self) -> str:
|
|
76
|
+
u = urlparse(self.settings.api_url.rstrip("/"))
|
|
77
|
+
scheme = "wss" if u.scheme == "https" else "ws"
|
|
78
|
+
path = self.settings.ws_path or "/"
|
|
79
|
+
if not path.startswith("/"):
|
|
80
|
+
path = f"/{path}"
|
|
81
|
+
return urlunparse((scheme, u.netloc, path, "", "", ""))
|
|
82
|
+
|
|
83
|
+
async def health_check(self) -> bool:
|
|
84
|
+
if self._client is None:
|
|
85
|
+
return False
|
|
86
|
+
try:
|
|
87
|
+
url = urljoin(
|
|
88
|
+
self.settings.api_url.rstrip("/") + "/",
|
|
89
|
+
self.settings.health_path.lstrip("/"),
|
|
90
|
+
)
|
|
91
|
+
resp = await self._client.get(url)
|
|
92
|
+
return resp.status_code < 500
|
|
93
|
+
except Exception:
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
async def transcribe(
|
|
97
|
+
self, audio: AudioData, opts: STTOptions | None = None
|
|
98
|
+
) -> Transcription:
|
|
99
|
+
if self._client is None:
|
|
100
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
101
|
+
logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
|
|
102
|
+
opts = opts or STTOptions()
|
|
103
|
+
language = (opts.language or self.settings.language).strip()
|
|
104
|
+
|
|
105
|
+
samples = self._audio_to_float32_samples(audio)
|
|
106
|
+
started_at = time.perf_counter()
|
|
107
|
+
text = await self._transcribe_samples_via_ws(samples)
|
|
108
|
+
|
|
109
|
+
duration_ms = int((time.perf_counter() - started_at) * 1000)
|
|
110
|
+
if audio.duration_ms is not None:
|
|
111
|
+
duration_ms = int(audio.duration_ms)
|
|
112
|
+
|
|
113
|
+
result = Transcription(
|
|
114
|
+
text=text,
|
|
115
|
+
language=language if language != "auto" else None,
|
|
116
|
+
confidence=None,
|
|
117
|
+
duration_ms=duration_ms,
|
|
118
|
+
)
|
|
119
|
+
logger.info("{}: completed in {}ms, result={} chars", self.name, duration_ms, len(result.text))
|
|
120
|
+
return result
|
|
121
|
+
|
|
122
|
+
async def transcribe_stream(
|
|
123
|
+
self, stream: AsyncIterator[bytes]
|
|
124
|
+
) -> AsyncIterator[Any]:
|
|
125
|
+
if self._client is None:
|
|
126
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
127
|
+
try:
|
|
128
|
+
import websockets
|
|
129
|
+
except ImportError:
|
|
130
|
+
raise ImportError("Install websockets: pip install openspeech[server]")
|
|
131
|
+
|
|
132
|
+
import asyncio
|
|
133
|
+
|
|
134
|
+
_t0 = time.perf_counter()
|
|
135
|
+
_frames_sent = 0
|
|
136
|
+
|
|
137
|
+
ws_url = self._ws_url()
|
|
138
|
+
_sender_stop = asyncio.Event()
|
|
139
|
+
|
|
140
|
+
logger.debug("{}: connecting to Sherpa-ONNX WebSocket...", self.name)
|
|
141
|
+
async with websockets.connect(ws_url, open_timeout=self.settings.timeout_s) as ws:
|
|
142
|
+
_t_connected = time.perf_counter()
|
|
143
|
+
logger.info("{}: WS connected in {:.0f}ms", self.name,
|
|
144
|
+
(_t_connected - _t0) * 1000)
|
|
145
|
+
sender_done = False
|
|
146
|
+
current_segment = 0
|
|
147
|
+
final_parts: list[str] = []
|
|
148
|
+
current_text = ""
|
|
149
|
+
_resp_count = 0
|
|
150
|
+
|
|
151
|
+
async def _sender() -> None:
|
|
152
|
+
nonlocal sender_done, _frames_sent
|
|
153
|
+
try:
|
|
154
|
+
async for chunk in stream:
|
|
155
|
+
if _sender_stop.is_set():
|
|
156
|
+
break
|
|
157
|
+
if not chunk:
|
|
158
|
+
continue
|
|
159
|
+
data = self._pcm16_bytes_to_float32_bytes(chunk)
|
|
160
|
+
if data:
|
|
161
|
+
await ws.send(data)
|
|
162
|
+
_frames_sent += 1
|
|
163
|
+
if _frames_sent == 1:
|
|
164
|
+
logger.debug("{}: first frame sent at {:.0f}ms",
|
|
165
|
+
self.name, (time.perf_counter() - _t0) * 1000)
|
|
166
|
+
if not _sender_stop.is_set():
|
|
167
|
+
await ws.send("Done")
|
|
168
|
+
except websockets.exceptions.ConnectionClosed:
|
|
169
|
+
pass
|
|
170
|
+
finally:
|
|
171
|
+
sender_done = True
|
|
172
|
+
logger.debug(
|
|
173
|
+
"{}: stream sender done, sent {} frames in {:.0f}ms",
|
|
174
|
+
self.name, _frames_sent, (time.perf_counter() - _t0) * 1000,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
send_task = asyncio.create_task(_sender())
|
|
178
|
+
try:
|
|
179
|
+
while True:
|
|
180
|
+
try:
|
|
181
|
+
raw = await asyncio.wait_for(ws.recv(), timeout=self.settings.recv_timeout_s)
|
|
182
|
+
except asyncio.TimeoutError:
|
|
183
|
+
if sender_done:
|
|
184
|
+
break
|
|
185
|
+
continue
|
|
186
|
+
if not isinstance(raw, str):
|
|
187
|
+
continue
|
|
188
|
+
evt = self._parse_ws_event(raw)
|
|
189
|
+
if evt is None:
|
|
190
|
+
continue
|
|
191
|
+
_resp_count += 1
|
|
192
|
+
seg, txt = evt
|
|
193
|
+
if _resp_count == 1:
|
|
194
|
+
logger.debug("{}: first response at {:.0f}ms seg={}",
|
|
195
|
+
self.name, (time.perf_counter() - _t0) * 1000, seg)
|
|
196
|
+
if seg > current_segment:
|
|
197
|
+
if current_text:
|
|
198
|
+
final_parts.append(current_text)
|
|
199
|
+
current_segment = seg
|
|
200
|
+
current_text = txt
|
|
201
|
+
else:
|
|
202
|
+
current_text = txt
|
|
203
|
+
merged = self._merge_with_current(final_parts, current_text)
|
|
204
|
+
if merged:
|
|
205
|
+
yield Transcription(text=merged, is_partial=True)
|
|
206
|
+
finally:
|
|
207
|
+
send_task.cancel()
|
|
208
|
+
try:
|
|
209
|
+
await send_task
|
|
210
|
+
except asyncio.CancelledError:
|
|
211
|
+
pass
|
|
212
|
+
|
|
213
|
+
merged = self._merge_with_current(final_parts, current_text)
|
|
214
|
+
if merged:
|
|
215
|
+
logger.info("{}: final result at {:.0f}ms responses={} text='{}'",
|
|
216
|
+
self.name, (time.perf_counter() - _t0) * 1000,
|
|
217
|
+
_resp_count, merged[:60])
|
|
218
|
+
yield Transcription(text=merged, is_partial=False)
|
|
219
|
+
|
|
220
|
+
logger.info(
|
|
221
|
+
"{}: stream completed in {:.0f}ms, frames={}",
|
|
222
|
+
self.name, (time.perf_counter() - _t0) * 1000, _frames_sent,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
@staticmethod
|
|
226
|
+
def _merge_with_current(parts: list[str], current: str) -> str:
|
|
227
|
+
data = [p.strip() for p in parts if p and p.strip()]
|
|
228
|
+
c = (current or "").strip()
|
|
229
|
+
if c:
|
|
230
|
+
data.append(c)
|
|
231
|
+
return " ".join(data).strip()
|
|
232
|
+
|
|
233
|
+
def _parse_ws_event(self, raw: str) -> tuple[int, str] | None:
|
|
234
|
+
try:
|
|
235
|
+
payload = json.loads(raw)
|
|
236
|
+
except Exception:
|
|
237
|
+
txt = raw.strip()
|
|
238
|
+
return (0, txt) if txt else None
|
|
239
|
+
txt = str(payload.get("text", "")).strip()
|
|
240
|
+
if not txt:
|
|
241
|
+
return None
|
|
242
|
+
seg = int(payload.get("segment", 0) or 0)
|
|
243
|
+
return seg, txt
|
|
244
|
+
|
|
245
|
+
def _transcribe_chunk_frames(self, sample_count: int) -> int:
|
|
246
|
+
sample_rate = max(8000, int(self.settings.sample_rate))
|
|
247
|
+
chunk_ms = max(20, int(self.settings.chunk_ms))
|
|
248
|
+
frames = max(1, int(sample_rate * (chunk_ms / 1000.0)))
|
|
249
|
+
if sample_count < frames:
|
|
250
|
+
return sample_count
|
|
251
|
+
return frames
|
|
252
|
+
|
|
253
|
+
async def _transcribe_samples_via_ws(self, samples: list[float]) -> str:
|
|
254
|
+
try:
|
|
255
|
+
import websockets
|
|
256
|
+
except ImportError:
|
|
257
|
+
raise ImportError("Install websockets: pip install openspeech[server]")
|
|
258
|
+
ws_url = self._ws_url()
|
|
259
|
+
current_segment = 0
|
|
260
|
+
final_parts: list[str] = []
|
|
261
|
+
current_text = ""
|
|
262
|
+
import asyncio
|
|
263
|
+
|
|
264
|
+
async with websockets.connect(ws_url, open_timeout=self.settings.timeout_s) as ws:
|
|
265
|
+
i = 0
|
|
266
|
+
step = self._transcribe_chunk_frames(len(samples))
|
|
267
|
+
while i < len(samples):
|
|
268
|
+
chunk = samples[i:i + step]
|
|
269
|
+
i += step
|
|
270
|
+
if chunk:
|
|
271
|
+
await ws.send(self._float32_list_to_bytes(chunk))
|
|
272
|
+
|
|
273
|
+
await ws.send("Done")
|
|
274
|
+
while True:
|
|
275
|
+
try:
|
|
276
|
+
raw = await asyncio.wait_for(ws.recv(), timeout=self.settings.recv_timeout_s)
|
|
277
|
+
except asyncio.TimeoutError:
|
|
278
|
+
break
|
|
279
|
+
if not isinstance(raw, str):
|
|
280
|
+
continue
|
|
281
|
+
evt = self._parse_ws_event(raw)
|
|
282
|
+
if evt is None:
|
|
283
|
+
continue
|
|
284
|
+
seg, txt = evt
|
|
285
|
+
if seg > current_segment:
|
|
286
|
+
if current_text:
|
|
287
|
+
final_parts.append(current_text)
|
|
288
|
+
current_segment = seg
|
|
289
|
+
current_text = txt
|
|
290
|
+
else:
|
|
291
|
+
current_text = txt
|
|
292
|
+
return self._merge_with_current(final_parts, current_text)
|
|
293
|
+
|
|
294
|
+
@staticmethod
|
|
295
|
+
def _float32_list_to_bytes(samples: list[float]) -> bytes:
|
|
296
|
+
if not samples:
|
|
297
|
+
return b""
|
|
298
|
+
return struct.pack(f"<{len(samples)}f", *samples)
|
|
299
|
+
|
|
300
|
+
@staticmethod
|
|
301
|
+
def _pcm16_bytes_to_float32_bytes(data: bytes) -> bytes:
|
|
302
|
+
if not data:
|
|
303
|
+
return b""
|
|
304
|
+
usable = len(data) - (len(data) % 2)
|
|
305
|
+
if usable <= 0:
|
|
306
|
+
return b""
|
|
307
|
+
values = struct.unpack(f"<{usable // 2}h", data[:usable])
|
|
308
|
+
floats = [max(-1.0, min(1.0, v / 32768.0)) for v in values]
|
|
309
|
+
return struct.pack(f"<{len(floats)}f", *floats)
|
|
310
|
+
|
|
311
|
+
def _audio_to_float32_samples(self, audio: AudioData) -> list[float]:
|
|
312
|
+
if len(audio.data) > 12 and audio.data[:4] == b"RIFF" and audio.data[8:12] == b"WAVE":
|
|
313
|
+
with wave.open(io.BytesIO(audio.data), "rb") as wf:
|
|
314
|
+
channels = max(1, int(wf.getnchannels()))
|
|
315
|
+
sampwidth = int(wf.getsampwidth())
|
|
316
|
+
sample_rate = int(wf.getframerate())
|
|
317
|
+
frames = wf.readframes(wf.getnframes())
|
|
318
|
+
if sampwidth != 2:
|
|
319
|
+
raise RuntimeError("Only PCM16 WAV is supported for sherpa transcribe")
|
|
320
|
+
ints = struct.unpack(f"<{len(frames) // 2}h", frames)
|
|
321
|
+
if channels > 1:
|
|
322
|
+
mono: list[int] = []
|
|
323
|
+
for i in range(0, len(ints), channels):
|
|
324
|
+
frame = ints[i:i + channels]
|
|
325
|
+
mono.append(int(sum(frame) / max(1, len(frame))))
|
|
326
|
+
ints = tuple(mono)
|
|
327
|
+
samples = [max(-1.0, min(1.0, v / 32768.0)) for v in ints]
|
|
328
|
+
if sample_rate != int(self.settings.sample_rate):
|
|
329
|
+
return self._resample_linear(samples, sample_rate, int(self.settings.sample_rate))
|
|
330
|
+
return samples
|
|
331
|
+
|
|
332
|
+
# Raw PCM16 fallback.
|
|
333
|
+
raw = self._pcm16_bytes_to_float32_bytes(audio.data)
|
|
334
|
+
if not raw:
|
|
335
|
+
return []
|
|
336
|
+
count = len(raw) // 4
|
|
337
|
+
return list(struct.unpack(f"<{count}f", raw))
|
|
338
|
+
|
|
339
|
+
@staticmethod
|
|
340
|
+
def _resample_linear(samples: list[float], src_rate: int, dst_rate: int) -> list[float]:
|
|
341
|
+
if src_rate <= 0 or dst_rate <= 0 or src_rate == dst_rate or not samples:
|
|
342
|
+
return samples
|
|
343
|
+
ratio = dst_rate / src_rate
|
|
344
|
+
out_len = max(1, int(math.floor(len(samples) * ratio)))
|
|
345
|
+
out: list[float] = []
|
|
346
|
+
for i in range(out_len):
|
|
347
|
+
src_pos = i / ratio
|
|
348
|
+
left = int(math.floor(src_pos))
|
|
349
|
+
right = min(left + 1, len(samples) - 1)
|
|
350
|
+
frac = src_pos - left
|
|
351
|
+
val = samples[left] * (1.0 - frac) + samples[right] * frac
|
|
352
|
+
out.append(float(val))
|
|
353
|
+
return out
|