openspeechapi 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openspeech/__init__.py +75 -0
- openspeech/__main__.py +5 -0
- openspeech/cli.py +413 -0
- openspeech/client/__init__.py +4 -0
- openspeech/client/client.py +145 -0
- openspeech/config.py +212 -0
- openspeech/core/__init__.py +0 -0
- openspeech/core/base.py +75 -0
- openspeech/core/enums.py +39 -0
- openspeech/core/models.py +61 -0
- openspeech/core/registry.py +37 -0
- openspeech/core/settings.py +8 -0
- openspeech/demo.py +675 -0
- openspeech/dispatch/__init__.py +0 -0
- openspeech/dispatch/context.py +34 -0
- openspeech/dispatch/dispatcher.py +661 -0
- openspeech/dispatch/executors/__init__.py +0 -0
- openspeech/dispatch/executors/base.py +34 -0
- openspeech/dispatch/executors/in_process.py +66 -0
- openspeech/dispatch/executors/remote.py +64 -0
- openspeech/dispatch/executors/subprocess_exec.py +446 -0
- openspeech/dispatch/fanout.py +95 -0
- openspeech/dispatch/filters.py +73 -0
- openspeech/dispatch/lifecycle.py +178 -0
- openspeech/dispatch/watcher.py +82 -0
- openspeech/engine_catalog.py +236 -0
- openspeech/engine_registry.yaml +347 -0
- openspeech/exceptions.py +51 -0
- openspeech/factory.py +325 -0
- openspeech/local_engines/__init__.py +12 -0
- openspeech/local_engines/aim_resolver.py +91 -0
- openspeech/local_engines/backends/__init__.py +1 -0
- openspeech/local_engines/backends/docker_backend.py +490 -0
- openspeech/local_engines/backends/native_backend.py +902 -0
- openspeech/local_engines/base.py +30 -0
- openspeech/local_engines/engines/__init__.py +1 -0
- openspeech/local_engines/engines/faster_whisper.py +36 -0
- openspeech/local_engines/engines/fish_speech.py +33 -0
- openspeech/local_engines/engines/sherpa_onnx.py +56 -0
- openspeech/local_engines/engines/whisper.py +41 -0
- openspeech/local_engines/engines/whisperlivekit.py +60 -0
- openspeech/local_engines/manager.py +208 -0
- openspeech/local_engines/models.py +50 -0
- openspeech/local_engines/progress.py +69 -0
- openspeech/local_engines/registry.py +19 -0
- openspeech/local_engines/task_store.py +52 -0
- openspeech/local_engines/tasks.py +71 -0
- openspeech/logging_config.py +607 -0
- openspeech/observe/__init__.py +0 -0
- openspeech/observe/base.py +79 -0
- openspeech/observe/debug.py +44 -0
- openspeech/observe/latency.py +19 -0
- openspeech/observe/metrics.py +47 -0
- openspeech/observe/tracing.py +44 -0
- openspeech/observe/usage.py +27 -0
- openspeech/providers/__init__.py +0 -0
- openspeech/providers/_template.py +101 -0
- openspeech/providers/stt/__init__.py +0 -0
- openspeech/providers/stt/alibaba.py +86 -0
- openspeech/providers/stt/assemblyai.py +135 -0
- openspeech/providers/stt/azure_speech.py +99 -0
- openspeech/providers/stt/baidu.py +135 -0
- openspeech/providers/stt/deepgram.py +311 -0
- openspeech/providers/stt/elevenlabs.py +385 -0
- openspeech/providers/stt/faster_whisper.py +211 -0
- openspeech/providers/stt/google_cloud.py +106 -0
- openspeech/providers/stt/iflytek.py +427 -0
- openspeech/providers/stt/macos_speech.py +226 -0
- openspeech/providers/stt/openai.py +84 -0
- openspeech/providers/stt/sherpa_onnx.py +353 -0
- openspeech/providers/stt/tencent.py +212 -0
- openspeech/providers/stt/volcengine.py +107 -0
- openspeech/providers/stt/whisper.py +153 -0
- openspeech/providers/stt/whisperlivekit.py +530 -0
- openspeech/providers/stt/windows_speech.py +249 -0
- openspeech/providers/tts/__init__.py +0 -0
- openspeech/providers/tts/alibaba.py +95 -0
- openspeech/providers/tts/azure_speech.py +123 -0
- openspeech/providers/tts/baidu.py +143 -0
- openspeech/providers/tts/coqui.py +64 -0
- openspeech/providers/tts/cosyvoice.py +90 -0
- openspeech/providers/tts/deepgram.py +174 -0
- openspeech/providers/tts/elevenlabs.py +311 -0
- openspeech/providers/tts/fish_speech.py +158 -0
- openspeech/providers/tts/google_cloud.py +107 -0
- openspeech/providers/tts/iflytek.py +209 -0
- openspeech/providers/tts/macos_say.py +251 -0
- openspeech/providers/tts/minimax.py +122 -0
- openspeech/providers/tts/openai.py +104 -0
- openspeech/providers/tts/piper.py +104 -0
- openspeech/providers/tts/tencent.py +189 -0
- openspeech/providers/tts/volcengine.py +117 -0
- openspeech/providers/tts/windows_sapi.py +234 -0
- openspeech/server/__init__.py +1 -0
- openspeech/server/app.py +72 -0
- openspeech/server/auth.py +42 -0
- openspeech/server/middleware.py +75 -0
- openspeech/server/routes/__init__.py +1 -0
- openspeech/server/routes/management.py +848 -0
- openspeech/server/routes/stt.py +121 -0
- openspeech/server/routes/tts.py +159 -0
- openspeech/server/routes/webui.py +29 -0
- openspeech/server/webui/app.js +2649 -0
- openspeech/server/webui/index.html +216 -0
- openspeech/server/webui/styles.css +617 -0
- openspeech/server/ws/__init__.py +1 -0
- openspeech/server/ws/stt_stream.py +263 -0
- openspeech/server/ws/tts_stream.py +207 -0
- openspeech/telemetry/__init__.py +21 -0
- openspeech/telemetry/perf.py +307 -0
- openspeech/utils/__init__.py +5 -0
- openspeech/utils/audio_converter.py +406 -0
- openspeech/utils/audio_playback.py +156 -0
- openspeech/vendor_registry.yaml +74 -0
- openspeechapi-0.1.0.dist-info/METADATA +101 -0
- openspeechapi-0.1.0.dist-info/RECORD +118 -0
- openspeechapi-0.1.0.dist-info/WHEEL +4 -0
- openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,530 @@
|
|
|
1
|
+
"""WhisperLiveKit STT provider (local service, Deepgram-compatible WS)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from collections.abc import AsyncIterator
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
import io
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import re
|
|
10
|
+
import time
|
|
11
|
+
from typing import Any
|
|
12
|
+
from uuid import uuid4
|
|
13
|
+
from urllib.parse import urlencode, urljoin, urlparse, urlunparse
|
|
14
|
+
import wave
|
|
15
|
+
|
|
16
|
+
from openspeech.core.base import STTProvider
|
|
17
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
18
|
+
from openspeech.core.models import AudioData, STTOptions, Transcription
|
|
19
|
+
from openspeech.core.settings import BaseSettings
|
|
20
|
+
from openspeech.logging_config import logger
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class WhisperLiveKitSTTSettings(BaseSettings):
|
|
24
|
+
api_url: str = "http://127.0.0.1:12100"
|
|
25
|
+
ws_path: str = "/asr"
|
|
26
|
+
ws_mode: str = "diff"
|
|
27
|
+
http_transcribe_path: str = "/v1/audio/transcriptions"
|
|
28
|
+
model: str = "openai/whisper-large-v3-turbo"
|
|
29
|
+
language: str = ""
|
|
30
|
+
sample_rate: int = 16000
|
|
31
|
+
encoding: str = "linear16"
|
|
32
|
+
interim_results: bool = True
|
|
33
|
+
punctuate: bool = True
|
|
34
|
+
smart_format: bool = True
|
|
35
|
+
timeout_s: float = 30.0
|
|
36
|
+
retries: int = 0
|
|
37
|
+
# Trace logs for streaming parse/debug (JSONL).
|
|
38
|
+
trace_enabled: bool = False
|
|
39
|
+
trace_path: str = ".tmp/logs/stt_wlk_trace.jsonl"
|
|
40
|
+
trace_max_chars: int = 240
|
|
41
|
+
# WhisperLiveKit (mlx/simul) may return empty text for very short WAV clips.
|
|
42
|
+
# To improve UX, pad trailing silence up to this duration before request.
|
|
43
|
+
min_audio_duration_ms: int = 6000
|
|
44
|
+
|
|
45
|
+
class WhisperLiveKitSTT(STTProvider):
|
|
46
|
+
name = "whisperlivekit-stt"
|
|
47
|
+
provider_type = ProviderType.STT
|
|
48
|
+
execution_mode = ExecMode.LOCAL
|
|
49
|
+
settings_cls = WhisperLiveKitSTTSettings
|
|
50
|
+
capabilities = {
|
|
51
|
+
Capability.STREAMING,
|
|
52
|
+
Capability.BATCH,
|
|
53
|
+
Capability.MULTILINGUAL,
|
|
54
|
+
}
|
|
55
|
+
field_options = {"language": ["auto", "en", "zh", "ja", "ko", "es", "fr", "de"]}
|
|
56
|
+
|
|
57
|
+
def __init__(self, settings: WhisperLiveKitSTTSettings | None = None) -> None:
|
|
58
|
+
self.settings = settings or WhisperLiveKitSTTSettings()
|
|
59
|
+
self._client: Any = None
|
|
60
|
+
self._owns_client: bool = True
|
|
61
|
+
self._trace_file: Path | None = None
|
|
62
|
+
|
|
63
|
+
def set_http_client(self, client) -> None:
|
|
64
|
+
self._client = client
|
|
65
|
+
self._owns_client = False
|
|
66
|
+
|
|
67
|
+
async def start(self) -> None:
|
|
68
|
+
if self._client is None:
|
|
69
|
+
try:
|
|
70
|
+
import httpx
|
|
71
|
+
except ImportError:
|
|
72
|
+
raise ImportError(
|
|
73
|
+
"Install httpx + websockets: pip install openspeech[server] openspeech[whisperlivekit]"
|
|
74
|
+
)
|
|
75
|
+
self._client = httpx.AsyncClient(timeout=self.settings.timeout_s, trust_env=False)
|
|
76
|
+
self._owns_client = True
|
|
77
|
+
logger.info("{} provider started", self.name)
|
|
78
|
+
if self.settings.trace_enabled:
|
|
79
|
+
self._trace_file = Path(self.settings.trace_path).expanduser()
|
|
80
|
+
self._trace_file.parent.mkdir(parents=True, exist_ok=True)
|
|
81
|
+
self._trace("start", ws_path=self.settings.ws_path, ws_mode=self.settings.ws_mode)
|
|
82
|
+
|
|
83
|
+
async def stop(self) -> None:
|
|
84
|
+
if self._client and self._owns_client:
|
|
85
|
+
await self._client.aclose()
|
|
86
|
+
self._client = None
|
|
87
|
+
logger.info("{} provider stopped", self.name)
|
|
88
|
+
self._trace("stop")
|
|
89
|
+
|
|
90
|
+
def _truncate(self, value: Any) -> Any:
|
|
91
|
+
if value is None:
|
|
92
|
+
return None
|
|
93
|
+
s = str(value)
|
|
94
|
+
max_chars = max(40, int(self.settings.trace_max_chars))
|
|
95
|
+
if len(s) <= max_chars:
|
|
96
|
+
return s
|
|
97
|
+
return s[:max_chars] + "..."
|
|
98
|
+
|
|
99
|
+
def _trace(self, event: str, **payload: Any) -> None:
|
|
100
|
+
if not self.settings.trace_enabled or self._trace_file is None:
|
|
101
|
+
return
|
|
102
|
+
row = {
|
|
103
|
+
"ts": time.time(),
|
|
104
|
+
"event": event,
|
|
105
|
+
**payload,
|
|
106
|
+
}
|
|
107
|
+
try:
|
|
108
|
+
self._trace_file.parent.mkdir(parents=True, exist_ok=True)
|
|
109
|
+
with self._trace_file.open("a", encoding="utf-8") as f:
|
|
110
|
+
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
|
111
|
+
except Exception as exc: # noqa: BLE001
|
|
112
|
+
logger.warning("Failed to write stt_wlk trace log: {}", exc)
|
|
113
|
+
|
|
114
|
+
async def health_check(self) -> bool:
|
|
115
|
+
if self._client is None:
|
|
116
|
+
return False
|
|
117
|
+
try:
|
|
118
|
+
url = urljoin(self.settings.api_url.rstrip("/") + "/", "health")
|
|
119
|
+
r = await self._client.get(url)
|
|
120
|
+
return r.status_code < 500
|
|
121
|
+
except Exception:
|
|
122
|
+
return False
|
|
123
|
+
|
|
124
|
+
async def transcribe(
|
|
125
|
+
self, audio: AudioData, opts: STTOptions | None = None
|
|
126
|
+
) -> Transcription:
|
|
127
|
+
if self._client is None:
|
|
128
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
129
|
+
logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
|
|
130
|
+
opts = opts or STTOptions()
|
|
131
|
+
url = urljoin(
|
|
132
|
+
self.settings.api_url.rstrip("/") + "/",
|
|
133
|
+
self.settings.http_transcribe_path.lstrip("/"),
|
|
134
|
+
)
|
|
135
|
+
model = (opts.model or self.settings.model).strip()
|
|
136
|
+
language = (opts.language or self.settings.language).strip()
|
|
137
|
+
wav_bytes = self._maybe_pad_wav_for_short_clip(
|
|
138
|
+
audio.data, min_duration_ms=max(0, int(self.settings.min_audio_duration_ms))
|
|
139
|
+
)
|
|
140
|
+
files = {
|
|
141
|
+
"file": ("audio.wav", wav_bytes, "audio/wav"),
|
|
142
|
+
}
|
|
143
|
+
data: dict[str, Any] = {"model": model}
|
|
144
|
+
if language:
|
|
145
|
+
data["language"] = language
|
|
146
|
+
if opts.prompt:
|
|
147
|
+
data["prompt"] = opts.prompt
|
|
148
|
+
if opts.temperature is not None:
|
|
149
|
+
data["temperature"] = opts.temperature
|
|
150
|
+
|
|
151
|
+
last_exc: Exception | None = None
|
|
152
|
+
attempts = max(0, int(self.settings.retries)) + 1
|
|
153
|
+
resp = None
|
|
154
|
+
started_at = time.perf_counter()
|
|
155
|
+
for _ in range(attempts):
|
|
156
|
+
try:
|
|
157
|
+
resp = await self._client.post(url, files=files, data=data)
|
|
158
|
+
break
|
|
159
|
+
except Exception as exc: # noqa: BLE001
|
|
160
|
+
last_exc = exc
|
|
161
|
+
if resp is None:
|
|
162
|
+
raise RuntimeError(f"WhisperLiveKit request failed: {last_exc}") from last_exc
|
|
163
|
+
resp.raise_for_status()
|
|
164
|
+
payload = resp.json()
|
|
165
|
+
text = str(payload.get("text", "")).strip()
|
|
166
|
+
if not text:
|
|
167
|
+
# Deepgram-like response fallback
|
|
168
|
+
channel = payload.get("channel", {})
|
|
169
|
+
alts = channel.get("alternatives", []) if isinstance(channel, dict) else []
|
|
170
|
+
if alts:
|
|
171
|
+
text = str(alts[0].get("transcript", "")).strip()
|
|
172
|
+
elapsed_ms = int((time.perf_counter() - started_at) * 1000)
|
|
173
|
+
duration_ms = payload.get("duration_ms")
|
|
174
|
+
if duration_ms is None:
|
|
175
|
+
metadata = payload.get("metadata", {})
|
|
176
|
+
if isinstance(metadata, dict):
|
|
177
|
+
md_dur = metadata.get("duration")
|
|
178
|
+
if isinstance(md_dur, (int, float)):
|
|
179
|
+
duration_ms = int(float(md_dur) * 1000)
|
|
180
|
+
if duration_ms is None:
|
|
181
|
+
duration_ms = elapsed_ms
|
|
182
|
+
result = Transcription(
|
|
183
|
+
text=text,
|
|
184
|
+
language=payload.get("language") or language or None,
|
|
185
|
+
confidence=payload.get("confidence"),
|
|
186
|
+
duration_ms=int(duration_ms),
|
|
187
|
+
)
|
|
188
|
+
logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, elapsed_ms, len(result.text))
|
|
189
|
+
return result
|
|
190
|
+
|
|
191
|
+
@staticmethod
|
|
192
|
+
def _maybe_pad_wav_for_short_clip(data: bytes, min_duration_ms: int) -> bytes:
|
|
193
|
+
"""Pad trailing silence for short PCM WAV clips to avoid empty short-utterance outputs."""
|
|
194
|
+
if min_duration_ms <= 0 or len(data) < 44:
|
|
195
|
+
return data
|
|
196
|
+
if not (data.startswith(b"RIFF") and data[8:12] == b"WAVE"):
|
|
197
|
+
return data
|
|
198
|
+
try:
|
|
199
|
+
with wave.open(io.BytesIO(data), "rb") as r:
|
|
200
|
+
params = r.getparams()
|
|
201
|
+
framerate = int(r.getframerate())
|
|
202
|
+
sampwidth = int(r.getsampwidth())
|
|
203
|
+
channels = int(r.getnchannels())
|
|
204
|
+
frames = r.readframes(r.getnframes())
|
|
205
|
+
except Exception:
|
|
206
|
+
return data
|
|
207
|
+
if framerate <= 0 or sampwidth <= 0 or channels <= 0:
|
|
208
|
+
return data
|
|
209
|
+
target_frames = int((min_duration_ms / 1000.0) * framerate)
|
|
210
|
+
current_frames = len(frames) // max(1, sampwidth * channels)
|
|
211
|
+
if current_frames >= target_frames:
|
|
212
|
+
return data
|
|
213
|
+
add_frames = target_frames - current_frames
|
|
214
|
+
silence = b"\x00" * (add_frames * sampwidth * channels)
|
|
215
|
+
out = io.BytesIO()
|
|
216
|
+
with wave.open(out, "wb") as w:
|
|
217
|
+
w.setparams(params)
|
|
218
|
+
w.writeframes(frames + silence)
|
|
219
|
+
return out.getvalue()
|
|
220
|
+
|
|
221
|
+
def _build_ws_url(self, model: str, language: str) -> str:
|
|
222
|
+
base = self.settings.api_url.rstrip("/")
|
|
223
|
+
u = urlparse(base)
|
|
224
|
+
scheme = "wss" if u.scheme == "https" else "ws"
|
|
225
|
+
path = self.settings.ws_path if self.settings.ws_path.startswith("/") else f"/{self.settings.ws_path}"
|
|
226
|
+
query: dict[str, str] = {}
|
|
227
|
+
if path == "/asr":
|
|
228
|
+
query["mode"] = self.settings.ws_mode or "diff"
|
|
229
|
+
if language:
|
|
230
|
+
query["language"] = language
|
|
231
|
+
else:
|
|
232
|
+
query = {
|
|
233
|
+
"model": model,
|
|
234
|
+
"encoding": self.settings.encoding,
|
|
235
|
+
"sample_rate": str(int(self.settings.sample_rate)),
|
|
236
|
+
"interim_results": "true" if self.settings.interim_results else "false",
|
|
237
|
+
"punctuate": "true" if self.settings.punctuate else "false",
|
|
238
|
+
"smart_format": "true" if self.settings.smart_format else "false",
|
|
239
|
+
}
|
|
240
|
+
if language:
|
|
241
|
+
query["language"] = language
|
|
242
|
+
if path != "/asr" and language:
|
|
243
|
+
query["language"] = language
|
|
244
|
+
return urlunparse((scheme, u.netloc, path, "", urlencode(query), ""))
|
|
245
|
+
|
|
246
|
+
@staticmethod
|
|
247
|
+
def _merge_text(prev: str, incoming: str) -> str:
|
|
248
|
+
p = (prev or "").strip()
|
|
249
|
+
n = (incoming or "").strip()
|
|
250
|
+
if not n:
|
|
251
|
+
return p
|
|
252
|
+
if not p:
|
|
253
|
+
return n
|
|
254
|
+
if n == p:
|
|
255
|
+
return p
|
|
256
|
+
if n.startswith(p):
|
|
257
|
+
return n
|
|
258
|
+
if p.startswith(n):
|
|
259
|
+
return p
|
|
260
|
+
if p in n:
|
|
261
|
+
return n
|
|
262
|
+
if n in p:
|
|
263
|
+
return p
|
|
264
|
+
return f"{p} {n}".strip()
|
|
265
|
+
|
|
266
|
+
@staticmethod
|
|
267
|
+
def _snapshot_text(prev: str, incoming: str) -> str:
|
|
268
|
+
p = (prev or "").strip()
|
|
269
|
+
n = (incoming or "").strip()
|
|
270
|
+
if not n:
|
|
271
|
+
return p
|
|
272
|
+
if n == p:
|
|
273
|
+
return p
|
|
274
|
+
# WhisperLiveKit /asr (diff/snapshot) messages represent latest hypothesis.
|
|
275
|
+
# Replace instead of append to avoid repeated growth on revisions.
|
|
276
|
+
return n
|
|
277
|
+
|
|
278
|
+
@staticmethod
|
|
279
|
+
def _stabilize_asr_text(text: str) -> str:
|
|
280
|
+
t = str(text or "").strip()
|
|
281
|
+
if not t:
|
|
282
|
+
return ""
|
|
283
|
+
# Collapse obvious repeated Latin chunks, e.g. "HelloHelloHello" -> "Hello".
|
|
284
|
+
t = re.sub(r"(?i)\b([a-z][a-z0-9']{1,})(?:\1){1,}\b", r"\1", t)
|
|
285
|
+
# Collapse repeated word with spaces, e.g. "hello hello hello" -> "hello".
|
|
286
|
+
t = re.sub(r"(?i)\b([a-z][a-z0-9']{1,})\b(?:\s+\1\b){1,}", r"\1", t)
|
|
287
|
+
# Collapse CJK/punctuation long runs (3+) caused by unstable interim decoding.
|
|
288
|
+
t = re.sub(r"([\u4e00-\u9fff])\1{2,}", r"\1", t)
|
|
289
|
+
t = re.sub(r"([,,。.!!??、;;::\"'`~\-])\1{2,}", r"\1", t)
|
|
290
|
+
return t.strip()
|
|
291
|
+
|
|
292
|
+
@staticmethod
|
|
293
|
+
def _extract_asr_text_from_lines(lines: list[Any]) -> str:
|
|
294
|
+
parts: list[str] = []
|
|
295
|
+
for line in lines:
|
|
296
|
+
if not isinstance(line, dict):
|
|
297
|
+
continue
|
|
298
|
+
if int(line.get("speaker", 0) or 0) == -2:
|
|
299
|
+
continue
|
|
300
|
+
t = str(line.get("text") or "").strip()
|
|
301
|
+
if t:
|
|
302
|
+
parts.append(t)
|
|
303
|
+
return " ".join(parts).strip()
|
|
304
|
+
|
|
305
|
+
def _extract_asr_message_text(
|
|
306
|
+
self,
|
|
307
|
+
data: dict[str, Any],
|
|
308
|
+
committed_lines: list[dict[str, Any]],
|
|
309
|
+
) -> str:
|
|
310
|
+
t = str(data.get("type") or "").strip()
|
|
311
|
+
if t == "snapshot":
|
|
312
|
+
lines = data.get("lines") if isinstance(data.get("lines"), list) else []
|
|
313
|
+
committed_lines.clear()
|
|
314
|
+
committed_lines.extend([x for x in lines if isinstance(x, dict)])
|
|
315
|
+
elif t == "diff":
|
|
316
|
+
pruned = int(data.get("lines_pruned") or 0)
|
|
317
|
+
if pruned > 0:
|
|
318
|
+
del committed_lines[:pruned]
|
|
319
|
+
new_lines = data.get("new_lines") if isinstance(data.get("new_lines"), list) else []
|
|
320
|
+
committed_lines.extend([x for x in new_lines if isinstance(x, dict)])
|
|
321
|
+
elif t == "ready_to_stop":
|
|
322
|
+
return ""
|
|
323
|
+
elif "lines" in data and isinstance(data.get("lines"), list):
|
|
324
|
+
committed_lines.clear()
|
|
325
|
+
committed_lines.extend([x for x in data.get("lines", []) if isinstance(x, dict)])
|
|
326
|
+
base = self._extract_asr_text_from_lines(committed_lines)
|
|
327
|
+
buffer_t = str(data.get("buffer_transcription") or "").strip()
|
|
328
|
+
if buffer_t:
|
|
329
|
+
return self._merge_text(base, buffer_t)
|
|
330
|
+
return base
|
|
331
|
+
|
|
332
|
+
@staticmethod
|
|
333
|
+
def _extract_deepgram_text(data: dict[str, Any]) -> tuple[str, str | None, float | None]:
|
|
334
|
+
if data.get("type") != "Results":
|
|
335
|
+
return "", None, None
|
|
336
|
+
channel = data.get("channel", {})
|
|
337
|
+
alts = channel.get("alternatives", []) if isinstance(channel, dict) else []
|
|
338
|
+
if not alts:
|
|
339
|
+
return "", None, None
|
|
340
|
+
transcript = str(alts[0].get("transcript", "")).strip()
|
|
341
|
+
language = channel.get("detected_language")
|
|
342
|
+
confidence = alts[0].get("confidence")
|
|
343
|
+
return transcript, language, confidence
|
|
344
|
+
|
|
345
|
+
async def transcribe_stream(
|
|
346
|
+
self, stream: AsyncIterator[bytes]
|
|
347
|
+
) -> AsyncIterator[Any]:
|
|
348
|
+
if self._client is None:
|
|
349
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
350
|
+
try:
|
|
351
|
+
import websockets
|
|
352
|
+
except ImportError:
|
|
353
|
+
raise ImportError(
|
|
354
|
+
"Install websockets: pip install openspeech[server]"
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
_t0 = time.perf_counter()
|
|
358
|
+
|
|
359
|
+
ws_url = self._build_ws_url(self.settings.model, self.settings.language)
|
|
360
|
+
is_asr = self.settings.ws_path.startswith("/asr")
|
|
361
|
+
stream_id = uuid4().hex[:12]
|
|
362
|
+
def trace(ev: str, **kw: Any) -> None:
|
|
363
|
+
self._trace(ev, stream_id=stream_id, **kw)
|
|
364
|
+
|
|
365
|
+
logger.debug("{}: connecting to WhisperLiveKit WebSocket...", self.name)
|
|
366
|
+
trace("stream_open", ws_url=ws_url, is_asr=is_asr)
|
|
367
|
+
async with websockets.connect(ws_url, ping_interval=20, ping_timeout=20) as ws:
|
|
368
|
+
_t_connected = time.perf_counter()
|
|
369
|
+
logger.info("{}: WS connected in {:.0f}ms", self.name,
|
|
370
|
+
(_t_connected - _t0) * 1000)
|
|
371
|
+
committed_lines: list[dict[str, Any]] = []
|
|
372
|
+
latest_text = ""
|
|
373
|
+
sent_chunks = 0
|
|
374
|
+
sent_bytes = 0
|
|
375
|
+
_resp_count = 0
|
|
376
|
+
|
|
377
|
+
_sender_stop = asyncio.Event()
|
|
378
|
+
|
|
379
|
+
async def send_audio() -> None:
|
|
380
|
+
nonlocal sent_chunks, sent_bytes
|
|
381
|
+
try:
|
|
382
|
+
async for chunk in stream:
|
|
383
|
+
if _sender_stop.is_set():
|
|
384
|
+
break
|
|
385
|
+
if chunk:
|
|
386
|
+
await ws.send(chunk)
|
|
387
|
+
sent_chunks += 1
|
|
388
|
+
sent_bytes += len(chunk)
|
|
389
|
+
if sent_chunks == 1:
|
|
390
|
+
logger.debug("{}: first frame sent at {:.0f}ms",
|
|
391
|
+
self.name, (time.perf_counter() - _t0) * 1000)
|
|
392
|
+
trace("stream_send_chunk", sent_chunks=sent_chunks, sent_bytes=sent_bytes)
|
|
393
|
+
# WhisperLiveKit stop sentinel for both /asr and /v1/listen.
|
|
394
|
+
if not _sender_stop.is_set():
|
|
395
|
+
await ws.send(b"")
|
|
396
|
+
trace("stream_send_stop", sent_chunks=sent_chunks, sent_bytes=sent_bytes)
|
|
397
|
+
except websockets.exceptions.ConnectionClosed:
|
|
398
|
+
pass
|
|
399
|
+
finally:
|
|
400
|
+
logger.debug(
|
|
401
|
+
"{}: stream sender done, sent {} frames ({} bytes) in {:.0f}ms",
|
|
402
|
+
self.name, sent_chunks, sent_bytes,
|
|
403
|
+
(time.perf_counter() - _t0) * 1000,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
import asyncio
|
|
407
|
+
send_task = asyncio.create_task(send_audio())
|
|
408
|
+
try:
|
|
409
|
+
while True:
|
|
410
|
+
try:
|
|
411
|
+
msg = await asyncio.wait_for(ws.recv(), timeout=1.2)
|
|
412
|
+
except asyncio.TimeoutError:
|
|
413
|
+
if send_task.done():
|
|
414
|
+
trace("stream_recv_timeout_done")
|
|
415
|
+
break
|
|
416
|
+
continue
|
|
417
|
+
if isinstance(msg, bytes):
|
|
418
|
+
trace("stream_recv_binary", size=len(msg))
|
|
419
|
+
continue
|
|
420
|
+
data = json.loads(msg)
|
|
421
|
+
msg_type = str(data.get("type") or "")
|
|
422
|
+
if is_asr:
|
|
423
|
+
trace(
|
|
424
|
+
"stream_recv_asr",
|
|
425
|
+
type=msg_type,
|
|
426
|
+
seq=data.get("seq"),
|
|
427
|
+
status=data.get("status"),
|
|
428
|
+
n_lines=data.get("n_lines"),
|
|
429
|
+
lines_pruned=data.get("lines_pruned"),
|
|
430
|
+
buffer_transcription=self._truncate(data.get("buffer_transcription")),
|
|
431
|
+
new_lines=self._truncate(
|
|
432
|
+
" | ".join(
|
|
433
|
+
str(x.get("text", ""))
|
|
434
|
+
for x in (data.get("new_lines") or [])
|
|
435
|
+
if isinstance(x, dict)
|
|
436
|
+
)
|
|
437
|
+
),
|
|
438
|
+
)
|
|
439
|
+
else:
|
|
440
|
+
transcript, _lang, _conf = self._extract_deepgram_text(data)
|
|
441
|
+
trace(
|
|
442
|
+
"stream_recv_deepgram",
|
|
443
|
+
type=msg_type,
|
|
444
|
+
transcript=self._truncate(transcript),
|
|
445
|
+
)
|
|
446
|
+
_resp_count += 1
|
|
447
|
+
if _resp_count == 1:
|
|
448
|
+
logger.debug("{}: first response at {:.0f}ms type={}",
|
|
449
|
+
self.name, (time.perf_counter() - _t0) * 1000, msg_type)
|
|
450
|
+
if msg_type in {"error", "Error"}:
|
|
451
|
+
detail = str(data.get("detail") or data.get("message") or data)
|
|
452
|
+
trace("stream_error", detail=self._truncate(detail))
|
|
453
|
+
raise RuntimeError(f"WhisperLiveKit stream error: {detail}")
|
|
454
|
+
if is_asr:
|
|
455
|
+
parsed_text = self._extract_asr_message_text(data, committed_lines)
|
|
456
|
+
stabilized_text = self._stabilize_asr_text(parsed_text)
|
|
457
|
+
trace(
|
|
458
|
+
"stream_parse_asr",
|
|
459
|
+
parsed_text=self._truncate(parsed_text),
|
|
460
|
+
stabilized_text=self._truncate(stabilized_text),
|
|
461
|
+
latest_text_before=self._truncate(latest_text),
|
|
462
|
+
)
|
|
463
|
+
if not stabilized_text:
|
|
464
|
+
continue
|
|
465
|
+
merged = self._snapshot_text(latest_text, stabilized_text)
|
|
466
|
+
trace(
|
|
467
|
+
"stream_merge_asr",
|
|
468
|
+
merged_text=self._truncate(merged),
|
|
469
|
+
changed=(merged != latest_text),
|
|
470
|
+
)
|
|
471
|
+
if merged == latest_text:
|
|
472
|
+
continue
|
|
473
|
+
latest_text = merged
|
|
474
|
+
trace("stream_emit_asr", emit_text=self._truncate(latest_text))
|
|
475
|
+
yield Transcription(
|
|
476
|
+
text=latest_text,
|
|
477
|
+
language=self.settings.language or None,
|
|
478
|
+
confidence=None,
|
|
479
|
+
is_partial=True,
|
|
480
|
+
)
|
|
481
|
+
else:
|
|
482
|
+
transcript, language, confidence = self._extract_deepgram_text(data)
|
|
483
|
+
trace(
|
|
484
|
+
"stream_parse_deepgram",
|
|
485
|
+
transcript=self._truncate(transcript),
|
|
486
|
+
latest_text_before=self._truncate(latest_text),
|
|
487
|
+
)
|
|
488
|
+
if not transcript:
|
|
489
|
+
continue
|
|
490
|
+
merged = self._merge_text(latest_text, transcript)
|
|
491
|
+
trace(
|
|
492
|
+
"stream_merge_deepgram",
|
|
493
|
+
merged_text=self._truncate(merged),
|
|
494
|
+
changed=(merged != latest_text),
|
|
495
|
+
)
|
|
496
|
+
if merged == latest_text:
|
|
497
|
+
continue
|
|
498
|
+
latest_text = merged
|
|
499
|
+
trace("stream_emit_deepgram", emit_text=self._truncate(latest_text))
|
|
500
|
+
yield Transcription(
|
|
501
|
+
text=latest_text,
|
|
502
|
+
language=language or self.settings.language or None,
|
|
503
|
+
confidence=confidence,
|
|
504
|
+
is_partial=True,
|
|
505
|
+
)
|
|
506
|
+
finally:
|
|
507
|
+
_sender_stop.set()
|
|
508
|
+
send_task.cancel()
|
|
509
|
+
try:
|
|
510
|
+
await send_task
|
|
511
|
+
except asyncio.CancelledError:
|
|
512
|
+
pass
|
|
513
|
+
trace("stream_close", sent_chunks=sent_chunks, sent_bytes=sent_bytes)
|
|
514
|
+
|
|
515
|
+
# Emit final snapshot after the receive loop exits
|
|
516
|
+
if latest_text:
|
|
517
|
+
logger.info("{}: final result at {:.0f}ms responses={} text='{}'",
|
|
518
|
+
self.name, (time.perf_counter() - _t0) * 1000,
|
|
519
|
+
_resp_count, latest_text[:60])
|
|
520
|
+
yield Transcription(
|
|
521
|
+
text=latest_text,
|
|
522
|
+
language=self.settings.language or None,
|
|
523
|
+
confidence=None,
|
|
524
|
+
is_partial=False,
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
logger.info(
|
|
528
|
+
"{}: stream completed in {:.0f}ms, frames={}",
|
|
529
|
+
self.name, (time.perf_counter() - _t0) * 1000, sent_chunks,
|
|
530
|
+
)
|