openspeechapi 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openspeech/__init__.py +75 -0
- openspeech/__main__.py +5 -0
- openspeech/cli.py +413 -0
- openspeech/client/__init__.py +4 -0
- openspeech/client/client.py +145 -0
- openspeech/config.py +212 -0
- openspeech/core/__init__.py +0 -0
- openspeech/core/base.py +75 -0
- openspeech/core/enums.py +39 -0
- openspeech/core/models.py +61 -0
- openspeech/core/registry.py +37 -0
- openspeech/core/settings.py +8 -0
- openspeech/demo.py +675 -0
- openspeech/dispatch/__init__.py +0 -0
- openspeech/dispatch/context.py +34 -0
- openspeech/dispatch/dispatcher.py +661 -0
- openspeech/dispatch/executors/__init__.py +0 -0
- openspeech/dispatch/executors/base.py +34 -0
- openspeech/dispatch/executors/in_process.py +66 -0
- openspeech/dispatch/executors/remote.py +64 -0
- openspeech/dispatch/executors/subprocess_exec.py +446 -0
- openspeech/dispatch/fanout.py +95 -0
- openspeech/dispatch/filters.py +73 -0
- openspeech/dispatch/lifecycle.py +178 -0
- openspeech/dispatch/watcher.py +82 -0
- openspeech/engine_catalog.py +236 -0
- openspeech/engine_registry.yaml +347 -0
- openspeech/exceptions.py +51 -0
- openspeech/factory.py +325 -0
- openspeech/local_engines/__init__.py +12 -0
- openspeech/local_engines/aim_resolver.py +91 -0
- openspeech/local_engines/backends/__init__.py +1 -0
- openspeech/local_engines/backends/docker_backend.py +490 -0
- openspeech/local_engines/backends/native_backend.py +902 -0
- openspeech/local_engines/base.py +30 -0
- openspeech/local_engines/engines/__init__.py +1 -0
- openspeech/local_engines/engines/faster_whisper.py +36 -0
- openspeech/local_engines/engines/fish_speech.py +33 -0
- openspeech/local_engines/engines/sherpa_onnx.py +56 -0
- openspeech/local_engines/engines/whisper.py +41 -0
- openspeech/local_engines/engines/whisperlivekit.py +60 -0
- openspeech/local_engines/manager.py +208 -0
- openspeech/local_engines/models.py +50 -0
- openspeech/local_engines/progress.py +69 -0
- openspeech/local_engines/registry.py +19 -0
- openspeech/local_engines/task_store.py +52 -0
- openspeech/local_engines/tasks.py +71 -0
- openspeech/logging_config.py +607 -0
- openspeech/observe/__init__.py +0 -0
- openspeech/observe/base.py +79 -0
- openspeech/observe/debug.py +44 -0
- openspeech/observe/latency.py +19 -0
- openspeech/observe/metrics.py +47 -0
- openspeech/observe/tracing.py +44 -0
- openspeech/observe/usage.py +27 -0
- openspeech/providers/__init__.py +0 -0
- openspeech/providers/_template.py +101 -0
- openspeech/providers/stt/__init__.py +0 -0
- openspeech/providers/stt/alibaba.py +86 -0
- openspeech/providers/stt/assemblyai.py +135 -0
- openspeech/providers/stt/azure_speech.py +99 -0
- openspeech/providers/stt/baidu.py +135 -0
- openspeech/providers/stt/deepgram.py +311 -0
- openspeech/providers/stt/elevenlabs.py +385 -0
- openspeech/providers/stt/faster_whisper.py +211 -0
- openspeech/providers/stt/google_cloud.py +106 -0
- openspeech/providers/stt/iflytek.py +427 -0
- openspeech/providers/stt/macos_speech.py +226 -0
- openspeech/providers/stt/openai.py +84 -0
- openspeech/providers/stt/sherpa_onnx.py +353 -0
- openspeech/providers/stt/tencent.py +212 -0
- openspeech/providers/stt/volcengine.py +107 -0
- openspeech/providers/stt/whisper.py +153 -0
- openspeech/providers/stt/whisperlivekit.py +530 -0
- openspeech/providers/stt/windows_speech.py +249 -0
- openspeech/providers/tts/__init__.py +0 -0
- openspeech/providers/tts/alibaba.py +95 -0
- openspeech/providers/tts/azure_speech.py +123 -0
- openspeech/providers/tts/baidu.py +143 -0
- openspeech/providers/tts/coqui.py +64 -0
- openspeech/providers/tts/cosyvoice.py +90 -0
- openspeech/providers/tts/deepgram.py +174 -0
- openspeech/providers/tts/elevenlabs.py +311 -0
- openspeech/providers/tts/fish_speech.py +158 -0
- openspeech/providers/tts/google_cloud.py +107 -0
- openspeech/providers/tts/iflytek.py +209 -0
- openspeech/providers/tts/macos_say.py +251 -0
- openspeech/providers/tts/minimax.py +122 -0
- openspeech/providers/tts/openai.py +104 -0
- openspeech/providers/tts/piper.py +104 -0
- openspeech/providers/tts/tencent.py +189 -0
- openspeech/providers/tts/volcengine.py +117 -0
- openspeech/providers/tts/windows_sapi.py +234 -0
- openspeech/server/__init__.py +1 -0
- openspeech/server/app.py +72 -0
- openspeech/server/auth.py +42 -0
- openspeech/server/middleware.py +75 -0
- openspeech/server/routes/__init__.py +1 -0
- openspeech/server/routes/management.py +848 -0
- openspeech/server/routes/stt.py +121 -0
- openspeech/server/routes/tts.py +159 -0
- openspeech/server/routes/webui.py +29 -0
- openspeech/server/webui/app.js +2649 -0
- openspeech/server/webui/index.html +216 -0
- openspeech/server/webui/styles.css +617 -0
- openspeech/server/ws/__init__.py +1 -0
- openspeech/server/ws/stt_stream.py +263 -0
- openspeech/server/ws/tts_stream.py +207 -0
- openspeech/telemetry/__init__.py +21 -0
- openspeech/telemetry/perf.py +307 -0
- openspeech/utils/__init__.py +5 -0
- openspeech/utils/audio_converter.py +406 -0
- openspeech/utils/audio_playback.py +156 -0
- openspeech/vendor_registry.yaml +74 -0
- openspeechapi-0.1.0.dist-info/METADATA +101 -0
- openspeechapi-0.1.0.dist-info/RECORD +118 -0
- openspeechapi-0.1.0.dist-info/WHEEL +4 -0
- openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Azure Speech STT provider adapter (batch, httpx)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from openspeech.logging_config import logger
|
|
5
|
+
import time
|
|
6
|
+
from collections.abc import AsyncIterator
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
from openspeech.core.base import STTProvider
|
|
13
|
+
|
|
14
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
15
|
+
from openspeech.core.models import AudioData, STTOptions, Transcription
|
|
16
|
+
from openspeech.core.settings import BaseSettings
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class AzureSpeechSTTSettings(BaseSettings):
|
|
20
|
+
subscription_key: str = ""
|
|
21
|
+
region: str = "eastus"
|
|
22
|
+
language: str = "en-US"
|
|
23
|
+
|
|
24
|
+
class AzureSpeechSTT(STTProvider):
|
|
25
|
+
name = "azure-stt"
|
|
26
|
+
provider_type = ProviderType.STT
|
|
27
|
+
execution_mode = ExecMode.REMOTE
|
|
28
|
+
settings_cls = AzureSpeechSTTSettings
|
|
29
|
+
capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
|
|
30
|
+
field_options = {"language": ["en-US", "zh-CN", "ja-JP", "ko-KR", "es-ES", "fr-FR", "de-DE", "pt-BR", "it-IT", "ru-RU", "ar-SA", "hi-IN"], "region": ["eastus", "westus2", "westeurope", "eastasia", "southeastasia"]}
|
|
31
|
+
|
|
32
|
+
def __init__(self, settings: AzureSpeechSTTSettings | None = None) -> None:
|
|
33
|
+
self.settings = settings or AzureSpeechSTTSettings()
|
|
34
|
+
self._client: httpx.AsyncClient | None = None
|
|
35
|
+
self._owns_client: bool = True
|
|
36
|
+
|
|
37
|
+
def set_http_client(self, client) -> None:
|
|
38
|
+
self._client = client
|
|
39
|
+
self._owns_client = False
|
|
40
|
+
|
|
41
|
+
async def start(self) -> None:
|
|
42
|
+
if self._client is None:
|
|
43
|
+
self._client = httpx.AsyncClient(timeout=60.0)
|
|
44
|
+
self._owns_client = True
|
|
45
|
+
logger.info("{} provider started", self.name)
|
|
46
|
+
|
|
47
|
+
async def stop(self) -> None:
|
|
48
|
+
if self._client and self._owns_client:
|
|
49
|
+
await self._client.aclose()
|
|
50
|
+
self._client = None
|
|
51
|
+
logger.info("{} provider stopped", self.name)
|
|
52
|
+
|
|
53
|
+
async def health_check(self) -> bool:
|
|
54
|
+
return bool(self.settings.subscription_key)
|
|
55
|
+
|
|
56
|
+
async def transcribe(
|
|
57
|
+
self, audio: AudioData, opts: STTOptions | None = None
|
|
58
|
+
) -> Transcription:
|
|
59
|
+
if self._client is None:
|
|
60
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
61
|
+
logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
|
|
62
|
+
_t0 = time.perf_counter()
|
|
63
|
+
opts = opts or STTOptions()
|
|
64
|
+
language = opts.language or self.settings.language
|
|
65
|
+
region = self.settings.region
|
|
66
|
+
|
|
67
|
+
url = (
|
|
68
|
+
f"https://{region}.stt.speech.microsoft.com"
|
|
69
|
+
f"/speech/recognition/conversation/cognitiveservices/v1"
|
|
70
|
+
f"?language={language}"
|
|
71
|
+
)
|
|
72
|
+
headers = {
|
|
73
|
+
"Ocp-Apim-Subscription-Key": self.settings.subscription_key,
|
|
74
|
+
"Content-Type": "audio/wav",
|
|
75
|
+
}
|
|
76
|
+
response = await self._client.post(url, headers=headers, content=audio.data)
|
|
77
|
+
if response.status_code != 200:
|
|
78
|
+
raise RuntimeError(
|
|
79
|
+
f"Azure Speech STT API error {response.status_code}: {response.text}"
|
|
80
|
+
)
|
|
81
|
+
data = response.json()
|
|
82
|
+
recognition_status = data.get("RecognitionStatus", "")
|
|
83
|
+
if recognition_status != "Success":
|
|
84
|
+
return Transcription(text="", language=language)
|
|
85
|
+
result = Transcription(
|
|
86
|
+
text=data.get("DisplayText", ""),
|
|
87
|
+
language=language,
|
|
88
|
+
confidence=data.get("Confidence"),
|
|
89
|
+
)
|
|
90
|
+
logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(result.text))
|
|
91
|
+
return result
|
|
92
|
+
|
|
93
|
+
async def transcribe_stream(
|
|
94
|
+
self, stream: AsyncIterator[bytes]
|
|
95
|
+
) -> AsyncIterator[Any]:
|
|
96
|
+
raise NotImplementedError(
|
|
97
|
+
"Azure Speech STT batch provider does not support streaming input"
|
|
98
|
+
)
|
|
99
|
+
yield # pragma: no cover
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Baidu Cloud ASR STT provider adapter."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import base64
|
|
5
|
+
from openspeech.logging_config import logger
|
|
6
|
+
import time
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
|
|
13
|
+
from openspeech.core.base import STTProvider
|
|
14
|
+
|
|
15
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
16
|
+
from openspeech.core.models import AudioData, STTOptions, Transcription
|
|
17
|
+
from openspeech.core.settings import BaseSettings
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class BaiduSTTSettings(BaseSettings):
|
|
21
|
+
api_key: str = ""
|
|
22
|
+
secret_key: str = ""
|
|
23
|
+
dev_pid: int = 1537 # 1537=普通话, 1737=英语, 1637=粤语
|
|
24
|
+
|
|
25
|
+
class BaiduSTT(STTProvider):
|
|
26
|
+
name = "baidu-stt"
|
|
27
|
+
provider_type = ProviderType.STT
|
|
28
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
29
|
+
settings_cls = BaiduSTTSettings
|
|
30
|
+
capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
|
|
31
|
+
field_options = {"dev_pid": [1537, 1737, 1637, 1936, 1836]}
|
|
32
|
+
|
|
33
|
+
def __init__(self, settings: BaiduSTTSettings | None = None) -> None:
|
|
34
|
+
self.settings = settings or BaiduSTTSettings()
|
|
35
|
+
self._client: httpx.AsyncClient | None = None
|
|
36
|
+
self._owns_client: bool = True
|
|
37
|
+
self._token: str | None = None
|
|
38
|
+
self._token_expires_at: float = 0.0
|
|
39
|
+
|
|
40
|
+
def set_http_client(self, client) -> None:
|
|
41
|
+
self._client = client
|
|
42
|
+
self._owns_client = False
|
|
43
|
+
|
|
44
|
+
async def start(self) -> None:
|
|
45
|
+
if self._client is None:
|
|
46
|
+
self._client = httpx.AsyncClient(timeout=60.0)
|
|
47
|
+
self._owns_client = True
|
|
48
|
+
logger.info("{} provider started", self.name)
|
|
49
|
+
|
|
50
|
+
async def stop(self) -> None:
|
|
51
|
+
if self._client and self._owns_client:
|
|
52
|
+
await self._client.aclose()
|
|
53
|
+
self._client = None
|
|
54
|
+
self._token = None
|
|
55
|
+
self._token_expires_at = 0.0
|
|
56
|
+
logger.info("{} provider stopped", self.name)
|
|
57
|
+
|
|
58
|
+
async def health_check(self) -> bool:
|
|
59
|
+
return bool(self.settings.api_key) and bool(self.settings.secret_key)
|
|
60
|
+
|
|
61
|
+
async def _get_token(self) -> str:
|
|
62
|
+
"""Fetch or return cached OAuth access token."""
|
|
63
|
+
if self._token and time.time() < self._token_expires_at:
|
|
64
|
+
return self._token
|
|
65
|
+
|
|
66
|
+
if self._client is None:
|
|
67
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
68
|
+
|
|
69
|
+
resp = await self._client.get(
|
|
70
|
+
"https://aip.baidubce.com/oauth/2.0/token",
|
|
71
|
+
params={
|
|
72
|
+
"grant_type": "client_credentials",
|
|
73
|
+
"client_id": self.settings.api_key,
|
|
74
|
+
"client_secret": self.settings.secret_key,
|
|
75
|
+
},
|
|
76
|
+
)
|
|
77
|
+
resp.raise_for_status()
|
|
78
|
+
data = resp.json()
|
|
79
|
+
|
|
80
|
+
if "access_token" not in data:
|
|
81
|
+
raise RuntimeError(
|
|
82
|
+
f"Baidu OAuth error: {data.get('error_description', 'unknown')}"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
self._token = data["access_token"]
|
|
86
|
+
self._token_expires_at = time.time() + data.get("expires_in", 2592000) - 60
|
|
87
|
+
return self._token # type: ignore[return-value]
|
|
88
|
+
|
|
89
|
+
async def transcribe(
|
|
90
|
+
self, audio: AudioData, opts: STTOptions | None = None
|
|
91
|
+
) -> Transcription:
|
|
92
|
+
if self._client is None:
|
|
93
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
94
|
+
logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
|
|
95
|
+
_t0 = time.perf_counter()
|
|
96
|
+
|
|
97
|
+
token = await self._get_token()
|
|
98
|
+
base64_audio = base64.b64encode(audio.data).decode("utf-8")
|
|
99
|
+
|
|
100
|
+
payload = {
|
|
101
|
+
"format": "wav",
|
|
102
|
+
"rate": 16000,
|
|
103
|
+
"channel": 1,
|
|
104
|
+
"cuid": "openspeech",
|
|
105
|
+
"token": token,
|
|
106
|
+
"dev_pid": self.settings.dev_pid,
|
|
107
|
+
"speech": base64_audio,
|
|
108
|
+
"len": len(audio.data),
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
resp = await self._client.post(
|
|
112
|
+
"https://vop.baidu.com/server_api",
|
|
113
|
+
json=payload,
|
|
114
|
+
)
|
|
115
|
+
resp.raise_for_status()
|
|
116
|
+
data = resp.json()
|
|
117
|
+
|
|
118
|
+
err_no = data.get("err_no", 0)
|
|
119
|
+
if err_no != 0:
|
|
120
|
+
raise RuntimeError(
|
|
121
|
+
f"Baidu ASR error [{err_no}]: {data.get('err_msg', 'unknown')}"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
results = data.get("result", [])
|
|
125
|
+
text = results[0] if results else ""
|
|
126
|
+
|
|
127
|
+
result = Transcription(text=text)
|
|
128
|
+
logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(result.text))
|
|
129
|
+
return result
|
|
130
|
+
|
|
131
|
+
async def transcribe_stream(
|
|
132
|
+
self, stream: AsyncIterator[bytes]
|
|
133
|
+
) -> AsyncIterator[Any]:
|
|
134
|
+
raise NotImplementedError("Baidu STT streaming not implemented")
|
|
135
|
+
yield # noqa: unreachable — makes this an async generator
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
"""Deepgram STT provider adapter (batch + streaming, httpx-based, no SDK needed)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import asyncio
|
|
5
|
+
import inspect
|
|
6
|
+
import json
|
|
7
|
+
from openspeech.logging_config import logger
|
|
8
|
+
import time
|
|
9
|
+
from collections.abc import AsyncIterator
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import httpx
|
|
14
|
+
|
|
15
|
+
from openspeech.core.base import STTProvider
|
|
16
|
+
|
|
17
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
18
|
+
from openspeech.core.models import AudioData, STTOptions, Transcription, Word
|
|
19
|
+
from openspeech.core.settings import BaseSettings
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _ws_connect_with_headers(websockets_mod, url: str, headers: dict[str, str]):
|
|
23
|
+
"""Compatible connect kwargs across websockets versions."""
|
|
24
|
+
try:
|
|
25
|
+
sig = inspect.signature(websockets_mod.connect)
|
|
26
|
+
if "additional_headers" in sig.parameters:
|
|
27
|
+
return websockets_mod.connect(url, additional_headers=headers)
|
|
28
|
+
except Exception:
|
|
29
|
+
pass
|
|
30
|
+
return websockets_mod.connect(url, extra_headers=headers)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class DeepgramSTTSettings(BaseSettings):
|
|
35
|
+
api_key: str = ""
|
|
36
|
+
model: str = "nova-2"
|
|
37
|
+
language: str = "en-US"
|
|
38
|
+
punctuate: bool = True
|
|
39
|
+
smart_format: bool = True
|
|
40
|
+
|
|
41
|
+
class DeepgramSTT(STTProvider):
|
|
42
|
+
name = "deepgram"
|
|
43
|
+
provider_type = ProviderType.STT
|
|
44
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
45
|
+
settings_cls = DeepgramSTTSettings
|
|
46
|
+
capabilities = {
|
|
47
|
+
Capability.STREAMING,
|
|
48
|
+
Capability.BATCH,
|
|
49
|
+
Capability.MULTILINGUAL,
|
|
50
|
+
}
|
|
51
|
+
field_options = {
|
|
52
|
+
"model": [
|
|
53
|
+
"nova-3", "nova-3-general", "nova-3-medical",
|
|
54
|
+
"nova-2", "nova-2-general", "nova-2-meeting", "nova-2-phonecall",
|
|
55
|
+
"nova-2-voicemail", "nova-2-finance", "nova-2-medical",
|
|
56
|
+
"enhanced", "enhanced-general",
|
|
57
|
+
"base", "base-general",
|
|
58
|
+
"whisper-large", "whisper-medium", "whisper-small",
|
|
59
|
+
],
|
|
60
|
+
"language": [
|
|
61
|
+
"multi", "en", "en-US", "en-GB", "en-AU", "en-IN",
|
|
62
|
+
"zh", "zh-CN", "zh-TW", "zh-HK",
|
|
63
|
+
"ja", "ko", "ko-KR",
|
|
64
|
+
"es", "es-419", "fr", "fr-CA", "de", "de-CH",
|
|
65
|
+
"pt", "pt-BR", "pt-PT", "it", "nl", "nl-BE",
|
|
66
|
+
"ru", "uk", "pl", "cs", "sk",
|
|
67
|
+
"sv", "sv-SE", "da", "da-DK", "no", "fi",
|
|
68
|
+
"tr", "el", "ro", "hu", "bg",
|
|
69
|
+
"ar", "he", "fa", "hi", "hi-Latn", "bn", "ta", "te", "ur",
|
|
70
|
+
"id", "ms", "th", "th-TH", "vi", "tl",
|
|
71
|
+
],
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
def __init__(self, settings: DeepgramSTTSettings | None = None) -> None:
|
|
75
|
+
self.settings = settings or DeepgramSTTSettings()
|
|
76
|
+
self._client: httpx.AsyncClient | None = None
|
|
77
|
+
self._owns_client: bool = True
|
|
78
|
+
|
|
79
|
+
def set_http_client(self, client) -> None:
|
|
80
|
+
self._client = client
|
|
81
|
+
self._owns_client = False
|
|
82
|
+
|
|
83
|
+
async def start(self) -> None:
|
|
84
|
+
if self._client is None:
|
|
85
|
+
self._client = httpx.AsyncClient(timeout=60.0)
|
|
86
|
+
self._owns_client = True
|
|
87
|
+
logger.info("{} provider started", self.name)
|
|
88
|
+
|
|
89
|
+
async def stop(self) -> None:
|
|
90
|
+
if self._client and self._owns_client:
|
|
91
|
+
await self._client.aclose()
|
|
92
|
+
self._client = None
|
|
93
|
+
logger.info("{} provider stopped", self.name)
|
|
94
|
+
|
|
95
|
+
async def health_check(self) -> bool:
|
|
96
|
+
# Cloud provider: healthy if API key is configured (client is lazy-started)
|
|
97
|
+
return bool(self.settings.api_key)
|
|
98
|
+
|
|
99
|
+
async def transcribe(
|
|
100
|
+
self, audio: AudioData, opts: STTOptions | None = None
|
|
101
|
+
) -> Transcription:
|
|
102
|
+
if self._client is None:
|
|
103
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
104
|
+
logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
|
|
105
|
+
_t0 = time.perf_counter()
|
|
106
|
+
opts = opts or STTOptions()
|
|
107
|
+
|
|
108
|
+
language = opts.language or self.settings.language
|
|
109
|
+
params = {
|
|
110
|
+
"model": self.settings.model,
|
|
111
|
+
"language": language,
|
|
112
|
+
"punctuate": str(self.settings.punctuate).lower(),
|
|
113
|
+
"smart_format": str(self.settings.smart_format).lower(),
|
|
114
|
+
}
|
|
115
|
+
headers = {
|
|
116
|
+
"Authorization": f"Token {self.settings.api_key}",
|
|
117
|
+
"Content-Type": "audio/wav",
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
resp = await self._client.post(
|
|
121
|
+
"https://api.deepgram.com/v1/listen",
|
|
122
|
+
params=params,
|
|
123
|
+
headers=headers,
|
|
124
|
+
content=audio.data,
|
|
125
|
+
)
|
|
126
|
+
if resp.status_code != 200:
|
|
127
|
+
raise RuntimeError(f"Deepgram API error ({resp.status_code}): {resp.text}")
|
|
128
|
+
|
|
129
|
+
data = resp.json()
|
|
130
|
+
channel = data.get("results", {}).get("channels", [{}])[0]
|
|
131
|
+
alt = channel.get("alternatives", [{}])[0]
|
|
132
|
+
|
|
133
|
+
words: list[Word] = []
|
|
134
|
+
for w in alt.get("words", []):
|
|
135
|
+
words.append(
|
|
136
|
+
Word(
|
|
137
|
+
text=w.get("word", ""),
|
|
138
|
+
start_ms=int(w.get("start", 0) * 1000),
|
|
139
|
+
end_ms=int(w.get("end", 0) * 1000),
|
|
140
|
+
confidence=w.get("confidence"),
|
|
141
|
+
)
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
result = Transcription(
|
|
145
|
+
text=alt.get("transcript", ""),
|
|
146
|
+
language=channel.get("detected_language"),
|
|
147
|
+
confidence=alt.get("confidence"),
|
|
148
|
+
words=words if words else None,
|
|
149
|
+
)
|
|
150
|
+
logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(result.text))
|
|
151
|
+
return result
|
|
152
|
+
|
|
153
|
+
async def transcribe_stream(
|
|
154
|
+
self, stream: AsyncIterator[bytes]
|
|
155
|
+
) -> AsyncIterator[Any]:
|
|
156
|
+
"""Stream audio chunks to Deepgram via WebSocket and yield transcriptions.
|
|
157
|
+
|
|
158
|
+
Deepgram streaming protocol (``interim_results=true``):
|
|
159
|
+
- Each ``Results`` message contains an utterance-level transcript with
|
|
160
|
+
``is_final`` (bool) indicating whether the utterance is finalized.
|
|
161
|
+
- ``speech_final`` (bool) indicates end-of-speech (VAD silence).
|
|
162
|
+
- This implementation accumulates finalized utterances and appends the
|
|
163
|
+
latest interim text so each yield is a **full-text snapshot** that
|
|
164
|
+
the frontend can display directly via ``streamTextSnapshot``.
|
|
165
|
+
- ``is_partial`` on the yielded ``Transcription`` is set according to
|
|
166
|
+
``speech_final`` so the server can forward ``"type": "final"`` and
|
|
167
|
+
trigger auto-stop on the client.
|
|
168
|
+
"""
|
|
169
|
+
if self._client is None:
|
|
170
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
171
|
+
|
|
172
|
+
import websockets
|
|
173
|
+
|
|
174
|
+
_t0 = time.perf_counter()
|
|
175
|
+
_frames_sent = 0
|
|
176
|
+
|
|
177
|
+
params = (
|
|
178
|
+
f"model={self.settings.model}"
|
|
179
|
+
f"&language={self.settings.language}"
|
|
180
|
+
f"&punctuate={'true' if self.settings.punctuate else 'false'}"
|
|
181
|
+
f"&smart_format={'true' if self.settings.smart_format else 'false'}"
|
|
182
|
+
f"&encoding=linear16&sample_rate=16000"
|
|
183
|
+
f"&interim_results=true"
|
|
184
|
+
f"&vad_events=true"
|
|
185
|
+
)
|
|
186
|
+
url = f"wss://api.deepgram.com/v1/listen?{params}"
|
|
187
|
+
headers = {"Authorization": f"Token {self.settings.api_key}"}
|
|
188
|
+
|
|
189
|
+
results: asyncio.Queue[Transcription | None] = asyncio.Queue()
|
|
190
|
+
_sender_stop = asyncio.Event()
|
|
191
|
+
|
|
192
|
+
logger.debug("{}: connecting to Deepgram WebSocket...", self.name)
|
|
193
|
+
async with _ws_connect_with_headers(websockets, url, headers) as ws:
|
|
194
|
+
_t_connected = time.perf_counter()
|
|
195
|
+
logger.info("{}: WS connected in {:.0f}ms", self.name,
|
|
196
|
+
(_t_connected - _t0) * 1000)
|
|
197
|
+
|
|
198
|
+
async def send_audio() -> None:
|
|
199
|
+
nonlocal _frames_sent
|
|
200
|
+
try:
|
|
201
|
+
async for chunk in stream:
|
|
202
|
+
if _sender_stop.is_set():
|
|
203
|
+
break
|
|
204
|
+
if chunk:
|
|
205
|
+
await ws.send(chunk)
|
|
206
|
+
_frames_sent += 1
|
|
207
|
+
if _frames_sent == 1:
|
|
208
|
+
logger.debug("{}: first frame sent at {:.0f}ms",
|
|
209
|
+
self.name, (time.perf_counter() - _t0) * 1000)
|
|
210
|
+
if not _sender_stop.is_set():
|
|
211
|
+
await ws.send(json.dumps({"type": "CloseStream"}))
|
|
212
|
+
except websockets.exceptions.ConnectionClosed:
|
|
213
|
+
pass
|
|
214
|
+
finally:
|
|
215
|
+
logger.debug(
|
|
216
|
+
"{}: stream sender done, sent {} frames in {:.0f}ms",
|
|
217
|
+
self.name, _frames_sent, (time.perf_counter() - _t0) * 1000,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
async def receive_results() -> None:
|
|
221
|
+
# Accumulate finalized utterance parts so each yield is a
|
|
222
|
+
# full-text snapshot (not just the latest utterance fragment).
|
|
223
|
+
confirmed_parts: list[str] = []
|
|
224
|
+
_resp_count = 0
|
|
225
|
+
|
|
226
|
+
try:
|
|
227
|
+
async for msg in ws:
|
|
228
|
+
data = json.loads(msg)
|
|
229
|
+
if data.get("type") != "Results":
|
|
230
|
+
continue
|
|
231
|
+
|
|
232
|
+
_resp_count += 1
|
|
233
|
+
channel = data.get("channel", {})
|
|
234
|
+
alts = channel.get("alternatives", [])
|
|
235
|
+
if not alts:
|
|
236
|
+
continue
|
|
237
|
+
transcript = alts[0].get("transcript", "").strip()
|
|
238
|
+
is_final = data.get("is_final", False)
|
|
239
|
+
speech_final = data.get("speech_final", False)
|
|
240
|
+
detected_language = channel.get(
|
|
241
|
+
"detected_language", self.settings.language
|
|
242
|
+
)
|
|
243
|
+
confidence = alts[0].get("confidence")
|
|
244
|
+
|
|
245
|
+
if _resp_count == 1:
|
|
246
|
+
logger.debug("{}: first response at {:.0f}ms is_final={} speech_final={}",
|
|
247
|
+
self.name, (time.perf_counter() - _t0) * 1000,
|
|
248
|
+
is_final, speech_final)
|
|
249
|
+
|
|
250
|
+
if is_final and transcript:
|
|
251
|
+
confirmed_parts.append(transcript)
|
|
252
|
+
|
|
253
|
+
# Build full-text snapshot: confirmed + current interim
|
|
254
|
+
if is_final:
|
|
255
|
+
snapshot = " ".join(confirmed_parts)
|
|
256
|
+
else:
|
|
257
|
+
parts = list(confirmed_parts)
|
|
258
|
+
if transcript:
|
|
259
|
+
parts.append(transcript)
|
|
260
|
+
snapshot = " ".join(parts)
|
|
261
|
+
|
|
262
|
+
if not snapshot:
|
|
263
|
+
continue
|
|
264
|
+
|
|
265
|
+
# speech_final = Deepgram VAD detected end of speech
|
|
266
|
+
is_partial = not speech_final
|
|
267
|
+
await results.put(Transcription(
|
|
268
|
+
text=snapshot,
|
|
269
|
+
confidence=confidence,
|
|
270
|
+
language=detected_language or self.settings.language,
|
|
271
|
+
is_partial=is_partial,
|
|
272
|
+
))
|
|
273
|
+
|
|
274
|
+
if speech_final:
|
|
275
|
+
_sender_stop.set()
|
|
276
|
+
logger.info("{}: final result at {:.0f}ms responses={} text='{}'",
|
|
277
|
+
self.name, (time.perf_counter() - _t0) * 1000,
|
|
278
|
+
_resp_count, snapshot[:60])
|
|
279
|
+
break
|
|
280
|
+
except websockets.exceptions.ConnectionClosed:
|
|
281
|
+
# Emit whatever we have as final
|
|
282
|
+
snapshot = " ".join(confirmed_parts).strip()
|
|
283
|
+
if snapshot:
|
|
284
|
+
await results.put(Transcription(
|
|
285
|
+
text=snapshot, is_partial=False,
|
|
286
|
+
language=self.settings.language,
|
|
287
|
+
))
|
|
288
|
+
_sender_stop.set()
|
|
289
|
+
finally:
|
|
290
|
+
_sender_stop.set()
|
|
291
|
+
await results.put(None)
|
|
292
|
+
|
|
293
|
+
send_task = asyncio.create_task(send_audio())
|
|
294
|
+
recv_task = asyncio.create_task(receive_results())
|
|
295
|
+
|
|
296
|
+
while True:
|
|
297
|
+
item = await results.get()
|
|
298
|
+
if item is None:
|
|
299
|
+
break
|
|
300
|
+
yield item
|
|
301
|
+
|
|
302
|
+
logger.info(
|
|
303
|
+
"{}: stream completed in {:.0f}ms, frames={}",
|
|
304
|
+
self.name, (time.perf_counter() - _t0) * 1000, _frames_sent,
|
|
305
|
+
)
|
|
306
|
+
send_task.cancel()
|
|
307
|
+
try:
|
|
308
|
+
await send_task
|
|
309
|
+
except asyncio.CancelledError:
|
|
310
|
+
pass
|
|
311
|
+
await recv_task
|