openspeechapi 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openspeech/__init__.py +75 -0
- openspeech/__main__.py +5 -0
- openspeech/cli.py +413 -0
- openspeech/client/__init__.py +4 -0
- openspeech/client/client.py +145 -0
- openspeech/config.py +212 -0
- openspeech/core/__init__.py +0 -0
- openspeech/core/base.py +75 -0
- openspeech/core/enums.py +39 -0
- openspeech/core/models.py +61 -0
- openspeech/core/registry.py +37 -0
- openspeech/core/settings.py +8 -0
- openspeech/demo.py +675 -0
- openspeech/dispatch/__init__.py +0 -0
- openspeech/dispatch/context.py +34 -0
- openspeech/dispatch/dispatcher.py +661 -0
- openspeech/dispatch/executors/__init__.py +0 -0
- openspeech/dispatch/executors/base.py +34 -0
- openspeech/dispatch/executors/in_process.py +66 -0
- openspeech/dispatch/executors/remote.py +64 -0
- openspeech/dispatch/executors/subprocess_exec.py +446 -0
- openspeech/dispatch/fanout.py +95 -0
- openspeech/dispatch/filters.py +73 -0
- openspeech/dispatch/lifecycle.py +178 -0
- openspeech/dispatch/watcher.py +82 -0
- openspeech/engine_catalog.py +236 -0
- openspeech/engine_registry.yaml +347 -0
- openspeech/exceptions.py +51 -0
- openspeech/factory.py +325 -0
- openspeech/local_engines/__init__.py +12 -0
- openspeech/local_engines/aim_resolver.py +91 -0
- openspeech/local_engines/backends/__init__.py +1 -0
- openspeech/local_engines/backends/docker_backend.py +490 -0
- openspeech/local_engines/backends/native_backend.py +902 -0
- openspeech/local_engines/base.py +30 -0
- openspeech/local_engines/engines/__init__.py +1 -0
- openspeech/local_engines/engines/faster_whisper.py +36 -0
- openspeech/local_engines/engines/fish_speech.py +33 -0
- openspeech/local_engines/engines/sherpa_onnx.py +56 -0
- openspeech/local_engines/engines/whisper.py +41 -0
- openspeech/local_engines/engines/whisperlivekit.py +60 -0
- openspeech/local_engines/manager.py +208 -0
- openspeech/local_engines/models.py +50 -0
- openspeech/local_engines/progress.py +69 -0
- openspeech/local_engines/registry.py +19 -0
- openspeech/local_engines/task_store.py +52 -0
- openspeech/local_engines/tasks.py +71 -0
- openspeech/logging_config.py +607 -0
- openspeech/observe/__init__.py +0 -0
- openspeech/observe/base.py +79 -0
- openspeech/observe/debug.py +44 -0
- openspeech/observe/latency.py +19 -0
- openspeech/observe/metrics.py +47 -0
- openspeech/observe/tracing.py +44 -0
- openspeech/observe/usage.py +27 -0
- openspeech/providers/__init__.py +0 -0
- openspeech/providers/_template.py +101 -0
- openspeech/providers/stt/__init__.py +0 -0
- openspeech/providers/stt/alibaba.py +86 -0
- openspeech/providers/stt/assemblyai.py +135 -0
- openspeech/providers/stt/azure_speech.py +99 -0
- openspeech/providers/stt/baidu.py +135 -0
- openspeech/providers/stt/deepgram.py +311 -0
- openspeech/providers/stt/elevenlabs.py +385 -0
- openspeech/providers/stt/faster_whisper.py +211 -0
- openspeech/providers/stt/google_cloud.py +106 -0
- openspeech/providers/stt/iflytek.py +427 -0
- openspeech/providers/stt/macos_speech.py +226 -0
- openspeech/providers/stt/openai.py +84 -0
- openspeech/providers/stt/sherpa_onnx.py +353 -0
- openspeech/providers/stt/tencent.py +212 -0
- openspeech/providers/stt/volcengine.py +107 -0
- openspeech/providers/stt/whisper.py +153 -0
- openspeech/providers/stt/whisperlivekit.py +530 -0
- openspeech/providers/stt/windows_speech.py +249 -0
- openspeech/providers/tts/__init__.py +0 -0
- openspeech/providers/tts/alibaba.py +95 -0
- openspeech/providers/tts/azure_speech.py +123 -0
- openspeech/providers/tts/baidu.py +143 -0
- openspeech/providers/tts/coqui.py +64 -0
- openspeech/providers/tts/cosyvoice.py +90 -0
- openspeech/providers/tts/deepgram.py +174 -0
- openspeech/providers/tts/elevenlabs.py +311 -0
- openspeech/providers/tts/fish_speech.py +158 -0
- openspeech/providers/tts/google_cloud.py +107 -0
- openspeech/providers/tts/iflytek.py +209 -0
- openspeech/providers/tts/macos_say.py +251 -0
- openspeech/providers/tts/minimax.py +122 -0
- openspeech/providers/tts/openai.py +104 -0
- openspeech/providers/tts/piper.py +104 -0
- openspeech/providers/tts/tencent.py +189 -0
- openspeech/providers/tts/volcengine.py +117 -0
- openspeech/providers/tts/windows_sapi.py +234 -0
- openspeech/server/__init__.py +1 -0
- openspeech/server/app.py +72 -0
- openspeech/server/auth.py +42 -0
- openspeech/server/middleware.py +75 -0
- openspeech/server/routes/__init__.py +1 -0
- openspeech/server/routes/management.py +848 -0
- openspeech/server/routes/stt.py +121 -0
- openspeech/server/routes/tts.py +159 -0
- openspeech/server/routes/webui.py +29 -0
- openspeech/server/webui/app.js +2649 -0
- openspeech/server/webui/index.html +216 -0
- openspeech/server/webui/styles.css +617 -0
- openspeech/server/ws/__init__.py +1 -0
- openspeech/server/ws/stt_stream.py +263 -0
- openspeech/server/ws/tts_stream.py +207 -0
- openspeech/telemetry/__init__.py +21 -0
- openspeech/telemetry/perf.py +307 -0
- openspeech/utils/__init__.py +5 -0
- openspeech/utils/audio_converter.py +406 -0
- openspeech/utils/audio_playback.py +156 -0
- openspeech/vendor_registry.yaml +74 -0
- openspeechapi-0.1.0.dist-info/METADATA +101 -0
- openspeechapi-0.1.0.dist-info/RECORD +118 -0
- openspeechapi-0.1.0.dist-info/WHEEL +4 -0
- openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Google Cloud STT provider adapter (batch, httpx)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import base64
|
|
5
|
+
from openspeech.logging_config import logger
|
|
6
|
+
import time
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
|
|
13
|
+
from openspeech.core.base import STTProvider
|
|
14
|
+
|
|
15
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
16
|
+
from openspeech.core.models import AudioData, STTOptions, Transcription
|
|
17
|
+
from openspeech.core.settings import BaseSettings
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class GoogleCloudSTTSettings(BaseSettings):
|
|
21
|
+
api_key: str = ""
|
|
22
|
+
model: str = "latest_long"
|
|
23
|
+
language: str = "en-US"
|
|
24
|
+
|
|
25
|
+
class GoogleCloudSTT(STTProvider):
|
|
26
|
+
name = "google-stt"
|
|
27
|
+
provider_type = ProviderType.STT
|
|
28
|
+
execution_mode = ExecMode.REMOTE
|
|
29
|
+
settings_cls = GoogleCloudSTTSettings
|
|
30
|
+
capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
|
|
31
|
+
field_options = {"model": ["latest_long", "latest_short", "telephony", "command_and_search"], "language": ["en-US", "zh-CN", "ja-JP", "ko-KR", "es-ES", "fr-FR", "de-DE", "pt-BR", "it-IT", "ru-RU", "ar-SA", "hi-IN"]}
|
|
32
|
+
|
|
33
|
+
def __init__(self, settings: GoogleCloudSTTSettings | None = None) -> None:
|
|
34
|
+
self.settings = settings or GoogleCloudSTTSettings()
|
|
35
|
+
self._client: httpx.AsyncClient | None = None
|
|
36
|
+
self._owns_client: bool = True
|
|
37
|
+
|
|
38
|
+
def set_http_client(self, client) -> None:
|
|
39
|
+
self._client = client
|
|
40
|
+
self._owns_client = False
|
|
41
|
+
|
|
42
|
+
async def start(self) -> None:
|
|
43
|
+
if self._client is None:
|
|
44
|
+
self._client = httpx.AsyncClient(timeout=60.0)
|
|
45
|
+
self._owns_client = True
|
|
46
|
+
logger.info("{} provider started", self.name)
|
|
47
|
+
|
|
48
|
+
async def stop(self) -> None:
|
|
49
|
+
if self._client and self._owns_client:
|
|
50
|
+
await self._client.aclose()
|
|
51
|
+
self._client = None
|
|
52
|
+
logger.info("{} provider stopped", self.name)
|
|
53
|
+
|
|
54
|
+
async def health_check(self) -> bool:
|
|
55
|
+
return bool(self.settings.api_key)
|
|
56
|
+
|
|
57
|
+
async def transcribe(
|
|
58
|
+
self, audio: AudioData, opts: STTOptions | None = None
|
|
59
|
+
) -> Transcription:
|
|
60
|
+
if self._client is None:
|
|
61
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
62
|
+
logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
|
|
63
|
+
_t0 = time.perf_counter()
|
|
64
|
+
opts = opts or STTOptions()
|
|
65
|
+
language = opts.language or self.settings.language
|
|
66
|
+
model = self.settings.model
|
|
67
|
+
|
|
68
|
+
b64_audio = base64.b64encode(audio.data).decode("utf-8")
|
|
69
|
+
url = (
|
|
70
|
+
"https://speech.googleapis.com/v1/speech:recognize"
|
|
71
|
+
f"?key={self.settings.api_key}"
|
|
72
|
+
)
|
|
73
|
+
body = {
|
|
74
|
+
"config": {
|
|
75
|
+
"encoding": "LINEAR16",
|
|
76
|
+
"sampleRateHertz": audio.sample_rate,
|
|
77
|
+
"languageCode": language,
|
|
78
|
+
"model": model,
|
|
79
|
+
},
|
|
80
|
+
"audio": {"content": b64_audio},
|
|
81
|
+
}
|
|
82
|
+
response = await self._client.post(url, json=body)
|
|
83
|
+
if response.status_code != 200:
|
|
84
|
+
raise RuntimeError(
|
|
85
|
+
f"Google Cloud STT API error {response.status_code}: {response.text}"
|
|
86
|
+
)
|
|
87
|
+
data = response.json()
|
|
88
|
+
results = data.get("results", [])
|
|
89
|
+
if not results:
|
|
90
|
+
return Transcription(text="", language=language)
|
|
91
|
+
alternative = results[0]["alternatives"][0]
|
|
92
|
+
result = Transcription(
|
|
93
|
+
text=alternative.get("transcript", ""),
|
|
94
|
+
language=language,
|
|
95
|
+
confidence=alternative.get("confidence"),
|
|
96
|
+
)
|
|
97
|
+
logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(result.text))
|
|
98
|
+
return result
|
|
99
|
+
|
|
100
|
+
async def transcribe_stream(
|
|
101
|
+
self, stream: AsyncIterator[bytes]
|
|
102
|
+
) -> AsyncIterator[Any]:
|
|
103
|
+
raise NotImplementedError(
|
|
104
|
+
"Google Cloud STT batch provider does not support streaming input"
|
|
105
|
+
)
|
|
106
|
+
yield # pragma: no cover
|
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
"""iFlytek (讯飞) STT provider adapter — WebSocket-based."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import asyncio
|
|
5
|
+
import base64
|
|
6
|
+
import hashlib
|
|
7
|
+
import hmac
|
|
8
|
+
import json
|
|
9
|
+
from openspeech.logging_config import logger
|
|
10
|
+
import time
|
|
11
|
+
import urllib.parse
|
|
12
|
+
from collections.abc import AsyncIterator
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from datetime import datetime, timezone
|
|
15
|
+
from email.utils import formatdate
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
import httpx
|
|
19
|
+
import websockets
|
|
20
|
+
|
|
21
|
+
from openspeech.core.base import STTProvider
|
|
22
|
+
|
|
23
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
24
|
+
from openspeech.core.models import AudioData, STTOptions, Transcription
|
|
25
|
+
from openspeech.core.settings import BaseSettings
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class IflytekSTTSettings(BaseSettings):
|
|
29
|
+
app_id: str = ""
|
|
30
|
+
api_key: str = ""
|
|
31
|
+
api_secret: str = ""
|
|
32
|
+
language: str = "zh_cn"
|
|
33
|
+
|
|
34
|
+
class IflytekSTT(STTProvider):
|
|
35
|
+
name = "iflytek-stt"
|
|
36
|
+
provider_type = ProviderType.STT
|
|
37
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
38
|
+
settings_cls = IflytekSTTSettings
|
|
39
|
+
capabilities = {Capability.BATCH, Capability.STREAMING, Capability.MULTILINGUAL}
|
|
40
|
+
field_options = {"language": ["zh_cn", "en_us"]}
|
|
41
|
+
|
|
42
|
+
_WS_HOST = "iat-api.xfyun.cn"
|
|
43
|
+
_WS_PATH = "/v2/iat"
|
|
44
|
+
|
|
45
|
+
def __init__(self, settings: IflytekSTTSettings | None = None) -> None:
|
|
46
|
+
self.settings = settings or IflytekSTTSettings()
|
|
47
|
+
self._client: httpx.AsyncClient | None = None
|
|
48
|
+
self._owns_client: bool = True
|
|
49
|
+
|
|
50
|
+
def set_http_client(self, client) -> None:
|
|
51
|
+
self._client = client
|
|
52
|
+
self._owns_client = False
|
|
53
|
+
|
|
54
|
+
async def start(self) -> None:
|
|
55
|
+
if self._client is None:
|
|
56
|
+
self._client = httpx.AsyncClient(timeout=60.0)
|
|
57
|
+
self._owns_client = True
|
|
58
|
+
logger.info("{} provider started", self.name)
|
|
59
|
+
|
|
60
|
+
async def stop(self) -> None:
|
|
61
|
+
if self._client and self._owns_client:
|
|
62
|
+
await self._client.aclose()
|
|
63
|
+
self._client = None
|
|
64
|
+
logger.info("{} provider stopped", self.name)
|
|
65
|
+
|
|
66
|
+
async def health_check(self) -> bool:
|
|
67
|
+
return bool(self.settings.app_id) and bool(self.settings.api_key) and bool(self.settings.api_secret)
|
|
68
|
+
|
|
69
|
+
def _build_auth_url(self) -> str:
|
|
70
|
+
"""Build HMAC-SHA256 signed WebSocket URL."""
|
|
71
|
+
now = datetime.now(tz=timezone.utc)
|
|
72
|
+
date = formatdate(timeval=now.timestamp(), localtime=False, usegmt=True)
|
|
73
|
+
|
|
74
|
+
signature_origin = (
|
|
75
|
+
f"host: {self._WS_HOST}\n"
|
|
76
|
+
f"date: {date}\n"
|
|
77
|
+
f"GET {self._WS_PATH} HTTP/1.1"
|
|
78
|
+
)
|
|
79
|
+
signature_sha = hmac.new(
|
|
80
|
+
self.settings.api_secret.encode("utf-8"),
|
|
81
|
+
signature_origin.encode("utf-8"),
|
|
82
|
+
hashlib.sha256,
|
|
83
|
+
).digest()
|
|
84
|
+
signature = base64.b64encode(signature_sha).decode("utf-8")
|
|
85
|
+
|
|
86
|
+
authorization_origin = (
|
|
87
|
+
f'api_key="{self.settings.api_key}", '
|
|
88
|
+
f'algorithm="hmac-sha256", '
|
|
89
|
+
f'headers="host date request-line", '
|
|
90
|
+
f'signature="{signature}"'
|
|
91
|
+
)
|
|
92
|
+
authorization = base64.b64encode(
|
|
93
|
+
authorization_origin.encode("utf-8")
|
|
94
|
+
).decode("utf-8")
|
|
95
|
+
|
|
96
|
+
params = urllib.parse.urlencode(
|
|
97
|
+
{"authorization": authorization, "date": date, "host": self._WS_HOST}
|
|
98
|
+
)
|
|
99
|
+
return f"wss://{self._WS_HOST}{self._WS_PATH}?{params}"
|
|
100
|
+
|
|
101
|
+
async def transcribe(
|
|
102
|
+
self, audio: AudioData, opts: STTOptions | None = None
|
|
103
|
+
) -> Transcription:
|
|
104
|
+
if self._client is None:
|
|
105
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
106
|
+
logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
|
|
107
|
+
_t0 = time.perf_counter()
|
|
108
|
+
|
|
109
|
+
url = self._build_auth_url()
|
|
110
|
+
audio_bytes = audio.data
|
|
111
|
+
# iFlytek recommends ~40ms per frame at 16kHz 16bit mono = 1280 bytes.
|
|
112
|
+
# Use larger frames (8000 bytes = ~250ms) with pacing to avoid server
|
|
113
|
+
# read-timeout when sending pre-recorded audio faster than real-time.
|
|
114
|
+
frame_size = 8000 # bytes per chunk (~250ms of 16kHz 16bit mono)
|
|
115
|
+
# Pacing: small delay between frames to avoid server read-timeout
|
|
116
|
+
# when sending pre-recorded audio faster than real-time.
|
|
117
|
+
# 8000 bytes = 250ms of audio; 10ms interval = ~25x real-time.
|
|
118
|
+
frame_interval = 0.01 # 10ms between frames (~25x real-time)
|
|
119
|
+
|
|
120
|
+
result_texts: list[str] = []
|
|
121
|
+
|
|
122
|
+
async with websockets.connect(url) as ws:
|
|
123
|
+
# Send audio in chunks with interleaved receive
|
|
124
|
+
total = len(audio_bytes)
|
|
125
|
+
offset = 0
|
|
126
|
+
status = 0 # 0=first frame
|
|
127
|
+
frames_sent = 0
|
|
128
|
+
|
|
129
|
+
while offset < total:
|
|
130
|
+
end = min(offset + frame_size, total)
|
|
131
|
+
chunk = audio_bytes[offset:end]
|
|
132
|
+
|
|
133
|
+
if end >= total:
|
|
134
|
+
status = 2 # last frame
|
|
135
|
+
elif offset > 0:
|
|
136
|
+
status = 1 # continue
|
|
137
|
+
|
|
138
|
+
frame_data = base64.b64encode(chunk).decode("utf-8")
|
|
139
|
+
|
|
140
|
+
if status == 0:
|
|
141
|
+
# First frame includes common and business params
|
|
142
|
+
msg = {
|
|
143
|
+
"common": {"app_id": self.settings.app_id},
|
|
144
|
+
"business": {
|
|
145
|
+
"language": self.settings.language,
|
|
146
|
+
"domain": "iat",
|
|
147
|
+
"accent": "mandarin",
|
|
148
|
+
"vad_eos": 2000,
|
|
149
|
+
},
|
|
150
|
+
"data": {
|
|
151
|
+
"status": 0,
|
|
152
|
+
"format": "audio/L16;rate=16000",
|
|
153
|
+
"encoding": "raw",
|
|
154
|
+
"audio": frame_data,
|
|
155
|
+
},
|
|
156
|
+
}
|
|
157
|
+
else:
|
|
158
|
+
msg = {
|
|
159
|
+
"data": {
|
|
160
|
+
"status": status,
|
|
161
|
+
"format": "audio/L16;rate=16000",
|
|
162
|
+
"encoding": "raw",
|
|
163
|
+
"audio": frame_data,
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
await ws.send(json.dumps(msg))
|
|
168
|
+
frames_sent += 1
|
|
169
|
+
offset = end
|
|
170
|
+
|
|
171
|
+
# Pacing: small delay between frames to avoid server timeout
|
|
172
|
+
if status != 2 and frame_interval > 0:
|
|
173
|
+
await asyncio.sleep(frame_interval)
|
|
174
|
+
|
|
175
|
+
logger.debug("{}: sent {} frames in {:.0f}ms", self.name, frames_sent,
|
|
176
|
+
(time.perf_counter() - _t0) * 1000)
|
|
177
|
+
|
|
178
|
+
# Receive results
|
|
179
|
+
async for message in ws:
|
|
180
|
+
resp = json.loads(message)
|
|
181
|
+
code = resp.get("code", -1)
|
|
182
|
+
if code != 0:
|
|
183
|
+
raise RuntimeError(
|
|
184
|
+
f"iFlytek STT error [{code}]: {resp.get('message', 'unknown')}"
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
data = resp.get("data", {})
|
|
188
|
+
result = data.get("result", {})
|
|
189
|
+
ws_items = result.get("ws", [])
|
|
190
|
+
for ws_item in ws_items:
|
|
191
|
+
cw_list = ws_item.get("cw", [])
|
|
192
|
+
for cw in cw_list:
|
|
193
|
+
result_texts.append(cw.get("w", ""))
|
|
194
|
+
|
|
195
|
+
if data.get("status") == 2:
|
|
196
|
+
break
|
|
197
|
+
|
|
198
|
+
result = Transcription(text="".join(result_texts))
|
|
199
|
+
logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(result.text))
|
|
200
|
+
return result
|
|
201
|
+
|
|
202
|
+
@staticmethod
|
|
203
|
+
def _extract_segment_text(ws_items: list[dict]) -> str:
|
|
204
|
+
"""Extract text from a single response's ws array."""
|
|
205
|
+
parts: list[str] = []
|
|
206
|
+
for ws_item in ws_items:
|
|
207
|
+
for cw in ws_item.get("cw", []):
|
|
208
|
+
parts.append(cw.get("w", ""))
|
|
209
|
+
return "".join(parts)
|
|
210
|
+
|
|
211
|
+
async def transcribe_stream(
|
|
212
|
+
self, stream: AsyncIterator[bytes]
|
|
213
|
+
) -> AsyncIterator[Transcription]:
|
|
214
|
+
"""Stream audio chunks to iFlytek via WebSocket and yield transcriptions.
|
|
215
|
+
|
|
216
|
+
Each bytes chunk from *stream* is a raw PCM frame (16kHz 16bit mono).
|
|
217
|
+
A ``None`` or empty chunk signals end-of-stream (VAD end).
|
|
218
|
+
|
|
219
|
+
The implementation uses two concurrent coroutines:
|
|
220
|
+
- **sender**: reads chunks from *stream* and forwards them to iFlytek
|
|
221
|
+
with natural pacing (frames arrive at ~real-time from the mic).
|
|
222
|
+
- **receiver**: reads iFlytek responses, parses ``dwa=wpgs`` dynamic
|
|
223
|
+
correction fields (``pgs``/``rg``), maintains a segment array, and
|
|
224
|
+
yields partial ``Transcription`` on every response plus a final one
|
|
225
|
+
on ``status == 2``.
|
|
226
|
+
|
|
227
|
+
wpgs protocol:
|
|
228
|
+
- ``pgs="apd"``: append — new segment at index ``sn``
|
|
229
|
+
- ``pgs="rpl"``: replace — replace segments ``rg[0]..rg[1]`` with
|
|
230
|
+
new content, effectively a correction of earlier partial results
|
|
231
|
+
- No ``pgs`` field: legacy mode (no dynamic correction) — accumulate
|
|
232
|
+
"""
|
|
233
|
+
if self._client is None:
|
|
234
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
235
|
+
|
|
236
|
+
url = self._build_auth_url()
|
|
237
|
+
results: asyncio.Queue[Transcription | None] = asyncio.Queue()
|
|
238
|
+
_t0 = time.perf_counter()
|
|
239
|
+
_frames_sent = 0
|
|
240
|
+
|
|
241
|
+
# Event to signal sender to stop (set by receiver when iFlytek
|
|
242
|
+
# returns status=2 or the connection closes). This handles the case
|
|
243
|
+
# where the user doesn't click stop — iFlytek's VAD triggers a final
|
|
244
|
+
# result, and we need the sender to stop consuming the frame queue.
|
|
245
|
+
_sender_stop = asyncio.Event()
|
|
246
|
+
|
|
247
|
+
logger.debug("{}: connecting to iFlytek WebSocket...", self.name)
|
|
248
|
+
async with websockets.connect(url) as ws:
|
|
249
|
+
_t_connected = time.perf_counter()
|
|
250
|
+
logger.info("{}: WS connected in {:.0f}ms", self.name,
|
|
251
|
+
(_t_connected - _t0) * 1000)
|
|
252
|
+
|
|
253
|
+
async def sender() -> None:
|
|
254
|
+
nonlocal _frames_sent
|
|
255
|
+
is_first = True
|
|
256
|
+
try:
|
|
257
|
+
async for chunk in stream:
|
|
258
|
+
# Check if receiver signaled us to stop
|
|
259
|
+
if _sender_stop.is_set():
|
|
260
|
+
break
|
|
261
|
+
if chunk is None or len(chunk) == 0:
|
|
262
|
+
# End-of-stream sentinel — send last frame
|
|
263
|
+
break
|
|
264
|
+
frame_data = base64.b64encode(chunk).decode("utf-8")
|
|
265
|
+
if is_first:
|
|
266
|
+
msg = {
|
|
267
|
+
"common": {"app_id": self.settings.app_id},
|
|
268
|
+
"business": {
|
|
269
|
+
"language": self.settings.language,
|
|
270
|
+
"domain": "iat",
|
|
271
|
+
"accent": "mandarin",
|
|
272
|
+
"dwa": "wpgs",
|
|
273
|
+
"vad_eos": 2000,
|
|
274
|
+
},
|
|
275
|
+
"data": {
|
|
276
|
+
"status": 0,
|
|
277
|
+
"format": "audio/L16;rate=16000",
|
|
278
|
+
"encoding": "raw",
|
|
279
|
+
"audio": frame_data,
|
|
280
|
+
},
|
|
281
|
+
}
|
|
282
|
+
is_first = False
|
|
283
|
+
else:
|
|
284
|
+
msg = {
|
|
285
|
+
"data": {
|
|
286
|
+
"status": 1,
|
|
287
|
+
"format": "audio/L16;rate=16000",
|
|
288
|
+
"encoding": "raw",
|
|
289
|
+
"audio": frame_data,
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
await ws.send(json.dumps(msg))
|
|
293
|
+
_frames_sent += 1
|
|
294
|
+
if _frames_sent == 1:
|
|
295
|
+
logger.debug("{}: first frame sent at {:.0f}ms",
|
|
296
|
+
self.name, (time.perf_counter() - _t0) * 1000)
|
|
297
|
+
|
|
298
|
+
# Send empty last frame to signal end (only if WS still open)
|
|
299
|
+
if not _sender_stop.is_set():
|
|
300
|
+
last_msg = {
|
|
301
|
+
"data": {
|
|
302
|
+
"status": 2,
|
|
303
|
+
"format": "audio/L16;rate=16000",
|
|
304
|
+
"encoding": "raw",
|
|
305
|
+
"audio": "",
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
await ws.send(json.dumps(last_msg))
|
|
309
|
+
except websockets.exceptions.ConnectionClosed:
|
|
310
|
+
# iFlytek closed the connection (e.g. server read timeout
|
|
311
|
+
# or VAD-triggered close). This is expected when the user
|
|
312
|
+
# doesn't click stop — just exit silently.
|
|
313
|
+
pass
|
|
314
|
+
finally:
|
|
315
|
+
logger.debug(
|
|
316
|
+
"{}: stream sender done, sent {} frames in {:.0f}ms",
|
|
317
|
+
self.name, _frames_sent, (time.perf_counter() - _t0) * 1000,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
async def receiver() -> None:
|
|
321
|
+
# Segment array for wpgs dynamic correction.
|
|
322
|
+
# Index = sn (sentence number from iFlytek).
|
|
323
|
+
# Each element is the text for that segment.
|
|
324
|
+
segments: list[str] = []
|
|
325
|
+
_resp_count = 0
|
|
326
|
+
|
|
327
|
+
try:
|
|
328
|
+
async for message in ws:
|
|
329
|
+
resp = json.loads(message)
|
|
330
|
+
code = resp.get("code", -1)
|
|
331
|
+
if code != 0:
|
|
332
|
+
raise RuntimeError(
|
|
333
|
+
f"iFlytek STT error [{code}]: {resp.get('message', 'unknown')}"
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
_resp_count += 1
|
|
337
|
+
data = resp.get("data", {})
|
|
338
|
+
resp_status = data.get("status", 0)
|
|
339
|
+
result = data.get("result", {})
|
|
340
|
+
ws_items = result.get("ws", [])
|
|
341
|
+
pgs = result.get("pgs", "") # "apd" or "rpl"
|
|
342
|
+
rg = result.get("rg", []) # [start, end] for rpl
|
|
343
|
+
sn = result.get("sn", 0) # segment number
|
|
344
|
+
|
|
345
|
+
seg_text = self._extract_segment_text(ws_items)
|
|
346
|
+
|
|
347
|
+
if _resp_count == 1:
|
|
348
|
+
logger.debug("{}: first response at {:.0f}ms sn={} pgs={}",
|
|
349
|
+
self.name, (time.perf_counter() - _t0) * 1000,
|
|
350
|
+
sn, pgs or "none")
|
|
351
|
+
|
|
352
|
+
if pgs == "rpl" and len(rg) == 2:
|
|
353
|
+
# Replace: clear segments rg[0]..rg[1], put new
|
|
354
|
+
# text at rg[0], remove the rest in range.
|
|
355
|
+
start, end = rg[0], rg[1]
|
|
356
|
+
# Ensure segments list is large enough
|
|
357
|
+
while len(segments) <= end:
|
|
358
|
+
segments.append("")
|
|
359
|
+
# Clear the replaced range
|
|
360
|
+
for i in range(start, end + 1):
|
|
361
|
+
segments[i] = ""
|
|
362
|
+
# Put new text at start position
|
|
363
|
+
segments[start] = seg_text
|
|
364
|
+
elif pgs == "apd":
|
|
365
|
+
# Append: add/overwrite segment at index sn
|
|
366
|
+
while len(segments) <= sn:
|
|
367
|
+
segments.append("")
|
|
368
|
+
segments[sn] = seg_text
|
|
369
|
+
else:
|
|
370
|
+
# No pgs (legacy / non-wpgs fallback): append
|
|
371
|
+
while len(segments) <= sn:
|
|
372
|
+
segments.append("")
|
|
373
|
+
segments[sn] = seg_text
|
|
374
|
+
|
|
375
|
+
# Build current full text from all segments
|
|
376
|
+
current_text = "".join(segments).strip()
|
|
377
|
+
|
|
378
|
+
if resp_status == 2:
|
|
379
|
+
# Final result — stop sender
|
|
380
|
+
_sender_stop.set()
|
|
381
|
+
logger.info("{}: final result at {:.0f}ms responses={} text='{}'",
|
|
382
|
+
self.name, (time.perf_counter() - _t0) * 1000,
|
|
383
|
+
_resp_count, current_text[:60])
|
|
384
|
+
if current_text:
|
|
385
|
+
await results.put(
|
|
386
|
+
Transcription(text=current_text, is_partial=False)
|
|
387
|
+
)
|
|
388
|
+
break
|
|
389
|
+
else:
|
|
390
|
+
# Partial result — yield for real-time display
|
|
391
|
+
if current_text:
|
|
392
|
+
await results.put(
|
|
393
|
+
Transcription(text=current_text, is_partial=True)
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
except websockets.exceptions.ConnectionClosed:
|
|
397
|
+
# Connection closed by server — stop sender, emit whatever we have
|
|
398
|
+
_sender_stop.set()
|
|
399
|
+
current_text = "".join(segments).strip()
|
|
400
|
+
if current_text:
|
|
401
|
+
await results.put(
|
|
402
|
+
Transcription(text=current_text, is_partial=False)
|
|
403
|
+
)
|
|
404
|
+
finally:
|
|
405
|
+
_sender_stop.set() # ensure sender stops in all cases
|
|
406
|
+
await results.put(None) # sentinel
|
|
407
|
+
|
|
408
|
+
send_task = asyncio.create_task(sender())
|
|
409
|
+
recv_task = asyncio.create_task(receiver())
|
|
410
|
+
|
|
411
|
+
while True:
|
|
412
|
+
item = await results.get()
|
|
413
|
+
if item is None:
|
|
414
|
+
break
|
|
415
|
+
yield item
|
|
416
|
+
|
|
417
|
+
logger.info(
|
|
418
|
+
"{}: stream completed in {:.0f}ms, frames={}",
|
|
419
|
+
self.name, (time.perf_counter() - _t0) * 1000, _frames_sent,
|
|
420
|
+
)
|
|
421
|
+
# Wait for tasks; suppress sender errors (e.g. ConnectionClosed
|
|
422
|
+
# that slipped past the try/except if timing was tight).
|
|
423
|
+
for task in (send_task, recv_task):
|
|
424
|
+
try:
|
|
425
|
+
await task
|
|
426
|
+
except websockets.exceptions.ConnectionClosed:
|
|
427
|
+
pass
|