@agentunion/kite 1.0.6 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +127 -25
- package/core/event_hub/entry.py +384 -61
- package/core/event_hub/hub.py +8 -0
- package/core/event_hub/module.md +0 -1
- package/core/event_hub/server.py +169 -38
- package/core/kite_log.py +241 -0
- package/core/launcher/entry.py +1306 -425
- package/core/launcher/module_scanner.py +10 -9
- package/core/launcher/process_manager.py +555 -121
- package/core/registry/entry.py +335 -30
- package/core/registry/server.py +339 -256
- package/core/registry/store.py +13 -2
- package/extensions/agents/__init__.py +1 -0
- package/extensions/agents/assistant/__init__.py +1 -0
- package/extensions/agents/assistant/entry.py +380 -0
- package/extensions/agents/assistant/module.md +22 -0
- package/extensions/agents/assistant/server.py +236 -0
- package/extensions/channels/__init__.py +1 -0
- package/extensions/channels/acp_channel/__init__.py +1 -0
- package/extensions/channels/acp_channel/entry.py +380 -0
- package/extensions/channels/acp_channel/module.md +22 -0
- package/extensions/channels/acp_channel/server.py +236 -0
- package/{core → extensions}/event_hub_bench/entry.py +664 -371
- package/{core → extensions}/event_hub_bench/module.md +4 -2
- package/extensions/services/backup/__init__.py +1 -0
- package/extensions/services/backup/entry.py +380 -0
- package/extensions/services/backup/module.md +22 -0
- package/extensions/services/backup/server.py +244 -0
- package/extensions/services/model_service/__init__.py +1 -0
- package/extensions/services/model_service/entry.py +380 -0
- package/extensions/services/model_service/module.md +22 -0
- package/extensions/services/model_service/server.py +236 -0
- package/extensions/services/watchdog/entry.py +460 -143
- package/extensions/services/watchdog/module.md +3 -0
- package/extensions/services/watchdog/monitor.py +128 -13
- package/extensions/services/watchdog/server.py +75 -13
- package/extensions/services/web/__init__.py +1 -0
- package/extensions/services/web/config.yaml +149 -0
- package/extensions/services/web/entry.py +487 -0
- package/extensions/services/web/module.md +24 -0
- package/extensions/services/web/routes/__init__.py +1 -0
- package/extensions/services/web/routes/routes_call.py +189 -0
- package/extensions/services/web/routes/routes_config.py +512 -0
- package/extensions/services/web/routes/routes_contacts.py +98 -0
- package/extensions/services/web/routes/routes_devlog.py +99 -0
- package/extensions/services/web/routes/routes_phone.py +81 -0
- package/extensions/services/web/routes/routes_sms.py +48 -0
- package/extensions/services/web/routes/routes_stats.py +17 -0
- package/extensions/services/web/routes/routes_voicechat.py +554 -0
- package/extensions/services/web/routes/schemas.py +216 -0
- package/extensions/services/web/server.py +332 -0
- package/extensions/services/web/static/css/style.css +1064 -0
- package/extensions/services/web/static/index.html +1445 -0
- package/extensions/services/web/static/js/app.js +4671 -0
- package/extensions/services/web/vendor/__init__.py +1 -0
- package/extensions/services/web/vendor/bluetooth/audio.py +348 -0
- package/extensions/services/web/vendor/bluetooth/contacts.py +251 -0
- package/extensions/services/web/vendor/bluetooth/manager.py +395 -0
- package/extensions/services/web/vendor/bluetooth/sms.py +290 -0
- package/extensions/services/web/vendor/bluetooth/telephony.py +274 -0
- package/extensions/services/web/vendor/config.py +139 -0
- package/extensions/services/web/vendor/conversation/__init__.py +0 -0
- package/extensions/services/web/vendor/conversation/asr.py +936 -0
- package/extensions/services/web/vendor/conversation/engine.py +548 -0
- package/extensions/services/web/vendor/conversation/llm.py +534 -0
- package/extensions/services/web/vendor/conversation/mcp_tools.py +190 -0
- package/extensions/services/web/vendor/conversation/tts.py +322 -0
- package/extensions/services/web/vendor/conversation/vad.py +138 -0
- package/extensions/services/web/vendor/storage/__init__.py +1 -0
- package/extensions/services/web/vendor/storage/identity.py +312 -0
- package/extensions/services/web/vendor/storage/store.py +507 -0
- package/extensions/services/web/vendor/task/__init__.py +0 -0
- package/extensions/services/web/vendor/task/manager.py +864 -0
- package/extensions/services/web/vendor/task/models.py +45 -0
- package/extensions/services/web/vendor/task/webhook.py +263 -0
- package/extensions/services/web/vendor/tools/__init__.py +0 -0
- package/extensions/services/web/vendor/tools/registry.py +321 -0
- package/main.py +344 -4
- package/package.json +11 -2
- package/core/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/__pycache__/data_dir.cpython-313.pyc +0 -0
- package/core/data_dir.py +0 -62
- package/core/event_hub/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/bench.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/bench_perf.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/dedup.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/hub.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/router.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/server.cpython-313.pyc +0 -0
- package/core/event_hub/bench_results/2026-02-28_13-26-48.json +0 -51
- package/core/event_hub/bench_results/2026-02-28_13-44-45.json +0 -51
- package/core/event_hub/bench_results/2026-02-28_13-45-39.json +0 -51
- package/core/launcher/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/module_scanner.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/process_manager.cpython-313.pyc +0 -0
- package/core/launcher/data/log/lifecycle.jsonl +0 -1158
- package/core/launcher/data/token.txt +0 -1
- package/core/registry/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/server.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/store.cpython-313.pyc +0 -0
- package/core/registry/data/port.txt +0 -1
- package/core/registry/data/port_484.txt +0 -1
- package/extensions/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/entry.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/monitor.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/server.cpython-313.pyc +0 -0
- /package/{core/event_hub/bench_results/.gitkeep → extensions/services/web/vendor/bluetooth/__init__.py} +0 -0
|
@@ -0,0 +1,936 @@
|
|
|
1
|
+
"""ASR (Automatic Speech Recognition) abstraction and implementations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import base64
|
|
7
|
+
import hashlib
|
|
8
|
+
import hmac
|
|
9
|
+
import io
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import random
|
|
13
|
+
import time
|
|
14
|
+
import uuid
|
|
15
|
+
from abc import ABC, abstractmethod
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from time import mktime
|
|
18
|
+
from urllib.parse import quote, urlencode, urlparse
|
|
19
|
+
from wsgiref.handlers import format_date_time
|
|
20
|
+
|
|
21
|
+
import httpx
|
|
22
|
+
import websockets
|
|
23
|
+
|
|
24
|
+
from .. import config as cfg
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
# Abstract base
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
class ASRProvider(ABC):
|
|
34
|
+
"""Base class for all ASR (speech-to-text) providers."""
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
async def start_stream(self, language: str = "zh") -> None:
|
|
38
|
+
"""Prepare the provider for a new recognition stream."""
|
|
39
|
+
...
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
async def feed_audio(self, chunk: bytes) -> None:
|
|
43
|
+
"""Feed a chunk of PCM S16LE 16 kHz mono audio."""
|
|
44
|
+
...
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
async def get_result(self) -> str:
|
|
48
|
+
"""Return the transcribed text accumulated so far and reset the buffer."""
|
|
49
|
+
...
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
async def stop_stream(self) -> None:
|
|
53
|
+
"""Tear down the recognition stream and release resources."""
|
|
54
|
+
...
|
|
55
|
+
|
|
56
|
+
async def get_interim_result(self) -> str:
|
|
57
|
+
"""Return any interim/partial text without stopping the stream.
|
|
58
|
+
|
|
59
|
+
Default returns empty string. Streaming providers can override.
|
|
60
|
+
"""
|
|
61
|
+
return ""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# OpenAI Whisper API
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
class WhisperASR(ASRProvider):
|
|
69
|
+
"""OpenAI Whisper API -- accumulates audio chunks, sends for transcription."""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
base_url: str,
|
|
74
|
+
api_key: str,
|
|
75
|
+
model: str = "whisper-1",
|
|
76
|
+
) -> None:
|
|
77
|
+
self.base_url = base_url.rstrip("/")
|
|
78
|
+
self.api_key = api_key
|
|
79
|
+
self.model = model
|
|
80
|
+
self._buffer = bytearray()
|
|
81
|
+
self._language = "zh"
|
|
82
|
+
self._client: httpx.AsyncClient | None = None
|
|
83
|
+
|
|
84
|
+
async def start_stream(self, language: str = "zh") -> None:
|
|
85
|
+
self._language = language
|
|
86
|
+
self._buffer = bytearray()
|
|
87
|
+
self._client = httpx.AsyncClient(timeout=30.0)
|
|
88
|
+
|
|
89
|
+
async def feed_audio(self, chunk: bytes) -> None:
|
|
90
|
+
self._buffer.extend(chunk)
|
|
91
|
+
|
|
92
|
+
async def get_result(self) -> str:
|
|
93
|
+
if not self._buffer:
|
|
94
|
+
return ""
|
|
95
|
+
|
|
96
|
+
if self._client is None:
|
|
97
|
+
logger.warning("WhisperASR: client not initialized, call start_stream first")
|
|
98
|
+
return ""
|
|
99
|
+
|
|
100
|
+
# Build a minimal WAV header so Whisper can accept the raw PCM.
|
|
101
|
+
audio_data = self._build_wav(bytes(self._buffer))
|
|
102
|
+
self._buffer = bytearray()
|
|
103
|
+
|
|
104
|
+
url = f"{self.base_url}/audio/transcriptions"
|
|
105
|
+
files = {
|
|
106
|
+
"file": ("audio.wav", io.BytesIO(audio_data), "audio/wav"),
|
|
107
|
+
}
|
|
108
|
+
data = {
|
|
109
|
+
"model": self.model,
|
|
110
|
+
"language": self._language,
|
|
111
|
+
"response_format": "json",
|
|
112
|
+
}
|
|
113
|
+
headers = {"Authorization": f"Bearer {self.api_key}"}
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
resp = await self._client.post(url, headers=headers, files=files, data=data)
|
|
117
|
+
resp.raise_for_status()
|
|
118
|
+
result = resp.json()
|
|
119
|
+
return result.get("text", "")
|
|
120
|
+
except Exception:
|
|
121
|
+
logger.exception("WhisperASR: transcription request failed")
|
|
122
|
+
return ""
|
|
123
|
+
|
|
124
|
+
async def stop_stream(self) -> None:
|
|
125
|
+
self._buffer = bytearray()
|
|
126
|
+
if self._client:
|
|
127
|
+
await self._client.aclose()
|
|
128
|
+
self._client = None
|
|
129
|
+
|
|
130
|
+
# -- helpers --
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def _build_wav(pcm: bytes, sample_rate: int = 16000, channels: int = 1, bits: int = 16) -> bytes:
|
|
134
|
+
"""Wrap raw PCM S16LE data in a minimal WAV header."""
|
|
135
|
+
data_size = len(pcm)
|
|
136
|
+
byte_rate = sample_rate * channels * bits // 8
|
|
137
|
+
block_align = channels * bits // 8
|
|
138
|
+
header = bytearray(44)
|
|
139
|
+
# RIFF header
|
|
140
|
+
header[0:4] = b"RIFF"
|
|
141
|
+
header[4:8] = (data_size + 36).to_bytes(4, "little")
|
|
142
|
+
header[8:12] = b"WAVE"
|
|
143
|
+
# fmt chunk
|
|
144
|
+
header[12:16] = b"fmt "
|
|
145
|
+
header[16:20] = (16).to_bytes(4, "little") # chunk size
|
|
146
|
+
header[20:22] = (1).to_bytes(2, "little") # PCM format
|
|
147
|
+
header[22:24] = channels.to_bytes(2, "little")
|
|
148
|
+
header[24:28] = sample_rate.to_bytes(4, "little")
|
|
149
|
+
header[28:32] = byte_rate.to_bytes(4, "little")
|
|
150
|
+
header[32:34] = block_align.to_bytes(2, "little")
|
|
151
|
+
header[34:36] = bits.to_bytes(2, "little")
|
|
152
|
+
# data chunk
|
|
153
|
+
header[36:40] = b"data"
|
|
154
|
+
header[40:44] = data_size.to_bytes(4, "little")
|
|
155
|
+
return bytes(header) + pcm
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# ---------------------------------------------------------------------------
|
|
159
|
+
# iFlytek (Xunfei) streaming ASR via WebSocket
|
|
160
|
+
# ---------------------------------------------------------------------------
|
|
161
|
+
|
|
162
|
+
class XunfeiASR(ASRProvider):
|
|
163
|
+
"""iFlytek streaming ASR via WebSocket (wss://iat-api.xfyun.cn/v2/iat)."""
|
|
164
|
+
|
|
165
|
+
XUNFEI_IAT_URL = "wss://iat-api.xfyun.cn/v2/iat"
|
|
166
|
+
FRAME_SIZE = 8000 # 250ms at 16kHz S16LE
|
|
167
|
+
|
|
168
|
+
def __init__(self, app_id: str, api_key: str, api_secret: str) -> None:
|
|
169
|
+
self.app_id = app_id
|
|
170
|
+
self.api_key = api_key
|
|
171
|
+
self.api_secret = api_secret
|
|
172
|
+
self._ws: websockets.WebSocketClientProtocol | None = None
|
|
173
|
+
self._buffer = bytearray()
|
|
174
|
+
self._result_text = ""
|
|
175
|
+
self._language = "zh_cn"
|
|
176
|
+
self._stream_started = False
|
|
177
|
+
|
|
178
|
+
async def start_stream(self, language: str = "zh") -> None:
|
|
179
|
+
self._language = "zh_cn" if language.startswith("zh") else language
|
|
180
|
+
self._buffer = bytearray()
|
|
181
|
+
self._result_text = ""
|
|
182
|
+
self._stream_started = False
|
|
183
|
+
url = self._build_auth_url()
|
|
184
|
+
self._ws = await websockets.connect(url)
|
|
185
|
+
self._stream_started = True
|
|
186
|
+
|
|
187
|
+
async def feed_audio(self, chunk: bytes) -> None:
|
|
188
|
+
self._buffer.extend(chunk)
|
|
189
|
+
|
|
190
|
+
if not self._ws or not self._stream_started:
|
|
191
|
+
return
|
|
192
|
+
|
|
193
|
+
# Send accumulated frames when we have enough data
|
|
194
|
+
while len(self._buffer) >= self.FRAME_SIZE:
|
|
195
|
+
frame = bytes(self._buffer[: self.FRAME_SIZE])
|
|
196
|
+
self._buffer = self._buffer[self.FRAME_SIZE :]
|
|
197
|
+
await self._send_frame(frame, status=1) # 1 = continue
|
|
198
|
+
|
|
199
|
+
async def get_result(self) -> str:
|
|
200
|
+
if not self._ws:
|
|
201
|
+
return ""
|
|
202
|
+
|
|
203
|
+
# Send remaining audio and end-of-stream marker
|
|
204
|
+
if self._buffer:
|
|
205
|
+
await self._send_frame(bytes(self._buffer), status=2) # 2 = last frame
|
|
206
|
+
self._buffer = bytearray()
|
|
207
|
+
else:
|
|
208
|
+
await self._send_frame(b"", status=2)
|
|
209
|
+
|
|
210
|
+
# Receive results until the server closes
|
|
211
|
+
result_parts: list[str] = []
|
|
212
|
+
try:
|
|
213
|
+
async for message in self._ws:
|
|
214
|
+
data = json.loads(message)
|
|
215
|
+
code = data.get("code", -1)
|
|
216
|
+
if code != 0:
|
|
217
|
+
logger.error("XunfeiASR: error code=%s, message=%s", code, data.get("message"))
|
|
218
|
+
break
|
|
219
|
+
result = data.get("data", {}).get("result", {})
|
|
220
|
+
ws_list = result.get("ws", [])
|
|
221
|
+
for ws_item in ws_list:
|
|
222
|
+
for cw in ws_item.get("cw", []):
|
|
223
|
+
result_parts.append(cw.get("w", ""))
|
|
224
|
+
# Check if this is the final result
|
|
225
|
+
if result.get("status") == 2:
|
|
226
|
+
break
|
|
227
|
+
except websockets.exceptions.ConnectionClosed:
|
|
228
|
+
pass
|
|
229
|
+
|
|
230
|
+
text = "".join(result_parts)
|
|
231
|
+
return text
|
|
232
|
+
|
|
233
|
+
async def stop_stream(self) -> None:
|
|
234
|
+
self._buffer = bytearray()
|
|
235
|
+
self._result_text = ""
|
|
236
|
+
self._stream_started = False
|
|
237
|
+
if self._ws:
|
|
238
|
+
try:
|
|
239
|
+
await self._ws.close()
|
|
240
|
+
except Exception:
|
|
241
|
+
pass
|
|
242
|
+
self._ws = None
|
|
243
|
+
|
|
244
|
+
# -- helpers --
|
|
245
|
+
|
|
246
|
+
async def _send_frame(self, audio: bytes, status: int) -> None:
|
|
247
|
+
"""Send a single audio frame to the Xunfei WebSocket.
|
|
248
|
+
|
|
249
|
+
status: 0 = first frame, 1 = continue, 2 = last frame.
|
|
250
|
+
"""
|
|
251
|
+
if not self._ws:
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
payload: dict = {
|
|
255
|
+
"data": {
|
|
256
|
+
"status": status,
|
|
257
|
+
"format": "audio/L16;rate=16000",
|
|
258
|
+
"encoding": "raw",
|
|
259
|
+
"audio": base64.b64encode(audio).decode("utf-8"),
|
|
260
|
+
},
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
# The first frame must also carry common and business params
|
|
264
|
+
if status == 0 or (status == 1 and not self._result_text):
|
|
265
|
+
payload["common"] = {"app_id": self.app_id}
|
|
266
|
+
payload["business"] = {
|
|
267
|
+
"language": self._language,
|
|
268
|
+
"domain": "iat",
|
|
269
|
+
"accent": "mandarin",
|
|
270
|
+
"vad_eos": 3000,
|
|
271
|
+
"dwa": "wpgs",
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
await self._ws.send(json.dumps(payload))
|
|
275
|
+
|
|
276
|
+
def _build_auth_url(self) -> str:
|
|
277
|
+
"""Build the signed WebSocket URL for Xunfei authentication (HMAC-SHA256)."""
|
|
278
|
+
parsed = urlparse(self.XUNFEI_IAT_URL)
|
|
279
|
+
now = datetime.now()
|
|
280
|
+
date_str = format_date_time(mktime(now.timetuple()))
|
|
281
|
+
|
|
282
|
+
signature_origin = (
|
|
283
|
+
f"host: {parsed.hostname}\n"
|
|
284
|
+
f"date: {date_str}\n"
|
|
285
|
+
f"GET {parsed.path} HTTP/1.1"
|
|
286
|
+
)
|
|
287
|
+
signature_sha = hmac.new(
|
|
288
|
+
self.api_secret.encode("utf-8"),
|
|
289
|
+
signature_origin.encode("utf-8"),
|
|
290
|
+
hashlib.sha256,
|
|
291
|
+
).digest()
|
|
292
|
+
signature = base64.b64encode(signature_sha).decode("utf-8")
|
|
293
|
+
|
|
294
|
+
authorization_origin = (
|
|
295
|
+
f'api_key="{self.api_key}", '
|
|
296
|
+
f'algorithm="hmac-sha256", '
|
|
297
|
+
f'headers="host date request-line", '
|
|
298
|
+
f'signature="{signature}"'
|
|
299
|
+
)
|
|
300
|
+
authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
|
|
301
|
+
|
|
302
|
+
params = {
|
|
303
|
+
"authorization": authorization,
|
|
304
|
+
"date": date_str,
|
|
305
|
+
"host": parsed.hostname,
|
|
306
|
+
}
|
|
307
|
+
return f"{self.XUNFEI_IAT_URL}?{urlencode(params)}"
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# ---------------------------------------------------------------------------
|
|
311
|
+
# Volcengine (ByteDance) streaming ASR via WebSocket binary protocol
|
|
312
|
+
# ---------------------------------------------------------------------------
|
|
313
|
+
|
|
314
|
+
class VolcengineASR(ASRProvider):
|
|
315
|
+
"""火山引擎(豆包)大模型流式语音识别。
|
|
316
|
+
|
|
317
|
+
Uses the binary WebSocket protocol described at:
|
|
318
|
+
https://www.volcengine.com/docs/6561/1354869
|
|
319
|
+
|
|
320
|
+
Requires ``app_id`` and ``access_token`` from the Volcengine console.
|
|
321
|
+
"""
|
|
322
|
+
|
|
323
|
+
DEFAULT_WS_URL = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async"
|
|
324
|
+
FRAME_DURATION_MS = 200 # recommended 200ms per packet
|
|
325
|
+
SAMPLE_RATE = 16000
|
|
326
|
+
BYTES_PER_FRAME = SAMPLE_RATE * 2 * FRAME_DURATION_MS // 1000 # 6400 bytes
|
|
327
|
+
|
|
328
|
+
def __init__(
|
|
329
|
+
self,
|
|
330
|
+
app_id: str,
|
|
331
|
+
access_token: str,
|
|
332
|
+
resource_id: str = "volc.bigasr.sauc.duration",
|
|
333
|
+
ws_url: str = "",
|
|
334
|
+
) -> None:
|
|
335
|
+
self.app_id = app_id
|
|
336
|
+
self.access_token = access_token
|
|
337
|
+
self.resource_id = resource_id
|
|
338
|
+
self.ws_url = ws_url or self.DEFAULT_WS_URL
|
|
339
|
+
self._ws = None
|
|
340
|
+
self._buffer = bytearray()
|
|
341
|
+
self._language = "zh"
|
|
342
|
+
self._recv_text = ""
|
|
343
|
+
self._recv_done = False
|
|
344
|
+
self._recv_task: asyncio.Task | None = None
|
|
345
|
+
|
|
346
|
+
# -- protocol constants ---------------------------------------------------
|
|
347
|
+
|
|
348
|
+
_NO_SEQUENCE = 0b0000
|
|
349
|
+
_NEG_SEQUENCE = 0b0010
|
|
350
|
+
|
|
351
|
+
_FULL_CLIENT_REQUEST = 0b0001
|
|
352
|
+
_AUDIO_ONLY_REQUEST = 0b0010
|
|
353
|
+
_FULL_SERVER_RESPONSE = 0b1001
|
|
354
|
+
_SERVER_ACK = 0b1011
|
|
355
|
+
_SERVER_ERROR = 0b1111
|
|
356
|
+
|
|
357
|
+
_JSON = 0b0001
|
|
358
|
+
_NO_SERIAL = 0b0000
|
|
359
|
+
_GZIP = 0b0001
|
|
360
|
+
|
|
361
|
+
# -- protocol helpers ---------------------------------------------------
|
|
362
|
+
|
|
363
|
+
@staticmethod
|
|
364
|
+
def _build_header(
|
|
365
|
+
msg_type: int,
|
|
366
|
+
msg_flags: int,
|
|
367
|
+
serial: int = 0x01,
|
|
368
|
+
compress: int = 0x01,
|
|
369
|
+
) -> bytes:
|
|
370
|
+
"""Build the 4-byte binary header."""
|
|
371
|
+
b0 = (0x1 << 4) | 0x1 # version=1, header_size=1 (×4 = 4 bytes)
|
|
372
|
+
b1 = (msg_type << 4) | msg_flags
|
|
373
|
+
b2 = (serial << 4) | compress
|
|
374
|
+
b3 = 0x00
|
|
375
|
+
return bytes([b0, b1, b2, b3])
|
|
376
|
+
|
|
377
|
+
@staticmethod
|
|
378
|
+
def _gzip(data: bytes) -> bytes:
|
|
379
|
+
import gzip
|
|
380
|
+
return gzip.compress(data)
|
|
381
|
+
|
|
382
|
+
@staticmethod
|
|
383
|
+
def _gunzip(data: bytes) -> bytes:
|
|
384
|
+
import gzip
|
|
385
|
+
return gzip.decompress(data)
|
|
386
|
+
|
|
387
|
+
def _build_full_client_request(self) -> bytes:
|
|
388
|
+
payload = json.dumps({
|
|
389
|
+
"user": {"uid": "ai-phone-agent"},
|
|
390
|
+
"audio": {
|
|
391
|
+
"format": "pcm",
|
|
392
|
+
"rate": self.SAMPLE_RATE,
|
|
393
|
+
"bits": 16,
|
|
394
|
+
"channel": 1,
|
|
395
|
+
"codec": "raw",
|
|
396
|
+
},
|
|
397
|
+
"request": {
|
|
398
|
+
"model_name": "bigmodel",
|
|
399
|
+
"enable_itn": True,
|
|
400
|
+
"enable_punc": True,
|
|
401
|
+
"enable_ddc": False,
|
|
402
|
+
"result_type": "full",
|
|
403
|
+
},
|
|
404
|
+
}).encode("utf-8")
|
|
405
|
+
compressed = self._gzip(payload)
|
|
406
|
+
header = self._build_header(
|
|
407
|
+
msg_type=self._FULL_CLIENT_REQUEST,
|
|
408
|
+
msg_flags=self._NO_SEQUENCE,
|
|
409
|
+
serial=self._JSON,
|
|
410
|
+
compress=self._GZIP,
|
|
411
|
+
)
|
|
412
|
+
size = len(compressed).to_bytes(4, "big")
|
|
413
|
+
return header + size + compressed
|
|
414
|
+
|
|
415
|
+
def _build_audio_packet(self, audio: bytes, last: bool = False) -> bytes:
|
|
416
|
+
compressed = self._gzip(audio)
|
|
417
|
+
flags = self._NEG_SEQUENCE if last else self._NO_SEQUENCE
|
|
418
|
+
header = self._build_header(
|
|
419
|
+
msg_type=self._AUDIO_ONLY_REQUEST,
|
|
420
|
+
msg_flags=flags,
|
|
421
|
+
serial=self._NO_SERIAL,
|
|
422
|
+
compress=self._GZIP,
|
|
423
|
+
)
|
|
424
|
+
size = len(compressed).to_bytes(4, "big")
|
|
425
|
+
return header + size + compressed
|
|
426
|
+
|
|
427
|
+
def _parse_response(self, data: bytes) -> tuple[str, bool]:
|
|
428
|
+
"""Parse a server response binary frame.
|
|
429
|
+
|
|
430
|
+
Returns ``(text, is_last)``.
|
|
431
|
+
"""
|
|
432
|
+
if len(data) < 4:
|
|
433
|
+
return "", False
|
|
434
|
+
|
|
435
|
+
header_size = data[0] & 0x0F
|
|
436
|
+
b1 = data[1]
|
|
437
|
+
msg_type = (b1 >> 4) & 0x0F
|
|
438
|
+
flags = b1 & 0x0F
|
|
439
|
+
b2 = data[2]
|
|
440
|
+
compress = b2 & 0x0F
|
|
441
|
+
|
|
442
|
+
header_bytes = header_size * 4
|
|
443
|
+
payload = data[header_bytes:]
|
|
444
|
+
|
|
445
|
+
if msg_type == self._SERVER_ERROR:
|
|
446
|
+
if len(payload) >= 8:
|
|
447
|
+
err_code = int.from_bytes(payload[:4], "big")
|
|
448
|
+
err_size = int.from_bytes(payload[4:8], "big")
|
|
449
|
+
err_msg = payload[8:8 + err_size]
|
|
450
|
+
if compress == self._GZIP:
|
|
451
|
+
try:
|
|
452
|
+
err_msg = self._gunzip(err_msg)
|
|
453
|
+
except Exception:
|
|
454
|
+
pass
|
|
455
|
+
logger.error("VolcengineASR error: code=%s msg=%s",
|
|
456
|
+
err_code, err_msg.decode("utf-8", errors="replace"))
|
|
457
|
+
return "", True
|
|
458
|
+
|
|
459
|
+
if msg_type == self._SERVER_ACK:
|
|
460
|
+
return "", False
|
|
461
|
+
|
|
462
|
+
if msg_type != self._FULL_SERVER_RESPONSE:
|
|
463
|
+
return "", False
|
|
464
|
+
|
|
465
|
+
has_sequence = (flags & 0x01) != 0
|
|
466
|
+
# The "last" indicator can be bit 1 (0x02) or bit 2 (0x04) depending
|
|
467
|
+
# on endpoint; treat any of these as the final response.
|
|
468
|
+
is_last = (flags & 0x02) != 0 or (flags & 0x04) != 0
|
|
469
|
+
|
|
470
|
+
offset = 0
|
|
471
|
+
if has_sequence:
|
|
472
|
+
offset += 4 # skip sequence number
|
|
473
|
+
|
|
474
|
+
if offset + 4 > len(payload):
|
|
475
|
+
return "", is_last
|
|
476
|
+
payload_size = int.from_bytes(payload[offset:offset + 4], "big")
|
|
477
|
+
offset += 4
|
|
478
|
+
payload_bytes = payload[offset:offset + payload_size]
|
|
479
|
+
|
|
480
|
+
if compress == self._GZIP:
|
|
481
|
+
payload_bytes = self._gunzip(payload_bytes)
|
|
482
|
+
|
|
483
|
+
try:
|
|
484
|
+
result = json.loads(payload_bytes)
|
|
485
|
+
text = result.get("result", {}).get("text", "")
|
|
486
|
+
if text:
|
|
487
|
+
logger.debug("VolcengineASR: text=%r is_last=%s", text, is_last)
|
|
488
|
+
return text, is_last
|
|
489
|
+
except (json.JSONDecodeError, KeyError):
|
|
490
|
+
return "", is_last
|
|
491
|
+
|
|
492
|
+
# -- background receiver --------------------------------------------------
|
|
493
|
+
|
|
494
|
+
async def _recv_loop(self) -> None:
|
|
495
|
+
"""Background task: read all responses from the Volcengine WebSocket."""
|
|
496
|
+
try:
|
|
497
|
+
while self._ws:
|
|
498
|
+
try:
|
|
499
|
+
resp = await self._ws.recv()
|
|
500
|
+
except (websockets.exceptions.ConnectionClosed,
|
|
501
|
+
websockets.exceptions.ConnectionClosedOK,
|
|
502
|
+
websockets.exceptions.ConnectionClosedError):
|
|
503
|
+
logger.debug("VolcengineASR: recv_loop connection closed")
|
|
504
|
+
break
|
|
505
|
+
if isinstance(resp, bytes):
|
|
506
|
+
text, is_last = self._parse_response(resp)
|
|
507
|
+
if text:
|
|
508
|
+
self._recv_text = text
|
|
509
|
+
if is_last:
|
|
510
|
+
self._recv_done = True
|
|
511
|
+
break
|
|
512
|
+
except asyncio.CancelledError:
|
|
513
|
+
pass
|
|
514
|
+
except Exception:
|
|
515
|
+
logger.debug("VolcengineASR: recv_loop ended", exc_info=True)
|
|
516
|
+
|
|
517
|
+
# -- ASRProvider interface -----------------------------------------------
|
|
518
|
+
|
|
519
|
+
async def start_stream(self, language: str = "zh") -> None:
|
|
520
|
+
self._language = language
|
|
521
|
+
self._buffer = bytearray()
|
|
522
|
+
self._recv_text = ""
|
|
523
|
+
self._recv_done = False
|
|
524
|
+
|
|
525
|
+
connect_id = str(uuid.uuid4())
|
|
526
|
+
extra_headers = {
|
|
527
|
+
"X-Api-App-Key": self.app_id,
|
|
528
|
+
"X-Api-Access-Key": self.access_token,
|
|
529
|
+
"X-Api-Resource-Id": self.resource_id,
|
|
530
|
+
"X-Api-Connect-Id": connect_id,
|
|
531
|
+
}
|
|
532
|
+
masked_token = ("****" + self.access_token[-4:]) if len(self.access_token) > 4 else "****"
|
|
533
|
+
logger.info(
|
|
534
|
+
"VolcengineASR: connecting app_id=%s access_token=%s resource_id=%s connect_id=%s url=%s",
|
|
535
|
+
self.app_id, masked_token, self.resource_id, connect_id, self.ws_url,
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
try:
|
|
539
|
+
self._ws = await websockets.connect(
|
|
540
|
+
self.ws_url,
|
|
541
|
+
additional_headers=extra_headers,
|
|
542
|
+
)
|
|
543
|
+
except websockets.exceptions.InvalidStatus as e:
|
|
544
|
+
body = getattr(e.response, "body", b"")
|
|
545
|
+
if isinstance(body, (bytes, bytearray)):
|
|
546
|
+
body = body.decode("utf-8", errors="replace")
|
|
547
|
+
logger.error("VolcengineASR: WebSocket rejected: HTTP %s, %s", e.response.status_code, body)
|
|
548
|
+
raise ConnectionError(
|
|
549
|
+
f"火山引擎ASR连接失败 (HTTP {e.response.status_code}): {body}. "
|
|
550
|
+
f"请检查 app_id、access_token、resource_id 配置,"
|
|
551
|
+
f"并确认已在火山引擎控制台开通对应的语音识别服务。"
|
|
552
|
+
) from e
|
|
553
|
+
|
|
554
|
+
# Log X-Tt-Logid from response headers
|
|
555
|
+
resp_headers = getattr(self._ws, "response_headers", None)
|
|
556
|
+
if resp_headers is None:
|
|
557
|
+
resp_obj = getattr(self._ws, "response", None)
|
|
558
|
+
resp_headers = getattr(resp_obj, "headers", {}) if resp_obj else {}
|
|
559
|
+
logid = resp_headers.get("X-Tt-Logid", "")
|
|
560
|
+
if logid:
|
|
561
|
+
logger.info("VolcengineASR: X-Tt-Logid=%s", logid)
|
|
562
|
+
|
|
563
|
+
# Send full client request
|
|
564
|
+
await self._ws.send(self._build_full_client_request())
|
|
565
|
+
logger.info("VolcengineASR: stream started, listening for responses")
|
|
566
|
+
|
|
567
|
+
# Start background receiver task
|
|
568
|
+
self._recv_task = asyncio.create_task(self._recv_loop())
|
|
569
|
+
|
|
570
|
+
async def feed_audio(self, chunk: bytes) -> None:
|
|
571
|
+
self._buffer.extend(chunk)
|
|
572
|
+
|
|
573
|
+
if not self._ws:
|
|
574
|
+
return
|
|
575
|
+
|
|
576
|
+
# Send complete frames (200ms each)
|
|
577
|
+
while len(self._buffer) >= self.BYTES_PER_FRAME:
|
|
578
|
+
frame = bytes(self._buffer[:self.BYTES_PER_FRAME])
|
|
579
|
+
self._buffer = self._buffer[self.BYTES_PER_FRAME:]
|
|
580
|
+
try:
|
|
581
|
+
await self._ws.send(self._build_audio_packet(frame, last=False))
|
|
582
|
+
except Exception:
|
|
583
|
+
logger.debug("VolcengineASR: send failed", exc_info=True)
|
|
584
|
+
return
|
|
585
|
+
|
|
586
|
+
async def get_result(self) -> str:
|
|
587
|
+
if not self._ws:
|
|
588
|
+
return self._recv_text or ""
|
|
589
|
+
|
|
590
|
+
# Send remaining audio as the last packet
|
|
591
|
+
remaining = bytes(self._buffer) if self._buffer else b""
|
|
592
|
+
self._buffer = bytearray()
|
|
593
|
+
try:
|
|
594
|
+
await self._ws.send(self._build_audio_packet(remaining, last=True))
|
|
595
|
+
logger.debug("VolcengineASR: sent last audio packet (%d bytes)", len(remaining))
|
|
596
|
+
except Exception:
|
|
597
|
+
logger.debug("VolcengineASR: failed to send last packet", exc_info=True)
|
|
598
|
+
|
|
599
|
+
# Wait for the receiver to get the final response
|
|
600
|
+
if self._recv_task and not self._recv_task.done():
|
|
601
|
+
try:
|
|
602
|
+
await asyncio.wait_for(self._recv_task, timeout=5.0)
|
|
603
|
+
except asyncio.TimeoutError:
|
|
604
|
+
logger.warning("VolcengineASR: timeout waiting for final response")
|
|
605
|
+
except Exception:
|
|
606
|
+
pass
|
|
607
|
+
|
|
608
|
+
result = self._recv_text
|
|
609
|
+
self._recv_text = ""
|
|
610
|
+
return result
|
|
611
|
+
|
|
612
|
+
async def stop_stream(self) -> None:
|
|
613
|
+
self._buffer = bytearray()
|
|
614
|
+
# Cancel the receiver task
|
|
615
|
+
if self._recv_task and not self._recv_task.done():
|
|
616
|
+
self._recv_task.cancel()
|
|
617
|
+
try:
|
|
618
|
+
await self._recv_task
|
|
619
|
+
except (asyncio.CancelledError, Exception):
|
|
620
|
+
pass
|
|
621
|
+
self._recv_task = None
|
|
622
|
+
self._recv_text = ""
|
|
623
|
+
self._recv_done = False
|
|
624
|
+
if self._ws:
|
|
625
|
+
try:
|
|
626
|
+
await self._ws.close()
|
|
627
|
+
except Exception:
|
|
628
|
+
pass
|
|
629
|
+
self._ws = None
|
|
630
|
+
|
|
631
|
+
async def get_interim_result(self) -> str:
|
|
632
|
+
return self._recv_text
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
# ---------------------------------------------------------------------------
|
|
636
|
+
# Tencent Cloud real-time ASR via WebSocket
|
|
637
|
+
# ---------------------------------------------------------------------------
|
|
638
|
+
|
|
639
|
+
class TencentASR(ASRProvider):
|
|
640
|
+
"""腾讯云实时语音识别 (WebSocket).
|
|
641
|
+
|
|
642
|
+
Protocol doc: https://cloud.tencent.com/document/product/1093/48982
|
|
643
|
+
"""
|
|
644
|
+
|
|
645
|
+
WS_BASE = "wss://asr.cloud.tencent.com/asr/v2/"
|
|
646
|
+
SIGN_BASE = "asr.cloud.tencent.com/asr/v2/"
|
|
647
|
+
# 40ms per chunk at 16kHz S16LE mono = 1280 bytes
|
|
648
|
+
FRAME_DURATION_MS = 40
|
|
649
|
+
SAMPLE_RATE = 16000
|
|
650
|
+
BYTES_PER_FRAME = SAMPLE_RATE * 2 * FRAME_DURATION_MS // 1000 # 1280
|
|
651
|
+
|
|
652
|
+
def __init__(
|
|
653
|
+
self,
|
|
654
|
+
app_id: str,
|
|
655
|
+
secret_id: str,
|
|
656
|
+
secret_key: str,
|
|
657
|
+
engine_model_type: str = "16k_zh_large",
|
|
658
|
+
) -> None:
|
|
659
|
+
self.app_id = app_id
|
|
660
|
+
self.secret_id = secret_id
|
|
661
|
+
self.secret_key = secret_key
|
|
662
|
+
self.engine_model_type = engine_model_type
|
|
663
|
+
self._ws = None
|
|
664
|
+
self._buffer = bytearray()
|
|
665
|
+
self._language = "zh"
|
|
666
|
+
self._recv_text = ""
|
|
667
|
+
self._sentence_parts: list[str] = []
|
|
668
|
+
self._recv_done = False
|
|
669
|
+
self._recv_task: asyncio.Task | None = None
|
|
670
|
+
|
|
671
|
+
# -- signing ---------------------------------------------------------------
|
|
672
|
+
|
|
673
|
+
def _build_ws_url(self) -> str:
|
|
674
|
+
now = int(time.time())
|
|
675
|
+
params = {
|
|
676
|
+
"secretid": self.secret_id,
|
|
677
|
+
"timestamp": now,
|
|
678
|
+
"expired": now + 86400,
|
|
679
|
+
"nonce": random.randint(10000, 99999),
|
|
680
|
+
"engine_model_type": self.engine_model_type,
|
|
681
|
+
"voice_id": str(uuid.uuid4()),
|
|
682
|
+
"voice_format": 1, # PCM
|
|
683
|
+
"needvad": 1,
|
|
684
|
+
"filter_dirty": 0,
|
|
685
|
+
"filter_modal": 0,
|
|
686
|
+
"filter_punc": 0,
|
|
687
|
+
"convert_num_mode": 1,
|
|
688
|
+
}
|
|
689
|
+
if self.SAMPLE_RATE == 8000:
|
|
690
|
+
params["input_sample_rate"] = 8000
|
|
691
|
+
|
|
692
|
+
# Sort params by key
|
|
693
|
+
sorted_keys = sorted(params.keys())
|
|
694
|
+
query_parts = [f"{k}={params[k]}" for k in sorted_keys]
|
|
695
|
+
query_str = "&".join(query_parts)
|
|
696
|
+
|
|
697
|
+
# Build sign string
|
|
698
|
+
sign_str = f"{self.SIGN_BASE}{self.app_id}?{query_str}"
|
|
699
|
+
|
|
700
|
+
# HMAC-SHA1 → base64
|
|
701
|
+
signature = base64.b64encode(
|
|
702
|
+
hmac.new(
|
|
703
|
+
self.secret_key.encode("utf-8"),
|
|
704
|
+
sign_str.encode("utf-8"),
|
|
705
|
+
hashlib.sha1,
|
|
706
|
+
).digest()
|
|
707
|
+
).decode("utf-8")
|
|
708
|
+
|
|
709
|
+
# URL-encode signature
|
|
710
|
+
encoded_sig = quote(signature, safe="")
|
|
711
|
+
|
|
712
|
+
return f"{self.WS_BASE}{self.app_id}?{query_str}&signature={encoded_sig}"
|
|
713
|
+
|
|
714
|
+
# -- background receiver ---------------------------------------------------
|
|
715
|
+
|
|
716
|
+
async def _recv_loop(self) -> None:
|
|
717
|
+
try:
|
|
718
|
+
while self._ws:
|
|
719
|
+
try:
|
|
720
|
+
msg = await self._ws.recv()
|
|
721
|
+
except (websockets.exceptions.ConnectionClosed,
|
|
722
|
+
websockets.exceptions.ConnectionClosedOK,
|
|
723
|
+
websockets.exceptions.ConnectionClosedError):
|
|
724
|
+
logger.debug("TencentASR: recv_loop connection closed")
|
|
725
|
+
break
|
|
726
|
+
|
|
727
|
+
if not isinstance(msg, str):
|
|
728
|
+
continue
|
|
729
|
+
|
|
730
|
+
try:
|
|
731
|
+
data = json.loads(msg)
|
|
732
|
+
except json.JSONDecodeError:
|
|
733
|
+
continue
|
|
734
|
+
|
|
735
|
+
code = data.get("code", -1)
|
|
736
|
+
if code != 0:
|
|
737
|
+
logger.error("TencentASR: error code=%s msg=%s",
|
|
738
|
+
code, data.get("message", ""))
|
|
739
|
+
self._recv_done = True
|
|
740
|
+
break
|
|
741
|
+
|
|
742
|
+
result = data.get("result", {})
|
|
743
|
+
slice_type = result.get("slice_type")
|
|
744
|
+
voice_text = result.get("voice_text_str", "")
|
|
745
|
+
|
|
746
|
+
if slice_type == 0:
|
|
747
|
+
# Sentence begin — nothing to accumulate yet
|
|
748
|
+
pass
|
|
749
|
+
elif slice_type == 1:
|
|
750
|
+
# Interim result for current sentence
|
|
751
|
+
self._recv_text = "".join(self._sentence_parts) + voice_text
|
|
752
|
+
elif slice_type == 2:
|
|
753
|
+
# Sentence end — finalize this sentence
|
|
754
|
+
self._sentence_parts.append(voice_text)
|
|
755
|
+
self._recv_text = "".join(self._sentence_parts)
|
|
756
|
+
|
|
757
|
+
final_speech = data.get("final_speech")
|
|
758
|
+
if final_speech == 1:
|
|
759
|
+
self._recv_done = True
|
|
760
|
+
break
|
|
761
|
+
|
|
762
|
+
except asyncio.CancelledError:
|
|
763
|
+
pass
|
|
764
|
+
except Exception:
|
|
765
|
+
logger.debug("TencentASR: recv_loop ended", exc_info=True)
|
|
766
|
+
|
|
767
|
+
# -- ASRProvider interface -------------------------------------------------
|
|
768
|
+
|
|
769
|
+
async def start_stream(self, language: str = "zh") -> None:
|
|
770
|
+
self._language = language
|
|
771
|
+
self._buffer = bytearray()
|
|
772
|
+
self._recv_text = ""
|
|
773
|
+
self._sentence_parts = []
|
|
774
|
+
self._recv_done = False
|
|
775
|
+
|
|
776
|
+
ws_url = self._build_ws_url()
|
|
777
|
+
masked_key = ("****" + self.secret_key[-4:]) if len(self.secret_key) > 4 else "****"
|
|
778
|
+
logger.info(
|
|
779
|
+
"TencentASR: connecting app_id=%s engine=%s secret_key=%s",
|
|
780
|
+
self.app_id, self.engine_model_type, masked_key,
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
try:
|
|
784
|
+
self._ws = await websockets.connect(ws_url)
|
|
785
|
+
except websockets.exceptions.InvalidStatus as e:
|
|
786
|
+
body = getattr(e.response, "body", b"")
|
|
787
|
+
if isinstance(body, (bytes, bytearray)):
|
|
788
|
+
body = body.decode("utf-8", errors="replace")
|
|
789
|
+
logger.error("TencentASR: WebSocket rejected: HTTP %s, %s",
|
|
790
|
+
e.response.status_code, body)
|
|
791
|
+
raise ConnectionError(
|
|
792
|
+
f"腾讯云ASR连接失败 (HTTP {e.response.status_code}): {body}. "
|
|
793
|
+
f"请检查 app_id、secret_id、secret_key 配置。"
|
|
794
|
+
) from e
|
|
795
|
+
|
|
796
|
+
# Wait for the initial handshake response
|
|
797
|
+
try:
|
|
798
|
+
init_msg = await asyncio.wait_for(self._ws.recv(), timeout=5.0)
|
|
799
|
+
init_data = json.loads(init_msg) if isinstance(init_msg, str) else {}
|
|
800
|
+
if init_data.get("code", -1) != 0:
|
|
801
|
+
raise ConnectionError(
|
|
802
|
+
f"腾讯云ASR握手失败: code={init_data.get('code')}, "
|
|
803
|
+
f"msg={init_data.get('message', '')}"
|
|
804
|
+
)
|
|
805
|
+
logger.info("TencentASR: handshake ok, voice_id=%s", init_data.get("voice_id", ""))
|
|
806
|
+
except asyncio.TimeoutError:
|
|
807
|
+
logger.warning("TencentASR: handshake timeout, proceeding anyway")
|
|
808
|
+
|
|
809
|
+
# Start background receiver
|
|
810
|
+
self._recv_task = asyncio.create_task(self._recv_loop())
|
|
811
|
+
logger.info("TencentASR: stream started")
|
|
812
|
+
|
|
813
|
+
async def feed_audio(self, chunk: bytes) -> None:
|
|
814
|
+
self._buffer.extend(chunk)
|
|
815
|
+
if not self._ws:
|
|
816
|
+
return
|
|
817
|
+
|
|
818
|
+
while len(self._buffer) >= self.BYTES_PER_FRAME:
|
|
819
|
+
frame = bytes(self._buffer[:self.BYTES_PER_FRAME])
|
|
820
|
+
self._buffer = self._buffer[self.BYTES_PER_FRAME:]
|
|
821
|
+
try:
|
|
822
|
+
await self._ws.send(frame)
|
|
823
|
+
except Exception:
|
|
824
|
+
logger.debug("TencentASR: send failed", exc_info=True)
|
|
825
|
+
return
|
|
826
|
+
|
|
827
|
+
async def get_result(self) -> str:
|
|
828
|
+
if not self._ws:
|
|
829
|
+
return self._recv_text or ""
|
|
830
|
+
|
|
831
|
+
# Send remaining audio
|
|
832
|
+
if self._buffer:
|
|
833
|
+
try:
|
|
834
|
+
await self._ws.send(bytes(self._buffer))
|
|
835
|
+
except Exception:
|
|
836
|
+
pass
|
|
837
|
+
self._buffer = bytearray()
|
|
838
|
+
|
|
839
|
+
# Send end signal
|
|
840
|
+
try:
|
|
841
|
+
await self._ws.send(json.dumps({"type": "end"}))
|
|
842
|
+
except Exception:
|
|
843
|
+
pass
|
|
844
|
+
|
|
845
|
+
# Wait for final_speech=1
|
|
846
|
+
if self._recv_task and not self._recv_task.done():
|
|
847
|
+
try:
|
|
848
|
+
await asyncio.wait_for(self._recv_task, timeout=10.0)
|
|
849
|
+
except asyncio.TimeoutError:
|
|
850
|
+
logger.warning("TencentASR: timeout waiting for final response")
|
|
851
|
+
except Exception:
|
|
852
|
+
pass
|
|
853
|
+
|
|
854
|
+
result = self._recv_text
|
|
855
|
+
self._recv_text = ""
|
|
856
|
+
return result
|
|
857
|
+
|
|
858
|
+
async def stop_stream(self) -> None:
|
|
859
|
+
self._buffer = bytearray()
|
|
860
|
+
if self._recv_task and not self._recv_task.done():
|
|
861
|
+
self._recv_task.cancel()
|
|
862
|
+
try:
|
|
863
|
+
await self._recv_task
|
|
864
|
+
except (asyncio.CancelledError, Exception):
|
|
865
|
+
pass
|
|
866
|
+
self._recv_task = None
|
|
867
|
+
self._recv_text = ""
|
|
868
|
+
self._sentence_parts = []
|
|
869
|
+
self._recv_done = False
|
|
870
|
+
if self._ws:
|
|
871
|
+
try:
|
|
872
|
+
await self._ws.close()
|
|
873
|
+
except Exception:
|
|
874
|
+
pass
|
|
875
|
+
self._ws = None
|
|
876
|
+
|
|
877
|
+
async def get_interim_result(self) -> str:
|
|
878
|
+
return self._recv_text
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
# ---------------------------------------------------------------------------
|
|
882
|
+
# Factory
|
|
883
|
+
# ---------------------------------------------------------------------------
|
|
884
|
+
|
|
885
|
+
def create_asr_provider(
|
|
886
|
+
provider_override: str | None = None,
|
|
887
|
+
resource_id_override: str | None = None,
|
|
888
|
+
sample_rate_override: int | None = None,
|
|
889
|
+
) -> ASRProvider:
|
|
890
|
+
"""Create an ASR provider instance based on the current configuration.
|
|
891
|
+
|
|
892
|
+
Optional overrides allow the ASR test UI to dynamically switch
|
|
893
|
+
provider, resource_id and sample_rate without changing the saved config.
|
|
894
|
+
"""
|
|
895
|
+
provider = provider_override or cfg.get("asr.provider", "whisper")
|
|
896
|
+
|
|
897
|
+
if provider == "whisper":
|
|
898
|
+
return WhisperASR(
|
|
899
|
+
base_url=cfg.get("asr.whisper.base_url", "https://api.openai.com/v1"),
|
|
900
|
+
api_key=cfg.get("asr.whisper.api_key", ""),
|
|
901
|
+
model=cfg.get("asr.whisper.model", "whisper-1"),
|
|
902
|
+
)
|
|
903
|
+
elif provider == "xunfei":
|
|
904
|
+
return XunfeiASR(
|
|
905
|
+
app_id=cfg.get("asr.xunfei.app_id", ""),
|
|
906
|
+
api_key=cfg.get("asr.xunfei.api_key", ""),
|
|
907
|
+
api_secret=cfg.get("asr.xunfei.api_secret", ""),
|
|
908
|
+
)
|
|
909
|
+
elif provider == "volcengine":
|
|
910
|
+
resource_id = resource_id_override or cfg.get("asr.volcengine.resource_id", "volc.bigasr.sauc.duration")
|
|
911
|
+
sample_rate = sample_rate_override or 16000
|
|
912
|
+
asr = VolcengineASR(
|
|
913
|
+
app_id=cfg.get("asr.volcengine.app_id", ""),
|
|
914
|
+
access_token=cfg.get("asr.volcengine.access_token", ""),
|
|
915
|
+
resource_id=resource_id,
|
|
916
|
+
ws_url=cfg.get("asr.volcengine.ws_url", ""),
|
|
917
|
+
)
|
|
918
|
+
if sample_rate and sample_rate != 16000:
|
|
919
|
+
asr.SAMPLE_RATE = sample_rate
|
|
920
|
+
asr.BYTES_PER_FRAME = sample_rate * 2 * asr.FRAME_DURATION_MS // 1000
|
|
921
|
+
return asr
|
|
922
|
+
elif provider == "tencent":
|
|
923
|
+
engine = resource_id_override or cfg.get("asr.tencent.engine_model_type", "16k_zh_large")
|
|
924
|
+
sample_rate = sample_rate_override or 16000
|
|
925
|
+
asr = TencentASR(
|
|
926
|
+
app_id=cfg.get("asr.tencent.app_id", ""),
|
|
927
|
+
secret_id=cfg.get("asr.tencent.secret_id", ""),
|
|
928
|
+
secret_key=cfg.get("asr.tencent.secret_key", ""),
|
|
929
|
+
engine_model_type=engine,
|
|
930
|
+
)
|
|
931
|
+
if sample_rate and sample_rate != 16000:
|
|
932
|
+
asr.SAMPLE_RATE = sample_rate
|
|
933
|
+
asr.BYTES_PER_FRAME = sample_rate * 2 * asr.FRAME_DURATION_MS // 1000
|
|
934
|
+
return asr
|
|
935
|
+
else:
|
|
936
|
+
raise ValueError(f"Unknown ASR provider: {provider}")
|