@agentunion/kite 1.0.6 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/cli.js +127 -25
  2. package/core/event_hub/entry.py +384 -61
  3. package/core/event_hub/hub.py +8 -0
  4. package/core/event_hub/module.md +0 -1
  5. package/core/event_hub/server.py +169 -38
  6. package/core/kite_log.py +241 -0
  7. package/core/launcher/entry.py +1306 -425
  8. package/core/launcher/module_scanner.py +10 -9
  9. package/core/launcher/process_manager.py +555 -121
  10. package/core/registry/entry.py +335 -30
  11. package/core/registry/server.py +339 -256
  12. package/core/registry/store.py +13 -2
  13. package/extensions/agents/__init__.py +1 -0
  14. package/extensions/agents/assistant/__init__.py +1 -0
  15. package/extensions/agents/assistant/entry.py +380 -0
  16. package/extensions/agents/assistant/module.md +22 -0
  17. package/extensions/agents/assistant/server.py +236 -0
  18. package/extensions/channels/__init__.py +1 -0
  19. package/extensions/channels/acp_channel/__init__.py +1 -0
  20. package/extensions/channels/acp_channel/entry.py +380 -0
  21. package/extensions/channels/acp_channel/module.md +22 -0
  22. package/extensions/channels/acp_channel/server.py +236 -0
  23. package/{core → extensions}/event_hub_bench/entry.py +664 -371
  24. package/{core → extensions}/event_hub_bench/module.md +4 -2
  25. package/extensions/services/backup/__init__.py +1 -0
  26. package/extensions/services/backup/entry.py +380 -0
  27. package/extensions/services/backup/module.md +22 -0
  28. package/extensions/services/backup/server.py +244 -0
  29. package/extensions/services/model_service/__init__.py +1 -0
  30. package/extensions/services/model_service/entry.py +380 -0
  31. package/extensions/services/model_service/module.md +22 -0
  32. package/extensions/services/model_service/server.py +236 -0
  33. package/extensions/services/watchdog/entry.py +460 -143
  34. package/extensions/services/watchdog/module.md +3 -0
  35. package/extensions/services/watchdog/monitor.py +128 -13
  36. package/extensions/services/watchdog/server.py +75 -13
  37. package/extensions/services/web/__init__.py +1 -0
  38. package/extensions/services/web/config.yaml +149 -0
  39. package/extensions/services/web/entry.py +487 -0
  40. package/extensions/services/web/module.md +24 -0
  41. package/extensions/services/web/routes/__init__.py +1 -0
  42. package/extensions/services/web/routes/routes_call.py +189 -0
  43. package/extensions/services/web/routes/routes_config.py +512 -0
  44. package/extensions/services/web/routes/routes_contacts.py +98 -0
  45. package/extensions/services/web/routes/routes_devlog.py +99 -0
  46. package/extensions/services/web/routes/routes_phone.py +81 -0
  47. package/extensions/services/web/routes/routes_sms.py +48 -0
  48. package/extensions/services/web/routes/routes_stats.py +17 -0
  49. package/extensions/services/web/routes/routes_voicechat.py +554 -0
  50. package/extensions/services/web/routes/schemas.py +216 -0
  51. package/extensions/services/web/server.py +332 -0
  52. package/extensions/services/web/static/css/style.css +1064 -0
  53. package/extensions/services/web/static/index.html +1445 -0
  54. package/extensions/services/web/static/js/app.js +4671 -0
  55. package/extensions/services/web/vendor/__init__.py +1 -0
  56. package/extensions/services/web/vendor/bluetooth/audio.py +348 -0
  57. package/extensions/services/web/vendor/bluetooth/contacts.py +251 -0
  58. package/extensions/services/web/vendor/bluetooth/manager.py +395 -0
  59. package/extensions/services/web/vendor/bluetooth/sms.py +290 -0
  60. package/extensions/services/web/vendor/bluetooth/telephony.py +274 -0
  61. package/extensions/services/web/vendor/config.py +139 -0
  62. package/extensions/services/web/vendor/conversation/__init__.py +0 -0
  63. package/extensions/services/web/vendor/conversation/asr.py +936 -0
  64. package/extensions/services/web/vendor/conversation/engine.py +548 -0
  65. package/extensions/services/web/vendor/conversation/llm.py +534 -0
  66. package/extensions/services/web/vendor/conversation/mcp_tools.py +190 -0
  67. package/extensions/services/web/vendor/conversation/tts.py +322 -0
  68. package/extensions/services/web/vendor/conversation/vad.py +138 -0
  69. package/extensions/services/web/vendor/storage/__init__.py +1 -0
  70. package/extensions/services/web/vendor/storage/identity.py +312 -0
  71. package/extensions/services/web/vendor/storage/store.py +507 -0
  72. package/extensions/services/web/vendor/task/__init__.py +0 -0
  73. package/extensions/services/web/vendor/task/manager.py +864 -0
  74. package/extensions/services/web/vendor/task/models.py +45 -0
  75. package/extensions/services/web/vendor/task/webhook.py +263 -0
  76. package/extensions/services/web/vendor/tools/__init__.py +0 -0
  77. package/extensions/services/web/vendor/tools/registry.py +321 -0
  78. package/main.py +344 -4
  79. package/package.json +11 -2
  80. package/core/__pycache__/__init__.cpython-313.pyc +0 -0
  81. package/core/__pycache__/data_dir.cpython-313.pyc +0 -0
  82. package/core/data_dir.py +0 -62
  83. package/core/event_hub/__pycache__/__init__.cpython-313.pyc +0 -0
  84. package/core/event_hub/__pycache__/bench.cpython-313.pyc +0 -0
  85. package/core/event_hub/__pycache__/bench_perf.cpython-313.pyc +0 -0
  86. package/core/event_hub/__pycache__/dedup.cpython-313.pyc +0 -0
  87. package/core/event_hub/__pycache__/entry.cpython-313.pyc +0 -0
  88. package/core/event_hub/__pycache__/hub.cpython-313.pyc +0 -0
  89. package/core/event_hub/__pycache__/router.cpython-313.pyc +0 -0
  90. package/core/event_hub/__pycache__/server.cpython-313.pyc +0 -0
  91. package/core/event_hub/bench_results/2026-02-28_13-26-48.json +0 -51
  92. package/core/event_hub/bench_results/2026-02-28_13-44-45.json +0 -51
  93. package/core/event_hub/bench_results/2026-02-28_13-45-39.json +0 -51
  94. package/core/launcher/__pycache__/__init__.cpython-313.pyc +0 -0
  95. package/core/launcher/__pycache__/entry.cpython-313.pyc +0 -0
  96. package/core/launcher/__pycache__/module_scanner.cpython-313.pyc +0 -0
  97. package/core/launcher/__pycache__/process_manager.cpython-313.pyc +0 -0
  98. package/core/launcher/data/log/lifecycle.jsonl +0 -1158
  99. package/core/launcher/data/token.txt +0 -1
  100. package/core/registry/__pycache__/__init__.cpython-313.pyc +0 -0
  101. package/core/registry/__pycache__/entry.cpython-313.pyc +0 -0
  102. package/core/registry/__pycache__/server.cpython-313.pyc +0 -0
  103. package/core/registry/__pycache__/store.cpython-313.pyc +0 -0
  104. package/core/registry/data/port.txt +0 -1
  105. package/core/registry/data/port_484.txt +0 -1
  106. package/extensions/__pycache__/__init__.cpython-313.pyc +0 -0
  107. package/extensions/services/__pycache__/__init__.cpython-313.pyc +0 -0
  108. package/extensions/services/watchdog/__pycache__/__init__.cpython-313.pyc +0 -0
  109. package/extensions/services/watchdog/__pycache__/entry.cpython-313.pyc +0 -0
  110. package/extensions/services/watchdog/__pycache__/monitor.cpython-313.pyc +0 -0
  111. package/extensions/services/watchdog/__pycache__/server.cpython-313.pyc +0 -0
  112. /package/{core/event_hub/bench_results/.gitkeep → extensions/services/web/vendor/bluetooth/__init__.py} +0 -0
@@ -0,0 +1,936 @@
1
+ """ASR (Automatic Speech Recognition) abstraction and implementations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import base64
7
+ import hashlib
8
+ import hmac
9
+ import io
10
+ import json
11
+ import logging
12
+ import random
13
+ import time
14
+ import uuid
15
+ from abc import ABC, abstractmethod
16
+ from datetime import datetime
17
+ from time import mktime
18
+ from urllib.parse import quote, urlencode, urlparse
19
+ from wsgiref.handlers import format_date_time
20
+
21
+ import httpx
22
+ import websockets
23
+
24
+ from .. import config as cfg
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Abstract base
31
+ # ---------------------------------------------------------------------------
32
+
33
+ class ASRProvider(ABC):
34
+ """Base class for all ASR (speech-to-text) providers."""
35
+
36
+ @abstractmethod
37
+ async def start_stream(self, language: str = "zh") -> None:
38
+ """Prepare the provider for a new recognition stream."""
39
+ ...
40
+
41
+ @abstractmethod
42
+ async def feed_audio(self, chunk: bytes) -> None:
43
+ """Feed a chunk of PCM S16LE 16 kHz mono audio."""
44
+ ...
45
+
46
+ @abstractmethod
47
+ async def get_result(self) -> str:
48
+ """Return the transcribed text accumulated so far and reset the buffer."""
49
+ ...
50
+
51
+ @abstractmethod
52
+ async def stop_stream(self) -> None:
53
+ """Tear down the recognition stream and release resources."""
54
+ ...
55
+
56
+ async def get_interim_result(self) -> str:
57
+ """Return any interim/partial text without stopping the stream.
58
+
59
+ Default returns empty string. Streaming providers can override.
60
+ """
61
+ return ""
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # OpenAI Whisper API
66
+ # ---------------------------------------------------------------------------
67
+
68
+ class WhisperASR(ASRProvider):
69
+ """OpenAI Whisper API -- accumulates audio chunks, sends for transcription."""
70
+
71
+ def __init__(
72
+ self,
73
+ base_url: str,
74
+ api_key: str,
75
+ model: str = "whisper-1",
76
+ ) -> None:
77
+ self.base_url = base_url.rstrip("/")
78
+ self.api_key = api_key
79
+ self.model = model
80
+ self._buffer = bytearray()
81
+ self._language = "zh"
82
+ self._client: httpx.AsyncClient | None = None
83
+
84
+ async def start_stream(self, language: str = "zh") -> None:
85
+ self._language = language
86
+ self._buffer = bytearray()
87
+ self._client = httpx.AsyncClient(timeout=30.0)
88
+
89
+ async def feed_audio(self, chunk: bytes) -> None:
90
+ self._buffer.extend(chunk)
91
+
92
+ async def get_result(self) -> str:
93
+ if not self._buffer:
94
+ return ""
95
+
96
+ if self._client is None:
97
+ logger.warning("WhisperASR: client not initialized, call start_stream first")
98
+ return ""
99
+
100
+ # Build a minimal WAV header so Whisper can accept the raw PCM.
101
+ audio_data = self._build_wav(bytes(self._buffer))
102
+ self._buffer = bytearray()
103
+
104
+ url = f"{self.base_url}/audio/transcriptions"
105
+ files = {
106
+ "file": ("audio.wav", io.BytesIO(audio_data), "audio/wav"),
107
+ }
108
+ data = {
109
+ "model": self.model,
110
+ "language": self._language,
111
+ "response_format": "json",
112
+ }
113
+ headers = {"Authorization": f"Bearer {self.api_key}"}
114
+
115
+ try:
116
+ resp = await self._client.post(url, headers=headers, files=files, data=data)
117
+ resp.raise_for_status()
118
+ result = resp.json()
119
+ return result.get("text", "")
120
+ except Exception:
121
+ logger.exception("WhisperASR: transcription request failed")
122
+ return ""
123
+
124
+ async def stop_stream(self) -> None:
125
+ self._buffer = bytearray()
126
+ if self._client:
127
+ await self._client.aclose()
128
+ self._client = None
129
+
130
+ # -- helpers --
131
+
132
+ @staticmethod
133
+ def _build_wav(pcm: bytes, sample_rate: int = 16000, channels: int = 1, bits: int = 16) -> bytes:
134
+ """Wrap raw PCM S16LE data in a minimal WAV header."""
135
+ data_size = len(pcm)
136
+ byte_rate = sample_rate * channels * bits // 8
137
+ block_align = channels * bits // 8
138
+ header = bytearray(44)
139
+ # RIFF header
140
+ header[0:4] = b"RIFF"
141
+ header[4:8] = (data_size + 36).to_bytes(4, "little")
142
+ header[8:12] = b"WAVE"
143
+ # fmt chunk
144
+ header[12:16] = b"fmt "
145
+ header[16:20] = (16).to_bytes(4, "little") # chunk size
146
+ header[20:22] = (1).to_bytes(2, "little") # PCM format
147
+ header[22:24] = channels.to_bytes(2, "little")
148
+ header[24:28] = sample_rate.to_bytes(4, "little")
149
+ header[28:32] = byte_rate.to_bytes(4, "little")
150
+ header[32:34] = block_align.to_bytes(2, "little")
151
+ header[34:36] = bits.to_bytes(2, "little")
152
+ # data chunk
153
+ header[36:40] = b"data"
154
+ header[40:44] = data_size.to_bytes(4, "little")
155
+ return bytes(header) + pcm
156
+
157
+
158
+ # ---------------------------------------------------------------------------
159
+ # iFlytek (Xunfei) streaming ASR via WebSocket
160
+ # ---------------------------------------------------------------------------
161
+
162
+ class XunfeiASR(ASRProvider):
163
+ """iFlytek streaming ASR via WebSocket (wss://iat-api.xfyun.cn/v2/iat)."""
164
+
165
+ XUNFEI_IAT_URL = "wss://iat-api.xfyun.cn/v2/iat"
166
+ FRAME_SIZE = 8000 # 250ms at 16kHz S16LE
167
+
168
+ def __init__(self, app_id: str, api_key: str, api_secret: str) -> None:
169
+ self.app_id = app_id
170
+ self.api_key = api_key
171
+ self.api_secret = api_secret
172
+ self._ws: websockets.WebSocketClientProtocol | None = None
173
+ self._buffer = bytearray()
174
+ self._result_text = ""
175
+ self._language = "zh_cn"
176
+ self._stream_started = False
177
+
178
+ async def start_stream(self, language: str = "zh") -> None:
179
+ self._language = "zh_cn" if language.startswith("zh") else language
180
+ self._buffer = bytearray()
181
+ self._result_text = ""
182
+ self._stream_started = False
183
+ url = self._build_auth_url()
184
+ self._ws = await websockets.connect(url)
185
+ self._stream_started = True
186
+
187
+ async def feed_audio(self, chunk: bytes) -> None:
188
+ self._buffer.extend(chunk)
189
+
190
+ if not self._ws or not self._stream_started:
191
+ return
192
+
193
+ # Send accumulated frames when we have enough data
194
+ while len(self._buffer) >= self.FRAME_SIZE:
195
+ frame = bytes(self._buffer[: self.FRAME_SIZE])
196
+ self._buffer = self._buffer[self.FRAME_SIZE :]
197
+ await self._send_frame(frame, status=1) # 1 = continue
198
+
199
+ async def get_result(self) -> str:
200
+ if not self._ws:
201
+ return ""
202
+
203
+ # Send remaining audio and end-of-stream marker
204
+ if self._buffer:
205
+ await self._send_frame(bytes(self._buffer), status=2) # 2 = last frame
206
+ self._buffer = bytearray()
207
+ else:
208
+ await self._send_frame(b"", status=2)
209
+
210
+ # Receive results until the server closes
211
+ result_parts: list[str] = []
212
+ try:
213
+ async for message in self._ws:
214
+ data = json.loads(message)
215
+ code = data.get("code", -1)
216
+ if code != 0:
217
+ logger.error("XunfeiASR: error code=%s, message=%s", code, data.get("message"))
218
+ break
219
+ result = data.get("data", {}).get("result", {})
220
+ ws_list = result.get("ws", [])
221
+ for ws_item in ws_list:
222
+ for cw in ws_item.get("cw", []):
223
+ result_parts.append(cw.get("w", ""))
224
+ # Check if this is the final result
225
+ if result.get("status") == 2:
226
+ break
227
+ except websockets.exceptions.ConnectionClosed:
228
+ pass
229
+
230
+ text = "".join(result_parts)
231
+ return text
232
+
233
+ async def stop_stream(self) -> None:
234
+ self._buffer = bytearray()
235
+ self._result_text = ""
236
+ self._stream_started = False
237
+ if self._ws:
238
+ try:
239
+ await self._ws.close()
240
+ except Exception:
241
+ pass
242
+ self._ws = None
243
+
244
+ # -- helpers --
245
+
246
+ async def _send_frame(self, audio: bytes, status: int) -> None:
247
+ """Send a single audio frame to the Xunfei WebSocket.
248
+
249
+ status: 0 = first frame, 1 = continue, 2 = last frame.
250
+ """
251
+ if not self._ws:
252
+ return
253
+
254
+ payload: dict = {
255
+ "data": {
256
+ "status": status,
257
+ "format": "audio/L16;rate=16000",
258
+ "encoding": "raw",
259
+ "audio": base64.b64encode(audio).decode("utf-8"),
260
+ },
261
+ }
262
+
263
+ # The first frame must also carry common and business params
264
+ if status == 0 or (status == 1 and not self._result_text):
265
+ payload["common"] = {"app_id": self.app_id}
266
+ payload["business"] = {
267
+ "language": self._language,
268
+ "domain": "iat",
269
+ "accent": "mandarin",
270
+ "vad_eos": 3000,
271
+ "dwa": "wpgs",
272
+ }
273
+
274
+ await self._ws.send(json.dumps(payload))
275
+
276
+ def _build_auth_url(self) -> str:
277
+ """Build the signed WebSocket URL for Xunfei authentication (HMAC-SHA256)."""
278
+ parsed = urlparse(self.XUNFEI_IAT_URL)
279
+ now = datetime.now()
280
+ date_str = format_date_time(mktime(now.timetuple()))
281
+
282
+ signature_origin = (
283
+ f"host: {parsed.hostname}\n"
284
+ f"date: {date_str}\n"
285
+ f"GET {parsed.path} HTTP/1.1"
286
+ )
287
+ signature_sha = hmac.new(
288
+ self.api_secret.encode("utf-8"),
289
+ signature_origin.encode("utf-8"),
290
+ hashlib.sha256,
291
+ ).digest()
292
+ signature = base64.b64encode(signature_sha).decode("utf-8")
293
+
294
+ authorization_origin = (
295
+ f'api_key="{self.api_key}", '
296
+ f'algorithm="hmac-sha256", '
297
+ f'headers="host date request-line", '
298
+ f'signature="{signature}"'
299
+ )
300
+ authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
301
+
302
+ params = {
303
+ "authorization": authorization,
304
+ "date": date_str,
305
+ "host": parsed.hostname,
306
+ }
307
+ return f"{self.XUNFEI_IAT_URL}?{urlencode(params)}"
308
+
309
+
310
+ # ---------------------------------------------------------------------------
311
+ # Volcengine (ByteDance) streaming ASR via WebSocket binary protocol
312
+ # ---------------------------------------------------------------------------
313
+
314
+ class VolcengineASR(ASRProvider):
315
+ """火山引擎(豆包)大模型流式语音识别。
316
+
317
+ Uses the binary WebSocket protocol described at:
318
+ https://www.volcengine.com/docs/6561/1354869
319
+
320
+ Requires ``app_id`` and ``access_token`` from the Volcengine console.
321
+ """
322
+
323
+ DEFAULT_WS_URL = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async"
324
+ FRAME_DURATION_MS = 200 # recommended 200ms per packet
325
+ SAMPLE_RATE = 16000
326
+ BYTES_PER_FRAME = SAMPLE_RATE * 2 * FRAME_DURATION_MS // 1000 # 6400 bytes
327
+
328
+ def __init__(
329
+ self,
330
+ app_id: str,
331
+ access_token: str,
332
+ resource_id: str = "volc.bigasr.sauc.duration",
333
+ ws_url: str = "",
334
+ ) -> None:
335
+ self.app_id = app_id
336
+ self.access_token = access_token
337
+ self.resource_id = resource_id
338
+ self.ws_url = ws_url or self.DEFAULT_WS_URL
339
+ self._ws = None
340
+ self._buffer = bytearray()
341
+ self._language = "zh"
342
+ self._recv_text = ""
343
+ self._recv_done = False
344
+ self._recv_task: asyncio.Task | None = None
345
+
346
+ # -- protocol constants ---------------------------------------------------
347
+
348
+ _NO_SEQUENCE = 0b0000
349
+ _NEG_SEQUENCE = 0b0010
350
+
351
+ _FULL_CLIENT_REQUEST = 0b0001
352
+ _AUDIO_ONLY_REQUEST = 0b0010
353
+ _FULL_SERVER_RESPONSE = 0b1001
354
+ _SERVER_ACK = 0b1011
355
+ _SERVER_ERROR = 0b1111
356
+
357
+ _JSON = 0b0001
358
+ _NO_SERIAL = 0b0000
359
+ _GZIP = 0b0001
360
+
361
+ # -- protocol helpers ---------------------------------------------------
362
+
363
+ @staticmethod
364
+ def _build_header(
365
+ msg_type: int,
366
+ msg_flags: int,
367
+ serial: int = 0x01,
368
+ compress: int = 0x01,
369
+ ) -> bytes:
370
+ """Build the 4-byte binary header."""
371
+ b0 = (0x1 << 4) | 0x1 # version=1, header_size=1 (×4 = 4 bytes)
372
+ b1 = (msg_type << 4) | msg_flags
373
+ b2 = (serial << 4) | compress
374
+ b3 = 0x00
375
+ return bytes([b0, b1, b2, b3])
376
+
377
+ @staticmethod
378
+ def _gzip(data: bytes) -> bytes:
379
+ import gzip
380
+ return gzip.compress(data)
381
+
382
+ @staticmethod
383
+ def _gunzip(data: bytes) -> bytes:
384
+ import gzip
385
+ return gzip.decompress(data)
386
+
387
+ def _build_full_client_request(self) -> bytes:
388
+ payload = json.dumps({
389
+ "user": {"uid": "ai-phone-agent"},
390
+ "audio": {
391
+ "format": "pcm",
392
+ "rate": self.SAMPLE_RATE,
393
+ "bits": 16,
394
+ "channel": 1,
395
+ "codec": "raw",
396
+ },
397
+ "request": {
398
+ "model_name": "bigmodel",
399
+ "enable_itn": True,
400
+ "enable_punc": True,
401
+ "enable_ddc": False,
402
+ "result_type": "full",
403
+ },
404
+ }).encode("utf-8")
405
+ compressed = self._gzip(payload)
406
+ header = self._build_header(
407
+ msg_type=self._FULL_CLIENT_REQUEST,
408
+ msg_flags=self._NO_SEQUENCE,
409
+ serial=self._JSON,
410
+ compress=self._GZIP,
411
+ )
412
+ size = len(compressed).to_bytes(4, "big")
413
+ return header + size + compressed
414
+
415
+ def _build_audio_packet(self, audio: bytes, last: bool = False) -> bytes:
416
+ compressed = self._gzip(audio)
417
+ flags = self._NEG_SEQUENCE if last else self._NO_SEQUENCE
418
+ header = self._build_header(
419
+ msg_type=self._AUDIO_ONLY_REQUEST,
420
+ msg_flags=flags,
421
+ serial=self._NO_SERIAL,
422
+ compress=self._GZIP,
423
+ )
424
+ size = len(compressed).to_bytes(4, "big")
425
+ return header + size + compressed
426
+
427
+ def _parse_response(self, data: bytes) -> tuple[str, bool]:
428
+ """Parse a server response binary frame.
429
+
430
+ Returns ``(text, is_last)``.
431
+ """
432
+ if len(data) < 4:
433
+ return "", False
434
+
435
+ header_size = data[0] & 0x0F
436
+ b1 = data[1]
437
+ msg_type = (b1 >> 4) & 0x0F
438
+ flags = b1 & 0x0F
439
+ b2 = data[2]
440
+ compress = b2 & 0x0F
441
+
442
+ header_bytes = header_size * 4
443
+ payload = data[header_bytes:]
444
+
445
+ if msg_type == self._SERVER_ERROR:
446
+ if len(payload) >= 8:
447
+ err_code = int.from_bytes(payload[:4], "big")
448
+ err_size = int.from_bytes(payload[4:8], "big")
449
+ err_msg = payload[8:8 + err_size]
450
+ if compress == self._GZIP:
451
+ try:
452
+ err_msg = self._gunzip(err_msg)
453
+ except Exception:
454
+ pass
455
+ logger.error("VolcengineASR error: code=%s msg=%s",
456
+ err_code, err_msg.decode("utf-8", errors="replace"))
457
+ return "", True
458
+
459
+ if msg_type == self._SERVER_ACK:
460
+ return "", False
461
+
462
+ if msg_type != self._FULL_SERVER_RESPONSE:
463
+ return "", False
464
+
465
+ has_sequence = (flags & 0x01) != 0
466
+ # The "last" indicator can be bit 1 (0x02) or bit 2 (0x04) depending
467
+ # on endpoint; treat any of these as the final response.
468
+ is_last = (flags & 0x02) != 0 or (flags & 0x04) != 0
469
+
470
+ offset = 0
471
+ if has_sequence:
472
+ offset += 4 # skip sequence number
473
+
474
+ if offset + 4 > len(payload):
475
+ return "", is_last
476
+ payload_size = int.from_bytes(payload[offset:offset + 4], "big")
477
+ offset += 4
478
+ payload_bytes = payload[offset:offset + payload_size]
479
+
480
+ if compress == self._GZIP:
481
+ payload_bytes = self._gunzip(payload_bytes)
482
+
483
+ try:
484
+ result = json.loads(payload_bytes)
485
+ text = result.get("result", {}).get("text", "")
486
+ if text:
487
+ logger.debug("VolcengineASR: text=%r is_last=%s", text, is_last)
488
+ return text, is_last
489
+ except (json.JSONDecodeError, KeyError):
490
+ return "", is_last
491
+
492
+ # -- background receiver --------------------------------------------------
493
+
494
+ async def _recv_loop(self) -> None:
495
+ """Background task: read all responses from the Volcengine WebSocket."""
496
+ try:
497
+ while self._ws:
498
+ try:
499
+ resp = await self._ws.recv()
500
+ except (websockets.exceptions.ConnectionClosed,
501
+ websockets.exceptions.ConnectionClosedOK,
502
+ websockets.exceptions.ConnectionClosedError):
503
+ logger.debug("VolcengineASR: recv_loop connection closed")
504
+ break
505
+ if isinstance(resp, bytes):
506
+ text, is_last = self._parse_response(resp)
507
+ if text:
508
+ self._recv_text = text
509
+ if is_last:
510
+ self._recv_done = True
511
+ break
512
+ except asyncio.CancelledError:
513
+ pass
514
+ except Exception:
515
+ logger.debug("VolcengineASR: recv_loop ended", exc_info=True)
516
+
517
+ # -- ASRProvider interface -----------------------------------------------
518
+
519
+ async def start_stream(self, language: str = "zh") -> None:
520
+ self._language = language
521
+ self._buffer = bytearray()
522
+ self._recv_text = ""
523
+ self._recv_done = False
524
+
525
+ connect_id = str(uuid.uuid4())
526
+ extra_headers = {
527
+ "X-Api-App-Key": self.app_id,
528
+ "X-Api-Access-Key": self.access_token,
529
+ "X-Api-Resource-Id": self.resource_id,
530
+ "X-Api-Connect-Id": connect_id,
531
+ }
532
+ masked_token = ("****" + self.access_token[-4:]) if len(self.access_token) > 4 else "****"
533
+ logger.info(
534
+ "VolcengineASR: connecting app_id=%s access_token=%s resource_id=%s connect_id=%s url=%s",
535
+ self.app_id, masked_token, self.resource_id, connect_id, self.ws_url,
536
+ )
537
+
538
+ try:
539
+ self._ws = await websockets.connect(
540
+ self.ws_url,
541
+ additional_headers=extra_headers,
542
+ )
543
+ except websockets.exceptions.InvalidStatus as e:
544
+ body = getattr(e.response, "body", b"")
545
+ if isinstance(body, (bytes, bytearray)):
546
+ body = body.decode("utf-8", errors="replace")
547
+ logger.error("VolcengineASR: WebSocket rejected: HTTP %s, %s", e.response.status_code, body)
548
+ raise ConnectionError(
549
+ f"火山引擎ASR连接失败 (HTTP {e.response.status_code}): {body}. "
550
+ f"请检查 app_id、access_token、resource_id 配置,"
551
+ f"并确认已在火山引擎控制台开通对应的语音识别服务。"
552
+ ) from e
553
+
554
+ # Log X-Tt-Logid from response headers
555
+ resp_headers = getattr(self._ws, "response_headers", None)
556
+ if resp_headers is None:
557
+ resp_obj = getattr(self._ws, "response", None)
558
+ resp_headers = getattr(resp_obj, "headers", {}) if resp_obj else {}
559
+ logid = resp_headers.get("X-Tt-Logid", "")
560
+ if logid:
561
+ logger.info("VolcengineASR: X-Tt-Logid=%s", logid)
562
+
563
+ # Send full client request
564
+ await self._ws.send(self._build_full_client_request())
565
+ logger.info("VolcengineASR: stream started, listening for responses")
566
+
567
+ # Start background receiver task
568
+ self._recv_task = asyncio.create_task(self._recv_loop())
569
+
570
+ async def feed_audio(self, chunk: bytes) -> None:
571
+ self._buffer.extend(chunk)
572
+
573
+ if not self._ws:
574
+ return
575
+
576
+ # Send complete frames (200ms each)
577
+ while len(self._buffer) >= self.BYTES_PER_FRAME:
578
+ frame = bytes(self._buffer[:self.BYTES_PER_FRAME])
579
+ self._buffer = self._buffer[self.BYTES_PER_FRAME:]
580
+ try:
581
+ await self._ws.send(self._build_audio_packet(frame, last=False))
582
+ except Exception:
583
+ logger.debug("VolcengineASR: send failed", exc_info=True)
584
+ return
585
+
586
+ async def get_result(self) -> str:
587
+ if not self._ws:
588
+ return self._recv_text or ""
589
+
590
+ # Send remaining audio as the last packet
591
+ remaining = bytes(self._buffer) if self._buffer else b""
592
+ self._buffer = bytearray()
593
+ try:
594
+ await self._ws.send(self._build_audio_packet(remaining, last=True))
595
+ logger.debug("VolcengineASR: sent last audio packet (%d bytes)", len(remaining))
596
+ except Exception:
597
+ logger.debug("VolcengineASR: failed to send last packet", exc_info=True)
598
+
599
+ # Wait for the receiver to get the final response
600
+ if self._recv_task and not self._recv_task.done():
601
+ try:
602
+ await asyncio.wait_for(self._recv_task, timeout=5.0)
603
+ except asyncio.TimeoutError:
604
+ logger.warning("VolcengineASR: timeout waiting for final response")
605
+ except Exception:
606
+ pass
607
+
608
+ result = self._recv_text
609
+ self._recv_text = ""
610
+ return result
611
+
612
+ async def stop_stream(self) -> None:
613
+ self._buffer = bytearray()
614
+ # Cancel the receiver task
615
+ if self._recv_task and not self._recv_task.done():
616
+ self._recv_task.cancel()
617
+ try:
618
+ await self._recv_task
619
+ except (asyncio.CancelledError, Exception):
620
+ pass
621
+ self._recv_task = None
622
+ self._recv_text = ""
623
+ self._recv_done = False
624
+ if self._ws:
625
+ try:
626
+ await self._ws.close()
627
+ except Exception:
628
+ pass
629
+ self._ws = None
630
+
631
+ async def get_interim_result(self) -> str:
632
+ return self._recv_text
633
+
634
+
635
+ # ---------------------------------------------------------------------------
636
+ # Tencent Cloud real-time ASR via WebSocket
637
+ # ---------------------------------------------------------------------------
638
+
639
+ class TencentASR(ASRProvider):
640
+ """腾讯云实时语音识别 (WebSocket).
641
+
642
+ Protocol doc: https://cloud.tencent.com/document/product/1093/48982
643
+ """
644
+
645
+ WS_BASE = "wss://asr.cloud.tencent.com/asr/v2/"
646
+ SIGN_BASE = "asr.cloud.tencent.com/asr/v2/"
647
+ # 40ms per chunk at 16kHz S16LE mono = 1280 bytes
648
+ FRAME_DURATION_MS = 40
649
+ SAMPLE_RATE = 16000
650
+ BYTES_PER_FRAME = SAMPLE_RATE * 2 * FRAME_DURATION_MS // 1000 # 1280
651
+
652
+ def __init__(
653
+ self,
654
+ app_id: str,
655
+ secret_id: str,
656
+ secret_key: str,
657
+ engine_model_type: str = "16k_zh_large",
658
+ ) -> None:
659
+ self.app_id = app_id
660
+ self.secret_id = secret_id
661
+ self.secret_key = secret_key
662
+ self.engine_model_type = engine_model_type
663
+ self._ws = None
664
+ self._buffer = bytearray()
665
+ self._language = "zh"
666
+ self._recv_text = ""
667
+ self._sentence_parts: list[str] = []
668
+ self._recv_done = False
669
+ self._recv_task: asyncio.Task | None = None
670
+
671
+ # -- signing ---------------------------------------------------------------
672
+
673
+ def _build_ws_url(self) -> str:
674
+ now = int(time.time())
675
+ params = {
676
+ "secretid": self.secret_id,
677
+ "timestamp": now,
678
+ "expired": now + 86400,
679
+ "nonce": random.randint(10000, 99999),
680
+ "engine_model_type": self.engine_model_type,
681
+ "voice_id": str(uuid.uuid4()),
682
+ "voice_format": 1, # PCM
683
+ "needvad": 1,
684
+ "filter_dirty": 0,
685
+ "filter_modal": 0,
686
+ "filter_punc": 0,
687
+ "convert_num_mode": 1,
688
+ }
689
+ if self.SAMPLE_RATE == 8000:
690
+ params["input_sample_rate"] = 8000
691
+
692
+ # Sort params by key
693
+ sorted_keys = sorted(params.keys())
694
+ query_parts = [f"{k}={params[k]}" for k in sorted_keys]
695
+ query_str = "&".join(query_parts)
696
+
697
+ # Build sign string
698
+ sign_str = f"{self.SIGN_BASE}{self.app_id}?{query_str}"
699
+
700
+ # HMAC-SHA1 → base64
701
+ signature = base64.b64encode(
702
+ hmac.new(
703
+ self.secret_key.encode("utf-8"),
704
+ sign_str.encode("utf-8"),
705
+ hashlib.sha1,
706
+ ).digest()
707
+ ).decode("utf-8")
708
+
709
+ # URL-encode signature
710
+ encoded_sig = quote(signature, safe="")
711
+
712
+ return f"{self.WS_BASE}{self.app_id}?{query_str}&signature={encoded_sig}"
713
+
714
+ # -- background receiver ---------------------------------------------------
715
+
716
+ async def _recv_loop(self) -> None:
717
+ try:
718
+ while self._ws:
719
+ try:
720
+ msg = await self._ws.recv()
721
+ except (websockets.exceptions.ConnectionClosed,
722
+ websockets.exceptions.ConnectionClosedOK,
723
+ websockets.exceptions.ConnectionClosedError):
724
+ logger.debug("TencentASR: recv_loop connection closed")
725
+ break
726
+
727
+ if not isinstance(msg, str):
728
+ continue
729
+
730
+ try:
731
+ data = json.loads(msg)
732
+ except json.JSONDecodeError:
733
+ continue
734
+
735
+ code = data.get("code", -1)
736
+ if code != 0:
737
+ logger.error("TencentASR: error code=%s msg=%s",
738
+ code, data.get("message", ""))
739
+ self._recv_done = True
740
+ break
741
+
742
+ result = data.get("result", {})
743
+ slice_type = result.get("slice_type")
744
+ voice_text = result.get("voice_text_str", "")
745
+
746
+ if slice_type == 0:
747
+ # Sentence begin — nothing to accumulate yet
748
+ pass
749
+ elif slice_type == 1:
750
+ # Interim result for current sentence
751
+ self._recv_text = "".join(self._sentence_parts) + voice_text
752
+ elif slice_type == 2:
753
+ # Sentence end — finalize this sentence
754
+ self._sentence_parts.append(voice_text)
755
+ self._recv_text = "".join(self._sentence_parts)
756
+
757
+ final_speech = data.get("final_speech")
758
+ if final_speech == 1:
759
+ self._recv_done = True
760
+ break
761
+
762
+ except asyncio.CancelledError:
763
+ pass
764
+ except Exception:
765
+ logger.debug("TencentASR: recv_loop ended", exc_info=True)
766
+
767
+ # -- ASRProvider interface -------------------------------------------------
768
+
769
+ async def start_stream(self, language: str = "zh") -> None:
770
+ self._language = language
771
+ self._buffer = bytearray()
772
+ self._recv_text = ""
773
+ self._sentence_parts = []
774
+ self._recv_done = False
775
+
776
+ ws_url = self._build_ws_url()
777
+ masked_key = ("****" + self.secret_key[-4:]) if len(self.secret_key) > 4 else "****"
778
+ logger.info(
779
+ "TencentASR: connecting app_id=%s engine=%s secret_key=%s",
780
+ self.app_id, self.engine_model_type, masked_key,
781
+ )
782
+
783
+ try:
784
+ self._ws = await websockets.connect(ws_url)
785
+ except websockets.exceptions.InvalidStatus as e:
786
+ body = getattr(e.response, "body", b"")
787
+ if isinstance(body, (bytes, bytearray)):
788
+ body = body.decode("utf-8", errors="replace")
789
+ logger.error("TencentASR: WebSocket rejected: HTTP %s, %s",
790
+ e.response.status_code, body)
791
+ raise ConnectionError(
792
+ f"腾讯云ASR连接失败 (HTTP {e.response.status_code}): {body}. "
793
+ f"请检查 app_id、secret_id、secret_key 配置。"
794
+ ) from e
795
+
796
+ # Wait for the initial handshake response
797
+ try:
798
+ init_msg = await asyncio.wait_for(self._ws.recv(), timeout=5.0)
799
+ init_data = json.loads(init_msg) if isinstance(init_msg, str) else {}
800
+ if init_data.get("code", -1) != 0:
801
+ raise ConnectionError(
802
+ f"腾讯云ASR握手失败: code={init_data.get('code')}, "
803
+ f"msg={init_data.get('message', '')}"
804
+ )
805
+ logger.info("TencentASR: handshake ok, voice_id=%s", init_data.get("voice_id", ""))
806
+ except asyncio.TimeoutError:
807
+ logger.warning("TencentASR: handshake timeout, proceeding anyway")
808
+
809
+ # Start background receiver
810
+ self._recv_task = asyncio.create_task(self._recv_loop())
811
+ logger.info("TencentASR: stream started")
812
+
813
+ async def feed_audio(self, chunk: bytes) -> None:
814
+ self._buffer.extend(chunk)
815
+ if not self._ws:
816
+ return
817
+
818
+ while len(self._buffer) >= self.BYTES_PER_FRAME:
819
+ frame = bytes(self._buffer[:self.BYTES_PER_FRAME])
820
+ self._buffer = self._buffer[self.BYTES_PER_FRAME:]
821
+ try:
822
+ await self._ws.send(frame)
823
+ except Exception:
824
+ logger.debug("TencentASR: send failed", exc_info=True)
825
+ return
826
+
827
+ async def get_result(self) -> str:
828
+ if not self._ws:
829
+ return self._recv_text or ""
830
+
831
+ # Send remaining audio
832
+ if self._buffer:
833
+ try:
834
+ await self._ws.send(bytes(self._buffer))
835
+ except Exception:
836
+ pass
837
+ self._buffer = bytearray()
838
+
839
+ # Send end signal
840
+ try:
841
+ await self._ws.send(json.dumps({"type": "end"}))
842
+ except Exception:
843
+ pass
844
+
845
+ # Wait for final_speech=1
846
+ if self._recv_task and not self._recv_task.done():
847
+ try:
848
+ await asyncio.wait_for(self._recv_task, timeout=10.0)
849
+ except asyncio.TimeoutError:
850
+ logger.warning("TencentASR: timeout waiting for final response")
851
+ except Exception:
852
+ pass
853
+
854
+ result = self._recv_text
855
+ self._recv_text = ""
856
+ return result
857
+
858
+ async def stop_stream(self) -> None:
859
+ self._buffer = bytearray()
860
+ if self._recv_task and not self._recv_task.done():
861
+ self._recv_task.cancel()
862
+ try:
863
+ await self._recv_task
864
+ except (asyncio.CancelledError, Exception):
865
+ pass
866
+ self._recv_task = None
867
+ self._recv_text = ""
868
+ self._sentence_parts = []
869
+ self._recv_done = False
870
+ if self._ws:
871
+ try:
872
+ await self._ws.close()
873
+ except Exception:
874
+ pass
875
+ self._ws = None
876
+
877
+ async def get_interim_result(self) -> str:
878
+ return self._recv_text
879
+
880
+
881
+ # ---------------------------------------------------------------------------
882
+ # Factory
883
+ # ---------------------------------------------------------------------------
884
+
885
+ def create_asr_provider(
886
+ provider_override: str | None = None,
887
+ resource_id_override: str | None = None,
888
+ sample_rate_override: int | None = None,
889
+ ) -> ASRProvider:
890
+ """Create an ASR provider instance based on the current configuration.
891
+
892
+ Optional overrides allow the ASR test UI to dynamically switch
893
+ provider, resource_id and sample_rate without changing the saved config.
894
+ """
895
+ provider = provider_override or cfg.get("asr.provider", "whisper")
896
+
897
+ if provider == "whisper":
898
+ return WhisperASR(
899
+ base_url=cfg.get("asr.whisper.base_url", "https://api.openai.com/v1"),
900
+ api_key=cfg.get("asr.whisper.api_key", ""),
901
+ model=cfg.get("asr.whisper.model", "whisper-1"),
902
+ )
903
+ elif provider == "xunfei":
904
+ return XunfeiASR(
905
+ app_id=cfg.get("asr.xunfei.app_id", ""),
906
+ api_key=cfg.get("asr.xunfei.api_key", ""),
907
+ api_secret=cfg.get("asr.xunfei.api_secret", ""),
908
+ )
909
+ elif provider == "volcengine":
910
+ resource_id = resource_id_override or cfg.get("asr.volcengine.resource_id", "volc.bigasr.sauc.duration")
911
+ sample_rate = sample_rate_override or 16000
912
+ asr = VolcengineASR(
913
+ app_id=cfg.get("asr.volcengine.app_id", ""),
914
+ access_token=cfg.get("asr.volcengine.access_token", ""),
915
+ resource_id=resource_id,
916
+ ws_url=cfg.get("asr.volcengine.ws_url", ""),
917
+ )
918
+ if sample_rate and sample_rate != 16000:
919
+ asr.SAMPLE_RATE = sample_rate
920
+ asr.BYTES_PER_FRAME = sample_rate * 2 * asr.FRAME_DURATION_MS // 1000
921
+ return asr
922
+ elif provider == "tencent":
923
+ engine = resource_id_override or cfg.get("asr.tencent.engine_model_type", "16k_zh_large")
924
+ sample_rate = sample_rate_override or 16000
925
+ asr = TencentASR(
926
+ app_id=cfg.get("asr.tencent.app_id", ""),
927
+ secret_id=cfg.get("asr.tencent.secret_id", ""),
928
+ secret_key=cfg.get("asr.tencent.secret_key", ""),
929
+ engine_model_type=engine,
930
+ )
931
+ if sample_rate and sample_rate != 16000:
932
+ asr.SAMPLE_RATE = sample_rate
933
+ asr.BYTES_PER_FRAME = sample_rate * 2 * asr.FRAME_DURATION_MS // 1000
934
+ return asr
935
+ else:
936
+ raise ValueError(f"Unknown ASR provider: {provider}")