@agentunion/kite 1.0.6 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +127 -25
- package/core/event_hub/entry.py +384 -61
- package/core/event_hub/hub.py +8 -0
- package/core/event_hub/module.md +0 -1
- package/core/event_hub/server.py +169 -38
- package/core/kite_log.py +241 -0
- package/core/launcher/entry.py +1306 -425
- package/core/launcher/module_scanner.py +10 -9
- package/core/launcher/process_manager.py +555 -121
- package/core/registry/entry.py +335 -30
- package/core/registry/server.py +339 -256
- package/core/registry/store.py +13 -2
- package/extensions/agents/__init__.py +1 -0
- package/extensions/agents/assistant/__init__.py +1 -0
- package/extensions/agents/assistant/entry.py +380 -0
- package/extensions/agents/assistant/module.md +22 -0
- package/extensions/agents/assistant/server.py +236 -0
- package/extensions/channels/__init__.py +1 -0
- package/extensions/channels/acp_channel/__init__.py +1 -0
- package/extensions/channels/acp_channel/entry.py +380 -0
- package/extensions/channels/acp_channel/module.md +22 -0
- package/extensions/channels/acp_channel/server.py +236 -0
- package/{core → extensions}/event_hub_bench/entry.py +664 -371
- package/{core → extensions}/event_hub_bench/module.md +4 -2
- package/extensions/services/backup/__init__.py +1 -0
- package/extensions/services/backup/entry.py +380 -0
- package/extensions/services/backup/module.md +22 -0
- package/extensions/services/backup/server.py +244 -0
- package/extensions/services/model_service/__init__.py +1 -0
- package/extensions/services/model_service/entry.py +380 -0
- package/extensions/services/model_service/module.md +22 -0
- package/extensions/services/model_service/server.py +236 -0
- package/extensions/services/watchdog/entry.py +460 -143
- package/extensions/services/watchdog/module.md +3 -0
- package/extensions/services/watchdog/monitor.py +128 -13
- package/extensions/services/watchdog/server.py +75 -13
- package/extensions/services/web/__init__.py +1 -0
- package/extensions/services/web/config.yaml +149 -0
- package/extensions/services/web/entry.py +487 -0
- package/extensions/services/web/module.md +24 -0
- package/extensions/services/web/routes/__init__.py +1 -0
- package/extensions/services/web/routes/routes_call.py +189 -0
- package/extensions/services/web/routes/routes_config.py +512 -0
- package/extensions/services/web/routes/routes_contacts.py +98 -0
- package/extensions/services/web/routes/routes_devlog.py +99 -0
- package/extensions/services/web/routes/routes_phone.py +81 -0
- package/extensions/services/web/routes/routes_sms.py +48 -0
- package/extensions/services/web/routes/routes_stats.py +17 -0
- package/extensions/services/web/routes/routes_voicechat.py +554 -0
- package/extensions/services/web/routes/schemas.py +216 -0
- package/extensions/services/web/server.py +332 -0
- package/extensions/services/web/static/css/style.css +1064 -0
- package/extensions/services/web/static/index.html +1445 -0
- package/extensions/services/web/static/js/app.js +4671 -0
- package/extensions/services/web/vendor/__init__.py +1 -0
- package/extensions/services/web/vendor/bluetooth/audio.py +348 -0
- package/extensions/services/web/vendor/bluetooth/contacts.py +251 -0
- package/extensions/services/web/vendor/bluetooth/manager.py +395 -0
- package/extensions/services/web/vendor/bluetooth/sms.py +290 -0
- package/extensions/services/web/vendor/bluetooth/telephony.py +274 -0
- package/extensions/services/web/vendor/config.py +139 -0
- package/extensions/services/web/vendor/conversation/__init__.py +0 -0
- package/extensions/services/web/vendor/conversation/asr.py +936 -0
- package/extensions/services/web/vendor/conversation/engine.py +548 -0
- package/extensions/services/web/vendor/conversation/llm.py +534 -0
- package/extensions/services/web/vendor/conversation/mcp_tools.py +190 -0
- package/extensions/services/web/vendor/conversation/tts.py +322 -0
- package/extensions/services/web/vendor/conversation/vad.py +138 -0
- package/extensions/services/web/vendor/storage/__init__.py +1 -0
- package/extensions/services/web/vendor/storage/identity.py +312 -0
- package/extensions/services/web/vendor/storage/store.py +507 -0
- package/extensions/services/web/vendor/task/__init__.py +0 -0
- package/extensions/services/web/vendor/task/manager.py +864 -0
- package/extensions/services/web/vendor/task/models.py +45 -0
- package/extensions/services/web/vendor/task/webhook.py +263 -0
- package/extensions/services/web/vendor/tools/__init__.py +0 -0
- package/extensions/services/web/vendor/tools/registry.py +321 -0
- package/main.py +344 -4
- package/package.json +11 -2
- package/core/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/__pycache__/data_dir.cpython-313.pyc +0 -0
- package/core/data_dir.py +0 -62
- package/core/event_hub/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/bench.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/bench_perf.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/dedup.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/hub.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/router.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/server.cpython-313.pyc +0 -0
- package/core/event_hub/bench_results/2026-02-28_13-26-48.json +0 -51
- package/core/event_hub/bench_results/2026-02-28_13-44-45.json +0 -51
- package/core/event_hub/bench_results/2026-02-28_13-45-39.json +0 -51
- package/core/launcher/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/module_scanner.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/process_manager.cpython-313.pyc +0 -0
- package/core/launcher/data/log/lifecycle.jsonl +0 -1158
- package/core/launcher/data/token.txt +0 -1
- package/core/registry/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/server.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/store.cpython-313.pyc +0 -0
- package/core/registry/data/port.txt +0 -1
- package/core/registry/data/port_484.txt +0 -1
- package/extensions/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/entry.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/monitor.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/server.cpython-313.pyc +0 -0
- /package/{core/event_hub/bench_results/.gitkeep → extensions/services/web/vendor/bluetooth/__init__.py} +0 -0
|
@@ -0,0 +1,554 @@
|
|
|
1
|
+
"""Voice Chat WebSocket endpoint — browser-based ASR → LLM → TTS pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import re
|
|
9
|
+
import struct
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from fastapi import APIRouter, Request, WebSocket, WebSocketDisconnect
|
|
14
|
+
|
|
15
|
+
from vendor import config as cfg
|
|
16
|
+
from vendor.conversation.asr import create_asr_provider
|
|
17
|
+
from vendor.conversation.llm import create_llm_provider
|
|
18
|
+
from vendor.conversation.tts import create_tts_provider
|
|
19
|
+
from vendor.storage import identity
|
|
20
|
+
from vendor.tools.registry import get_registry
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
router = APIRouter()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
# Lightweight energy-based VAD (no C extension needed)
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
class EnergyVAD:
|
|
32
|
+
"""Simple RMS-energy voice activity detector for S16LE 16 kHz mono PCM."""
|
|
33
|
+
|
|
34
|
+
FRAME_BYTES = 640 # 20 ms at 16 kHz S16LE mono
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
energy_threshold: float = 300.0,
|
|
39
|
+
silence_ms: int = 800,
|
|
40
|
+
min_speech_ms: int = 250,
|
|
41
|
+
) -> None:
|
|
42
|
+
self._base_threshold = energy_threshold
|
|
43
|
+
self._threshold = energy_threshold
|
|
44
|
+
self._silence_frames = silence_ms // 20
|
|
45
|
+
self._speech_frames = max(1, min_speech_ms // 20)
|
|
46
|
+
self._in_speech = False
|
|
47
|
+
self._speech_count = 0
|
|
48
|
+
self._silence_count = 0
|
|
49
|
+
self._buf = bytearray()
|
|
50
|
+
self._last_rms = 0.0
|
|
51
|
+
self._noise_floor = 0.0
|
|
52
|
+
self._noise_alpha = 0.05
|
|
53
|
+
self._noise_margin = 3.0
|
|
54
|
+
self._noise_samples = 0
|
|
55
|
+
self._noise_warmup = 15
|
|
56
|
+
logger.info(
|
|
57
|
+
"EnergyVAD: base_threshold=%.0f, silence_frames=%d (%dms), speech_frames=%d (%dms), adaptive=on (margin=%.1fx)",
|
|
58
|
+
self._base_threshold, self._silence_frames, silence_ms,
|
|
59
|
+
self._speech_frames, min_speech_ms, self._noise_margin,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def feed(self, chunk: bytes, external_rms: float | None = None) -> str | None:
|
|
63
|
+
if external_rms is not None:
|
|
64
|
+
self._buf.extend(chunk)
|
|
65
|
+
last_event: str | None = None
|
|
66
|
+
while len(self._buf) >= self.FRAME_BYTES:
|
|
67
|
+
self._buf = self._buf[self.FRAME_BYTES:]
|
|
68
|
+
self._last_rms = external_rms
|
|
69
|
+
ev = self._process_rms(external_rms)
|
|
70
|
+
if ev is not None:
|
|
71
|
+
last_event = ev
|
|
72
|
+
return last_event
|
|
73
|
+
|
|
74
|
+
self._buf.extend(chunk)
|
|
75
|
+
last_event: str | None = None
|
|
76
|
+
while len(self._buf) >= self.FRAME_BYTES:
|
|
77
|
+
frame = self._buf[: self.FRAME_BYTES]
|
|
78
|
+
self._buf = self._buf[self.FRAME_BYTES :]
|
|
79
|
+
ev = self._process(bytes(frame))
|
|
80
|
+
if ev is not None:
|
|
81
|
+
last_event = ev
|
|
82
|
+
return last_event
|
|
83
|
+
|
|
84
|
+
def reset(self) -> None:
|
|
85
|
+
self._in_speech = False
|
|
86
|
+
self._speech_count = 0
|
|
87
|
+
self._silence_count = 0
|
|
88
|
+
self._buf = bytearray()
|
|
89
|
+
self._noise_samples = 0
|
|
90
|
+
self._noise_floor = 0.0
|
|
91
|
+
self._threshold = self._base_threshold
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def last_rms(self) -> float:
|
|
95
|
+
return self._last_rms
|
|
96
|
+
|
|
97
|
+
def _rms(self, frame: bytes) -> float:
|
|
98
|
+
n = len(frame) // 2
|
|
99
|
+
if n == 0:
|
|
100
|
+
return 0.0
|
|
101
|
+
samples = struct.unpack(f"<{n}h", frame)
|
|
102
|
+
return (sum(s * s for s in samples) / n) ** 0.5
|
|
103
|
+
|
|
104
|
+
def _process(self, frame: bytes) -> str | None:
|
|
105
|
+
rms = self._rms(frame)
|
|
106
|
+
return self._process_rms(rms)
|
|
107
|
+
|
|
108
|
+
def _process_rms(self, rms: float) -> str | None:
|
|
109
|
+
self._last_rms = rms
|
|
110
|
+
|
|
111
|
+
if not self._in_speech and rms < self._threshold:
|
|
112
|
+
self._noise_samples += 1
|
|
113
|
+
if self._noise_samples == 1:
|
|
114
|
+
self._noise_floor = rms
|
|
115
|
+
else:
|
|
116
|
+
self._noise_floor += self._noise_alpha * (rms - self._noise_floor)
|
|
117
|
+
if self._noise_samples >= self._noise_warmup:
|
|
118
|
+
self._threshold = max(self._base_threshold,
|
|
119
|
+
self._noise_floor * self._noise_margin)
|
|
120
|
+
|
|
121
|
+
is_speech = rms > self._threshold
|
|
122
|
+
if is_speech:
|
|
123
|
+
self._silence_count = 0
|
|
124
|
+
self._speech_count += 1
|
|
125
|
+
if not self._in_speech and self._speech_count >= self._speech_frames:
|
|
126
|
+
self._in_speech = True
|
|
127
|
+
logger.debug("VAD: speech_start (rms=%.0f, threshold=%.0f, noise=%.0f)",
|
|
128
|
+
rms, self._threshold, self._noise_floor)
|
|
129
|
+
return "speech_start"
|
|
130
|
+
else:
|
|
131
|
+
self._speech_count = 0
|
|
132
|
+
if self._in_speech:
|
|
133
|
+
self._silence_count += 1
|
|
134
|
+
if self._silence_count >= self._silence_frames:
|
|
135
|
+
self._in_speech = False
|
|
136
|
+
self._silence_count = 0
|
|
137
|
+
logger.debug(
|
|
138
|
+
"VAD: speech_end (rms=%.0f, threshold=%.0f, noise=%.0f)",
|
|
139
|
+
rms, self._threshold, self._noise_floor,
|
|
140
|
+
)
|
|
141
|
+
return "speech_end"
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
DEFAULT_SYSTEM_PROMPT = (
|
|
146
|
+
"你是一个AI语音助手。用户正在通过语音与你对话。\n"
|
|
147
|
+
"- 用自然、简洁的中文回复\n"
|
|
148
|
+
"- 每次回复控制在1-3句话,适合语音播放\n"
|
|
149
|
+
"- 语气友好自然,像朋友聊天"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
_SENTENCE_RE = re.compile(r'(?<=[。!?!?\n])\s*')
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _split_sentences(text: str) -> list[str]:
|
|
156
|
+
parts = _SENTENCE_RE.split(text)
|
|
157
|
+
sentences: list[str] = []
|
|
158
|
+
for p in parts:
|
|
159
|
+
p = p.strip()
|
|
160
|
+
if not p:
|
|
161
|
+
continue
|
|
162
|
+
if sentences and len(p) < 5:
|
|
163
|
+
sentences[-1] += p
|
|
164
|
+
else:
|
|
165
|
+
sentences.append(p)
|
|
166
|
+
return sentences if sentences else [text]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
async def _send_json(ws: WebSocket, data: dict) -> bool:
|
|
170
|
+
try:
|
|
171
|
+
await ws.send_json(data)
|
|
172
|
+
return True
|
|
173
|
+
except Exception:
|
|
174
|
+
return False
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
async def _send_bytes(ws: WebSocket, data: bytes) -> bool:
|
|
178
|
+
try:
|
|
179
|
+
await ws.send_bytes(data)
|
|
180
|
+
return True
|
|
181
|
+
except Exception:
|
|
182
|
+
return False
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
async def _send_tts_segments(ws: WebSocket, tts, text: str, language: str) -> bool:
|
|
186
|
+
if not await _send_json(ws, {"type": "speaking"}):
|
|
187
|
+
return False
|
|
188
|
+
|
|
189
|
+
sentences = _split_sentences(text)
|
|
190
|
+
chunk_size = 16 * 1024
|
|
191
|
+
|
|
192
|
+
for sentence in sentences:
|
|
193
|
+
try:
|
|
194
|
+
audio_data = await tts.synthesize(sentence, language)
|
|
195
|
+
if audio_data:
|
|
196
|
+
for i in range(0, len(audio_data), chunk_size):
|
|
197
|
+
if not await _send_bytes(ws, audio_data[i:i + chunk_size]):
|
|
198
|
+
return False
|
|
199
|
+
if not await _send_json(ws, {"type": "segment_end"}):
|
|
200
|
+
return False
|
|
201
|
+
except Exception as e:
|
|
202
|
+
logger.exception("VoiceChat: TTS segment error for: %s", sentence)
|
|
203
|
+
await _send_json(ws, {"type": "error", "message": f"TTS错误: {e}"})
|
|
204
|
+
|
|
205
|
+
await _send_json(ws, {"type": "audio_done"})
|
|
206
|
+
return True
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
@router.post("/api/voicechat/dump-log")
|
|
210
|
+
async def dump_log(request: Request):
|
|
211
|
+
"""Save voicechat status log to data/voicechat_log.txt for diagnostics."""
|
|
212
|
+
body = await request.json()
|
|
213
|
+
log_text = body.get("log", "")
|
|
214
|
+
if not log_text:
|
|
215
|
+
return {"ok": False}
|
|
216
|
+
log_path = cfg.data_dir() / "voicechat_log.txt"
|
|
217
|
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
218
|
+
log_path.write_text(log_text, encoding="utf-8")
|
|
219
|
+
logger.info("VoiceChat: status log saved to %s (%d bytes)", log_path, len(log_text))
|
|
220
|
+
return {"ok": True}
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
@router.websocket("/ws/voice-chat")
|
|
224
|
+
async def voice_chat(ws: WebSocket):
|
|
225
|
+
"""Full-duplex voice chat: browser mic → VAD+ASR → LLM → TTS → browser speaker."""
|
|
226
|
+
await ws.accept()
|
|
227
|
+
logger.info("VoiceChat: WebSocket connected")
|
|
228
|
+
|
|
229
|
+
asr = None
|
|
230
|
+
llm = None
|
|
231
|
+
tts = None
|
|
232
|
+
vad = None
|
|
233
|
+
messages: list[dict[str, Any]] = []
|
|
234
|
+
running = False
|
|
235
|
+
ctx: dict[str, Any] = {"pending_prompt": None, "tools": None, "tool_ctx": None}
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
while True:
|
|
239
|
+
raw = await ws.receive_text()
|
|
240
|
+
msg = json.loads(raw)
|
|
241
|
+
if msg.get("type") == "start":
|
|
242
|
+
system_prompt = msg.get("system_prompt", "").strip() or DEFAULT_SYSTEM_PROMPT
|
|
243
|
+
break
|
|
244
|
+
|
|
245
|
+
asr = create_asr_provider()
|
|
246
|
+
llm = create_llm_provider()
|
|
247
|
+
tts = create_tts_provider()
|
|
248
|
+
vad = EnergyVAD(
|
|
249
|
+
energy_threshold=float(cfg.get("vad.energy_threshold", 300)),
|
|
250
|
+
silence_ms=int(cfg.get("vad.silence_threshold_ms", 800)),
|
|
251
|
+
min_speech_ms=int(cfg.get("vad.min_speech_ms", 250)),
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
phone_number = msg.get("phone_number", "")
|
|
255
|
+
user_phone = identity.get_user_phone()
|
|
256
|
+
is_owner = identity.is_owner(user_phone, phone_number) if (user_phone and phone_number) else False
|
|
257
|
+
|
|
258
|
+
registry = get_registry()
|
|
259
|
+
active_provider = cfg.get("llm.active_provider", "openai")
|
|
260
|
+
|
|
261
|
+
global_tools_cfg = identity.load_tools_config(cfg.data_dir() / "tools.yaml")
|
|
262
|
+
user_tools_cfg = identity.load_tools_config(
|
|
263
|
+
identity.user_dir(user_phone) / "tools.yaml"
|
|
264
|
+
) if user_phone else None
|
|
265
|
+
contact_tools_cfg = identity.load_tools_config(
|
|
266
|
+
identity.contact_dir(user_phone, phone_number) / "tools.yaml"
|
|
267
|
+
) if (user_phone and phone_number) else None
|
|
268
|
+
|
|
269
|
+
enabled_tools = registry.resolve_enabled(
|
|
270
|
+
global_tools_cfg, user_tools_cfg, contact_tools_cfg, None, is_owner,
|
|
271
|
+
)
|
|
272
|
+
provider_tools = registry.get_tools_for_provider(active_provider, enabled_tools) if enabled_tools else None
|
|
273
|
+
|
|
274
|
+
tool_ctx = {
|
|
275
|
+
"task_info": {
|
|
276
|
+
"task_id": f"voicechat-{id(ws)}",
|
|
277
|
+
"phone_number": phone_number,
|
|
278
|
+
"user_phone": user_phone,
|
|
279
|
+
},
|
|
280
|
+
"webhook": None,
|
|
281
|
+
"engine": None,
|
|
282
|
+
"data_dir": cfg.data_dir(),
|
|
283
|
+
"root_dir": cfg.root_dir(),
|
|
284
|
+
"is_owner": is_owner,
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
ctx["tools"] = provider_tools
|
|
288
|
+
ctx["tool_ctx"] = tool_ctx
|
|
289
|
+
ctx["enabled_tools"] = enabled_tools
|
|
290
|
+
ctx["registry"] = registry
|
|
291
|
+
ctx["active_provider"] = active_provider
|
|
292
|
+
|
|
293
|
+
logger.info(
|
|
294
|
+
"VoiceChat: tools resolved — phone=%s, is_owner=%s, enabled=%s",
|
|
295
|
+
phone_number or "(none)", is_owner, enabled_tools,
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
language = cfg.get("asr.language", "zh")
|
|
299
|
+
|
|
300
|
+
tools_summary = registry.build_tools_summary(enabled_tools)
|
|
301
|
+
full_prompt = system_prompt
|
|
302
|
+
if tools_summary:
|
|
303
|
+
full_prompt += "\n\n" + tools_summary
|
|
304
|
+
|
|
305
|
+
messages = [{"role": "system", "content": full_prompt}]
|
|
306
|
+
running = True
|
|
307
|
+
|
|
308
|
+
await _send_json(ws, {"type": "ready"})
|
|
309
|
+
logger.info("VoiceChat: session started")
|
|
310
|
+
|
|
311
|
+
ai_first = msg.get("ai_first", False)
|
|
312
|
+
if ai_first and system_prompt:
|
|
313
|
+
logger.info("VoiceChat: ai_first mode — generating opening")
|
|
314
|
+
await _send_json(ws, {"type": "thinking"})
|
|
315
|
+
try:
|
|
316
|
+
opening_messages = messages + [
|
|
317
|
+
{"role": "user", "content": "请根据系统提示词中的任务目标,主动开始对话。直接说第一句话即可,不要解释。"}
|
|
318
|
+
]
|
|
319
|
+
result = await llm.generate(opening_messages, tools=provider_tools)
|
|
320
|
+
ai_text = result.get("content", "").strip()
|
|
321
|
+
except Exception as e:
|
|
322
|
+
logger.exception("VoiceChat: AI-first LLM error")
|
|
323
|
+
ai_text = ""
|
|
324
|
+
|
|
325
|
+
if ai_text:
|
|
326
|
+
messages.append({"role": "assistant", "content": ai_text})
|
|
327
|
+
await _send_json(ws, {"type": "ai_text", "text": ai_text})
|
|
328
|
+
await _send_json(ws, {"type": "debug_messages", "messages": opening_messages + [{"role": "assistant", "content": ai_text}]})
|
|
329
|
+
if not await _send_tts_segments(ws, tts, ai_text, language):
|
|
330
|
+
running = False
|
|
331
|
+
|
|
332
|
+
while running:
|
|
333
|
+
await _conversation_turn(ws, asr, llm, tts, vad, messages, language, ctx)
|
|
334
|
+
|
|
335
|
+
except WebSocketDisconnect:
|
|
336
|
+
logger.info("VoiceChat: client disconnected")
|
|
337
|
+
except Exception as e:
|
|
338
|
+
logger.exception("VoiceChat: unexpected error")
|
|
339
|
+
await _send_json(ws, {"type": "error", "message": str(e)})
|
|
340
|
+
finally:
|
|
341
|
+
running = False
|
|
342
|
+
if asr:
|
|
343
|
+
try:
|
|
344
|
+
await asr.stop_stream()
|
|
345
|
+
except Exception:
|
|
346
|
+
pass
|
|
347
|
+
logger.info("VoiceChat: session ended")
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
async def _conversation_turn(ws, asr, llm, tts, vad, messages, language, ctx):
|
|
351
|
+
"""Execute one listen → ASR → LLM → TTS turn."""
|
|
352
|
+
|
|
353
|
+
await _send_json(ws, {"type": "listening"})
|
|
354
|
+
vad.reset()
|
|
355
|
+
|
|
356
|
+
await asr.stop_stream()
|
|
357
|
+
await asr.start_stream(language)
|
|
358
|
+
|
|
359
|
+
speech_started = False
|
|
360
|
+
audio_chunks: list[bytes] = []
|
|
361
|
+
last_interim = ""
|
|
362
|
+
vad_report_counter = 0
|
|
363
|
+
|
|
364
|
+
while True:
|
|
365
|
+
try:
|
|
366
|
+
raw = await ws.receive()
|
|
367
|
+
except RuntimeError:
|
|
368
|
+
raise WebSocketDisconnect()
|
|
369
|
+
|
|
370
|
+
if "text" in raw:
|
|
371
|
+
msg = json.loads(raw["text"])
|
|
372
|
+
if msg.get("type") == "stop":
|
|
373
|
+
raise WebSocketDisconnect()
|
|
374
|
+
if msg.get("type") == "update_system_prompt":
|
|
375
|
+
new_prompt = msg.get("system_prompt", "").strip()
|
|
376
|
+
if new_prompt:
|
|
377
|
+
ctx["pending_prompt"] = new_prompt
|
|
378
|
+
logger.info("VoiceChat: system prompt updated (pending)")
|
|
379
|
+
if msg.get("type") == "update_vad":
|
|
380
|
+
threshold = float(msg.get("energy_threshold", vad._base_threshold))
|
|
381
|
+
silence_ms = int(msg.get("silence_threshold_ms", vad._silence_frames * 20))
|
|
382
|
+
min_speech_ms = int(msg.get("min_speech_ms", vad._speech_frames * 20))
|
|
383
|
+
vad._base_threshold = threshold
|
|
384
|
+
vad._threshold = max(threshold, vad._noise_floor * vad._noise_margin)
|
|
385
|
+
vad._silence_frames = silence_ms // 20
|
|
386
|
+
vad._speech_frames = max(1, min_speech_ms // 20)
|
|
387
|
+
logger.info(
|
|
388
|
+
"VoiceChat: VAD params updated live — base_threshold=%.0f, effective=%.0f, silence=%dms, min_speech=%dms",
|
|
389
|
+
threshold, vad._threshold, silence_ms, min_speech_ms,
|
|
390
|
+
)
|
|
391
|
+
continue
|
|
392
|
+
|
|
393
|
+
if "bytes" not in raw or raw["bytes"] is None:
|
|
394
|
+
continue
|
|
395
|
+
|
|
396
|
+
raw_bytes = raw["bytes"]
|
|
397
|
+
if len(raw_bytes) > 4:
|
|
398
|
+
pre_agc_rms = struct.unpack('<f', raw_bytes[:4])[0]
|
|
399
|
+
chunk = raw_bytes[4:]
|
|
400
|
+
else:
|
|
401
|
+
pre_agc_rms = None
|
|
402
|
+
chunk = raw_bytes
|
|
403
|
+
|
|
404
|
+
event = vad.feed(chunk, external_rms=pre_agc_rms)
|
|
405
|
+
|
|
406
|
+
vad_report_counter += 1
|
|
407
|
+
if vad_report_counter % 4 == 0:
|
|
408
|
+
await _send_json(ws, {
|
|
409
|
+
"type": "vad_info",
|
|
410
|
+
"rms": round(vad.last_rms, 1),
|
|
411
|
+
"threshold": round(vad._threshold, 1),
|
|
412
|
+
"noise_floor": round(vad._noise_floor, 1),
|
|
413
|
+
"in_speech": vad._in_speech,
|
|
414
|
+
"silence_count": vad._silence_count,
|
|
415
|
+
"silence_target": vad._silence_frames,
|
|
416
|
+
})
|
|
417
|
+
|
|
418
|
+
if event == "speech_start" and not speech_started:
|
|
419
|
+
speech_started = True
|
|
420
|
+
audio_chunks.clear()
|
|
421
|
+
last_interim = ""
|
|
422
|
+
await _send_json(ws, {"type": "speech_start"})
|
|
423
|
+
|
|
424
|
+
if speech_started:
|
|
425
|
+
audio_chunks.append(chunk)
|
|
426
|
+
await asr.feed_audio(chunk)
|
|
427
|
+
|
|
428
|
+
interim = await asr.get_interim_result()
|
|
429
|
+
if interim and interim != last_interim:
|
|
430
|
+
last_interim = interim
|
|
431
|
+
logger.debug("VoiceChat: interim → %s", interim)
|
|
432
|
+
await _send_json(ws, {"type": "interim", "text": interim})
|
|
433
|
+
|
|
434
|
+
if event == "speech_end" and speech_started:
|
|
435
|
+
await _send_json(ws, {"type": "speech_end"})
|
|
436
|
+
break
|
|
437
|
+
|
|
438
|
+
user_text = await asr.get_result()
|
|
439
|
+
user_text = user_text.strip()
|
|
440
|
+
|
|
441
|
+
if not user_text:
|
|
442
|
+
logger.info("VoiceChat: empty ASR result, resuming listening")
|
|
443
|
+
return
|
|
444
|
+
|
|
445
|
+
logger.info("VoiceChat: user said: %s", user_text)
|
|
446
|
+
await _send_json(ws, {"type": "user_text", "text": user_text})
|
|
447
|
+
messages.append({"role": "user", "content": user_text})
|
|
448
|
+
|
|
449
|
+
if ctx.get("pending_prompt"):
|
|
450
|
+
messages[0]["content"] = ctx["pending_prompt"]
|
|
451
|
+
ctx["pending_prompt"] = None
|
|
452
|
+
logger.info("VoiceChat: applied updated system prompt")
|
|
453
|
+
|
|
454
|
+
await _send_json(ws, {"type": "thinking"})
|
|
455
|
+
|
|
456
|
+
provider_tools = ctx.get("tools")
|
|
457
|
+
registry = ctx.get("registry")
|
|
458
|
+
tool_ctx_base = ctx.get("tool_ctx", {})
|
|
459
|
+
|
|
460
|
+
try:
|
|
461
|
+
result = await llm.generate(messages, tools=provider_tools)
|
|
462
|
+
except Exception as e:
|
|
463
|
+
logger.exception("VoiceChat: LLM error")
|
|
464
|
+
await _send_json(ws, {"type": "error", "message": f"LLM错误: {e}"})
|
|
465
|
+
return
|
|
466
|
+
|
|
467
|
+
assistant_msg: dict[str, Any] = {"role": "assistant"}
|
|
468
|
+
if result.get("content"):
|
|
469
|
+
assistant_msg["content"] = result["content"]
|
|
470
|
+
if result.get("tool_calls"):
|
|
471
|
+
assistant_msg["tool_calls"] = result.get("raw_tool_calls") or result["tool_calls"]
|
|
472
|
+
messages.append(assistant_msg)
|
|
473
|
+
|
|
474
|
+
while result.get("tool_calls") and registry:
|
|
475
|
+
tool_calls = result["tool_calls"]
|
|
476
|
+
|
|
477
|
+
for tc in tool_calls:
|
|
478
|
+
tc_name = tc.get("name", "")
|
|
479
|
+
tc_args = tc.get("arguments", {})
|
|
480
|
+
tc_id = tc.get("id", "")
|
|
481
|
+
|
|
482
|
+
await _send_json(ws, {
|
|
483
|
+
"type": "tool_call",
|
|
484
|
+
"id": tc_id,
|
|
485
|
+
"name": tc_name,
|
|
486
|
+
"arguments": tc_args,
|
|
487
|
+
})
|
|
488
|
+
logger.info("VoiceChat: tool call: %s(%s)", tc_name, tc_args)
|
|
489
|
+
|
|
490
|
+
handler = registry.get_handler(tc_name)
|
|
491
|
+
if handler:
|
|
492
|
+
perm_files = registry.get_permission_files(
|
|
493
|
+
tc_name,
|
|
494
|
+
tool_ctx_base.get("task_info", {}).get("user_phone", ""),
|
|
495
|
+
tool_ctx_base.get("task_info", {}).get("phone_number", ""),
|
|
496
|
+
)
|
|
497
|
+
handler_ctx = {
|
|
498
|
+
**tool_ctx_base,
|
|
499
|
+
"permission_files": perm_files,
|
|
500
|
+
}
|
|
501
|
+
try:
|
|
502
|
+
tool_result = await handler(tc_args, handler_ctx)
|
|
503
|
+
except Exception as exc:
|
|
504
|
+
logger.exception("VoiceChat: tool execution failed: %s", tc_name)
|
|
505
|
+
tool_result = f"Error: {exc}"
|
|
506
|
+
else:
|
|
507
|
+
tool_result = f"Unknown tool: {tc_name}"
|
|
508
|
+
|
|
509
|
+
await _send_json(ws, {
|
|
510
|
+
"type": "tool_result",
|
|
511
|
+
"id": tc_id,
|
|
512
|
+
"name": tc_name,
|
|
513
|
+
"result": tool_result[:500],
|
|
514
|
+
})
|
|
515
|
+
logger.info("VoiceChat: tool result: %s → %s", tc_name, tool_result[:100])
|
|
516
|
+
|
|
517
|
+
messages.append({
|
|
518
|
+
"role": "tool",
|
|
519
|
+
"tool_call_id": tc_id,
|
|
520
|
+
"name": tc_name,
|
|
521
|
+
"content": tool_result,
|
|
522
|
+
})
|
|
523
|
+
|
|
524
|
+
try:
|
|
525
|
+
result = await llm.generate(messages, tools=provider_tools)
|
|
526
|
+
except Exception as e:
|
|
527
|
+
logger.exception("VoiceChat: LLM error after tool call")
|
|
528
|
+
await _send_json(ws, {"type": "error", "message": f"LLM错误: {e}"})
|
|
529
|
+
return
|
|
530
|
+
|
|
531
|
+
assistant_msg = {"role": "assistant"}
|
|
532
|
+
if result.get("content"):
|
|
533
|
+
assistant_msg["content"] = result["content"]
|
|
534
|
+
if result.get("tool_calls"):
|
|
535
|
+
assistant_msg["tool_calls"] = result.get("raw_tool_calls") or result["tool_calls"]
|
|
536
|
+
messages.append(assistant_msg)
|
|
537
|
+
|
|
538
|
+
ai_text = (result.get("content") or "").strip()
|
|
539
|
+
|
|
540
|
+
if not ai_text:
|
|
541
|
+
ai_text = "抱歉,我没有理解你的意思,能再说一遍吗?"
|
|
542
|
+
|
|
543
|
+
logger.info("VoiceChat: AI reply: %s", ai_text)
|
|
544
|
+
await _send_json(ws, {"type": "ai_text", "text": ai_text})
|
|
545
|
+
|
|
546
|
+
await _send_json(ws, {"type": "debug_messages", "messages": [m for m in messages]})
|
|
547
|
+
|
|
548
|
+
await _send_tts_segments(ws, tts, ai_text, language)
|
|
549
|
+
|
|
550
|
+
try:
|
|
551
|
+
await asr.stop_stream()
|
|
552
|
+
await asr.start_stream(language)
|
|
553
|
+
except Exception:
|
|
554
|
+
logger.debug("VoiceChat: ASR re-init failed", exc_info=True)
|