myagent-ai 1.13.7 → 1.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -42,7 +42,7 @@ class MainAgent(BaseAgent):
42
42
  严格以XML格式化输出以下内容:
43
43
  <output>
44
44
  <response>直接回复用户的内容。这是一段友好、自然的话语,用于向用户说明你正在做什么,或者回应用户的问题/问候。要求简洁、有礼貌、符合对话场景。如果用户只是问候,简单回应即可;如果用户有具体任务,要说明你的计划。</response>
45
- <usersays_correct>根据用户输入的"usersays"内容,结合上下文优化为新的用户输入,如果"usersays"为空,这里输出为空。</usersays_correct>
45
+ <usersays_correct>根据用户输入的"usersays"内容(语音转写文本),结合对话语境优化为更准确的用户意图表达(修正识别错误、补充标点、口语转书面语)。如果"usersays"为空,这里输出为空。</usersays_correct>
46
46
  <task_plan>任务计划(仅复杂任务使用):如"context"包含非空"task_plan",则更新它。否则,先评估任务复杂度——如果预计操作步骤不超过3步(如:单次查询、简单问答、格式转换、单文件修改、简单计算等简单任务),则<task_plan>输出为空,不要创建任务列表;只有当任务较复杂(预计超过3步操作,如:多文件修改、需要调研+实现+测试、涉及多个模块联动等),才以Markdown列表格式制定新任务列表。格式:每项用 "- [ ] 任务描述" 或 "- [x] 已完成任务",含完成状态标记。</task_plan>
47
47
 
48
48
  <toolstocal>
@@ -62,7 +62,7 @@ class MainAgent(BaseAgent):
62
62
  ## 核心规则
63
63
  1. 你必须且只能输出 <output> XML 结构,不要输出任何其他文本
64
64
  2. <response>: 必须输出一段直接回复用户的话语(这是用户实际看到的回复),要求简洁友好、自然流畅。不要只输出任务计划而不说话!
65
- 3. <usersays_correct>: 如果 context 中 usersays 非空,则根据对话语境优化为更准确的用户意图表达
65
+ 3. <usersays_correct>: 如果 context 中 usersays 非空(说明用户通过语音输入),则根据对话语境将语音转写文本优化为更准确的用户意图表达,修正识别错误、补充标点、口语转书面语。如果 usersays 为空,这里输出为空。
66
66
  4. <task_plan>: 仅用于复杂任务(预计超过3步操作)。简单任务(≤3步)输出为空。复杂任务使用 Markdown 列表格式,每项包含任务描述和完成状态标记 [x]/[ ]
67
67
  5. <toolstocal>: 列出所有需要执行的工具调用,每个工具包含完整的参数说明
68
68
  6. <parms>: **必须使用严格合法的JSON格式**,例如 {"query": "关键词", "num": 10},不要使用其他格式
@@ -514,7 +514,7 @@ class MainAgent(BaseAgent):
514
514
  session_id=context.session_id,
515
515
  conversation_history=conversation_history,
516
516
  user_typed_text=context.user_message,
517
- user_voice_text="",
517
+ user_voice_text=context.metadata.get("user_voice_text", ""),
518
518
  task_plan=current_task_plan,
519
519
  agent_override_prompt=agent_override_prompt,
520
520
  get_knowledge=get_knowledge_content,
@@ -605,8 +605,15 @@ class ContextBuilder:
605
605
  Returns:
606
606
  <userprint> 和 <usersays> XML 段落字符串
607
607
  """
608
- safe_typed = _xml_escape(user_typed_text.strip()) if user_typed_text else ""
609
- safe_voice = _xml_escape(user_voice_text.strip()) if user_voice_text else ""
608
+ # 语音输入时:userprint 为空,usersays 存原始语音文本
609
+ # 键盘输入时:userprint 存文本,usersays 为空
610
+ # 两者互斥
611
+ if user_voice_text and user_voice_text.strip():
612
+ safe_typed = ""
613
+ safe_voice = _xml_escape(user_voice_text.strip())
614
+ else:
615
+ safe_typed = _xml_escape(user_typed_text.strip()) if user_typed_text else ""
616
+ safe_voice = ""
610
617
 
611
618
  lines = [
612
619
  f"<userprint>",
@@ -485,6 +485,11 @@ def _fallback_regex_parse(raw_text: str) -> ParsedOutput:
485
485
  }
486
486
  )
487
487
 
488
+ # 如果正则回退成功提取到了关键内容(response、工具调用等),
489
+ # 则标记为解析成功,避免主循环误判为解析失败而中断执行
490
+ if parsed.response or parsed.tools_to_call or parsed.ask_user:
491
+ parsed.parse_success = True
492
+
488
493
  return parsed
489
494
 
490
495
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "myagent-ai",
3
- "version": "1.13.7",
3
+ "version": "1.14.1",
4
4
  "description": "本地桌面端执行型AI助手 - Open Interpreter 风格 | Local Desktop Execution-Oriented AI Assistant",
5
5
  "main": "main.py",
6
6
  "bin": {
package/requirements.txt CHANGED
@@ -50,6 +50,11 @@ discord.py>=2.3.0
50
50
  # ============================================================
51
51
  edge-tts>=6.1.0
52
52
 
53
+ # ============================================================
54
+ # 语音识别 (本地 STT,默认启用)
55
+ # ============================================================
56
+ faster-whisper>=1.0.0
57
+
53
58
  # ============================================================
54
59
  # Anthropic Claude (可选)
55
60
  # ============================================================
package/setup.py CHANGED
@@ -37,6 +37,8 @@ setup(
37
37
  "Pillow>=10.0.0",
38
38
  # 语音合成
39
39
  "edge-tts>=6.1.0",
40
+ # 语音识别 (本地 STT)
41
+ "faster-whisper>=1.0.0",
40
42
  # 浏览器自动化 (ChromeDev MCP, 无需 Playwright)
41
43
  # 桌面 GUI 自动化 (内置技能)
42
44
  "pynput>=1.7.6",
@@ -48,12 +50,14 @@ setup(
48
50
  "discord": ["discord.py>=2.3.0"],
49
51
  "anthropic": ["anthropic>=0.18.0"],
50
52
  "communication": ["cryptography>=41.0.0", "websockets>=12.0"],
53
+ "voice": ["faster-whisper>=1.0.0"],
51
54
  "all": [
52
55
  "python-telegram-bot>=21.0",
53
56
  "discord.py>=2.3.0",
54
57
  "anthropic>=0.18.0",
55
58
  "cryptography>=41.0.0",
56
59
  "websockets>=12.0",
60
+ "faster-whisper>=1.0.0",
57
61
  ],
58
62
  },
59
63
  entry_points={
package/web/api_server.py CHANGED
@@ -313,6 +313,7 @@ class ApiServer:
313
313
  r.add_post("/api/chat/stream", self.handle_chat_stream)
314
314
  r.add_post("/api/chat/inject", self.handle_chat_inject)
315
315
  r.add_post("/api/voice-optimize", self.handle_voice_optimize)
316
+ r.add_post("/api/voice-stt", self.handle_voice_stt)
316
317
  r.add_get("/chat", self.handle_chat_page)
317
318
  r.add_get("/api/execution/progress", self.handle_execution_progress)
318
319
  # ── 组织管理 ──
@@ -614,6 +615,7 @@ class ApiServer:
614
615
  session_id = f"{agent_path}_{raw_session_id}"
615
616
  chat_mode = data.get("mode", "")
616
617
  escalated = data.get("escalated", False)
618
+ voice_text = data.get("voice_text", "").strip() # 语音转文字原始文本(用于 usersays_correct)
617
619
 
618
620
  # ── 检查是否有正在运行的同一会话任务 ──
619
621
  running_info = self._running_sessions.get(session_id)
@@ -718,12 +720,13 @@ class ApiServer:
718
720
  model_chain, clean_message, session_id,
719
721
  agent_path=agent_path, agent_system_prompt=agent_system_prompt,
720
722
  chat_mode=chat_mode, stream_response=proxy,
723
+ voice_text=voice_text,
721
724
  )
722
725
  elif self.core.main_agent and self.core.llm:
723
726
  full_response = await self._stream_process_message(
724
727
  clean_message, session_id, proxy,
725
728
  agent_path=agent_path, agent_system_prompt=agent_system_prompt,
726
- chat_mode=chat_mode,
729
+ chat_mode=chat_mode, voice_text=voice_text,
727
730
  )
728
731
  else:
729
732
  full_response = await self.core.process_message(clean_message, session_id)
@@ -768,12 +771,13 @@ class ApiServer:
768
771
  model_chain, clean_message_q, session_id,
769
772
  agent_path=agent_path, agent_system_prompt=agent_system_prompt_q,
770
773
  chat_mode=chat_mode, stream_response=proxy,
774
+ voice_text="",
771
775
  )
772
776
  elif self.core.main_agent and self.core.llm:
773
777
  full_response = await self._stream_process_message(
774
778
  clean_message_q, session_id, proxy,
775
779
  agent_path=agent_path, agent_system_prompt=agent_system_prompt_q,
776
- chat_mode=chat_mode,
780
+ chat_mode=chat_mode, voice_text="",
777
781
  )
778
782
  else:
779
783
  full_response = await self.core.process_message(clean_message_q, session_id)
@@ -1026,6 +1030,145 @@ class ApiServer:
1026
1030
  logger.error(f"Voice optimize failed: {e}")
1027
1031
  return web.json_response({"error": str(e)}, status=500)
1028
1032
 
1033
+ async def handle_voice_stt(self, request):
1034
+ """POST /api/voice-stt - 轻量级本地语音转文字
1035
+
1036
+ 接受音频文件(WAV/WEBM/OGG),使用本地 STT 引擎转录。
1037
+ 支持的引擎(按优先级):
1038
+ 1. faster-whisper(推荐,需安装:pip install faster-whisper)
1039
+ 2. vosk(备选,需安装:pip install vosk)
1040
+ 如果都未安装,返回错误提示。
1041
+ """
1042
+ try:
1043
+ reader = await request.multipart()
1044
+ audio_data = None
1045
+ audio_format = None
1046
+
1047
+ while True:
1048
+ field = await reader.next()
1049
+ if field is None:
1050
+ break
1051
+ if field.name == 'audio':
1052
+ audio_data = await field.read(decode=False)
1053
+ elif field.name == 'format':
1054
+ audio_format = (await field.read(decode=True)).decode('utf-8').strip()
1055
+
1056
+ if not audio_data:
1057
+ # 也支持 JSON body:{"audio": "base64...", "format": "wav"}
1058
+ try:
1059
+ data = await request.json()
1060
+ audio_b64 = data.get("audio", "")
1061
+ audio_format = data.get("format", "wav")
1062
+ if audio_b64:
1063
+ import base64
1064
+ audio_data = base64.b64decode(audio_b64)
1065
+ except Exception:
1066
+ pass
1067
+
1068
+ if not audio_data:
1069
+ return web.json_response({"error": "未收到音频数据"}, status=400)
1070
+
1071
+ import io
1072
+
1073
+ # ── 尝试 faster-whisper ──
1074
+ try:
1075
+ from faster_whisper import WhisperModel
1076
+ whisper_model = getattr(self, '_whisper_model', None)
1077
+ if whisper_model is None:
1078
+ import os
1079
+ model_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'models', 'whisper')
1080
+ # 使用 tiny 模型(最轻量,~39MB),CPU int8 量化
1081
+ self._whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8",
1082
+ download_root=model_dir)
1083
+ whisper_model = self._whisper_model
1084
+ logger.info("faster-whisper tiny 模型已加载 (CPU int8)")
1085
+
1086
+ # faster-whisper 需要 16kHz WAV
1087
+ import wave
1088
+ audio_buf = io.BytesIO(audio_data)
1089
+
1090
+ # 转换为 WAV 16kHz mono
1091
+ wav_buf = io.BytesIO()
1092
+ with wave.open(wav_buf, 'wb') as wf:
1093
+ # 尝试读取原始音频
1094
+ try:
1095
+ with wave.open(audio_buf, 'rb') as rf:
1096
+ wf.setnchannels(1)
1097
+ wf.setsampwidth(2)
1098
+ wf.setframerate(16000)
1099
+ # 读取所有帧并重采样
1100
+ frames = rf.readframes(rf.getnframes())
1101
+ wf.writeframes(frames)
1102
+ except Exception:
1103
+ # 非 WAV 格式,尝试通过 pydub 或直接写入
1104
+ wf.setnchannels(1)
1105
+ wf.setsampwidth(2)
1106
+ wf.setframerate(16000)
1107
+ wf.writeframes(audio_data)
1108
+
1109
+ wav_buf.seek(0)
1110
+ segments, info = whisper_model.transcribe(wav_buf, language="zh", beam_size=1,
1111
+ vad_filter=True, vad_parameters=dict(
1112
+ min_silence_duration_ms=300))
1113
+ text = "".join(seg.text for seg in segments).strip()
1114
+
1115
+ if text:
1116
+ return web.json_response({"text": text, "engine": "faster-whisper"})
1117
+ except ImportError:
1118
+ logger.debug("faster-whisper 未安装,跳过")
1119
+ except Exception as e:
1120
+ logger.warning(f"faster-whisper 转录失败: {e}")
1121
+
1122
+ # ── 尝试 vosk ──
1123
+ try:
1124
+ import vosk
1125
+ model = getattr(self, '_vosk_model', None)
1126
+ if model is None:
1127
+ import os, zipfile
1128
+ model_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'models', 'vosk', 'vosk-model-cn')
1129
+ if not os.path.exists(model_dir):
1130
+ # 自动下载 vosk 小型中文模型
1131
+ logger.info("正在下载 vosk 中文模型...")
1132
+ import urllib.request
1133
+ url = "https://alphacephei.com/vosk/models/vosk-model-small-cn-0.22.zip"
1134
+ zip_path = model_dir + ".zip"
1135
+ os.makedirs(os.path.dirname(model_dir), exist_ok=True)
1136
+ try:
1137
+ urllib.request.urlretrieve(url, zip_path)
1138
+ with zipfile.ZipFile(zip_path, 'r') as zf:
1139
+ zf.extractall(os.path.dirname(model_dir))
1140
+ os.remove(zip_path)
1141
+ except Exception as de:
1142
+ logger.warning(f"vosk 模型下载失败: {de}")
1143
+ if os.path.exists(model_dir):
1144
+ self._vosk_model = vosk.Model(model_dir)
1145
+ model = self._vosk_model
1146
+
1147
+ if model:
1148
+ import json as _json
1149
+ rec = vosk.KaldiRecognizer(model, 16000)
1150
+ rec.AcceptWaveform(audio_data)
1151
+ result = _json.loads(rec.Result())
1152
+ text = result.get("text", "").strip()
1153
+ if text:
1154
+ return web.json_response({"text": text, "engine": "vosk"})
1155
+ except ImportError:
1156
+ logger.debug("vosk 未安装,跳过")
1157
+ except Exception as e:
1158
+ logger.warning(f"vosk 转录失败: {e}")
1159
+
1160
+ # ── 没有可用的 STT 引擎 ──
1161
+ return web.json_response({
1162
+ "error": "未检测到本地 STT 引擎。请安装 faster-whisper(推荐)或 vosk:\n"
1163
+ " pip install faster-whisper (首次使用会自动下载 tiny 模型 ~39MB)\n"
1164
+ " 或 pip install vosk",
1165
+ "available": False,
1166
+ }, status=503)
1167
+
1168
+ except Exception as e:
1169
+ logger.error(f"Voice STT failed: {e}", exc_info=True)
1170
+ return web.json_response({"error": str(e)}, status=500)
1171
+
1029
1172
  def _build_task_plan_context(self, agent_path: str, chat_mode: str, original_message: str, session_id: str = "") -> str:
1030
1173
  """构建任务规划上下文(仅 exec 模式,注入到 system_prompt 中)"""
1031
1174
  if chat_mode != "exec":
@@ -3168,7 +3311,8 @@ class ApiServer:
3168
3311
 
3169
3312
  async def _try_model_chain_stream(self, model_chain, message, session_id,
3170
3313
  agent_path=None, agent_system_prompt=None,
3171
- chat_mode="", stream_response=None):
3314
+ chat_mode="", stream_response=None,
3315
+ voice_text=""):
3172
3316
  """流式版本的模型链调用,逐token输出到SSE
3173
3317
 
3174
3318
  使用 asyncio.Lock 保护共享的 self.core.llm,防止并发请求互相干扰。
@@ -3183,11 +3327,13 @@ class ApiServer:
3183
3327
  model_chain, message, session_id,
3184
3328
  agent_path=agent_path, agent_system_prompt=agent_system_prompt,
3185
3329
  chat_mode=chat_mode, stream_response=stream_response,
3330
+ voice_text=voice_text,
3186
3331
  )
3187
3332
 
3188
3333
  async def _try_model_chain_stream_inner(self, model_chain, message, session_id,
3189
3334
  agent_path=None, agent_system_prompt=None,
3190
- chat_mode="", stream_response=None):
3335
+ chat_mode="", stream_response=None,
3336
+ voice_text=""):
3191
3337
  """_try_model_chain_stream 的实际执行体(已在 _model_chain_lock 保护下)"""
3192
3338
  llm = self.core.llm
3193
3339
  full_text = ""
@@ -3212,7 +3358,7 @@ class ApiServer:
3212
3358
  result = await self._stream_process_message(
3213
3359
  message, session_id, stream_response,
3214
3360
  agent_path=agent_path, agent_system_prompt=agent_system_prompt,
3215
- chat_mode=chat_mode,
3361
+ chat_mode=chat_mode, voice_text=voice_text,
3216
3362
  )
3217
3363
  if result and not result.startswith("⚠️") and not result.startswith("❌"):
3218
3364
  return result
@@ -3243,7 +3389,8 @@ class ApiServer:
3243
3389
  await asyncio.sleep(delay)
3244
3390
 
3245
3391
  async def _stream_process_message(self, user_message, session_id, stream_response,
3246
- agent_path=None, agent_system_prompt=None, chat_mode=""):
3392
+ agent_path=None, agent_system_prompt=None, chat_mode="",
3393
+ voice_text=""):
3247
3394
  """使用流式LLM调用处理消息,支持完整的agent循环(工具调用/操作执行)+ 实时流式输出
3248
3395
 
3249
3396
  核心改进:
@@ -3269,6 +3416,7 @@ class ApiServer:
3269
3416
  context.metadata["agent_override_prompt"] = agent_system_prompt
3270
3417
  context.metadata["agent_override_path"] = agent_path
3271
3418
  context.metadata["chat_mode"] = chat_mode
3419
+ context.metadata["user_voice_text"] = voice_text # 语音输入原始文本(用于 usersays_correct)
3272
3420
 
3273
3421
  # ── 根据 Agent 配置设置执行引擎参数(execution_mode 等)──
3274
3422
  agent_cfg_for_exec = self._read_agent_config(agent_path)
@@ -455,7 +455,9 @@ input,textarea,select{font:inherit}
455
455
 
456
456
  /* ── Message Content Smooth Render ── */
457
457
  .message-content{
458
- flex:1;min-width:0;
458
+ flex:1;min-width:0;width:100%;
459
+ /* 确保所有子元素(thought-block, bubble 等)撑满宽度 */
460
+ display:flex;flex-direction:column;align-items:stretch;
459
461
  }
460
462
  .stream-text-node{
461
463
  display:inline;
@@ -469,7 +471,7 @@ input,textarea,select{font:inherit}
469
471
  }
470
472
 
471
473
  /* ── Thought Block (Agent Thinking) ── */
472
- .thought-block{width:100%;display:block;margin:0 0 10px 0;border:1px solid var(--border-light);border-radius:var(--radius-sm);overflow:hidden;background:linear-gradient(135deg,var(--accent-light),var(--bg2));animation:thoughtFadeIn .4s ease-out}
474
+ .thought-block{width:100%;max-width:100%;display:block;margin:0 0 10px 0;border:1px solid var(--border-light);border-radius:var(--radius-sm);overflow:hidden;background:linear-gradient(135deg,var(--accent-light),var(--bg2));animation:thoughtFadeIn .4s ease-out;flex-shrink:0;box-sizing:border-box}
473
475
  .thought-block.streaming{border-color:var(--accent);box-shadow:0 0 12px rgba(99,102,241,.15)}
474
476
  @keyframes thoughtFadeIn{from{opacity:0;transform:translateY(-6px)}to{opacity:1;transform:translateY(0)}}
475
477
  .thought-block summary{display:flex;align-items:center;gap:8px;padding:8px 14px;cursor:pointer;font-size:12px;font-weight:600;color:var(--text2);user-select:none;transition:var(--transition);text-transform:uppercase;letter-spacing:.3px}
@@ -353,14 +353,17 @@ function initChat() {
353
353
  }
354
354
 
355
355
  // 如果 URL 指定了 agent 或 session,等 agent 列表加载后自动选中
356
- if (urlAgent || urlSession) {
356
+ // 注意:loadSessions() 内部会检查 URL session 参数并自动恢复
357
+ if (urlAgent) {
357
358
  const targetAgent = urlAgent || (urlSession ? urlSession.split('_web_')[0] || 'default' : null);
358
359
  setTimeout(function() {
359
360
  if (targetAgent) selectAgent(targetAgent);
360
- // 如果指定了 session,等会话列表加载后自动选中
361
- if (urlSession) {
362
- setTimeout(function() { selectSession(urlSession); }, 800);
363
- }
361
+ }, 500);
362
+ } else if (urlSession) {
363
+ // 只有 session 没有 agent,尝试从 session ID 推断 agent
364
+ const targetAgent = urlSession.split('_web_')[0] || 'default';
365
+ setTimeout(function() {
366
+ selectAgent(targetAgent);
364
367
  }, 500);
365
368
  }
366
369
  }
@@ -1581,7 +1584,13 @@ async function loadSessions() {
1581
1584
  updateSidebarAgentIndicator();
1582
1585
 
1583
1586
  // Auto-select most recent session if none selected
1584
- if (!state.activeSessionId && state.sessions.length > 0) {
1587
+ // 优先检查 URL 参数指定的 session(页面刷新恢复)
1588
+ const urlParams = new URLSearchParams(window.location.search);
1589
+ const urlSession = urlParams.get('session');
1590
+ if (urlSession && state.sessions.some(s => s.id === urlSession)) {
1591
+ // URL 指定了有效的 session ID,直接选中(刷新恢复)
1592
+ await selectSession(urlSession);
1593
+ } else if (!state.activeSessionId && state.sessions.length > 0) {
1585
1594
  await selectSession(state.sessions[0].id);
1586
1595
  }
1587
1596
  }
@@ -4016,28 +4025,38 @@ if (document.readyState === 'loading') {
4016
4025
  var VoiceInput = {
4017
4026
  mode: 'text', // 'text' | 'voice'
4018
4027
  isRecording: false,
4019
- recognition: null,
4028
+ mediaRecorder: null,
4029
+ audioChunks: [],
4020
4030
  rawText: '',
4021
- optimizedText: '',
4022
- isOptimizing: false,
4023
4031
  _micPermissionGranted: false, // 麦克风权限是否已确认
4024
- _startRetries: 0, // 启动重试计数
4025
-
4026
- /** 检查是否在安全上下文中(HTTPS 或 localhost) */
4027
- _isSecureContext: function() {
4028
- if (window.isSecureContext) return true;
4029
- // 某些浏览器不支持 isSecureContext,手动检查
4030
- var protocol = window.location.protocol;
4031
- var hostname = window.location.hostname;
4032
- return protocol === 'https:' || hostname === 'localhost' || hostname === '127.0.0.1';
4032
+ _audioStream: null, // 当前活跃的音频流
4033
+ _sttEngine: null, // 检测到的STT引擎名称
4034
+
4035
+ /** 检查STT引擎是否可用 */
4036
+ checkSTTAvailable: async function() {
4037
+ try {
4038
+ var resp = await fetch('/api/voice-stt', { method: 'OPTIONS' }).catch(function() { return { ok: false }; });
4039
+ // OPTIONS might not be supported, try a small test
4040
+ var testData = new FormData();
4041
+ testData.append('audio', new Blob([], { type: 'audio/wav' }));
4042
+ var testResp = await fetch('/api/voice-stt', {
4043
+ method: 'POST',
4044
+ body: testData,
4045
+ });
4046
+ if (testResp.status === 400) {
4047
+ // 400 means "no audio data" — endpoint exists and works
4048
+ return true;
4049
+ }
4050
+ return testResp.ok;
4051
+ } catch (e) {
4052
+ return false;
4053
+ }
4033
4054
  },
4034
4055
 
4035
- /** 主动请求麦克风权限(通过 getUserMedia 确认权限状态) */
4056
+ /** 主动请求麦克风权限 */
4036
4057
  _ensureMicPermission: async function() {
4037
- // 如果已经确认有权限,跳过
4038
4058
  if (this._micPermissionGranted) return true;
4039
4059
 
4040
- // 检查 navigator.permissions API
4041
4060
  if (navigator.permissions && navigator.permissions.query) {
4042
4061
  try {
4043
4062
  var result = await navigator.permissions.query({ name: 'microphone' });
@@ -4048,16 +4067,12 @@ var VoiceInput = {
4048
4067
  if (result.state === 'denied') {
4049
4068
  return false;
4050
4069
  }
4051
- } catch (_) {
4052
- // permissions.query 可能不支持 microphone,继续尝试 getUserMedia
4053
- }
4070
+ } catch (_) {}
4054
4071
  }
4055
4072
 
4056
- // 通过 getUserMedia 主动请求麦克风权限
4057
4073
  if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
4058
4074
  try {
4059
4075
  var stream = await navigator.mediaDevices.getUserMedia({ audio: true });
4060
- // 获取成功,立即释放(SpeechRecognition 会自己管理音频流)
4061
4076
  stream.getTracks().forEach(function(t) { t.stop(); });
4062
4077
  this._micPermissionGranted = true;
4063
4078
  return true;
@@ -4067,88 +4082,6 @@ var VoiceInput = {
4067
4082
  return false;
4068
4083
  }
4069
4084
  }
4070
-
4071
- // 没有 mediaDevices API(HTTP 环境),但 SpeechRecognition 可能仍可用
4072
- return true;
4073
- },
4074
-
4075
- /** Initialize Web Speech API */
4076
- init: function() {
4077
- var SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
4078
- if (!SpeechRecognition) {
4079
- console.warn('Web Speech API not supported');
4080
- return false;
4081
- }
4082
- this.recognition = new SpeechRecognition();
4083
- this.recognition.continuous = true;
4084
- this.recognition.interimResults = true;
4085
- this.recognition.lang = 'zh-CN';
4086
- this.recognition.maxAlternatives = 1;
4087
-
4088
- var self = this;
4089
- this.recognition.onresult = function(event) {
4090
- var transcript = '';
4091
- for (var i = 0; i < event.results.length; i++) {
4092
- transcript += event.results[i][0].transcript;
4093
- }
4094
- self.rawText = transcript;
4095
- self._startRetries = 0; // 成功获取结果,重置重试计数
4096
- var statusEl = document.getElementById('voiceStatus');
4097
- if (statusEl) {
4098
- statusEl.textContent = transcript || '正在聆听...';
4099
- statusEl.style.color = transcript ? 'var(--text)' : 'var(--text3)';
4100
- }
4101
- };
4102
-
4103
- this.recognition.onerror = function(event) {
4104
- console.error('Speech recognition error:', event.error);
4105
- var statusEl = document.getElementById('voiceStatus');
4106
- var errMsg = '';
4107
-
4108
- switch (event.error) {
4109
- case 'no-speech':
4110
- // 用户没有说话,静默处理
4111
- self._showStatus('未检测到语音,请重试', 'var(--text3)');
4112
- break;
4113
- case 'aborted':
4114
- // 用户取消,不显示错误
4115
- break;
4116
- case 'not-allowed':
4117
- errMsg = '麦克风权限被拒绝,请在浏览器地址栏左侧点击权限图标允许麦克风访问';
4118
- self._micPermissionGranted = false;
4119
- break;
4120
- case 'service-not-available':
4121
- errMsg = '语音识别服务不可用,请检查网络连接或尝试刷新页面';
4122
- break;
4123
- case 'service-not-allowed':
4124
- errMsg = '语音识别服务未授权,请检查浏览器设置是否允许语音识别';
4125
- break;
4126
- case 'audio-capture':
4127
- errMsg = '未找到麦克风设备,请确认已连接麦克风';
4128
- break;
4129
- case 'network':
4130
- errMsg = '语音识别网络错误,请检查网络连接后重试';
4131
- break;
4132
- default:
4133
- errMsg = '语音识别出错 (' + event.error + '),请重试';
4134
- break;
4135
- }
4136
-
4137
- if (errMsg) {
4138
- self._showStatus(errMsg, 'var(--danger)');
4139
- }
4140
- self.stopRecording();
4141
- };
4142
-
4143
- this.recognition.onend = function() {
4144
- if (self.isRecording) {
4145
- self.stopRecording();
4146
- }
4147
- };
4148
-
4149
- // Set up press-and-hold for the record button
4150
- this._setupHoldButton();
4151
-
4152
4085
  return true;
4153
4086
  },
4154
4087
 
@@ -4161,6 +4094,12 @@ var VoiceInput = {
4161
4094
  }
4162
4095
  },
4163
4096
 
4097
+ /** 初始化(设置按钮事件) */
4098
+ init: function() {
4099
+ this._setupHoldButton();
4100
+ return true;
4101
+ },
4102
+
4164
4103
  /** Set up press-and-hold behavior on the voice record button */
4165
4104
  _setupHoldButton: function() {
4166
4105
  var btn = document.getElementById('voiceRecordBtn');
@@ -4215,45 +4154,38 @@ var VoiceInput = {
4215
4154
  if (textBtn) textBtn.classList.remove('active');
4216
4155
  if (voiceBtn) voiceBtn.classList.add('active');
4217
4156
  if (inputBox) inputBox.style.borderColor = '';
4218
- // Initialize speech recognition if not done
4219
- if (!this.recognition) {
4157
+ // Init if not done
4158
+ if (!this._setupDone) {
4220
4159
  this.init();
4160
+ this._setupDone = true;
4221
4161
  }
4162
+ // Check STT availability
4163
+ this._showStatus('按住麦克风开始录音', 'var(--text3)');
4222
4164
  } else {
4223
4165
  if (textArea) textArea.style.display = 'flex';
4224
4166
  if (voiceArea) voiceArea.style.display = 'none';
4225
4167
  if (voicePreview) voicePreview.style.display = 'none';
4226
4168
  if (textBtn) textBtn.classList.add('active');
4227
4169
  if (voiceBtn) voiceBtn.classList.remove('active');
4228
- // Cancel any ongoing recording
4229
4170
  if (this.isRecording) {
4230
4171
  this.cancelRecording();
4231
4172
  }
4232
4173
  }
4233
4174
  },
4234
4175
 
4235
- /** Start recording(异步,先检查权限) */
4176
+ /** Start recording(使用 MediaRecorder) */
4236
4177
  startRecording: async function() {
4237
- if (this.isRecording || !this.recognition) return;
4238
- if (this.isOptimizing) return;
4239
-
4240
- // ── Step 1: 检查安全上下文 ──
4241
- if (!this._isSecureContext()) {
4242
- this._showStatus('语音识别需要 HTTPS 环境,当前页面不安全', 'var(--danger)');
4243
- if (typeof toast === 'function') {
4244
- toast('语音输入需要 HTTPS 环境,请通过 HTTPS 地址访问', 'error');
4245
- }
4246
- return;
4247
- }
4178
+ if (this.isRecording) return;
4248
4179
 
4249
4180
  this.isRecording = true;
4181
+ this.audioChunks = [];
4250
4182
  this.rawText = '';
4251
4183
 
4252
4184
  var btn = document.getElementById('voiceRecordBtn');
4253
4185
  if (btn) btn.classList.add('recording');
4254
4186
  this._showStatus('正在请求麦克风权限...', 'var(--text3)');
4255
4187
 
4256
- // ── Step 2: 主动请求麦克风权限 ──
4188
+ // 获取麦克风权限
4257
4189
  var hasPermission = await this._ensureMicPermission();
4258
4190
  if (!hasPermission) {
4259
4191
  this.isRecording = false;
@@ -4265,121 +4197,179 @@ var VoiceInput = {
4265
4197
  return;
4266
4198
  }
4267
4199
 
4268
- this._showStatus('正在聆听...', 'var(--text3)');
4269
-
4270
- // ── Step 3: 启动语音识别(带重试) ──
4271
4200
  try {
4272
- this.recognition.start();
4273
- } catch (e) {
4274
- // recognition 可能处于中间状态(stopped 但未完全重置),先 stop 再重试
4275
- console.warn('Recognition start failed, retrying:', e.message);
4276
- try { this.recognition.stop(); } catch(_) {}
4277
- if (this._startRetries < 2) {
4278
- this._startRetries++;
4279
- setTimeout(function() {
4280
- if (VoiceInput.isRecording) {
4281
- try { VoiceInput.recognition.start(); } catch(_) {
4282
- VoiceInput.isRecording = false;
4283
- if (btn) btn.classList.remove('recording');
4284
- VoiceInput._showStatus('语音识别启动失败,请重试', 'var(--danger)');
4285
- }
4286
- }
4287
- }, 150);
4288
- } else {
4289
- this.isRecording = false;
4290
- if (btn) btn.classList.remove('recording');
4291
- this._showStatus('语音识别启动失败,请刷新页面后重试', 'var(--danger)');
4292
- this._startRetries = 0;
4201
+ // 创建音频流
4202
+ this._audioStream = await navigator.mediaDevices.getUserMedia({
4203
+ audio: {
4204
+ channelCount: 1,
4205
+ sampleRate: 16000,
4206
+ echoCancellation: true,
4207
+ noiseSuppression: true,
4208
+ }
4209
+ });
4210
+
4211
+ // 创建 MediaRecorder(优先使用 WAV 格式,回退到 WEBM)
4212
+ var mimeType = 'audio/webm;codecs=opus';
4213
+ if (typeof MediaRecorder !== 'undefined' && MediaRecorder.isTypeSupported) {
4214
+ if (MediaRecorder.isTypeSupported('audio/webm;codecs=opus')) {
4215
+ mimeType = 'audio/webm;codecs=opus';
4216
+ } else if (MediaRecorder.isTypeSupported('audio/webm')) {
4217
+ mimeType = 'audio/webm';
4218
+ } else if (MediaRecorder.isTypeSupported('audio/ogg;codecs=opus')) {
4219
+ mimeType = 'audio/ogg;codecs=opus';
4220
+ }
4293
4221
  }
4222
+
4223
+ this.mediaRecorder = new MediaRecorder(this._audioStream, { mimeType: mimeType });
4224
+ var self = this;
4225
+
4226
+ this.mediaRecorder.ondataavailable = function(e) {
4227
+ if (e.data && e.data.size > 0) {
4228
+ self.audioChunks.push(e.data);
4229
+ }
4230
+ };
4231
+
4232
+ this.mediaRecorder.onstop = function() {
4233
+ self._processAudio();
4234
+ };
4235
+
4236
+ this.mediaRecorder.onerror = function(e) {
4237
+ console.error('MediaRecorder error:', e.error);
4238
+ self.isRecording = false;
4239
+ if (btn) btn.classList.remove('recording');
4240
+ self._showStatus('录音出错,请重试', 'var(--danger)');
4241
+ self._cleanupStream();
4242
+ };
4243
+
4244
+ // 开始录音(每100ms收集一次数据)
4245
+ this.mediaRecorder.start(100);
4246
+ this._showStatus('正在录音...', 'var(--text3)');
4247
+ this._recordingStartTime = Date.now();
4248
+
4249
+ } catch (e) {
4250
+ this.isRecording = false;
4251
+ if (btn) btn.classList.remove('recording');
4252
+ this._showStatus('无法启动录音: ' + (e.message || '未知错误'), 'var(--danger)');
4253
+ this._cleanupStream();
4294
4254
  }
4295
4255
  },
4296
4256
 
4297
- /** Stop recording and trigger optimization */
4257
+ /** Stop recording and process audio */
4298
4258
  stopRecording: function() {
4299
- if (!this.isRecording || !this.recognition) return;
4300
- this.isRecording = false;
4301
- this._startRetries = 0;
4259
+ if (!this.isRecording || !this.mediaRecorder) return;
4302
4260
 
4303
4261
  var btn = document.getElementById('voiceRecordBtn');
4304
4262
  if (btn) btn.classList.remove('recording');
4305
4263
 
4264
+ // 检查录音时长(太短则忽略)
4265
+ var duration = Date.now() - (this._recordingStartTime || 0);
4266
+ if (duration < 500) {
4267
+ this._showStatus('录音时间太短,请按住麦克风说话', 'var(--text3)');
4268
+ this.isRecording = false;
4269
+ try { this.mediaRecorder.stop(); } catch (e) {}
4270
+ this._cleanupStream();
4271
+ return;
4272
+ }
4273
+
4274
+ this._showStatus('正在识别...', 'var(--text3)');
4275
+ this.isRecording = false;
4276
+
4306
4277
  try {
4307
- this.recognition.stop();
4278
+ this.mediaRecorder.stop();
4308
4279
  } catch (e) {}
4309
-
4310
- // Only proceed if we have some text
4311
- if (this.rawText && this.rawText.trim()) {
4312
- this.optimizeAndPreview();
4313
- } else {
4314
- this._showStatus('未检测到语音,请重试', 'var(--text3)');
4315
- }
4316
4280
  },
4317
4281
 
4318
- /** Cancel recording without processing */
4319
- cancelRecording: function() {
4320
- this.isRecording = false;
4321
- if (this.recognition) {
4322
- try { this.recognition.abort(); } catch(e) {}
4323
- }
4324
- var btn = document.getElementById('voiceRecordBtn');
4325
- if (btn) btn.classList.remove('recording');
4326
- var statusEl = document.getElementById('voiceStatus');
4327
- if (statusEl) {
4328
- statusEl.textContent = '';
4282
+ /** 清理音频流 */
4283
+ _cleanupStream: function() {
4284
+ if (this._audioStream) {
4285
+ this._audioStream.getTracks().forEach(function(t) { t.stop(); });
4286
+ this._audioStream = null;
4329
4287
  }
4330
- this.rawText = '';
4331
4288
  },
4332
4289
 
4333
- /** Send raw text to backend for LLM optimization, then show preview */
4334
- optimizeAndPreview: async function() {
4335
- if (!this.rawText.trim()) return;
4290
+ /** 处理录音数据:发送到后端 STT */
4291
+ _processAudio: async function() {
4292
+ if (this.audioChunks.length === 0) {
4293
+ this._showStatus('未检测到语音,请重试', 'var(--text3)');
4294
+ this._cleanupStream();
4295
+ return;
4296
+ }
4336
4297
 
4337
- this.isOptimizing = true;
4338
4298
  var voiceArea = document.getElementById('voiceInputArea');
4339
4299
  var voicePreview = document.getElementById('voicePreview');
4340
4300
  var previewText = document.getElementById('voicePreviewText');
4341
4301
  var previewHint = document.getElementById('voicePreviewHint');
4342
4302
  var previewSend = document.getElementById('voicePreviewSend');
4343
4303
 
4344
- // Show preview area with raw text first
4304
+ // 显示预览区域
4345
4305
  if (voiceArea) voiceArea.style.display = 'none';
4346
4306
  if (voicePreview) voicePreview.style.display = 'block';
4347
- if (previewText) previewText.textContent = this.rawText;
4348
- if (previewHint) previewHint.textContent = '优化中...';
4307
+ if (previewText) previewText.textContent = '识别中...';
4308
+ if (previewHint) previewHint.textContent = '正在发送到本地STT引擎';
4349
4309
  if (previewSend) previewSend.disabled = true;
4350
4310
 
4351
- // Show raw text in input for now
4352
- this.optimizedText = this.rawText;
4353
-
4354
4311
  try {
4355
- var sessionId = state.activeSessionId || '';
4356
- var data = await api('/api/voice-optimize', {
4312
+ var audioBlob = new Blob(this.audioChunks, { type: this.mediaRecorder ? this.mediaRecorder.mimeType : 'audio/webm' });
4313
+
4314
+ // 发送音频到后端 STT 端点
4315
+ var formData = new FormData();
4316
+ formData.append('audio', audioBlob, 'recording.webm');
4317
+ formData.append('format', 'webm');
4318
+
4319
+ var resp = await fetch('/api/voice-stt', {
4357
4320
  method: 'POST',
4358
- body: JSON.stringify({
4359
- text: this.rawText,
4360
- agent_path: state.activeAgent,
4361
- session_id: sessionId,
4362
- mode: state.chatMode,
4363
- })
4321
+ body: formData,
4364
4322
  });
4365
4323
 
4366
- if (data && data.optimized) {
4367
- this.optimizedText = data.optimized;
4368
- if (previewText) previewText.textContent = data.optimized;
4369
- if (previewHint) previewHint.textContent = '已优化';
4324
+ var data = await resp.json();
4325
+
4326
+ if (data && data.text && data.text.trim()) {
4327
+ this.rawText = data.text.trim();
4328
+ this._sttEngine = data.engine || 'unknown';
4329
+ if (previewText) previewText.textContent = this.rawText;
4330
+ if (previewHint) previewHint.textContent = '已识别 (' + (this._sttEngine || 'local') + ')';
4370
4331
  } else if (data && data.error) {
4371
- if (previewHint) previewHint.textContent = '优化失败,使用原文';
4372
- console.warn('Voice optimization error:', data.error);
4332
+ if (previewHint) previewHint.textContent = '识别失败';
4333
+ if (previewText) previewText.textContent = data.error;
4334
+ console.warn('Voice STT error:', data.error);
4335
+ // 如果没有STT引擎,给出提示
4336
+ if (data.available === false) {
4337
+ if (previewHint) previewHint.textContent = '未检测到STT引擎';
4338
+ if (typeof toast === 'function') {
4339
+ toast('请安装语音识别引擎: pip install faster-whisper', 'error');
4340
+ }
4341
+ }
4342
+ } else {
4343
+ if (previewHint) previewHint.textContent = '未识别到文字';
4344
+ if (previewText) previewText.textContent = '未识别到文字,请重试';
4373
4345
  }
4374
4346
  } catch (e) {
4375
- if (previewHint) previewHint.textContent = '网络错误,使用原文';
4376
- console.error('Voice optimize API error:', e);
4347
+ console.error('Voice STT request error:', e);
4348
+ if (previewHint) previewHint.textContent = '网络错误';
4349
+ if (previewText) previewText.textContent = 'STT请求失败,请重试';
4377
4350
  }
4378
4351
 
4379
- this.isOptimizing = false;
4352
+ this._cleanupStream();
4380
4353
  if (previewSend) previewSend.disabled = false;
4381
4354
  },
4382
4355
 
4356
+ /** Cancel recording without processing */
4357
+ cancelRecording: function() {
4358
+ this.isRecording = false;
4359
+ if (this.mediaRecorder && this.mediaRecorder.state !== 'inactive') {
4360
+ try { this.mediaRecorder.abort(); } catch(e) {}
4361
+ }
4362
+ this._cleanupStream();
4363
+ var btn = document.getElementById('voiceRecordBtn');
4364
+ if (btn) btn.classList.remove('recording');
4365
+ var statusEl = document.getElementById('voiceStatus');
4366
+ if (statusEl) {
4367
+ statusEl.textContent = '';
4368
+ }
4369
+ this.rawText = '';
4370
+ this.audioChunks = [];
4371
+ },
4372
+
4383
4373
  /** Cancel voice preview and return to voice input mode */
4384
4374
  cancelPreview: function() {
4385
4375
  var voiceArea = document.getElementById('voiceInputArea');
@@ -4391,34 +4381,32 @@ var VoiceInput = {
4391
4381
  if (statusEl) statusEl.textContent = '';
4392
4382
 
4393
4383
  this.rawText = '';
4394
- this.optimizedText = '';
4395
- this.isOptimizing = false;
4384
+ this.audioChunks = [];
4396
4385
  },
4397
4386
 
4398
- /** Send the optimized voice text as a regular message */
4387
+ /** Send the voice text as a message(直接发送原始STT文本,由后端usersays_correct纠错) */
4399
4388
  sendMessage: function() {
4400
- if (!this.optimizedText || !this.optimizedText.trim()) return;
4389
+ if (!this.rawText || !this.rawText.trim()) return;
4401
4390
 
4402
- var text = this.optimizedText.trim();
4391
+ var text = this.rawText.trim();
4403
4392
 
4404
- // Switch back to text mode
4393
+ // 切回文本模式
4405
4394
  this.switchMode('text');
4406
4395
 
4407
- // Set the text in the textarea and trigger send
4396
+ // 将文本放入输入框并触发发送
4408
4397
  var input = document.getElementById('userInput');
4409
4398
  if (input) {
4410
4399
  input.value = text;
4411
4400
  input.dispatchEvent(new Event('input'));
4412
4401
  }
4413
4402
 
4414
- // Reset voice state
4403
+ // 重置语音状态
4415
4404
  this.rawText = '';
4416
- this.optimizedText = '';
4417
- this.isOptimizing = false;
4405
+ this.audioChunks = [];
4418
4406
 
4419
- // Send the message using the existing sendMessage function
4407
+ // 使用 sendMessage 发送(附带 voice_text 标记)
4420
4408
  if (typeof sendMessage === 'function') {
4421
- sendMessage();
4409
+ sendMessage({ voiceText: text });
4422
4410
  }
4423
4411
  }
4424
4412
  };
@@ -4440,18 +4428,8 @@ function sendVoiceMessage() {
4440
4428
 
4441
4429
  // Initialize voice input on DOM ready
4442
4430
  (function() {
4443
- var hasSpeechAPI = !!(window.SpeechRecognition || window.webkitSpeechRecognition);
4444
- var voiceBtn = document.getElementById('inputModeVoiceBtn');
4445
- if (!hasSpeechAPI && voiceBtn) {
4446
- voiceBtn.style.opacity = '0.3';
4447
- voiceBtn.style.cursor = 'not-allowed';
4448
- voiceBtn.title = '当前浏览器不支持语音输入';
4449
- voiceBtn.onclick = function(e) {
4450
- e.preventDefault();
4451
- if (typeof toast === 'function') {
4452
- toast('当前浏览器不支持语音识别,请使用 Chrome 或 Edge', 'error');
4453
- }
4454
- };
4455
- }
4431
+ // Voice input now uses MediaRecorder (always available) + backend STT
4432
+ // No need to check for SpeechRecognition API
4433
+ // The voice button is always enabled; STT engine availability is checked when recording
4456
4434
  })();
4457
4435
 
@@ -1124,16 +1124,63 @@ function _assembleV2Content(msg, msgParts) {
1124
1124
  return '(无回复)';
1125
1125
  }
1126
1126
 
1127
+ // ══════════════════════════════════════════════════════
1128
+ // ── Voice Input: User Bubble Replacement ──
1129
+ // ══════════════════════════════════════════════════════
1130
+
1131
+ /**
1132
+ * 替换指定索引的用户气泡文本(用于 usersays_correct 纠错)
1133
+ * 直接操作 DOM,不触发 renderMessages(),避免干扰流式输出
1134
+ * @param {number} idx - state.messages 中的用户消息索引
1135
+ * @param {string} newText - 纠错后的文本
1136
+ */
1137
+ function _replaceUserBubble(idx, newText) {
1138
+ var container = document.getElementById('messagesInner');
1139
+ if (!container) return;
1140
+
1141
+ // 找到第 idx+1 个 message-row.user 元素(跳过 tool 消息)
1142
+ var userRows = container.querySelectorAll('.message-row.user');
1143
+ var userCount = 0;
1144
+ for (var i = 0; i < state.messages.length && i <= idx; i++) {
1145
+ if (state.messages[i].role === 'user') {
1146
+ if (i === idx) {
1147
+ // 找到目标行
1148
+ if (userCount < userRows.length) {
1149
+ var row = userRows[userCount];
1150
+ var bubble = row.querySelector('.message-bubble');
1151
+ if (bubble) {
1152
+ // 平滑替换:先淡出,再更新内容,再淡入
1153
+ bubble.style.transition = 'opacity 0.2s ease';
1154
+ bubble.style.opacity = '0.4';
1155
+ setTimeout(function() {
1156
+ // 使用 renderMarkdown 渲染新文本
1157
+ if (typeof renderMarkdown === 'function') {
1158
+ bubble.innerHTML = renderMarkdown(newText);
1159
+ } else {
1160
+ bubble.textContent = newText;
1161
+ }
1162
+ bubble.style.opacity = '1';
1163
+ }, 200);
1164
+ }
1165
+ }
1166
+ break;
1167
+ }
1168
+ userCount++;
1169
+ }
1170
+ }
1171
+ }
1172
+
1127
1173
  // ══════════════════════════════════════════════════════
1128
1174
  // ── Send Message (核心 SSE 流式消息发送) ──
1129
1175
  // ══════════════════════════════════════════════════════
1130
1176
 
1131
- async function sendMessage() {
1177
+ async function sendMessage(opts) {
1132
1178
  if (currentView === 'group') {
1133
1179
  return sendGroupChat();
1134
1180
  }
1135
1181
  const input = document.getElementById('userInput');
1136
1182
  const text = input.value.trim();
1183
+ const voiceText = (opts && opts.voiceText) ? opts.voiceText : ''; // 语音输入原始文本
1137
1184
  // ── 如果正在生成,弹出处理选择框 ──
1138
1185
  if (state.isGenerating) {
1139
1186
  state.tempInputText = text;
@@ -1173,7 +1220,7 @@ async function sendMessage() {
1173
1220
  }
1174
1221
 
1175
1222
  // Add user message
1176
- state.messages.push({ role: 'user', content: text, time: new Date().toISOString() });
1223
+ state.messages.push({ role: 'user', content: text, time: new Date().toISOString(), _voiceText: voiceText });
1177
1224
  renderMessages();
1178
1225
 
1179
1226
  // Clear input
@@ -1207,6 +1254,7 @@ async function sendMessage() {
1207
1254
  agent_path: state.activeAgent,
1208
1255
  mode: state.chatMode,
1209
1256
  escalated: state.escalated,
1257
+ voice_text: voiceText, // 语音转文字原始文本(用于后端 usersays_correct)
1210
1258
  }),
1211
1259
  signal: state.abortController.signal,
1212
1260
  });
@@ -1402,6 +1450,31 @@ async function sendMessage() {
1402
1450
  // evt.data contains: {usersays_correct, task_plan, tools_to_call, remember, recall, ask_user, finish}
1403
1451
  // Store for rendering
1404
1452
  state.messages[msgIdx]._v2Parsed = evt.data;
1453
+ // ── usersays_correct:语音输入纠错 — 替换用户气泡文本 ──
1454
+ if (evt.data && evt.data.usersays_correct && evt.data.usersays_correct.trim()) {
1455
+ var correctedText = evt.data.usersays_correct.trim();
1456
+ // 找到对应的用户消息(当前消息的前一条)
1457
+ var userMsgIdx = msgIdx - 1;
1458
+ // 确认是语音消息(有 _voiceText 标记)
1459
+ if (userMsgIdx >= 0 && state.messages[userMsgIdx] &&
1460
+ state.messages[userMsgIdx].role === 'user' &&
1461
+ state.messages[userMsgIdx]._voiceText) {
1462
+ var oldContent = state.messages[userMsgIdx].content;
1463
+ if (oldContent !== correctedText) {
1464
+ state.messages[userMsgIdx].content = correctedText;
1465
+ state.messages[userMsgIdx]._voiceCorrected = true;
1466
+ // 更新用户气泡的 DOM(不重绘整个列表,直接替换文本)
1467
+ _replaceUserBubble(userMsgIdx, correctedText);
1468
+ // 更新侧边栏会话预览
1469
+ if (state.sessions && state.sessions.length > 0) {
1470
+ state.sessions[0].preview = correctedText.length > 40 ? correctedText.slice(0, 40) + '...' : correctedText;
1471
+ if (typeof renderSessions === 'function') {
1472
+ renderSessions();
1473
+ }
1474
+ }
1475
+ }
1476
+ }
1477
+ }
1405
1478
  // Render task plan if updated
1406
1479
  if (evt.data && evt.data.task_plan) {
1407
1480
  state.messages[msgIdx]._v2TaskPlan = evt.data.task_plan;
@@ -134,7 +134,7 @@
134
134
  </div>
135
135
  <!-- Voice preview area (shown after recording, before sending) -->
136
136
  <div class="voice-preview" id="voicePreview" style="display:none">
137
- <div class="voice-preview-label">语音输入 · <span id="voicePreviewHint">优化中...</span></div>
137
+ <div class="voice-preview-label">语音输入 · <span id="voicePreviewHint">识别中...</span></div>
138
138
  <div class="voice-preview-text" id="voicePreviewText"></div>
139
139
  <div class="voice-preview-actions">
140
140
  <button class="voice-preview-cancel" onclick="cancelVoicePreview()">取消</button>