myagent-ai 1.14.0 → 1.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,6 +6,7 @@ agents/main_agent.py - 主 Agent
6
6
  from __future__ import annotations
7
7
 
8
8
  import asyncio
9
+ import re
9
10
  from typing import Any, Callable, Dict, List, Optional
10
11
 
11
12
  from core.logger import get_logger
@@ -442,6 +443,8 @@ class MainAgent(BaseAgent):
442
443
  get_knowledge_content = ""
443
444
  # 追踪流式推送的 reasoning 文本(用于构建有意义的最终回复)
444
445
  _v2_reasoning_collected: List[str] = []
446
+ # XML 解析失败时的 LLM 修正重试计数
447
+ _xml_correction_retries: int = 0
445
448
 
446
449
  conversation_history = list(context.conversation_history or [])
447
450
 
@@ -587,31 +590,54 @@ class MainAgent(BaseAgent):
587
590
  "finish": parsed.finish,
588
591
  "finish_reason": truncate_str(parsed.finish_reason, 200),
589
592
  "next_step": truncate_str(parsed.next_step, 200),
593
+ "response": truncate_str(parsed.response, 500),
590
594
  "parse_success": parsed.parse_success,
595
+ "needs_correction": parsed.needs_correction,
591
596
  }},
592
597
  stream_callback,
593
598
  )
594
599
 
600
+ # Step 4.5: 解析失败处理 — 回退给 LLM 修正或提取周边文本
595
601
  if not parsed.parse_success:
596
- logger.warning(f"[{task_id}] XML 解析失败,尝试提取周边文本")
597
- before, after = extract_surrounding_text(llm_raw)
598
- if before.strip() or after.strip():
599
- final_text = (before + "\n" + after).strip()
600
- context.working_memory["final_response"] = final_text
601
- await self._emit_v2_event("v2_reasoning", {"content": final_text}, stream_callback)
602
- if self.memory:
603
- self.memory.add_session(
604
- session_id=context.session_id,
605
- role="assistant",
606
- content=final_text,
607
- )
608
- break
602
+ if parsed.needs_correction and _xml_correction_retries < 1:
603
+ # XML 完全无法解析,让 LLM 重新格式化输出
604
+ _xml_correction_retries += 1
605
+ logger.warning(
606
+ f"[{task_id}] XML 解析完全失败,回退给 LLM 修正 "
607
+ f"(重试 {_xml_correction_retries}/1)"
608
+ )
609
+ correction_prompt = (
610
+ "你上一次的输出格式有误,XML解析器无法识别。"
611
+ "请严格按照 <output>...</output> 格式重新输出你的回答。"
612
+ "注意:不要在 <output> 标签前后输出任何其他文字。\n\n"
613
+ f"你上一次的原始输出如下:\n{llm_raw}"
614
+ )
615
+ conversation_history.append(
616
+ Message(role="assistant", content=llm_raw)
617
+ )
618
+ conversation_history.append(
619
+ Message(role="user", content=correction_prompt)
620
+ )
621
+ await self._emit_v2_event(
622
+ "v2_reasoning",
623
+ {"content": "⚠️ 模型输出格式异常,正在自动修正..."},
624
+ stream_callback,
625
+ )
626
+ continue # 重新进入循环,让 LLM 重新生成
609
627
  else:
610
- # XML 解析失败且无法提取文本,发送原始输出作为备选
611
- logger.warning(f"[{task_id}] 无法提取文本,发送原始 LLM 输出")
612
- final_text = llm_raw.strip() if llm_raw.strip() else "处理完毕。"
628
+ # 已重试过或不需要修正,提取周边文本作为备选
629
+ logger.warning(f"[{task_id}] XML 解析失败,提取周边文本作为备选")
630
+ before, after = extract_surrounding_text(llm_raw)
631
+ if before.strip() or after.strip():
632
+ final_text = (before + "\n" + after).strip()
633
+ else:
634
+ # 清除残余 XML 标签后作为纯文本
635
+ final_text = re.sub(r"<[^>]+>", "", llm_raw).strip()
636
+ final_text = final_text if final_text else "处理完毕。"
613
637
  context.working_memory["final_response"] = final_text
614
- await self._emit_v2_event("v2_reasoning", {"content": final_text}, stream_callback)
638
+ await self._emit_v2_event(
639
+ "v2_reasoning", {"content": final_text}, stream_callback
640
+ )
615
641
  if self.memory:
616
642
  self.memory.add_session(
617
643
  session_id=context.session_id,
@@ -1,11 +1,14 @@
1
1
  """
2
- Structured Output Parser Module.
2
+ Custom Fault-Tolerant XML Output Parser Module.
3
3
 
4
4
  Parses the XML ``<output>`` block generated by the LLM in response to the
5
5
  system prompt. The LLM produces structured XML that drives the agent's
6
6
  execution loop — including tool calls, memory operations, user interaction
7
7
  hints, and loop-control flags.
8
8
 
9
+ **This module does NOT use xml.etree.ElementTree.** All parsing is done with
10
+ pure Python + regex to achieve maximum fault tolerance.
11
+
9
12
  Expected XML schema produced by the LLM::
10
13
 
11
14
  <output>
@@ -20,26 +23,31 @@ Expected XML schema produced by the LLM::
20
23
  <callback>true/false</callback>
21
24
  </tool>
22
25
  </toolstocal>
23
- <remember>仅从最新用户输入中提炼的记忆,无新信息则为空</remember>
26
+ <remember>
27
+ <type>global|session</type>
28
+ <content>记忆内容</content>
29
+ </remember>
24
30
  <recall>下一轮需要调取的记忆</recall>
25
31
  <askuser>需要询问用户的内容</askuser>
26
32
  <get_knowledge>下一轮需要搜索获得的知识</get_knowledge>
27
33
  <finish>true/false</finish>
34
+ <response>模型对用户的直接回复</response>
28
35
  </output>
29
36
 
30
- The parser is deliberately robust:
37
+ Fault-tolerance features:
31
38
 
32
- * XML parsing errors are caught and a regex-based fallback is attempted.
33
- * Missing optional fields are defaulted (``finish`` ``False``,
34
- ``timeout`` ``120``, ``callback`` ``True``).
35
- * All text fields are stripped of leading/trailing whitespace.
36
- * Empty tags resolve to empty strings rather than raising errors.
39
+ * Text before ``<output>`` or after ``</output>`` is silently stripped.
40
+ * Unclosed tags are auto-closed at the next sibling tag boundary.
41
+ * Self-closing tags (``<tag/>``) resolve to empty strings.
42
+ * Case-insensitive tag matching (``<OUTPUT>`` == ``<output>``).
43
+ * Tag-name aliases: ``<ask_user>`` ``askuser``.
44
+ * If extraction yields nothing meaningful, ``needs_correction`` is set to
45
+ ``True`` so the caller can ask the LLM to re-format.
37
46
  """
38
47
 
39
48
  from __future__ import annotations
40
49
 
41
50
  import re
42
- import xml.etree.ElementTree as ET
43
51
  from dataclasses import dataclass, field
44
52
  from typing import Any, Dict, List
45
53
 
@@ -54,16 +62,46 @@ logger = get_logger("myagent.output_parser")
54
62
  _DEFAULT_TIMEOUT: int = 120
55
63
  _DEFAULT_CALLBACK: bool = True
56
64
 
57
- # Regex patterns used for extraction and fallback parsing.
58
- _OUTPUT_BLOCK_RE = re.compile(
59
- r"<output\s*>(.*?)</output\s*>",
60
- re.DOTALL | re.IGNORECASE,
61
- )
65
+ # All top-level tags we recognise inside <output>.
66
+ KNOWN_TOP_LEVEL_TAGS = [
67
+ "usersays_correct",
68
+ "task_plan",
69
+ "toolstocal",
70
+ "remember",
71
+ "recall",
72
+ "knowledge",
73
+ "askuser",
74
+ "ask_user", # alias for askuser
75
+ "get_knowledge",
76
+ "finish",
77
+ "finish_reason",
78
+ "next_step",
79
+ "response",
80
+ ]
81
+
82
+ # Inner tags inside each <tool>.
83
+ TOOL_INNER_TAGS = [
84
+ "beforecalltext",
85
+ "toolname",
86
+ "parms",
87
+ "timeout",
88
+ "callback",
89
+ ]
90
+
91
+ # Inner tags inside <remember>.
92
+ REMEMBER_INNER_TAGS = ["type", "content"]
93
+
94
+ # Tag aliases: canonical name -> list of aliases.
95
+ _TAG_ALIASES: Dict[str, List[str]] = {
96
+ "askuser": ["ask_user"],
97
+ }
98
+
99
+ # Build reverse lookup: alias -> canonical.
100
+ _ALIAS_TO_CANONICAL: Dict[str, str] = {}
101
+ for _canonical, _aliases in _TAG_ALIASES.items():
102
+ for _alias in _aliases:
103
+ _ALIAS_TO_CANONICAL[_alias.lower()] = _canonical
62
104
 
63
- _TAG_CONTENT_RE = re.compile(
64
- r"<(\w+)\s*>(.*?)</\1\s*>",
65
- re.DOTALL,
66
- )
67
105
 
68
106
  # ---------------------------------------------------------------------------
69
107
  # Data classes
@@ -80,29 +118,27 @@ class ParsedOutput:
80
118
  task_plan: Updated or new task plan (may contain Markdown).
81
119
  tools_to_call: Ordered list of tool descriptors to execute.
82
120
  remember: Content that should be persisted to the agent's memory.
83
- Structured as ``<type>global|session</type><content>...</content>``
84
- in the LLM output, but parsed into separate fields.
85
121
  remember_type: "global" (cross-session) or "session" (current session only).
86
122
  recall: Memory keys / descriptions to retrieve for the next loop
87
123
  iteration.
88
- knowledge: Knowledge content the LLM wants to persist to the
89
- knowledge base (analogous to remember → memory).
124
+ knowledge: Knowledge content the LLM wants to persist.
90
125
  ask_user: Free-form question the agent should pose to the user.
91
126
  get_knowledge: Knowledge search keywords for the next loop iteration.
92
- The ContextBuilder will use this to perform RAG retrieval.
93
127
  finish: When ``True`` the execution loop should terminate.
94
128
  finish_reason: When finish=True, explains why the task is ending.
95
129
  next_step: When finish=False, describes what to do next.
130
+ response: Model's direct reply to the user (friendly natural language).
96
131
  raw_text: The verbatim raw text returned by the LLM.
97
- parse_success: Whether the XML was parsed successfully (``True``)
98
- or the regex fallback was used (``False``).
132
+ parse_success: Whether parsing extracted at least one meaningful field.
133
+ needs_correction: When ``True``, the caller should send the raw text
134
+ back to the LLM for re-formatting.
99
135
  """
100
136
 
101
137
  usersays_correct: str = ""
102
138
  task_plan: str = ""
103
139
  tools_to_call: List[Dict[str, Any]] = field(default_factory=list)
104
140
  remember: str = ""
105
- remember_type: str = "" # "global" | "session" (default: "session")
141
+ remember_type: str = ""
106
142
  recall: str = ""
107
143
  knowledge: str = ""
108
144
  ask_user: str = ""
@@ -110,37 +146,24 @@ class ParsedOutput:
110
146
  finish: bool = False
111
147
  finish_reason: str = ""
112
148
  next_step: str = ""
113
- response: str = "" # 模型对用户的直接回复(友好自然的话语)
149
+ response: str = ""
114
150
  raw_text: str = ""
115
151
  parse_success: bool = False
152
+ needs_correction: bool = False
116
153
 
117
154
 
118
155
  # ---------------------------------------------------------------------------
119
- # Helper utilities
156
+ # Low-level extraction helpers
120
157
  # ---------------------------------------------------------------------------
121
158
 
122
159
 
123
- def _extract_xml_block(text: str) -> str | None:
124
- """Return the content between ``<output>`` and ``</output>``, or ``None``.
125
-
126
- The search is case-insensitive and tolerant of whitespace around the tag
127
- names.
128
- """
129
- match = _OUTPUT_BLOCK_RE.search(text)
130
- if match:
131
- return match.group(1)
132
- return None
133
-
134
-
135
160
  def _safe_strip(value: str | None) -> str:
136
- """Return the stripped string, defaulting to ``""`` for ``None``."""
137
161
  if value is None:
138
162
  return ""
139
163
  return value.strip()
140
164
 
141
165
 
142
166
  def _parse_bool(value: str | None, default: bool) -> bool:
143
- """Parse a boolean from a string, returning *default* on failure."""
144
167
  if value is None:
145
168
  return default
146
169
  stripped = value.strip().lower()
@@ -152,7 +175,6 @@ def _parse_bool(value: str | None, default: bool) -> bool:
152
175
 
153
176
 
154
177
  def _parse_int(value: str | None, default: int) -> int:
155
- """Parse an integer from a string, returning *default* on failure."""
156
178
  if value is None:
157
179
  return default
158
180
  try:
@@ -161,331 +183,366 @@ def _parse_int(value: str | None, default: int) -> int:
161
183
  return default
162
184
 
163
185
 
164
- def _parse_remember_tag(element: ET.Element | None) -> tuple[str, str]:
165
- """Parse the ``<remember>`` element which may contain ``<type>`` and
166
- ``<content>`` sub-tags, or plain text (legacy format).
186
+ def _canonical_tag(tag_name: str) -> str:
187
+ """Return the canonical tag name for *tag_name* (alias-aware, lowercased)."""
188
+ lower = tag_name.strip().lower()
189
+ return _ALIAS_TO_CANONICAL.get(lower, lower)
167
190
 
168
- Returns:
169
- ``(content, remember_type)`` *content* is the memory text,
170
- *remember_type* is ``"global"`` or ``"session"`` (default).
191
+
192
+ def _extract_tag_content(text: str, tag_name: str, stop_tags: List[str] | None = None) -> str:
193
+ """Extract the text content of ``<tag_name>…</tag_name>`` from *text*.
194
+
195
+ Fault-tolerant strategies tried in order:
196
+
197
+ 1. **Properly closed**: ``<tag>content</tag>``
198
+ 2. **Unclosed at next sibling opening tag**: ``<tag>content<next_tag>…``
199
+ 3. **Unclosed at ``</output>``**: ``<tag>content</output>``
200
+ 4. **Self-closing**: ``<tag/>``
201
+ 5. **Opening tag at end of string**: ``<tag>content$``
202
+
203
+ Parameters:
204
+ text: The text to search within (typically the body of ``<output>``).
205
+ tag_name: The tag name to extract (case-insensitive).
206
+ stop_tags: Sibling tag names that signal the end of this tag's
207
+ content (used for unclosed-tag detection). Defaults to
208
+ ``KNOWN_TOP_LEVEL_TAGS``.
171
209
  """
172
- if element is None:
173
- return "", ""
210
+ if not text or not tag_name:
211
+ return ""
174
212
 
175
- # Try structured format: <remember><type>global</type><content>...</content></remember>
176
- type_elem = element.findtext("type")
177
- content_elem = element.findtext("content")
178
- if content_elem is not None and content_elem.strip():
179
- mem_type = _safe_strip(type_elem) if type_elem else "session"
180
- if mem_type not in ("global", "session"):
181
- mem_type = "session"
182
- return _safe_strip(content_elem), mem_type
213
+ if stop_tags is None:
214
+ stop_tags = KNOWN_TOP_LEVEL_TAGS
183
215
 
184
- # Legacy format: <remember>plain text</remember>
185
- text = element.text or ""
186
- return _safe_strip(text), "session"
216
+ tag_esc = re.escape(tag_name)
187
217
 
218
+ # Strategy 1: Properly closed <tag>content</tag>
219
+ m = re.search(
220
+ rf"<{tag_esc}[^>]*>(.*?)</{tag_esc}\s*>",
221
+ text,
222
+ re.DOTALL | re.IGNORECASE,
223
+ )
224
+ if m:
225
+ return m.group(1)
226
+
227
+ # Strategy 2: Unclosed — content runs until the next opening/closing
228
+ # sibling tag or </output>.
229
+ sibling_names = [t for t in stop_tags if t.lower() != tag_name.lower()]
230
+ if sibling_names:
231
+ sibling_pat = "|".join(re.escape(t) for t in sibling_names)
232
+ # CRITICAL: Wrap sibling_pat in (?:...) so that | doesn't split the
233
+ # leading < or </ from the alternation. Without this, e.g.
234
+ # "<a|b|c" is parsed as "<a" OR "b" OR "c" — NOT "<a" OR "<b" OR "<c".
235
+ boundary = rf"(?:</output\s*>|<(?:{sibling_pat})\b|</(?:{sibling_pat})\s*>)"
236
+ else:
237
+ boundary = r"</output\s*>"
188
238
 
189
- # ---------------------------------------------------------------------------
190
- # Core parser
191
- # ---------------------------------------------------------------------------
239
+ m = re.search(
240
+ rf"<{tag_esc}[^>]*>(.*?)({boundary})",
241
+ text,
242
+ re.DOTALL | re.IGNORECASE,
243
+ )
244
+ if m:
245
+ return m.group(1)
192
246
 
247
+ # Strategy 3: Self-closing <tag/> or <tag />
248
+ m = re.search(rf"<{tag_esc}[^>]*/\s*>", text, re.IGNORECASE)
249
+ if m:
250
+ return ""
193
251
 
194
- def extract_surrounding_text(full_text: str) -> tuple[str, str]:
195
- """Split *full_text* around the ``<output>…</output>`` block.
252
+ # Strategy 4: Opening tag at end of text with no closing
253
+ m = re.search(
254
+ rf"<{tag_esc}[^>]*>(.*?)$",
255
+ text,
256
+ re.DOTALL | re.IGNORECASE,
257
+ )
258
+ if m:
259
+ content = m.group(1).strip()
260
+ # Only return if there's actual content (not just whitespace)
261
+ if content:
262
+ return content
196
263
 
197
- Returns:
198
- A ``(text_before_xml, text_after_xml)`` tuple. Both parts are
199
- stripped. If no ``<output>`` block is found the original text
200
- becomes *text_before_xml* and *text_after_xml* is ``""``.
264
+ return ""
265
+
266
+
267
+ def _extract_all_tag_blocks(
268
+ text: str,
269
+ tag_name: str,
270
+ parent_close_tag: str | None = None,
271
+ ) -> List[str]:
272
+ """Extract all ``<tag_name>…`` blocks from *text*.
273
+
274
+ Used for extracting multiple ``<tool>`` blocks from ``<toolstocal>``
275
+ content. Handles both properly closed and unclosed blocks.
276
+
277
+ Returns a list of content strings, one per block.
201
278
  """
202
- match = re.search(
203
- r"<output\s*>",
204
- full_text,
205
- re.IGNORECASE,
279
+ if not text:
280
+ return []
281
+
282
+ tag_esc = re.escape(tag_name)
283
+ blocks: List[str] = []
284
+
285
+ # Strategy 1: Find all properly closed <tag>content</tag> blocks
286
+ properly_closed = re.findall(
287
+ rf"<{tag_esc}[^>]*>(.*?)</{tag_esc}\s*>",
288
+ text,
289
+ re.DOTALL | re.IGNORECASE,
206
290
  )
207
- if match is None:
208
- return full_text.strip(), ""
291
+ if properly_closed:
292
+ return properly_closed
293
+
294
+ # Strategy 2: Split by <tag> openings — each segment is a block
295
+ positions = [
296
+ m.end() for m in re.finditer(rf"<{tag_esc}[^>]*>", text, re.IGNORECASE)
297
+ ]
298
+
299
+ for i, content_start in enumerate(positions):
300
+ if i + 1 < len(positions):
301
+ # Block ends at next <tag> opening
302
+ content_end = positions[i + 1]
303
+ elif parent_close_tag:
304
+ # Last block — ends at parent close tag
305
+ close_m = re.search(
306
+ re.escape(parent_close_tag),
307
+ text[content_start:],
308
+ re.IGNORECASE,
309
+ )
310
+ content_end = content_start + close_m.start() if close_m else len(text)
311
+ else:
312
+ content_end = len(text)
313
+
314
+ blocks.append(text[content_start:content_end])
315
+
316
+ return blocks
209
317
 
210
- text_before = full_text[: match.start()].strip()
211
318
 
212
- end_match = re.search(
319
+ def _extract_output_body(raw_text: str) -> str | None:
320
+ """Extract the content between ``<output>`` and ``</output>``.
321
+
322
+ If ``</output>`` is missing (unclosed), returns everything after the
323
+ opening ``<output>`` tag.
324
+
325
+ Returns ``None`` if no ``<output>`` opening tag is found at all.
326
+ """
327
+ open_match = re.search(r"<output[^>]*>", raw_text, re.IGNORECASE)
328
+ if open_match is None:
329
+ return None
330
+
331
+ content_start = open_match.end()
332
+
333
+ close_match = re.search(
213
334
  r"</output\s*>",
214
- full_text[match.start() :],
335
+ raw_text[content_start:],
215
336
  re.IGNORECASE,
216
337
  )
217
- if end_match is None:
218
- # Opening tag found but no closing tag — everything after opening
219
- # tag is considered "after".
220
- text_after = full_text[match.end() :].strip()
221
- else:
222
- absolute_end = match.start() + end_match.end()
223
- text_after = full_text[absolute_end:].strip()
338
+ if close_match:
339
+ return raw_text[content_start : content_start + close_match.start()]
224
340
 
225
- return text_before, text_after
341
+ # Unclosed <output> — take everything after it
342
+ return raw_text[content_start:]
226
343
 
227
344
 
228
- def _parse_tools_element(tools_element: ET.Element | None) -> List[Dict[str, Any]]:
229
- """Parse the ``<toolstocal>`` element into a list of tool dicts."""
230
- tools: List[Dict[str, Any]] = []
231
- if tools_element is None:
232
- return tools
345
+ def _strip_outer_noise(text: str) -> str:
346
+ """Remove text that is outside any recognised XML tags.
233
347
 
234
- for tool_el in tools_element.findall("tool"):
235
- tool: Dict[str, Any] = {
236
- "beforecalltext": _safe_strip(tool_el.findtext("beforecalltext")),
237
- "toolname": _safe_strip(tool_el.findtext("toolname")),
238
- "parms": _safe_strip(tool_el.findtext("parms")),
239
- "timeout": _parse_int(
240
- tool_el.findtext("timeout"), _DEFAULT_TIMEOUT
241
- ),
242
- "callback": _parse_bool(
243
- tool_el.findtext("callback"), _DEFAULT_CALLBACK
244
- ),
245
- }
246
- tools.append(tool)
247
-
248
- return tools
348
+ This handles the case where the LLM outputs plain text before or
349
+ after the ``<output>`` block, e.g.::
249
350
 
351
+ "我来使用 Python 脚本下载... <output>...</output>"
250
352
 
251
- def _fix_incomplete_xml(xml_content: str) -> str:
252
- """修复不完整的 XML,使用正则表达式补齐格式。
253
-
254
- 支持修复的问题类型:
255
- 1. 自闭合标签误用:<tag /> → <tag></tag>
256
- 2. 缺少闭合标签:<tag>value → <tag>value</tag>
257
- 3. 标签大小写混乱:<TAG>value</TAG> → <tag>value</tag>
258
- 4. 空白字符问题:< tag >value</ tag >
259
- 5. 嵌套标签未闭合
260
- 6. 特殊字符转义:& → &amp; (在属性值中)
353
+ The function returns the ``<output>…</output>`` body, or the original
354
+ text if no output block is found.
261
355
  """
262
- if not xml_content:
263
- return xml_content
264
-
265
- # 标准标签列表(用于修复缺少闭合标签)
266
- STANDARD_TAGS = [
267
- "usersays_correct", "task_plan", "toolstocal", "remember",
268
- "recall", "knowledge", "askuser", "get_knowledge", "finish",
269
- "finish_reason", "next_step", "response",
270
- "tool", "beforecalltext", "toolname", "parms", "timeout", "callback",
271
- "type", "content",
272
- ]
273
-
274
- # 1. 修复自闭合标签为普通标签
275
- # <tag /> → <tag></tag>
276
- # <tag/> → <tag></tag>
277
- xml_content = re.sub(r'<(\w+)\s*/\s*>', r'<\1></\1>', xml_content)
278
-
279
- # 2. 修复空白字符在标签内的问题:< tag > → <tag>
280
- # 同时规范化大小写
281
- def normalize_tag(match):
282
- inner = match.group(1).strip()
283
- tag_name = inner.split()[0].lower()
284
- rest = ' '.join(inner.split()[1:]) # 保留可能的属性
285
- if rest:
286
- return f'<{tag_name} {rest}>'
287
- return f'<{tag_name}>'
288
- xml_content = re.sub(r'<([a-z_][a-z_0-9]*)[^>]*>', normalize_tag, xml_content, flags=re.IGNORECASE)
289
-
290
- # 3. 修复闭合标签的大小写和空白:</ TAG > </tag>
291
- def normalize_close_tag(match):
292
- tag_name = match.group(1).strip().lower()
293
- return f'</{tag_name}>'
294
- xml_content = re.sub(r'</\s*([a-z_][a-z_0-9]*)\s*>', normalize_close_tag, xml_content, flags=re.IGNORECASE)
295
-
296
- # 4. 修复缺少闭合标签的问题
297
- # 策略:对于标准标签,如果后面跟着另一个标签或 </output>,则添加闭合标签
298
- for tag in STANDARD_TAGS:
299
- # 修复 <tag>value<下一个标签> 格式(缺少 </tag>)
300
- # 例如:<finish>true<task_plan> → <finish>true</finish><task_plan>
301
- next_tag_pattern = '|'.join([re.escape(t) for t in STANDARD_TAGS if t != tag])
302
- if next_tag_pattern:
303
- # 匹配 <tag>...后面跟着其他标签或 </output>
304
- pattern = rf'(<{tag}>)([^<]*?)(?=(?:<{next_tag_pattern}|</output>))'
305
- replacement = rf'\1\2</{tag}>'
306
- xml_content = re.sub(pattern, replacement, xml_content, flags=re.IGNORECASE | re.DOTALL)
307
-
308
- # 5. 修复 <toolstocal> 和 </toolstocal> 标签
309
- xml_content = re.sub(r'<toolstocal\s*>', '<toolstocal>', xml_content, flags=re.IGNORECASE)
310
- xml_content = re.sub(r'</toolstocal\s*>', '</toolstocal>', xml_content, flags=re.IGNORECASE)
311
-
312
- # 6. 修复 askuser 标签(系统提示中用的是 askuser,但有时可能写成 ask_user)
313
- xml_content = re.sub(r'<ask_user\s*>', '<askuser>', xml_content, flags=re.IGNORECASE)
314
- xml_content = re.sub(r'</ask_user\s*>', '</askuser>', xml_content, flags=re.IGNORECASE)
315
-
316
- # 7. 修复单独的 <tool> 块中的标签
317
- # 匹配 <tool>... 中缺少闭合标签的情况
318
- tool_tags = ["beforecalltext", "toolname", "parms", "timeout", "callback"]
319
- tool_next_pattern = '|'.join([re.escape(t) for t in tool_tags])
320
-
321
- for tag in tool_tags:
322
- pattern = rf'(<{tag}>)([^<]*?)(?=(?:<{tool_next_pattern}|</tool>|</toolstocal>|</output>))'
323
- replacement = rf'\1\2</{tag}>'
324
- xml_content = re.sub(pattern, replacement, xml_content, flags=re.IGNORECASE | re.DOTALL)
325
-
326
- # 8. 处理文本内容中的特殊 XML 字符(简单处理 &)
327
- # 只在标签外的内容中处理
328
- # 这个比较复杂,简单处理:在文本内容中 & 后没有 ; 的转为 &amp;
329
- # 但更安全的做法是只在必要时处理
330
-
331
- # 9. 确保 <output> 标签周围没有多余空白
332
- xml_content = xml_content.strip()
333
-
334
- return xml_content
335
-
336
-
337
- def _aggressive_clean_xml(xml_content: str) -> str:
338
- """激进清理 XML 内容,移除可能导致解析失败的字符。"""
339
- # 移除控制字符(除了换行和 tab)
340
- xml_content = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', xml_content)
341
-
342
- # 修复 & 字符(确保它是有效的 XML 实体)
343
- # 匹配 & 不在有效实体前的情况
344
- xml_content = re.sub(r'&(?!amp;|lt;|gt;|quot;|apos;|#\d+;|#x[0-9a-fA-F]+;)', '&amp;', xml_content)
345
-
346
- # 移除多余的空白(连续多个空白合并为一个)
347
- xml_content = re.sub(r'>\s+<', '><', xml_content)
348
- xml_content = re.sub(r'\s{2,}', ' ', xml_content)
349
-
350
- return xml_content
351
-
352
-
353
- def _parse_xml_content(xml_content: str) -> ParsedOutput:
354
- """Attempt to parse *xml_content* (the inner body of ``<output>``) as XML.
355
-
356
- Assumes *xml_content* has already been extracted from the surrounding
357
- ``<output>`` tags. If parsing fails a :class:`ParsedOutput` with
358
- ``parse_success=False`` is returned.
359
-
360
- 解析策略:
361
- 1. 首先尝试直接解析
362
- 2. 如果失败,使用正则表达式修复后再解析
363
- 3. 如果仍然失败,尝试激进清理
356
+ if not text:
357
+ return text
358
+
359
+ body = _extract_output_body(text)
360
+ if body is not None:
361
+ return body
362
+
363
+ # No <output> tag at all — check if there are any recognised tags
364
+ has_tags = False
365
+ for tag in KNOWN_TOP_LEVEL_TAGS:
366
+ if re.search(rf"<{re.escape(tag)}[\s>]", text, re.IGNORECASE):
367
+ has_tags = True
368
+ break
369
+
370
+ if has_tags:
371
+ # Tags exist but no <output> wrapper — return as-is
372
+ return text
373
+
374
+ # No tags at all — return original (caller will set needs_correction)
375
+ return text
376
+
377
+
378
+ # ---------------------------------------------------------------------------
379
+ # Core custom parser — NO xml.etree.ElementTree
380
+ # ---------------------------------------------------------------------------
381
+
382
+
383
+ def _custom_parse(raw_text: str) -> ParsedOutput:
384
+ """Fully custom, regex-based XML parser with maximum fault tolerance.
385
+
386
+ This function does NOT use ``xml.etree.ElementTree`` at all. Every
387
+ extraction is done via regex patterns that handle malformed XML
388
+ gracefully.
389
+
390
+ Returns a :class:`ParsedOutput` with ``parse_success=True`` if at least
391
+ one meaningful field was extracted, or ``needs_correction=True`` if
392
+ nothing could be parsed.
364
393
  """
365
- parsed = ParsedOutput(parse_success=False)
394
+ parsed = ParsedOutput(raw_text=raw_text)
395
+
396
+ if not raw_text or not raw_text.strip():
397
+ parsed.needs_correction = True
398
+ return parsed
399
+
400
+ # ── Step 1: Strip non-XML noise (text before/after <output>) ──
401
+ body = _strip_outer_noise(raw_text)
402
+
403
+ # ── Step 2: Extract each known top-level tag ──
404
+
405
+ # usersays_correct
406
+ raw_val = _extract_tag_content(body, "usersays_correct")
407
+ parsed.usersays_correct = _safe_strip(raw_val)
408
+
409
+ # task_plan
410
+ raw_val = _extract_tag_content(body, "task_plan")
411
+ parsed.task_plan = _safe_strip(raw_val)
412
+
413
+ # response
414
+ raw_val = _extract_tag_content(body, "response")
415
+ parsed.response = _safe_strip(raw_val)
416
+
417
+ # recall
418
+ raw_val = _extract_tag_content(body, "recall")
419
+ parsed.recall = _safe_strip(raw_val)
420
+
421
+ # knowledge
422
+ raw_val = _extract_tag_content(body, "knowledge")
423
+ parsed.knowledge = _safe_strip(raw_val)
424
+
425
+ # askuser (also try alias ask_user)
426
+ raw_val = _extract_tag_content(body, "askuser")
427
+ if not raw_val.strip():
428
+ raw_val = _extract_tag_content(body, "ask_user")
429
+ parsed.ask_user = _safe_strip(raw_val)
430
+
431
+ # get_knowledge
432
+ raw_val = _extract_tag_content(body, "get_knowledge")
433
+ parsed.get_knowledge = _safe_strip(raw_val)
434
+
435
+ # finish
436
+ raw_val = _extract_tag_content(body, "finish")
437
+ parsed.finish = _parse_bool(raw_val, False)
438
+
439
+ # finish_reason
440
+ raw_val = _extract_tag_content(body, "finish_reason")
441
+ parsed.finish_reason = _safe_strip(raw_val)
442
+
443
+ # next_step
444
+ raw_val = _extract_tag_content(body, "next_step")
445
+ parsed.next_step = _safe_strip(raw_val)
446
+
447
+ # ── Step 3: Parse <remember> (may contain <type> and <content>) ──
448
+ remember_raw = _extract_tag_content(body, "remember")
449
+ if remember_raw.strip():
450
+ # Try structured format: <type>global</type><content>...</content>
451
+ type_val = _extract_tag_content(remember_raw, "type", REMEMBER_INNER_TAGS)
452
+ content_val = _extract_tag_content(remember_raw, "content", REMEMBER_INNER_TAGS)
453
+
454
+ if content_val.strip():
455
+ mem_type = _safe_strip(type_val) or "session"
456
+ if mem_type not in ("global", "session"):
457
+ mem_type = "session"
458
+ parsed.remember = _safe_strip(content_val)
459
+ parsed.remember_type = mem_type
460
+ else:
461
+ # Legacy plain-text format
462
+ parsed.remember = _safe_strip(remember_raw)
463
+ parsed.remember_type = "session"
366
464
 
367
- # Strategy 1: 尝试直接解析原始 XML
368
- try:
369
- root = ET.fromstring("<output>" + xml_content + "</output>")
465
+ # ── Step 4: Parse <toolstocal> → list of tool dicts ──
466
+ toolstocal_raw = _extract_tag_content(body, "toolstocal")
467
+ if toolstocal_raw.strip():
468
+ parsed.tools_to_call = _parse_toolstocal(toolstocal_raw)
469
+
470
+ # ── Step 5: Determine parse success ──
471
+ has_content = bool(
472
+ parsed.response
473
+ or parsed.usersays_correct
474
+ or parsed.task_plan
475
+ or parsed.tools_to_call
476
+ or parsed.ask_user
477
+ or parsed.remember
478
+ or parsed.recall
479
+ or parsed.knowledge
480
+ or parsed.get_knowledge
481
+ or parsed.finish
482
+ or parsed.finish_reason
483
+ or parsed.next_step
484
+ )
485
+
486
+ if has_content:
370
487
  parsed.parse_success = True
371
- logger.debug(f"XML 直接解析成功")
372
- except ET.ParseError as e1:
373
- logger.debug(f"XML 直接解析失败: {e1},尝试修复...")
374
-
375
- # Strategy 2: 修复不完整的 XML 后再解析
376
- fixed_xml = _fix_incomplete_xml(xml_content)
377
- logger.debug(f"修复后 XML 前200字符: {fixed_xml[:200]}...")
378
-
379
- try:
380
- root = ET.fromstring("<output>" + fixed_xml + "</output>")
488
+ else:
489
+ # Nothing was extracted — check if there's any raw text that could
490
+ # be a response (the LLM might have skipped XML entirely)
491
+ cleaned = raw_text.strip()
492
+ # Remove any residual XML tags
493
+ cleaned_no_tags = re.sub(r"<[^>]+>", "", cleaned).strip()
494
+ if cleaned_no_tags:
495
+ # The LLM output something but not in XML format
496
+ # Treat the entire output as a response
497
+ parsed.response = cleaned_no_tags
381
498
  parsed.parse_success = True
382
- logger.debug(f"XML 修复后解析成功")
383
- except ET.ParseError as e2:
384
- logger.warning(f"XML 修复后仍然解析失败: {e2},尝试激进清理...")
385
- # Strategy 3: 激进清理
386
- cleaned = _aggressive_clean_xml(fixed_xml)
387
- try:
388
- root = ET.fromstring("<output>" + cleaned + "</output>")
389
- parsed.parse_success = True
390
- logger.debug(f"XML 激进清理后解析成功")
391
- except ET.ParseError as e3:
392
- logger.warning(f"XML 解析最终失败: {e3}")
393
- return parsed
394
-
395
- # 提取各字段
396
- parsed.usersays_correct = _safe_strip(root.findtext("usersays_correct"))
397
- parsed.task_plan = _safe_strip(root.findtext("task_plan"))
398
- parsed.tools_to_call = _parse_tools_element(root.find("toolstocal"))
399
- parsed.remember, parsed.remember_type = _parse_remember_tag(root.find("remember"))
400
- parsed.recall = _safe_strip(root.findtext("recall"))
401
- parsed.knowledge = _safe_strip(root.findtext("knowledge"))
402
- parsed.ask_user = _safe_strip(root.findtext("askuser"))
403
- parsed.get_knowledge = _safe_strip(root.findtext("get_knowledge"))
404
- parsed.finish = _parse_bool(root.findtext("finish"), False)
405
- parsed.finish_reason = _safe_strip(root.findtext("finish_reason"))
406
- parsed.next_step = _safe_strip(root.findtext("next_step"))
407
- parsed.response = _safe_strip(root.findtext("response"))
499
+ logger.info(
500
+ f"XML解析未提取到结构化字段,将原始文本(去除标签后)作为response: "
501
+ f"{cleaned_no_tags[:100]}..."
502
+ )
503
+ else:
504
+ # Complete parse failure
505
+ parsed.needs_correction = True
506
+ logger.warning(
507
+ f"XML解析完全失败,需要LLM修正。原始输出前200字符: {raw_text[:200]}"
508
+ )
408
509
 
409
510
  return parsed
410
511
 
411
512
 
412
- def _fallback_regex_parse(raw_text: str) -> ParsedOutput:
413
- """Last-resort parser that uses regex to extract tag contents.
513
+ def _parse_toolstocal(toolstocal_content: str) -> List[Dict[str, Any]]:
514
+ """Parse ``<toolstocal>`` body into a list of tool descriptors."""
515
+ tools: List[Dict[str, Any]] = []
414
516
 
415
- This is used when standard XML parsing fails entirely (e.g. the LLM
416
- produced malformed XML). Only flat text tags are extracted; nested
417
- ``<tool>`` elements inside ``<toolstocal>`` are not resolved.
418
- """
419
- parsed = ParsedOutput(parse_success=False, raw_text=raw_text)
420
-
421
- xml_body = _extract_xml_block(raw_text)
422
- search_text = xml_body if xml_body is not None else raw_text
423
-
424
- # Extract flat tags.
425
- tag_map: Dict[str, str] = {}
426
- for match in _TAG_CONTENT_RE.finditer(search_text):
427
- tag_name = match.group(1).lower()
428
- tag_map[tag_name] = match.group(2)
429
-
430
- parsed.usersays_correct = _safe_strip(tag_map.get("usersays_correct"))
431
- parsed.task_plan = _safe_strip(tag_map.get("task_plan"))
432
- # Regex fallback: 尝试解析结构化 <remember> 或退化到纯文本
433
- _remember_raw = tag_map.get("remember", "")
434
- if "<type>" in _remember_raw and "<content>" in _remember_raw:
435
- try:
436
- _rem_elem = ET.fromstring(f"<remember>{_remember_raw}</remember>")
437
- parsed.remember, parsed.remember_type = _parse_remember_tag(_rem_elem)
438
- except Exception:
439
- parsed.remember = _safe_strip(_remember_raw)
440
- parsed.remember_type = "session"
441
- else:
442
- parsed.remember = _safe_strip(_remember_raw)
443
- parsed.remember_type = "session"
444
- parsed.recall = _safe_strip(tag_map.get("recall"))
445
- parsed.knowledge = _safe_strip(tag_map.get("knowledge"))
446
- parsed.ask_user = _safe_strip(tag_map.get("askuser"))
447
- parsed.get_knowledge = _safe_strip(tag_map.get("get_knowledge"))
448
- parsed.finish = _parse_bool(tag_map.get("finish"), False)
449
- parsed.finish_reason = _safe_strip(tag_map.get("finish_reason"))
450
- parsed.next_step = _safe_strip(tag_map.get("next_step"))
451
- parsed.response = _safe_strip(tag_map.get("response"))
452
-
453
- # For toolstocal we attempt to find individual <tool> blocks.
454
- tools_raw = tag_map.get("toolstocal", "")
455
- tool_blocks = re.findall(
456
- r"<tool\s*>(.*?)</tool\s*>",
457
- tools_raw,
458
- re.DOTALL | re.IGNORECASE,
517
+ tool_blocks = _extract_all_tag_blocks(
518
+ toolstocal_content, "tool", parent_close_tag="</toolstocal>"
459
519
  )
460
- if not tool_blocks:
461
- # Perhaps the LLM placed <tool> tags at the top level.
462
- tool_blocks = re.findall(
463
- r"<tool\s*>(.*?)</tool\s*>",
464
- search_text,
465
- re.DOTALL | re.IGNORECASE,
466
- )
467
520
 
468
521
  for block in tool_blocks:
469
- inner_map: Dict[str, str] = {}
470
- for m in _TAG_CONTENT_RE.finditer(block):
471
- inner_map[m.group(1).lower()] = m.group(2)
472
- parsed.tools_to_call.append(
473
- {
474
- "beforecalltext": _safe_strip(
475
- inner_map.get("beforecalltext")
476
- ),
477
- "toolname": _safe_strip(inner_map.get("toolname")),
478
- "parms": _safe_strip(inner_map.get("parms")),
479
- "timeout": _parse_int(
480
- inner_map.get("timeout"), _DEFAULT_TIMEOUT
481
- ),
482
- "callback": _parse_bool(
483
- inner_map.get("callback"), _DEFAULT_CALLBACK
484
- ),
485
- }
486
- )
522
+ tool: Dict[str, Any] = {
523
+ "beforecalltext": _safe_strip(
524
+ _extract_tag_content(block, "beforecalltext", TOOL_INNER_TAGS)
525
+ ),
526
+ "toolname": _safe_strip(
527
+ _extract_tag_content(block, "toolname", TOOL_INNER_TAGS)
528
+ ),
529
+ "parms": _safe_strip(
530
+ _extract_tag_content(block, "parms", TOOL_INNER_TAGS)
531
+ ),
532
+ "timeout": _parse_int(
533
+ _extract_tag_content(block, "timeout", TOOL_INNER_TAGS),
534
+ _DEFAULT_TIMEOUT,
535
+ ),
536
+ "callback": _parse_bool(
537
+ _extract_tag_content(block, "callback", TOOL_INNER_TAGS),
538
+ _DEFAULT_CALLBACK,
539
+ ),
540
+ }
541
+ # Only add if toolname is present
542
+ if tool["toolname"]:
543
+ tools.append(tool)
487
544
 
488
- return parsed
545
+ return tools
489
546
 
490
547
 
491
548
  # ---------------------------------------------------------------------------
@@ -496,57 +553,48 @@ def _fallback_regex_parse(raw_text: str) -> ParsedOutput:
496
553
  def parse_output(raw_text: str) -> ParsedOutput:
497
554
  """Parse the LLM's raw response into a :class:`ParsedOutput`.
498
555
 
499
- Strategy (in order of preference):
556
+ This function uses a **fully custom regex-based parser** (no
557
+ ``xml.etree.ElementTree``) for maximum fault tolerance.
500
558
 
501
- 1. Locate the ``<output>…</output>`` block and parse its inner XML.
502
- 2. If no ``<output>`` block is found, try to parse the entire *raw_text*
503
- as XML (wrapping it in ``<output>`` tags).
504
- 3. If both XML attempts fail, fall back to regex-based extraction.
559
+ If the custom parser cannot extract any meaningful content, it falls
560
+ back to treating the raw text as a plain response. Only if even that
561
+ fails does it set ``needs_correction=True``, signalling the caller to
562
+ ask the LLM to re-format its output.
505
563
 
506
564
  Parameters:
507
565
  raw_text: The complete text returned by the LLM.
508
566
 
509
567
  Returns:
510
- A :class:`ParsedOutput` instance. ``parse_success`` indicates which
511
- strategy succeeded.
568
+ A :class:`ParsedOutput` instance.
512
569
  """
513
570
  if not raw_text:
514
- return ParsedOutput(raw_text=raw_text)
571
+ return ParsedOutput(raw_text=raw_text, needs_correction=True)
515
572
 
516
- # Strategy 1: Extract <output> block and parse inner XML.
517
- xml_body = _extract_xml_block(raw_text)
518
- if xml_body is not None:
519
- result = _parse_xml_content(xml_body)
520
- if result.parse_success:
521
- result.raw_text = raw_text
522
- return result
573
+ return _custom_parse(raw_text)
523
574
 
524
- # Strategy 2: Try parsing the full text as XML directly.
525
- try:
526
- root = ET.fromstring(raw_text)
527
- # If the root itself is <output>, great.
528
- if root.tag.lower() == "output":
529
- parsed = ParsedOutput(parse_success=True, raw_text=raw_text)
530
- parsed.usersays_correct = _safe_strip(
531
- root.findtext("usersays_correct")
532
- )
533
- parsed.task_plan = _safe_strip(root.findtext("task_plan"))
534
- parsed.tools_to_call = _parse_tools_element(root.find("toolstocal"))
535
- parsed.remember, parsed.remember_type = _parse_remember_tag(root.find("remember"))
536
- parsed.recall = _safe_strip(root.findtext("recall"))
537
- parsed.knowledge = _safe_strip(root.findtext("knowledge"))
538
- parsed.ask_user = _safe_strip(root.findtext("askuser"))
539
- parsed.get_knowledge = _safe_strip(root.findtext("get_knowledge"))
540
- parsed.finish = _parse_bool(root.findtext("finish"), False)
541
- parsed.finish_reason = _safe_strip(root.findtext("finish_reason"))
542
- parsed.next_step = _safe_strip(root.findtext("next_step"))
543
- parsed.response = _safe_strip(root.findtext("response"))
544
- return parsed
545
- except ET.ParseError:
546
- pass
547
-
548
- # Strategy 3: Regex fallback.
549
- return _fallback_regex_parse(raw_text)
575
+
576
+ def extract_surrounding_text(full_text: str) -> tuple[str, str]:
577
+ """Split *full_text* around the ``<output>…</output>`` block.
578
+
579
+ Returns:
580
+ A ``(text_before_xml, text_after_xml)`` tuple. Both parts are
581
+ stripped. If no ``<output>`` block is found the original text
582
+ becomes *text_before_xml* and *text_after_xml* is ``""``.
583
+ """
584
+ open_match = re.search(r"<output[^>]*>", full_text, re.IGNORECASE)
585
+ if open_match is None:
586
+ return full_text.strip(), ""
587
+
588
+ text_before = full_text[: open_match.start()].strip()
589
+
590
+ rest = full_text[open_match.end() :]
591
+ close_match = re.search(r"</output\s*>", rest, re.IGNORECASE)
592
+ if close_match is None:
593
+ text_after = rest.strip()
594
+ else:
595
+ text_after = rest[close_match.end() :].strip()
596
+
597
+ return text_before, text_after
550
598
 
551
599
 
552
600
  # ---------------------------------------------------------------------------
@@ -559,18 +607,6 @@ def validate_output(parsed: ParsedOutput) -> list[str]:
559
607
 
560
608
  An empty list means no issues were detected. Warnings are non-fatal
561
609
  hints that the calling code may log or present to the user.
562
-
563
- Checks performed:
564
-
565
- * ``tools_to_call`` entries missing ``toolname``.
566
- * ``tools_to_call`` entries with ``timeout`` ≤ 0.
567
- * ``tools_to_call`` entries with empty ``parms`` when ``toolname`` is
568
- present (informational — some tools legitimately need no params).
569
- * ``finish`` is ``True`` but ``ask_user`` is non-empty (possible mixed
570
- intent from the LLM).
571
- * ``usersays_correct`` is empty (may indicate the LLM skipped
572
- correction).
573
- * ``task_plan`` is empty (may indicate the LLM skipped planning).
574
610
  """
575
611
  warnings: list[str] = []
576
612
 
@@ -596,9 +632,7 @@ def validate_output(parsed: ParsedOutput) -> list[str]:
596
632
 
597
633
  # --- Semantic checks ---
598
634
  if parsed.finish and parsed.ask_user.strip():
599
- warnings.append(
600
- "finish=True but ask_user is non-empty — unclear intent"
601
- )
635
+ warnings.append("finish=True but ask_user is non-empty — unclear intent")
602
636
 
603
637
  if not parsed.usersays_correct.strip():
604
638
  warnings.append("usersays_correct is empty")
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "myagent-ai",
3
- "version": "1.14.0",
3
+ "version": "1.15.0",
4
4
  "description": "本地桌面端执行型AI助手 - Open Interpreter 风格 | Local Desktop Execution-Oriented AI Assistant",
5
5
  "main": "main.py",
6
6
  "bin": {
package/requirements.txt CHANGED
@@ -50,6 +50,11 @@ discord.py>=2.3.0
50
50
  # ============================================================
51
51
  edge-tts>=6.1.0
52
52
 
53
+ # ============================================================
54
+ # 语音识别 (本地 STT,默认启用)
55
+ # ============================================================
56
+ faster-whisper>=1.0.0
57
+
53
58
  # ============================================================
54
59
  # Anthropic Claude (可选)
55
60
  # ============================================================
package/setup.py CHANGED
@@ -37,6 +37,8 @@ setup(
37
37
  "Pillow>=10.0.0",
38
38
  # 语音合成
39
39
  "edge-tts>=6.1.0",
40
+ # 语音识别 (本地 STT)
41
+ "faster-whisper>=1.0.0",
40
42
  # 浏览器自动化 (ChromeDev MCP, 无需 Playwright)
41
43
  # 桌面 GUI 自动化 (内置技能)
42
44
  "pynput>=1.7.6",
@@ -455,7 +455,9 @@ input,textarea,select{font:inherit}
455
455
 
456
456
  /* ── Message Content Smooth Render ── */
457
457
  .message-content{
458
- flex:1;min-width:0;
458
+ flex:1;min-width:0;width:100%;
459
+ /* 确保所有子元素(thought-block, bubble 等)撑满宽度 */
460
+ display:flex;flex-direction:column;align-items:stretch;
459
461
  }
460
462
  .stream-text-node{
461
463
  display:inline;
@@ -469,7 +471,7 @@ input,textarea,select{font:inherit}
469
471
  }
470
472
 
471
473
  /* ── Thought Block (Agent Thinking) ── */
472
- .thought-block{width:100%;display:block;margin:0 0 10px 0;border:1px solid var(--border-light);border-radius:var(--radius-sm);overflow:hidden;background:linear-gradient(135deg,var(--accent-light),var(--bg2));animation:thoughtFadeIn .4s ease-out}
474
+ .thought-block{width:100%!important;max-width:100%!important;display:flex;flex-direction:column;margin:0 0 10px 0;border:1px solid var(--border-light);border-radius:var(--radius-sm);overflow:hidden;background:linear-gradient(135deg,var(--accent-light),var(--bg2));animation:thoughtFadeIn .4s ease-out;flex-shrink:0;box-sizing:border-box;align-self:stretch}
473
475
  .thought-block.streaming{border-color:var(--accent);box-shadow:0 0 12px rgba(99,102,241,.15)}
474
476
  @keyframes thoughtFadeIn{from{opacity:0;transform:translateY(-6px)}to{opacity:1;transform:translateY(0)}}
475
477
  .thought-block summary{display:flex;align-items:center;gap:8px;padding:8px 14px;cursor:pointer;font-size:12px;font-weight:600;color:var(--text2);user-select:none;transition:var(--transition);text-transform:uppercase;letter-spacing:.3px}
@@ -1977,26 +1979,7 @@ input,textarea,select{font:inherit}
1977
1979
  [data-theme="dark"] .inline-exec-code{background:var(--bg)}
1978
1980
  [data-theme="dark"] .inline-exec-result-btn:hover{background:var(--bg4)}
1979
1981
 
1980
- .thought-block {
1981
- background: rgba(0, 0, 0, 0.03);
1982
- border-radius: 8px;
1983
- padding: 8px 12px;
1984
- margin-bottom: 8px;
1985
- font-size: 13px;
1986
- color: var(--text2);
1987
- border-left: 3px solid var(--border);
1988
- }
1989
- .thought-block summary {
1990
- cursor: pointer;
1991
- font-weight: bold;
1992
- outline: none;
1993
- user-select: none;
1994
- }
1995
- .thought-content {
1996
- margin-top: 8px;
1997
- font-family: inherit;
1998
- white-space: pre-wrap;
1999
- }
1982
+ /* thought-block orphaned rules removed — see main .thought-block rule */
2000
1983
 
2001
1984
  /* ══════════════════════════════════════════════════════
2002
1985
  ── Popout Mode (独立窗口) ──
@@ -272,6 +272,11 @@ const StatePersistence = {
272
272
  StatePersistence.save('agentPanelOpen', state.agentPanelOpen);
273
273
  StatePersistence.save('rpSections', rpSections);
274
274
  StatePersistence.save('expandedNodes', [...state.expandedNodes]);
275
+ // 持久化当前活跃会话ID,用于页面刷新后恢复
276
+ if (state.activeSessionId && state.activeSessionId !== '__new__') {
277
+ StatePersistence.save('activeSessionId', state.activeSessionId);
278
+ StatePersistence.save('activeSessionAgent', state.activeAgent || 'default');
279
+ }
275
280
  },
276
281
  /** 恢复 UI 状态 */
277
282
  restoreUIState() {
@@ -353,16 +358,38 @@ function initChat() {
353
358
  }
354
359
 
355
360
  // 如果 URL 指定了 agent 或 session,等 agent 列表加载后自动选中
356
- if (urlAgent || urlSession) {
361
+ // 注意:loadSessions() 内部会检查 URL session 参数并自动恢复
362
+ if (urlAgent) {
357
363
  const targetAgent = urlAgent || (urlSession ? urlSession.split('_web_')[0] || 'default' : null);
358
364
  setTimeout(function() {
359
365
  if (targetAgent) selectAgent(targetAgent);
360
- // 如果指定了 session,等会话列表加载后自动选中
361
- if (urlSession) {
362
- setTimeout(function() { selectSession(urlSession); }, 800);
363
- }
364
366
  }, 500);
367
+ } else if (urlSession) {
368
+ // 只有 session 没有 agent,尝试从 session ID 推断 agent
369
+ const targetAgent = urlSession.split('_web_')[0] || 'default';
370
+ setTimeout(function() {
371
+ selectAgent(targetAgent);
372
+ }, 500);
373
+ } else {
374
+ // URL 中没有 session 参数,尝试从 localStorage 恢复上次的会话
375
+ var savedSessionId = StatePersistence.load('activeSessionId', null);
376
+ var savedSessionAgent = StatePersistence.load('activeSessionAgent', null);
377
+ if (savedSessionId && savedSessionAgent) {
378
+ // 确保 agent 一致,然后延迟等待 loadSessions() 完成后恢复
379
+ state._pendingSessionRestore = savedSessionId;
380
+ if (savedSessionAgent !== state.activeAgent) {
381
+ setTimeout(function() {
382
+ selectAgent(savedSessionAgent);
383
+ }, 500);
384
+ }
385
+ // 如果 agent 已经一致,loadSessions() 内部会自动处理
386
+ }
365
387
  }
388
+
389
+ // 页面卸载前保存 UI 状态(包括活跃 session)
390
+ window.addEventListener('beforeunload', function() {
391
+ StatePersistence.saveUIState();
392
+ });
366
393
  }
367
394
 
368
395
  // Run init: if DOMContentLoaded already fired (dynamic script load), run immediately
@@ -1581,8 +1608,25 @@ async function loadSessions() {
1581
1608
  updateSidebarAgentIndicator();
1582
1609
 
1583
1610
  // Auto-select most recent session if none selected
1584
- if (!state.activeSessionId && state.sessions.length > 0) {
1585
- await selectSession(state.sessions[0].id);
1611
+ // 优先级: URL session 参数 > localStorage 持久化的 session > 最新 session
1612
+ const urlParams = new URLSearchParams(window.location.search);
1613
+ const urlSession = urlParams.get('session');
1614
+ var targetSessionId = null;
1615
+
1616
+ if (urlSession && state.sessions.some(s => s.id === urlSession)) {
1617
+ // URL 指定了有效的 session ID,直接选中(刷新恢复)
1618
+ targetSessionId = urlSession;
1619
+ } else if (state._pendingSessionRestore && state.sessions.some(s => s.id === state._pendingSessionRestore)) {
1620
+ // 从 localStorage 恢复的 session(beforeunload 触发的保存)
1621
+ targetSessionId = state._pendingSessionRestore;
1622
+ state._pendingSessionRestore = null; // 清除,防止重复恢复
1623
+ } else if (!state.activeSessionId && state.sessions.length > 0) {
1624
+ // 默认选中最新 session
1625
+ targetSessionId = state.sessions[0].id;
1626
+ }
1627
+
1628
+ if (targetSessionId) {
1629
+ await selectSession(targetSessionId);
1586
1630
  }
1587
1631
  }
1588
1632
 
@@ -1848,6 +1892,8 @@ async function selectSession(id) {
1848
1892
  } catch (_) {}
1849
1893
  document.getElementById('userInput').focus();
1850
1894
  loadDraft();
1895
+ // 保存选中状态到 localStorage(用于页面刷新恢复)
1896
+ StatePersistence.saveUIState();
1851
1897
  if (isMobile()) closeMobileSidebar();
1852
1898
  }
1853
1899