npm - myagent-ai - Versions diffs - 1.14.0 → 1.15.0 - Mend

myagent-ai 1.14.0 → 1.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/agents/main_agent.py +43 -17
package/core/__pycache__/output_parser.cpython-312.pyc +0 -0
package/core/output_parser.py +422 -388
package/package.json +1 -1
package/requirements.txt +5 -0
package/setup.py +2 -0
package/web/ui/chat/chat.css +5 -22
package/web/ui/chat/chat_main.js +53 -7

package/agents/main_agent.py CHANGED Viewed

@@ -6,6 +6,7 @@ agents/main_agent.py - 主 Agent
 from __future__ import annotations
 import asyncio
+import re
 from typing import Any, Callable, Dict, List, Optional
 from core.logger import get_logger
@@ -442,6 +443,8 @@ class MainAgent(BaseAgent):
         get_knowledge_content = ""
         # 追踪流式推送的 reasoning 文本（用于构建有意义的最终回复）
         _v2_reasoning_collected: List[str] = []
+        # XML 解析失败时的 LLM 修正重试计数
+        _xml_correction_retries: int = 0
         conversation_history = list(context.conversation_history or [])
@@ -587,31 +590,54 @@ class MainAgent(BaseAgent):
                     "finish": parsed.finish,
                     "finish_reason": truncate_str(parsed.finish_reason, 200),
                     "next_step": truncate_str(parsed.next_step, 200),
+                    "response": truncate_str(parsed.response, 500),
                     "parse_success": parsed.parse_success,
+                    "needs_correction": parsed.needs_correction,
                 }},
                 stream_callback,
             )
+            # Step 4.5: 解析失败处理 — 回退给 LLM 修正或提取周边文本
             if not parsed.parse_success:
-                logger.warning(f"[{task_id}] XML 解析失败，尝试提取周边文本")
-                before, after = extract_surrounding_text(llm_raw)
-                if before.strip() or after.strip():
-                    final_text = (before + "\n" + after).strip()
-                    context.working_memory["final_response"] = final_text
-                    await self._emit_v2_event("v2_reasoning", {"content": final_text}, stream_callback)
-                    if self.memory:
-                        self.memory.add_session(
-                            session_id=context.session_id,
-                            role="assistant",
-                            content=final_text,
-                        )
-                    break
+                if parsed.needs_correction and _xml_correction_retries < 1:
+                    # XML 完全无法解析，让 LLM 重新格式化输出
+                    _xml_correction_retries += 1
+                    logger.warning(
+                        f"[{task_id}] XML 解析完全失败，回退给 LLM 修正 "
+                        f"(重试 {_xml_correction_retries}/1)"
+                    )
+                    correction_prompt = (
+                        "你上一次的输出格式有误，XML解析器无法识别。"
+                        "请严格按照 <output>...</output> 格式重新输出你的回答。"
+                        "注意：不要在 <output> 标签前后输出任何其他文字。\n\n"
+                        f"你上一次的原始输出如下：\n{llm_raw}"
+                    )
+                    conversation_history.append(
+                        Message(role="assistant", content=llm_raw)
+                    )
+                    conversation_history.append(
+                        Message(role="user", content=correction_prompt)
+                    )
+                    await self._emit_v2_event(
+                        "v2_reasoning",
+                        {"content": "⚠️ 模型输出格式异常，正在自动修正..."},
+                        stream_callback,
+                    )
+                    continue  # 重新进入循环，让 LLM 重新生成
                 else:
-                    # XML 解析失败且无法提取文本，发送原始输出作为备选
-                    logger.warning(f"[{task_id}] 无法提取文本，发送原始 LLM 输出")
-                    final_text = llm_raw.strip() if llm_raw.strip() else "处理完毕。"
+                    # 已重试过或不需要修正，提取周边文本作为备选
+                    logger.warning(f"[{task_id}] XML 解析失败，提取周边文本作为备选")
+                    before, after = extract_surrounding_text(llm_raw)
+                    if before.strip() or after.strip():
+                        final_text = (before + "\n" + after).strip()
+                    else:
+                        # 清除残余 XML 标签后作为纯文本
+                        final_text = re.sub(r"<[^>]+>", "", llm_raw).strip()
+                        final_text = final_text if final_text else "处理完毕。"
                     context.working_memory["final_response"] = final_text
-                    await self._emit_v2_event("v2_reasoning", {"content": final_text}, stream_callback)
+                    await self._emit_v2_event(
+                        "v2_reasoning", {"content": final_text}, stream_callback
+                    )
                     if self.memory:
                         self.memory.add_session(
                             session_id=context.session_id,

package/core/__pycache__/output_parser.cpython-312.pyc CHANGED Viewed

Binary file

package/core/output_parser.py CHANGED Viewed

@@ -1,11 +1,14 @@
 """
-Structured Output Parser Module.
+Custom Fault-Tolerant XML Output Parser Module.
 Parses the XML ``<output>`` block generated by the LLM in response to the
 system prompt.  The LLM produces structured XML that drives the agent's
 execution loop — including tool calls, memory operations, user interaction
 hints, and loop-control flags.
+**This module does NOT use xml.etree.ElementTree.**  All parsing is done with
+pure Python + regex to achieve maximum fault tolerance.
 Expected XML schema produced by the LLM::
     <output>
@@ -20,26 +23,31 @@ Expected XML schema produced by the LLM::
                 <callback>true/false</callback>
             </tool>
         </toolstocal>
-        <remember>仅从最新用户输入中提炼的记忆，无新信息则为空</remember>
+        <remember>
+            <type>global|session</type>
+            <content>记忆内容</content>
+        </remember>
         <recall>下一轮需要调取的记忆</recall>
         <askuser>需要询问用户的内容</askuser>
         <get_knowledge>下一轮需要搜索获得的知识</get_knowledge>
         <finish>true/false</finish>
+        <response>模型对用户的直接回复</response>
     </output>
-The parser is deliberately robust:
+Fault-tolerance features:
-* XML parsing errors are caught and a regex-based fallback is attempted.
-* Missing optional fields are defaulted (``finish`` → ``False``,
-  ``timeout`` → ``120``, ``callback`` → ``True``).
-* All text fields are stripped of leading/trailing whitespace.
-* Empty tags resolve to empty strings rather than raising errors.
+* Text before ``<output>`` or after ``</output>`` is silently stripped.
+* Unclosed tags are auto-closed at the next sibling tag boundary.
+* Self-closing tags (``<tag/>``) resolve to empty strings.
+* Case-insensitive tag matching (``<OUTPUT>`` == ``<output>``).
+* Tag-name aliases: ``<ask_user>`` → ``askuser``.
+* If extraction yields nothing meaningful, ``needs_correction`` is set to
+  ``True`` so the caller can ask the LLM to re-format.
 """
 from __future__ import annotations
 import re
-import xml.etree.ElementTree as ET
 from dataclasses import dataclass, field
 from typing import Any, Dict, List
@@ -54,16 +62,46 @@ logger = get_logger("myagent.output_parser")
 _DEFAULT_TIMEOUT: int = 120
 _DEFAULT_CALLBACK: bool = True
-# Regex patterns used for extraction and fallback parsing.
-_OUTPUT_BLOCK_RE = re.compile(
-    r"<output\s*>(.*?)</output\s*>",
-    re.DOTALL | re.IGNORECASE,
-)
+# All top-level tags we recognise inside <output>.
+KNOWN_TOP_LEVEL_TAGS = [
+    "usersays_correct",
+    "task_plan",
+    "toolstocal",
+    "remember",
+    "recall",
+    "knowledge",
+    "askuser",
+    "ask_user",  # alias for askuser
+    "get_knowledge",
+    "finish",
+    "finish_reason",
+    "next_step",
+    "response",
+]
+# Inner tags inside each <tool>.
+TOOL_INNER_TAGS = [
+    "beforecalltext",
+    "toolname",
+    "parms",
+    "timeout",
+    "callback",
+]
+# Inner tags inside <remember>.
+REMEMBER_INNER_TAGS = ["type", "content"]
+# Tag aliases: canonical name -> list of aliases.
+_TAG_ALIASES: Dict[str, List[str]] = {
+    "askuser": ["ask_user"],
+}
+# Build reverse lookup: alias -> canonical.
+_ALIAS_TO_CANONICAL: Dict[str, str] = {}
+for _canonical, _aliases in _TAG_ALIASES.items():
+    for _alias in _aliases:
+        _ALIAS_TO_CANONICAL[_alias.lower()] = _canonical
-_TAG_CONTENT_RE = re.compile(
-    r"<(\w+)\s*>(.*?)</\1\s*>",
-    re.DOTALL,
-)
 # ---------------------------------------------------------------------------
 # Data classes
@@ -80,29 +118,27 @@ class ParsedOutput:
         task_plan: Updated or new task plan (may contain Markdown).
         tools_to_call: Ordered list of tool descriptors to execute.
         remember: Content that should be persisted to the agent's memory.
-            Structured as ``<type>global|session</type><content>...</content>``
-            in the LLM output, but parsed into separate fields.
         remember_type: "global" (cross-session) or "session" (current session only).
         recall: Memory keys / descriptions to retrieve for the next loop
             iteration.
-        knowledge: Knowledge content the LLM wants to persist to the
-            knowledge base (analogous to remember → memory).
+        knowledge: Knowledge content the LLM wants to persist.
         ask_user: Free-form question the agent should pose to the user.
         get_knowledge: Knowledge search keywords for the next loop iteration.
-            The ContextBuilder will use this to perform RAG retrieval.
         finish: When ``True`` the execution loop should terminate.
         finish_reason: When finish=True, explains why the task is ending.
         next_step: When finish=False, describes what to do next.
+        response: Model's direct reply to the user (friendly natural language).
         raw_text: The verbatim raw text returned by the LLM.
-        parse_success: Whether the XML was parsed successfully (``True``)
-            or the regex fallback was used (``False``).
+        parse_success: Whether parsing extracted at least one meaningful field.
+        needs_correction: When ``True``, the caller should send the raw text
+            back to the LLM for re-formatting.
     """
     usersays_correct: str = ""
     task_plan: str = ""
     tools_to_call: List[Dict[str, Any]] = field(default_factory=list)
     remember: str = ""
-    remember_type: str = ""   # "global" | "session" (default: "session")
+    remember_type: str = ""
     recall: str = ""
     knowledge: str = ""
     ask_user: str = ""
@@ -110,37 +146,24 @@ class ParsedOutput:
     finish: bool = False
     finish_reason: str = ""
     next_step: str = ""
-    response: str = ""  # 模型对用户的直接回复（友好自然的话语）
+    response: str = ""
     raw_text: str = ""
     parse_success: bool = False
+    needs_correction: bool = False
 # ---------------------------------------------------------------------------
-# Helper utilities
+# Low-level extraction helpers
 # ---------------------------------------------------------------------------
-def _extract_xml_block(text: str) -> str | None:
-    """Return the content between ``<output>`` and ``</output>``, or ``None``.
-    The search is case-insensitive and tolerant of whitespace around the tag
-    names.
-    """
-    match = _OUTPUT_BLOCK_RE.search(text)
-    if match:
-        return match.group(1)
-    return None
 def _safe_strip(value: str | None) -> str:
-    """Return the stripped string, defaulting to ``""`` for ``None``."""
     if value is None:
         return ""
     return value.strip()
 def _parse_bool(value: str | None, default: bool) -> bool:
-    """Parse a boolean from a string, returning *default* on failure."""
     if value is None:
         return default
     stripped = value.strip().lower()
@@ -152,7 +175,6 @@ def _parse_bool(value: str | None, default: bool) -> bool:
 def _parse_int(value: str | None, default: int) -> int:
-    """Parse an integer from a string, returning *default* on failure."""
     if value is None:
         return default
     try:
@@ -161,331 +183,366 @@ def _parse_int(value: str | None, default: int) -> int:
         return default
-def _parse_remember_tag(element: ET.Element | None) -> tuple[str, str]:
-    """Parse the ``<remember>`` element which may contain ``<type>`` and
-    ``<content>`` sub-tags, or plain text (legacy format).
+def _canonical_tag(tag_name: str) -> str:
+    """Return the canonical tag name for *tag_name* (alias-aware, lowercased)."""
+    lower = tag_name.strip().lower()
+    return _ALIAS_TO_CANONICAL.get(lower, lower)
-    Returns:
-        ``(content, remember_type)`` — *content* is the memory text,
-        *remember_type* is ``"global"`` or ``"session"`` (default).
+def _extract_tag_content(text: str, tag_name: str, stop_tags: List[str] | None = None) -> str:
+    """Extract the text content of ``<tag_name>…</tag_name>`` from *text*.
+    Fault-tolerant strategies tried in order:
+    1. **Properly closed**: ``<tag>content</tag>``
+    2. **Unclosed at next sibling opening tag**: ``<tag>content<next_tag>…``
+    3. **Unclosed at ``</output>``**: ``<tag>content</output>``
+    4. **Self-closing**: ``<tag/>``
+    5. **Opening tag at end of string**: ``<tag>content$``
+    Parameters:
+        text: The text to search within (typically the body of ``<output>``).
+        tag_name: The tag name to extract (case-insensitive).
+        stop_tags: Sibling tag names that signal the end of this tag's
+            content (used for unclosed-tag detection).  Defaults to
+            ``KNOWN_TOP_LEVEL_TAGS``.
     """
-    if element is None:
-        return "", ""
+    if not text or not tag_name:
+        return ""
-    # Try structured format: <remember><type>global</type><content>...</content></remember>
-    type_elem = element.findtext("type")
-    content_elem = element.findtext("content")
-    if content_elem is not None and content_elem.strip():
-        mem_type = _safe_strip(type_elem) if type_elem else "session"
-        if mem_type not in ("global", "session"):
-            mem_type = "session"
-        return _safe_strip(content_elem), mem_type
+    if stop_tags is None:
+        stop_tags = KNOWN_TOP_LEVEL_TAGS
-    # Legacy format: <remember>plain text</remember>
-    text = element.text or ""
-    return _safe_strip(text), "session"
+    tag_esc = re.escape(tag_name)
+    # Strategy 1: Properly closed <tag>content</tag>
+    m = re.search(
+        rf"<{tag_esc}[^>]*>(.*?)</{tag_esc}\s*>",
+        text,
+        re.DOTALL | re.IGNORECASE,
+    )
+    if m:
+        return m.group(1)
+    # Strategy 2: Unclosed — content runs until the next opening/closing
+    # sibling tag or </output>.
+    sibling_names = [t for t in stop_tags if t.lower() != tag_name.lower()]
+    if sibling_names:
+        sibling_pat = "|".join(re.escape(t) for t in sibling_names)
+        # CRITICAL: Wrap sibling_pat in (?:...) so that | doesn't split the
+        # leading < or </ from the alternation.  Without this, e.g.
+        # "<a|b|c" is parsed as "<a" OR "b" OR "c" — NOT "<a" OR "<b" OR "<c".
+        boundary = rf"(?:</output\s*>|<(?:{sibling_pat})\b|</(?:{sibling_pat})\s*>)"
+    else:
+        boundary = r"</output\s*>"
-# ---------------------------------------------------------------------------
-# Core parser
-# ---------------------------------------------------------------------------
+    m = re.search(
+        rf"<{tag_esc}[^>]*>(.*?)({boundary})",
+        text,
+        re.DOTALL | re.IGNORECASE,
+    )
+    if m:
+        return m.group(1)
+    # Strategy 3: Self-closing <tag/> or <tag />
+    m = re.search(rf"<{tag_esc}[^>]*/\s*>", text, re.IGNORECASE)
+    if m:
+        return ""
-def extract_surrounding_text(full_text: str) -> tuple[str, str]:
-    """Split *full_text* around the ``<output>…</output>`` block.
+    # Strategy 4: Opening tag at end of text with no closing
+    m = re.search(
+        rf"<{tag_esc}[^>]*>(.*?)$",
+        text,
+        re.DOTALL | re.IGNORECASE,
+    )
+    if m:
+        content = m.group(1).strip()
+        # Only return if there's actual content (not just whitespace)
+        if content:
+            return content
-    Returns:
-        A ``(text_before_xml, text_after_xml)`` tuple.  Both parts are
-        stripped.  If no ``<output>`` block is found the original text
-        becomes *text_before_xml* and *text_after_xml* is ``""``.
+    return ""
+def _extract_all_tag_blocks(
+    text: str,
+    tag_name: str,
+    parent_close_tag: str | None = None,
+) -> List[str]:
+    """Extract all ``<tag_name>…`` blocks from *text*.
+    Used for extracting multiple ``<tool>`` blocks from ``<toolstocal>``
+    content.  Handles both properly closed and unclosed blocks.
+    Returns a list of content strings, one per block.
     """
-    match = re.search(
-        r"<output\s*>",
-        full_text,
-        re.IGNORECASE,
+    if not text:
+        return []
+    tag_esc = re.escape(tag_name)
+    blocks: List[str] = []
+    # Strategy 1: Find all properly closed <tag>content</tag> blocks
+    properly_closed = re.findall(
+        rf"<{tag_esc}[^>]*>(.*?)</{tag_esc}\s*>",
+        text,
+        re.DOTALL | re.IGNORECASE,
     )
-    if match is None:
-        return full_text.strip(), ""
+    if properly_closed:
+        return properly_closed
+    # Strategy 2: Split by <tag> openings — each segment is a block
+    positions = [
+        m.end() for m in re.finditer(rf"<{tag_esc}[^>]*>", text, re.IGNORECASE)
+    ]
+    for i, content_start in enumerate(positions):
+        if i + 1 < len(positions):
+            # Block ends at next <tag> opening
+            content_end = positions[i + 1]
+        elif parent_close_tag:
+            # Last block — ends at parent close tag
+            close_m = re.search(
+                re.escape(parent_close_tag),
+                text[content_start:],
+                re.IGNORECASE,
+            )
+            content_end = content_start + close_m.start() if close_m else len(text)
+        else:
+            content_end = len(text)
+        blocks.append(text[content_start:content_end])
+    return blocks
-    text_before = full_text[: match.start()].strip()
-    end_match = re.search(
+def _extract_output_body(raw_text: str) -> str | None:
+    """Extract the content between ``<output>`` and ``</output>``.
+    If ``</output>`` is missing (unclosed), returns everything after the
+    opening ``<output>`` tag.
+    Returns ``None`` if no ``<output>`` opening tag is found at all.
+    """
+    open_match = re.search(r"<output[^>]*>", raw_text, re.IGNORECASE)
+    if open_match is None:
+        return None
+    content_start = open_match.end()
+    close_match = re.search(
         r"</output\s*>",
-        full_text[match.start() :],
+        raw_text[content_start:],
         re.IGNORECASE,
     )
-    if end_match is None:
-        # Opening tag found but no closing tag — everything after opening
-        # tag is considered "after".
-        text_after = full_text[match.end() :].strip()
-    else:
-        absolute_end = match.start() + end_match.end()
-        text_after = full_text[absolute_end:].strip()
+    if close_match:
+        return raw_text[content_start : content_start + close_match.start()]
-    return text_before, text_after
+    # Unclosed <output> — take everything after it
+    return raw_text[content_start:]
-def _parse_tools_element(tools_element: ET.Element | None) -> List[Dict[str, Any]]:
-    """Parse the ``<toolstocal>`` element into a list of tool dicts."""
-    tools: List[Dict[str, Any]] = []
-    if tools_element is None:
-        return tools
+def _strip_outer_noise(text: str) -> str:
+    """Remove text that is outside any recognised XML tags.
-    for tool_el in tools_element.findall("tool"):
-        tool: Dict[str, Any] = {
-            "beforecalltext": _safe_strip(tool_el.findtext("beforecalltext")),
-            "toolname": _safe_strip(tool_el.findtext("toolname")),
-            "parms": _safe_strip(tool_el.findtext("parms")),
-            "timeout": _parse_int(
-                tool_el.findtext("timeout"), _DEFAULT_TIMEOUT
-            ),
-            "callback": _parse_bool(
-                tool_el.findtext("callback"), _DEFAULT_CALLBACK
-            ),
-        }
-        tools.append(tool)
-    return tools
+    This handles the case where the LLM outputs plain text before or
+    after the ``<output>`` block, e.g.::
+        "我来使用 Python 脚本下载... <output>...</output>"
-def _fix_incomplete_xml(xml_content: str) -> str:
-    """修复不完整的 XML，使用正则表达式补齐格式。
-    支持修复的问题类型：
-    1. 自闭合标签误用：<tag /> → <tag></tag>
-    2. 缺少闭合标签：<tag>value → <tag>value</tag>
-    3. 标签大小写混乱：<TAG>value</TAG> → <tag>value</tag>
-    4. 空白字符问题：< tag >value</ tag >
-    5. 嵌套标签未闭合
-    6. 特殊字符转义：& → &amp; (在属性值中)
+    The function returns the ``<output>…</output>`` body, or the original
+    text if no output block is found.
     """
-    if not xml_content:
-        return xml_content
-    # 标准标签列表（用于修复缺少闭合标签）
-    STANDARD_TAGS = [
-        "usersays_correct", "task_plan", "toolstocal", "remember",
-        "recall", "knowledge", "askuser", "get_knowledge", "finish",
-        "finish_reason", "next_step", "response",
-        "tool", "beforecalltext", "toolname", "parms", "timeout", "callback",
-        "type", "content",
-    ]
-    # 1. 修复自闭合标签为普通标签
-    # <tag /> → <tag></tag>
-    # <tag/> → <tag></tag>
-    xml_content = re.sub(r'<(\w+)\s*/\s*>', r'<\1></\1>', xml_content)
-    # 2. 修复空白字符在标签内的问题：< tag > → <tag>
-    # 同时规范化大小写
-    def normalize_tag(match):
-        inner = match.group(1).strip()
-        tag_name = inner.split()[0].lower()
-        rest = ' '.join(inner.split()[1:])  # 保留可能的属性
-        if rest:
-            return f'<{tag_name} {rest}>'
-        return f'<{tag_name}>'
-    xml_content = re.sub(r'<([a-z_][a-z_0-9]*)[^>]*>', normalize_tag, xml_content, flags=re.IGNORECASE)
-    # 3. 修复闭合标签的大小写和空白：</ TAG > → </tag>
-    def normalize_close_tag(match):
-        tag_name = match.group(1).strip().lower()
-        return f'</{tag_name}>'
-    xml_content = re.sub(r'</\s*([a-z_][a-z_0-9]*)\s*>', normalize_close_tag, xml_content, flags=re.IGNORECASE)
-    # 4. 修复缺少闭合标签的问题
-    # 策略：对于标准标签，如果后面跟着另一个标签或 </output>，则添加闭合标签
-    for tag in STANDARD_TAGS:
-        # 修复 <tag>value<下一个标签> 格式（缺少 </tag>）
-        # 例如：<finish>true<task_plan> → <finish>true</finish><task_plan>
-        next_tag_pattern = '|'.join([re.escape(t) for t in STANDARD_TAGS if t != tag])
-        if next_tag_pattern:
-            # 匹配 <tag>...后面跟着其他标签或 </output>
-            pattern = rf'(<{tag}>)([^<]*?)(?=(?:<{next_tag_pattern}|</output>))'
-            replacement = rf'\1\2</{tag}>'
-            xml_content = re.sub(pattern, replacement, xml_content, flags=re.IGNORECASE | re.DOTALL)
-    # 5. 修复 <toolstocal> 和 </toolstocal> 标签
-    xml_content = re.sub(r'<toolstocal\s*>', '<toolstocal>', xml_content, flags=re.IGNORECASE)
-    xml_content = re.sub(r'</toolstocal\s*>', '</toolstocal>', xml_content, flags=re.IGNORECASE)
-    # 6. 修复 askuser 标签（系统提示中用的是 askuser，但有时可能写成 ask_user）
-    xml_content = re.sub(r'<ask_user\s*>', '<askuser>', xml_content, flags=re.IGNORECASE)
-    xml_content = re.sub(r'</ask_user\s*>', '</askuser>', xml_content, flags=re.IGNORECASE)
-    # 7. 修复单独的 <tool> 块中的标签
-    # 匹配 <tool>... 中缺少闭合标签的情况
-    tool_tags = ["beforecalltext", "toolname", "parms", "timeout", "callback"]
-    tool_next_pattern = '|'.join([re.escape(t) for t in tool_tags])
-    for tag in tool_tags:
-        pattern = rf'(<{tag}>)([^<]*?)(?=(?:<{tool_next_pattern}|</tool>|</toolstocal>|</output>))'
-        replacement = rf'\1\2</{tag}>'
-        xml_content = re.sub(pattern, replacement, xml_content, flags=re.IGNORECASE | re.DOTALL)
-    # 8. 处理文本内容中的特殊 XML 字符（简单处理 &）
-    # 只在标签外的内容中处理
-    # 这个比较复杂，简单处理：在文本内容中 & 后没有 ; 的转为 &amp;
-    # 但更安全的做法是只在必要时处理
-    # 9. 确保 <output> 标签周围没有多余空白
-    xml_content = xml_content.strip()
-    return xml_content
-def _aggressive_clean_xml(xml_content: str) -> str:
-    """激进清理 XML 内容，移除可能导致解析失败的字符。"""
-    # 移除控制字符（除了换行和 tab）
-    xml_content = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', xml_content)
-    # 修复 & 字符（确保它是有效的 XML 实体）
-    # 匹配 & 不在有效实体前的情况
-    xml_content = re.sub(r'&(?!amp;|lt;|gt;|quot;|apos;|#\d+;|#x[0-9a-fA-F]+;)', '&amp;', xml_content)
-    # 移除多余的空白（连续多个空白合并为一个）
-    xml_content = re.sub(r'>\s+<', '><', xml_content)
-    xml_content = re.sub(r'\s{2,}', ' ', xml_content)
-    return xml_content
-def _parse_xml_content(xml_content: str) -> ParsedOutput:
-    """Attempt to parse *xml_content* (the inner body of ``<output>``) as XML.
-    Assumes *xml_content* has already been extracted from the surrounding
-    ``<output>`` tags.  If parsing fails a :class:`ParsedOutput` with
-    ``parse_success=False`` is returned.
-    解析策略：
-    1. 首先尝试直接解析
-    2. 如果失败，使用正则表达式修复后再解析
-    3. 如果仍然失败，尝试激进清理
+    if not text:
+        return text
+    body = _extract_output_body(text)
+    if body is not None:
+        return body
+    # No <output> tag at all — check if there are any recognised tags
+    has_tags = False
+    for tag in KNOWN_TOP_LEVEL_TAGS:
+        if re.search(rf"<{re.escape(tag)}[\s>]", text, re.IGNORECASE):
+            has_tags = True
+            break
+    if has_tags:
+        # Tags exist but no <output> wrapper — return as-is
+        return text
+    # No tags at all — return original (caller will set needs_correction)
+    return text
+# ---------------------------------------------------------------------------
+# Core custom parser — NO xml.etree.ElementTree
+# ---------------------------------------------------------------------------
+def _custom_parse(raw_text: str) -> ParsedOutput:
+    """Fully custom, regex-based XML parser with maximum fault tolerance.
+    This function does NOT use ``xml.etree.ElementTree`` at all.  Every
+    extraction is done via regex patterns that handle malformed XML
+    gracefully.
+    Returns a :class:`ParsedOutput` with ``parse_success=True`` if at least
+    one meaningful field was extracted, or ``needs_correction=True`` if
+    nothing could be parsed.
     """
-    parsed = ParsedOutput(parse_success=False)
+    parsed = ParsedOutput(raw_text=raw_text)
+    if not raw_text or not raw_text.strip():
+        parsed.needs_correction = True
+        return parsed
+    # ── Step 1: Strip non-XML noise (text before/after <output>) ──
+    body = _strip_outer_noise(raw_text)
+    # ── Step 2: Extract each known top-level tag ──
+    # usersays_correct
+    raw_val = _extract_tag_content(body, "usersays_correct")
+    parsed.usersays_correct = _safe_strip(raw_val)
+    # task_plan
+    raw_val = _extract_tag_content(body, "task_plan")
+    parsed.task_plan = _safe_strip(raw_val)
+    # response
+    raw_val = _extract_tag_content(body, "response")
+    parsed.response = _safe_strip(raw_val)
+    # recall
+    raw_val = _extract_tag_content(body, "recall")
+    parsed.recall = _safe_strip(raw_val)
+    # knowledge
+    raw_val = _extract_tag_content(body, "knowledge")
+    parsed.knowledge = _safe_strip(raw_val)
+    # askuser (also try alias ask_user)
+    raw_val = _extract_tag_content(body, "askuser")
+    if not raw_val.strip():
+        raw_val = _extract_tag_content(body, "ask_user")
+    parsed.ask_user = _safe_strip(raw_val)
+    # get_knowledge
+    raw_val = _extract_tag_content(body, "get_knowledge")
+    parsed.get_knowledge = _safe_strip(raw_val)
+    # finish
+    raw_val = _extract_tag_content(body, "finish")
+    parsed.finish = _parse_bool(raw_val, False)
+    # finish_reason
+    raw_val = _extract_tag_content(body, "finish_reason")
+    parsed.finish_reason = _safe_strip(raw_val)
+    # next_step
+    raw_val = _extract_tag_content(body, "next_step")
+    parsed.next_step = _safe_strip(raw_val)
+    # ── Step 3: Parse <remember> (may contain <type> and <content>) ──
+    remember_raw = _extract_tag_content(body, "remember")
+    if remember_raw.strip():
+        # Try structured format: <type>global</type><content>...</content>
+        type_val = _extract_tag_content(remember_raw, "type", REMEMBER_INNER_TAGS)
+        content_val = _extract_tag_content(remember_raw, "content", REMEMBER_INNER_TAGS)
+        if content_val.strip():
+            mem_type = _safe_strip(type_val) or "session"
+            if mem_type not in ("global", "session"):
+                mem_type = "session"
+            parsed.remember = _safe_strip(content_val)
+            parsed.remember_type = mem_type
+        else:
+            # Legacy plain-text format
+            parsed.remember = _safe_strip(remember_raw)
+            parsed.remember_type = "session"
-    # Strategy 1: 尝试直接解析原始 XML
-    try:
-        root = ET.fromstring("<output>" + xml_content + "</output>")
+    # ── Step 4: Parse <toolstocal> → list of tool dicts ──
+    toolstocal_raw = _extract_tag_content(body, "toolstocal")
+    if toolstocal_raw.strip():
+        parsed.tools_to_call = _parse_toolstocal(toolstocal_raw)
+    # ── Step 5: Determine parse success ──
+    has_content = bool(
+        parsed.response
+        or parsed.usersays_correct
+        or parsed.task_plan
+        or parsed.tools_to_call
+        or parsed.ask_user
+        or parsed.remember
+        or parsed.recall
+        or parsed.knowledge
+        or parsed.get_knowledge
+        or parsed.finish
+        or parsed.finish_reason
+        or parsed.next_step
+    )
+    if has_content:
         parsed.parse_success = True
-        logger.debug(f"XML 直接解析成功")
-    except ET.ParseError as e1:
-        logger.debug(f"XML 直接解析失败: {e1}，尝试修复...")
-        # Strategy 2: 修复不完整的 XML 后再解析
-        fixed_xml = _fix_incomplete_xml(xml_content)
-        logger.debug(f"修复后 XML 前200字符: {fixed_xml[:200]}...")
-        try:
-            root = ET.fromstring("<output>" + fixed_xml + "</output>")
+    else:
+        # Nothing was extracted — check if there's any raw text that could
+        # be a response (the LLM might have skipped XML entirely)
+        cleaned = raw_text.strip()
+        # Remove any residual XML tags
+        cleaned_no_tags = re.sub(r"<[^>]+>", "", cleaned).strip()
+        if cleaned_no_tags:
+            # The LLM output something but not in XML format
+            # Treat the entire output as a response
+            parsed.response = cleaned_no_tags
             parsed.parse_success = True
-            logger.debug(f"XML 修复后解析成功")
-        except ET.ParseError as e2:
-            logger.warning(f"XML 修复后仍然解析失败: {e2}，尝试激进清理...")
-            # Strategy 3: 激进清理
-            cleaned = _aggressive_clean_xml(fixed_xml)
-            try:
-                root = ET.fromstring("<output>" + cleaned + "</output>")
-                parsed.parse_success = True
-                logger.debug(f"XML 激进清理后解析成功")
-            except ET.ParseError as e3:
-                logger.warning(f"XML 解析最终失败: {e3}")
-                return parsed
-    # 提取各字段
-    parsed.usersays_correct = _safe_strip(root.findtext("usersays_correct"))
-    parsed.task_plan = _safe_strip(root.findtext("task_plan"))
-    parsed.tools_to_call = _parse_tools_element(root.find("toolstocal"))
-    parsed.remember, parsed.remember_type = _parse_remember_tag(root.find("remember"))
-    parsed.recall = _safe_strip(root.findtext("recall"))
-    parsed.knowledge = _safe_strip(root.findtext("knowledge"))
-    parsed.ask_user = _safe_strip(root.findtext("askuser"))
-    parsed.get_knowledge = _safe_strip(root.findtext("get_knowledge"))
-    parsed.finish = _parse_bool(root.findtext("finish"), False)
-    parsed.finish_reason = _safe_strip(root.findtext("finish_reason"))
-    parsed.next_step = _safe_strip(root.findtext("next_step"))
-    parsed.response = _safe_strip(root.findtext("response"))
+            logger.info(
+                f"XML解析未提取到结构化字段，将原始文本（去除标签后）作为response: "
+                f"{cleaned_no_tags[:100]}..."
+            )
+        else:
+            # Complete parse failure
+            parsed.needs_correction = True
+            logger.warning(
+                f"XML解析完全失败，需要LLM修正。原始输出前200字符: {raw_text[:200]}"
+            )
     return parsed
-def _fallback_regex_parse(raw_text: str) -> ParsedOutput:
-    """Last-resort parser that uses regex to extract tag contents.
+def _parse_toolstocal(toolstocal_content: str) -> List[Dict[str, Any]]:
+    """Parse ``<toolstocal>`` body into a list of tool descriptors."""
+    tools: List[Dict[str, Any]] = []
-    This is used when standard XML parsing fails entirely (e.g. the LLM
-    produced malformed XML).  Only flat text tags are extracted; nested
-    ``<tool>`` elements inside ``<toolstocal>`` are not resolved.
-    """
-    parsed = ParsedOutput(parse_success=False, raw_text=raw_text)
-    xml_body = _extract_xml_block(raw_text)
-    search_text = xml_body if xml_body is not None else raw_text
-    # Extract flat tags.
-    tag_map: Dict[str, str] = {}
-    for match in _TAG_CONTENT_RE.finditer(search_text):
-        tag_name = match.group(1).lower()
-        tag_map[tag_name] = match.group(2)
-    parsed.usersays_correct = _safe_strip(tag_map.get("usersays_correct"))
-    parsed.task_plan = _safe_strip(tag_map.get("task_plan"))
-    # Regex fallback: 尝试解析结构化 <remember> 或退化到纯文本
-    _remember_raw = tag_map.get("remember", "")
-    if "<type>" in _remember_raw and "<content>" in _remember_raw:
-        try:
-            _rem_elem = ET.fromstring(f"<remember>{_remember_raw}</remember>")
-            parsed.remember, parsed.remember_type = _parse_remember_tag(_rem_elem)
-        except Exception:
-            parsed.remember = _safe_strip(_remember_raw)
-            parsed.remember_type = "session"
-    else:
-        parsed.remember = _safe_strip(_remember_raw)
-        parsed.remember_type = "session"
-    parsed.recall = _safe_strip(tag_map.get("recall"))
-    parsed.knowledge = _safe_strip(tag_map.get("knowledge"))
-    parsed.ask_user = _safe_strip(tag_map.get("askuser"))
-    parsed.get_knowledge = _safe_strip(tag_map.get("get_knowledge"))
-    parsed.finish = _parse_bool(tag_map.get("finish"), False)
-    parsed.finish_reason = _safe_strip(tag_map.get("finish_reason"))
-    parsed.next_step = _safe_strip(tag_map.get("next_step"))
-    parsed.response = _safe_strip(tag_map.get("response"))
-    # For toolstocal we attempt to find individual <tool> blocks.
-    tools_raw = tag_map.get("toolstocal", "")
-    tool_blocks = re.findall(
-        r"<tool\s*>(.*?)</tool\s*>",
-        tools_raw,
-        re.DOTALL | re.IGNORECASE,
+    tool_blocks = _extract_all_tag_blocks(
+        toolstocal_content, "tool", parent_close_tag="</toolstocal>"
     )
-    if not tool_blocks:
-        # Perhaps the LLM placed <tool> tags at the top level.
-        tool_blocks = re.findall(
-            r"<tool\s*>(.*?)</tool\s*>",
-            search_text,
-            re.DOTALL | re.IGNORECASE,
-        )
     for block in tool_blocks:
-        inner_map: Dict[str, str] = {}
-        for m in _TAG_CONTENT_RE.finditer(block):
-            inner_map[m.group(1).lower()] = m.group(2)
-        parsed.tools_to_call.append(
-            {
-                "beforecalltext": _safe_strip(
-                    inner_map.get("beforecalltext")
-                ),
-                "toolname": _safe_strip(inner_map.get("toolname")),
-                "parms": _safe_strip(inner_map.get("parms")),
-                "timeout": _parse_int(
-                    inner_map.get("timeout"), _DEFAULT_TIMEOUT
-                ),
-                "callback": _parse_bool(
-                    inner_map.get("callback"), _DEFAULT_CALLBACK
-                ),
-            }
-        )
+        tool: Dict[str, Any] = {
+            "beforecalltext": _safe_strip(
+                _extract_tag_content(block, "beforecalltext", TOOL_INNER_TAGS)
+            ),
+            "toolname": _safe_strip(
+                _extract_tag_content(block, "toolname", TOOL_INNER_TAGS)
+            ),
+            "parms": _safe_strip(
+                _extract_tag_content(block, "parms", TOOL_INNER_TAGS)
+            ),
+            "timeout": _parse_int(
+                _extract_tag_content(block, "timeout", TOOL_INNER_TAGS),
+                _DEFAULT_TIMEOUT,
+            ),
+            "callback": _parse_bool(
+                _extract_tag_content(block, "callback", TOOL_INNER_TAGS),
+                _DEFAULT_CALLBACK,
+            ),
+        }
+        # Only add if toolname is present
+        if tool["toolname"]:
+            tools.append(tool)
-    return parsed
+    return tools
 # ---------------------------------------------------------------------------
@@ -496,57 +553,48 @@ def _fallback_regex_parse(raw_text: str) -> ParsedOutput:
 def parse_output(raw_text: str) -> ParsedOutput:
     """Parse the LLM's raw response into a :class:`ParsedOutput`.
-    Strategy (in order of preference):
+    This function uses a **fully custom regex-based parser** (no
+    ``xml.etree.ElementTree``) for maximum fault tolerance.
-    1. Locate the ``<output>…</output>`` block and parse its inner XML.
-    2. If no ``<output>`` block is found, try to parse the entire *raw_text*
-       as XML (wrapping it in ``<output>`` tags).
-    3. If both XML attempts fail, fall back to regex-based extraction.
+    If the custom parser cannot extract any meaningful content, it falls
+    back to treating the raw text as a plain response.  Only if even that
+    fails does it set ``needs_correction=True``, signalling the caller to
+    ask the LLM to re-format its output.
     Parameters:
         raw_text: The complete text returned by the LLM.
     Returns:
-        A :class:`ParsedOutput` instance.  ``parse_success`` indicates which
-        strategy succeeded.
+        A :class:`ParsedOutput` instance.
     """
     if not raw_text:
-        return ParsedOutput(raw_text=raw_text)
+        return ParsedOutput(raw_text=raw_text, needs_correction=True)
-    # Strategy 1: Extract <output> block and parse inner XML.
-    xml_body = _extract_xml_block(raw_text)
-    if xml_body is not None:
-        result = _parse_xml_content(xml_body)
-        if result.parse_success:
-            result.raw_text = raw_text
-            return result
+    return _custom_parse(raw_text)
-    # Strategy 2: Try parsing the full text as XML directly.
-    try:
-        root = ET.fromstring(raw_text)
-        # If the root itself is <output>, great.
-        if root.tag.lower() == "output":
-            parsed = ParsedOutput(parse_success=True, raw_text=raw_text)
-            parsed.usersays_correct = _safe_strip(
-                root.findtext("usersays_correct")
-            )
-            parsed.task_plan = _safe_strip(root.findtext("task_plan"))
-            parsed.tools_to_call = _parse_tools_element(root.find("toolstocal"))
-            parsed.remember, parsed.remember_type = _parse_remember_tag(root.find("remember"))
-            parsed.recall = _safe_strip(root.findtext("recall"))
-            parsed.knowledge = _safe_strip(root.findtext("knowledge"))
-            parsed.ask_user = _safe_strip(root.findtext("askuser"))
-            parsed.get_knowledge = _safe_strip(root.findtext("get_knowledge"))
-            parsed.finish = _parse_bool(root.findtext("finish"), False)
-            parsed.finish_reason = _safe_strip(root.findtext("finish_reason"))
-            parsed.next_step = _safe_strip(root.findtext("next_step"))
-            parsed.response = _safe_strip(root.findtext("response"))
-            return parsed
-    except ET.ParseError:
-        pass
-    # Strategy 3: Regex fallback.
-    return _fallback_regex_parse(raw_text)
+def extract_surrounding_text(full_text: str) -> tuple[str, str]:
+    """Split *full_text* around the ``<output>…</output>`` block.
+    Returns:
+        A ``(text_before_xml, text_after_xml)`` tuple.  Both parts are
+        stripped.  If no ``<output>`` block is found the original text
+        becomes *text_before_xml* and *text_after_xml* is ``""``.
+    """
+    open_match = re.search(r"<output[^>]*>", full_text, re.IGNORECASE)
+    if open_match is None:
+        return full_text.strip(), ""
+    text_before = full_text[: open_match.start()].strip()
+    rest = full_text[open_match.end() :]
+    close_match = re.search(r"</output\s*>", rest, re.IGNORECASE)
+    if close_match is None:
+        text_after = rest.strip()
+    else:
+        text_after = rest[close_match.end() :].strip()
+    return text_before, text_after
 # ---------------------------------------------------------------------------
@@ -559,18 +607,6 @@ def validate_output(parsed: ParsedOutput) -> list[str]:
     An empty list means no issues were detected.  Warnings are non-fatal
     hints that the calling code may log or present to the user.
-    Checks performed:
-    * ``tools_to_call`` entries missing ``toolname``.
-    * ``tools_to_call`` entries with ``timeout`` ≤ 0.
-    * ``tools_to_call`` entries with empty ``parms`` when ``toolname`` is
-      present (informational — some tools legitimately need no params).
-    * ``finish`` is ``True`` but ``ask_user`` is non-empty (possible mixed
-      intent from the LLM).
-    * ``usersays_correct`` is empty (may indicate the LLM skipped
-      correction).
-    * ``task_plan`` is empty (may indicate the LLM skipped planning).
     """
     warnings: list[str] = []
@@ -596,9 +632,7 @@ def validate_output(parsed: ParsedOutput) -> list[str]:
     # --- Semantic checks ---
     if parsed.finish and parsed.ask_user.strip():
-        warnings.append(
-            "finish=True but ask_user is non-empty — unclear intent"
-        )
+        warnings.append("finish=True but ask_user is non-empty — unclear intent")
     if not parsed.usersays_correct.strip():
         warnings.append("usersays_correct is empty")

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "myagent-ai",
-  "version": "1.14.0",
+  "version": "1.15.0",
   "description": "本地桌面端执行型AI助手 - Open Interpreter 风格 | Local Desktop Execution-Oriented AI Assistant",
   "main": "main.py",
   "bin": {

package/requirements.txt CHANGED Viewed

@@ -50,6 +50,11 @@ discord.py>=2.3.0
 # ============================================================
 edge-tts>=6.1.0
+# ============================================================
+# 语音识别 (本地 STT，默认启用)
+# ============================================================
+faster-whisper>=1.0.0
 # ============================================================
 # Anthropic Claude (可选)
 # ============================================================

package/setup.py CHANGED Viewed

@@ -37,6 +37,8 @@ setup(
         "Pillow>=10.0.0",
         # 语音合成
         "edge-tts>=6.1.0",
+        # 语音识别 (本地 STT)
+        "faster-whisper>=1.0.0",
         # 浏览器自动化 (ChromeDev MCP, 无需 Playwright)
         # 桌面 GUI 自动化 (内置技能)
         "pynput>=1.7.6",

package/web/ui/chat/chat.css CHANGED Viewed

@@ -455,7 +455,9 @@ input,textarea,select{font:inherit}
 /* ── Message Content Smooth Render ── */
 .message-content{
-  flex:1;min-width:0;
+  flex:1;min-width:0;width:100%;
+  /* 确保所有子元素（thought-block, bubble 等）撑满宽度 */
+  display:flex;flex-direction:column;align-items:stretch;
 }
 .stream-text-node{
   display:inline;
@@ -469,7 +471,7 @@ input,textarea,select{font:inherit}
 }
 /* ── Thought Block (Agent Thinking) ── */
-.thought-block{width:100%;display:block;margin:0 0 10px 0;border:1px solid var(--border-light);border-radius:var(--radius-sm);overflow:hidden;background:linear-gradient(135deg,var(--accent-light),var(--bg2));animation:thoughtFadeIn .4s ease-out}
+.thought-block{width:100%!important;max-width:100%!important;display:flex;flex-direction:column;margin:0 0 10px 0;border:1px solid var(--border-light);border-radius:var(--radius-sm);overflow:hidden;background:linear-gradient(135deg,var(--accent-light),var(--bg2));animation:thoughtFadeIn .4s ease-out;flex-shrink:0;box-sizing:border-box;align-self:stretch}
 .thought-block.streaming{border-color:var(--accent);box-shadow:0 0 12px rgba(99,102,241,.15)}
 @keyframes thoughtFadeIn{from{opacity:0;transform:translateY(-6px)}to{opacity:1;transform:translateY(0)}}
 .thought-block summary{display:flex;align-items:center;gap:8px;padding:8px 14px;cursor:pointer;font-size:12px;font-weight:600;color:var(--text2);user-select:none;transition:var(--transition);text-transform:uppercase;letter-spacing:.3px}
@@ -1977,26 +1979,7 @@ input,textarea,select{font:inherit}
 [data-theme="dark"] .inline-exec-code{background:var(--bg)}
 [data-theme="dark"] .inline-exec-result-btn:hover{background:var(--bg4)}
-  .thought-block {
-    background: rgba(0, 0, 0, 0.03);
-    border-radius: 8px;
-    padding: 8px 12px;
-    margin-bottom: 8px;
-    font-size: 13px;
-    color: var(--text2);
-    border-left: 3px solid var(--border);
-  }
-  .thought-block summary {
-    cursor: pointer;
-    font-weight: bold;
-    outline: none;
-    user-select: none;
-  }
-  .thought-content {
-    margin-top: 8px;
-    font-family: inherit;
-    white-space: pre-wrap;
-  }
+  /* thought-block orphaned rules removed — see main .thought-block rule */
 /* ══════════════════════════════════════════════════════
    ── Popout Mode (独立窗口) ──

package/web/ui/chat/chat_main.js CHANGED Viewed

@@ -272,6 +272,11 @@ const StatePersistence = {
     StatePersistence.save('agentPanelOpen', state.agentPanelOpen);
     StatePersistence.save('rpSections', rpSections);
     StatePersistence.save('expandedNodes', [...state.expandedNodes]);
+    // 持久化当前活跃会话ID，用于页面刷新后恢复
+    if (state.activeSessionId && state.activeSessionId !== '__new__') {
+      StatePersistence.save('activeSessionId', state.activeSessionId);
+      StatePersistence.save('activeSessionAgent', state.activeAgent || 'default');
+    }
   },
   /** 恢复 UI 状态 */
   restoreUIState() {
@@ -353,16 +358,38 @@ function initChat() {
   }
   // 如果 URL 指定了 agent 或 session，等 agent 列表加载后自动选中
-  if (urlAgent || urlSession) {
+  // 注意：loadSessions() 内部会检查 URL session 参数并自动恢复
+  if (urlAgent) {
     const targetAgent = urlAgent || (urlSession ? urlSession.split('_web_')[0] || 'default' : null);
     setTimeout(function() {
       if (targetAgent) selectAgent(targetAgent);
-      // 如果指定了 session，等会话列表加载后自动选中
-      if (urlSession) {
-        setTimeout(function() { selectSession(urlSession); }, 800);
-      }
     }, 500);
+  } else if (urlSession) {
+    // 只有 session 没有 agent，尝试从 session ID 推断 agent
+    const targetAgent = urlSession.split('_web_')[0] || 'default';
+    setTimeout(function() {
+      selectAgent(targetAgent);
+    }, 500);
+  } else {
+    // URL 中没有 session 参数，尝试从 localStorage 恢复上次的会话
+    var savedSessionId = StatePersistence.load('activeSessionId', null);
+    var savedSessionAgent = StatePersistence.load('activeSessionAgent', null);
+    if (savedSessionId && savedSessionAgent) {
+      // 确保 agent 一致，然后延迟等待 loadSessions() 完成后恢复
+      state._pendingSessionRestore = savedSessionId;
+      if (savedSessionAgent !== state.activeAgent) {
+        setTimeout(function() {
+          selectAgent(savedSessionAgent);
+        }, 500);
+      }
+      // 如果 agent 已经一致，loadSessions() 内部会自动处理
+    }
   }
+  // 页面卸载前保存 UI 状态（包括活跃 session）
+  window.addEventListener('beforeunload', function() {
+    StatePersistence.saveUIState();
+  });
 }
 // Run init: if DOMContentLoaded already fired (dynamic script load), run immediately
@@ -1581,8 +1608,25 @@ async function loadSessions() {
   updateSidebarAgentIndicator();
   // Auto-select most recent session if none selected
-  if (!state.activeSessionId && state.sessions.length > 0) {
-    await selectSession(state.sessions[0].id);
+  // 优先级: URL session 参数 > localStorage 持久化的 session > 最新 session
+  const urlParams = new URLSearchParams(window.location.search);
+  const urlSession = urlParams.get('session');
+  var targetSessionId = null;
+  if (urlSession && state.sessions.some(s => s.id === urlSession)) {
+    // URL 指定了有效的 session ID，直接选中（刷新恢复）
+    targetSessionId = urlSession;
+  } else if (state._pendingSessionRestore && state.sessions.some(s => s.id === state._pendingSessionRestore)) {
+    // 从 localStorage 恢复的 session（beforeunload 触发的保存）
+    targetSessionId = state._pendingSessionRestore;
+    state._pendingSessionRestore = null;  // 清除，防止重复恢复
+  } else if (!state.activeSessionId && state.sessions.length > 0) {
+    // 默认选中最新 session
+    targetSessionId = state.sessions[0].id;
+  }
+  if (targetSessionId) {
+    await selectSession(targetSessionId);
   }
 }
@@ -1848,6 +1892,8 @@ async function selectSession(id) {
   } catch (_) {}
   document.getElementById('userInput').focus();
   loadDraft();
+  // 保存选中状态到 localStorage（用于页面刷新恢复）
+  StatePersistence.saveUIState();
   if (isMobile()) closeMobileSidebar();
 }