matrix-for-agents 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. agentmatrix/__init__.py +20 -0
  2. agentmatrix/agents/__init__.py +1 -0
  3. agentmatrix/agents/base.py +572 -0
  4. agentmatrix/agents/claude_coder.py +10 -0
  5. agentmatrix/agents/data_crawler.py +14 -0
  6. agentmatrix/agents/post_office.py +212 -0
  7. agentmatrix/agents/report_writer.py +14 -0
  8. agentmatrix/agents/secretary.py +10 -0
  9. agentmatrix/agents/stateful.py +10 -0
  10. agentmatrix/agents/user_proxy.py +82 -0
  11. agentmatrix/agents/worker.py +30 -0
  12. agentmatrix/backends/__init__.py +1 -0
  13. agentmatrix/backends/llm_client.py +414 -0
  14. agentmatrix/backends/mock_llm.py +35 -0
  15. agentmatrix/cli_runner.py +94 -0
  16. agentmatrix/core/__init__.py +0 -0
  17. agentmatrix/core/action.py +50 -0
  18. agentmatrix/core/browser/bing.py +208 -0
  19. agentmatrix/core/browser/browser_adapter.py +298 -0
  20. agentmatrix/core/browser/browser_common.py +85 -0
  21. agentmatrix/core/browser/drission_page_adapter.py +1296 -0
  22. agentmatrix/core/browser/google.py +230 -0
  23. agentmatrix/core/cerebellum.py +121 -0
  24. agentmatrix/core/events.py +22 -0
  25. agentmatrix/core/loader.py +185 -0
  26. agentmatrix/core/loader_v1.py +146 -0
  27. agentmatrix/core/log_util.py +158 -0
  28. agentmatrix/core/message.py +32 -0
  29. agentmatrix/core/prompt_engine.py +30 -0
  30. agentmatrix/core/runtime.py +211 -0
  31. agentmatrix/core/session.py +20 -0
  32. agentmatrix/db/__init__.py +1 -0
  33. agentmatrix/db/database.py +79 -0
  34. agentmatrix/db/vector_db.py +213 -0
  35. agentmatrix/docs/Design.md +109 -0
  36. agentmatrix/docs/Framework Capbilities.md +105 -0
  37. agentmatrix/docs/Planner Design.md +148 -0
  38. agentmatrix/docs/crawler_flow.md +110 -0
  39. agentmatrix/docs/report_writer.md +83 -0
  40. agentmatrix/docs/review.md +99 -0
  41. agentmatrix/docs/skill_design.md +23 -0
  42. agentmatrix/profiles/claude_coder.yml +40 -0
  43. agentmatrix/profiles/mark.yml +26 -0
  44. agentmatrix/profiles/planner.yml +21 -0
  45. agentmatrix/profiles/prompts/base.txt +88 -0
  46. agentmatrix/profiles/prompts/base_v1.txt +101 -0
  47. agentmatrix/profiles/prompts/base_v2.txt +94 -0
  48. agentmatrix/profiles/tom_the_data_crawler.yml +38 -0
  49. agentmatrix/profiles/user_proxy.yml +17 -0
  50. agentmatrix/skills/__init__.py +1 -0
  51. agentmatrix/skills/crawler_helpers.py +315 -0
  52. agentmatrix/skills/data_crawler.py +777 -0
  53. agentmatrix/skills/filesystem.py +204 -0
  54. agentmatrix/skills/notebook.py +158 -0
  55. agentmatrix/skills/project_management.py +114 -0
  56. agentmatrix/skills/report_writer.py +194 -0
  57. agentmatrix/skills/report_writer_utils.py +379 -0
  58. agentmatrix/skills/search_tool.py +383 -0
  59. agentmatrix/skills/terminal_ctrl.py +122 -0
  60. agentmatrix/skills/utils.py +33 -0
  61. agentmatrix/skills/web_searcher.py +1107 -0
  62. matrix_for_agents-0.1.2.dist-info/METADATA +44 -0
  63. matrix_for_agents-0.1.2.dist-info/RECORD +66 -0
  64. matrix_for_agents-0.1.2.dist-info/WHEEL +5 -0
  65. matrix_for_agents-0.1.2.dist-info/licenses/LICENSE +190 -0
  66. matrix_for_agents-0.1.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1107 @@
1
+ import asyncio
2
+ import time
3
+ import os
4
+ import json
5
+ import textwrap
6
+ import re
7
+ from typing import List, Set, Dict, Optional, Any, Deque
8
+ from collections import deque
9
+ from dataclasses import dataclass, field
10
+
11
+ from ..core.browser.google import search_google
12
+ from ..core.browser.bing import search_bing
13
+ from ..core.browser.browser_adapter import (
14
+ BrowserAdapter, TabHandle, PageElement, PageSnapshot, PageType
15
+ )
16
+ from ..core.browser.browser_common import TabSession, BaseCrawlerContext
17
+ from ..skills.crawler_helpers import CrawlerHelperMixin
18
+ from ..core.browser.drission_page_adapter import DrissionPageAdapter
19
+ from ..core.action import register_action
20
+
21
+ search_func = search_google
22
+
23
+ # ==========================================
24
+ # Prompt 集中管理
25
+ # ==========================================
26
+
27
+ class WebSearcherPrompts:
28
+ """Web Searcher Prompt 集中管理"""
29
+
30
+ # ==========================================
31
+ # 1. 章节选择
32
+ # ==========================================
33
+
34
+ CHAPTER_SELECTION = """You are searching for information to answer: "{question}"
35
+
36
+ Below is the table of contents for a document:
37
+
38
+ {toc_list}
39
+
40
+ [Task]
41
+ Select the chapters that are MOST LIKELY to contain information relevant to answering the question.
42
+
43
+ [Rules]
44
+ 1. You can select multiple chapters
45
+ 2. Be conservative - only select chapters that seem directly relevant
46
+ 3. If unsure, you can select multiple chapters to be safe
47
+
48
+ [Output Format]
49
+
50
+ First, explain your reasoning (why you selected these chapters).
51
+
52
+ Then, output your selections using following format:
53
+
54
+ ====章节选择====
55
+ 你选择的章节名称1(replace with your choice)
56
+ 你选择的章节名称2(replace with your choice)
57
+ ...
58
+ ====章节选择结束====
59
+
60
+ One chapter name per line. The chapter names must EXACTLY match the names shown in the TOC above."""
61
+
62
+ CHAPTER_ERROR_HALLUCINATION = """Your selection contains chapters that don't exist in the TOC:
63
+
64
+ Invalid chapters:
65
+ {invalid_chapters}
66
+
67
+ Please select ONLY from the available chapters listed in the TOC. Try again."""
68
+
69
+ CHAPTER_ERROR_FORMAT = """Your output format is incorrect.
70
+
71
+ Please use this EXACT format:
72
+
73
+ ====章节选择====
74
+ 章节名称1
75
+ 章节名称2
76
+ ====章节选择结束====
77
+
78
+ Make sure:
79
+ 1. The markers are EXACTLY '====章节选择====' and '====章节选择结束===='
80
+ 2. One chapter name per line
81
+ 3. Chapter names EXACTLY match the TOC
82
+
83
+ Try again."""
84
+
85
+ # ==========================================
86
+ # 2. 批处理
87
+ # ==========================================
88
+
89
+ BATCH_PROCESSING = """You are reading a document to answer: "{question}"
90
+
91
+ [Document Info]
92
+ - Title: {doc_title}
93
+ - Source URL: {url}
94
+ - Progress: Page {current_batch} of {total_batches} ({progress_pct}% complete)
95
+
96
+ [Notebook - What We Already Know]
97
+ {notebook}
98
+
99
+ [Current Page Content - Page {current_batch}]
100
+ {batch_text}
101
+
102
+ [Task]
103
+ Based on the Notebook, Current Page, AND your reading progress, provide a brief summary.
104
+
105
+ Consider your progress:
106
+ - If you're early in the document (first 20%), keep exploring even if this page is weak
107
+ - If you're late in the document (last 30%) and found nothing useful, consider skipping
108
+ - If you're in the middle, continue unless the content is completely irrelevant
109
+
110
+ Your response MUST start with ONE of these four headings:
111
+
112
+ ##对问题的回答
113
+ If you can provide a clear, complete answer based on the Notebook and Current Page:
114
+ - Use this heading
115
+ - Provide your answer below
116
+ - Keep it concise but complete
117
+ - Keep key references (urls) for key information
118
+
119
+ ##值得记录的笔记
120
+ If you cannot answer yet, but found NEW and USEFUL information:
121
+ - Use this heading
122
+ - Provide a concise summary (2-5 sentences)
123
+ - Focus on facts, data, definitions, explanations
124
+ - Only extract information NOT already in Notebook
125
+ - Always include the source URL
126
+
127
+ ##没有值得记录的笔记继续阅读
128
+ If the page doesn't contain new or useful information, but the document still shows promise:
129
+ - Use this heading
130
+ - Briefly explain why (1 sentence)
131
+ - Consider: If you're late in the document (>70%), you might want to skip
132
+
133
+ ##完全不相关的文档应该放弃
134
+ If the page is completely irrelevant to the question (navigation, ads, unrelated topics):
135
+ - Use this heading
136
+ - Explain why (1 sentence)
137
+ - Skip the rest of this document
138
+ - Especially consider this if you're already deep into the document (>50%) and found nothing useful
139
+
140
+ [Output Format]
141
+
142
+ ##对问题的回答 (or one of the other three headings)
143
+
144
+ Your content here...
145
+
146
+ [Important]
147
+ - Start with ONE of the four headings above (EXACTLY as shown)
148
+ - Provide your content below the heading
149
+ - Consider your reading progress when deciding whether to continue or skip"""
150
+
151
+ BATCH_ERROR_FORMAT = """Your output format is incorrect.
152
+
153
+ Please start your response with ONE of these four headings (EXACTLY as shown):
154
+
155
+ ##对问题的回答
156
+ ##值得记录的笔记
157
+ ##没有值得记录的笔记继续阅读
158
+ ##完全不相关的文档应该放弃
159
+
160
+ Then provide your content below the heading.
161
+
162
+ Examples:
163
+
164
+ Example 1 (can answer):
165
+ ##对问题的回答
166
+ Python装饰器是一种...
167
+
168
+ Example 2 (useful info):
169
+ ##值得记录的笔记
170
+ 装饰器使用@符号语法...
171
+
172
+ Example 3 (no new info):
173
+ ##没有值得记录的笔记继续阅读
174
+ 这段内容介绍了网站导航,但没有新的有用信息。
175
+
176
+ Example 4 (irrelevant):
177
+ ##完全不相关的文档应该放弃
178
+ 这是一段购物网站的广告内容,完全与装饰器无关。
179
+
180
+ Try again."""
181
+
182
+
183
+ # ==========================================
184
+ # 1. 状态与上下文定义
185
+ # ==========================================
186
+
187
+ class WebSearcherContext(BaseCrawlerContext):
188
+ """
189
+ Web 搜索任务上下文
190
+ 用于回答问题的搜索任务,带有"小本本"机制记录有用信息
191
+ """
192
+
193
+ def __init__(self, purpose: str, deadline: float, chunk_threshold: int = 5000,
194
+ temp_file_dir: Optional[str] = None):
195
+ super().__init__(deadline)
196
+ self.purpose = purpose # 改名:question -> purpose
197
+ self.notebook = ""
198
+ self.chunk_threshold = chunk_threshold
199
+ self.temp_file_dir = temp_file_dir
200
+
201
+ def add_to_notebook(self, info: str):
202
+ """添加信息到小本本"""
203
+ if info:
204
+ timestamp = time.strftime("%H:%M:%S")
205
+ self.notebook += f"\n\n[{timestamp}] {info}\n"
206
+
207
+
208
+ # ==========================================
209
+ # 2. Web Searcher 核心逻辑
210
+ # ==========================================
211
+
212
+ class WebSearcherMixin(CrawlerHelperMixin):
213
+ """
214
+ Web 搜索器技能
215
+ 用于回答问题的网络搜索
216
+ """
217
+
218
+ @register_action(
219
+ "针对一个问题上网搜索答案,提供要解决的问题和(可选)搜索关键字词",
220
+ param_infos={
221
+ "purpose": "要回答的问题(或研究目标)",
222
+ "search_phrase": "可选,初始搜索关键词",
223
+ "max_time": "可选,最大搜索分钟,默认20",
224
+ "max_search_pages": "可选,最大搜索页数(默认5)",
225
+
226
+ }
227
+ )
228
+ async def web_search(
229
+ self,
230
+ purpose: str,
231
+ search_phrase: str = None,
232
+ max_time: int = 20,
233
+ max_search_pages: int = 5,
234
+ temp_file_dir: Optional[str] = None
235
+ ):
236
+ """
237
+ [Entry Point] 上网搜索回答问题(流式处理版本)
238
+
239
+ Args:
240
+ purpose: 要回答的问题(或研究目标)
241
+ search_phrase: 初始搜索关键词
242
+ max_time: 最大搜索时间(分钟)
243
+ max_search_pages: 最大搜索页数(默认5)
244
+ chunk_threshold: 分段阈值(字符数)
245
+ temp_file_dir: 临时文件保存目录(可选,用于调试)
246
+ """
247
+ # 1. 准备环境
248
+ profile_path = os.path.join(self.workspace_root, ".matrix", "browser_profile", self.name)
249
+ download_path = os.path.join(self.current_workspace, "downloads")
250
+ chunk_threshold = 5000
251
+
252
+ if not search_phrase:
253
+ resp = await self.brain.think(f"""
254
+ 现在我们要研究个新问题:{purpose},打算上网搜索一下,需要你设计一下最合适的关键词或者关键字组合。输出的时候可以先简单解释一下这么设计的理由,但是最后一行必须是也只能是要搜索的内容(也就是输入到搜索引擎搜索栏的内容)。例如你认为应该搜索"Keyword",那么最后一行就只能是"Keyword"
255
+ """)
256
+ reply = resp['reply']
257
+ #get last line of reply
258
+ if '\n' in reply:
259
+ search_phrase = reply.split('\n')[-1].strip()
260
+ #如果还是有问题,我们直接搜索问题:
261
+ if not search_phrase:
262
+ search_phrase = purpose
263
+ self.logger.info(f"🔍 准备搜索: {search_phrase}")
264
+
265
+ self.browser = DrissionPageAdapter(
266
+ profile_path=profile_path,
267
+ download_path=download_path
268
+ )
269
+
270
+ ctx = WebSearcherContext(
271
+ purpose=purpose,
272
+ deadline=time.time() + int(max_time) * 60,
273
+ chunk_threshold=chunk_threshold,
274
+ temp_file_dir=temp_file_dir
275
+ )
276
+
277
+ self.logger.info(f"🔍 Web Search Start: {purpose}")
278
+ self.logger.info(f"🔍 Initial search phrase: {search_phrase}")
279
+ self.logger.info(f"🔍 Max search pages: {max_search_pages}")
280
+
281
+ # 2. 启动浏览器
282
+ await self.browser.start(headless=False)
283
+
284
+ try:
285
+ # 3. 创建 Tab 和 Session
286
+ tab = await self.browser.get_tab()
287
+ session = TabSession(handle=tab, current_url="")
288
+
289
+ # 4. 外层循环:逐页处理搜索结果
290
+ for page_num in range(1, max_search_pages + 1):
291
+ self.logger.info(f"\n{'='*60}")
292
+ self.logger.info(f"🔍 Fetching search results page {page_num}/{max_search_pages}")
293
+ self.logger.info(f"{'='*60}\n")
294
+
295
+ # 4.1 获取第 page_num 页的搜索结果
296
+ search_result = await search_func(
297
+ self.browser,
298
+ tab,
299
+ search_phrase,
300
+ max_pages=max_search_pages,
301
+ page=page_num # 指定只获取第 page_num 页
302
+ )
303
+
304
+ if not search_result:
305
+ self.logger.warning(f"⚠️ No results found on page {page_num}")
306
+ break
307
+
308
+ # 4.2 将 URL 添加到 pending_link_queue
309
+ added_count = 0
310
+ for result in search_result:
311
+ url = result['url']
312
+ if not ctx.has_visited(url):
313
+ session.pending_link_queue.append(url)
314
+ added_count += 1
315
+
316
+ self.logger.info(f"✓ Added {added_count} URLs from page {page_num} to queue")
317
+
318
+ # 4.3 运行 _run_search_lifecycle 处理这些 URL
319
+ self.logger.info(f"\n🌐 Processing URLs from page {page_num}...")
320
+ answer = await self._run_search_lifecycle(session, ctx)
321
+
322
+ # 4.4 如果找到答案,提前返回
323
+ if answer:
324
+ self.logger.info(f"✅ Found answer on page {page_num}!")
325
+ return f"Answer: {answer}\n\n---\nNotebook:\n{ctx.notebook}"
326
+
327
+ # 4.5 检查时间和资源限制
328
+ if ctx.is_time_up():
329
+ self.logger.info("⏰ Time up!")
330
+ break
331
+
332
+ self.logger.info(f"✓ Completed page {page_num}, continuing to next page...")
333
+
334
+ # 5. 未找到答案,返回 notebook
335
+ self.logger.info("⏸ Exhausted all search pages without finding complete answer")
336
+ return f"Could not find a complete answer.\n\nHere's what I found:\n{ctx.notebook}"
337
+
338
+ except Exception as e:
339
+ self.logger.exception("Web searcher crashed")
340
+ return f"Search failed with error: {e}"
341
+ finally:
342
+ self.logger.info("🛑 Closing browser...")
343
+ await self.browser.close()
344
+
345
+ # ==========================================
346
+ # 2. 获取完整页面内容
347
+ # ==========================================
348
+
349
+ async def _get_full_page_markdown(self, tab: TabHandle, ctx: WebSearcherContext) -> str:
350
+ """
351
+ 获取完整页面的 Markdown,无字符限制
352
+ - HTML: 使用 trafilatura 提取完整 Markdown
353
+ - PDF: 使用 pdf_to_markdown 转换完整文档
354
+ """
355
+ content_type = await self.browser.analyze_page_type(tab)
356
+
357
+ if content_type == PageType.STATIC_ASSET:
358
+ return await self._pdf_to_full_markdown(tab, ctx)
359
+ else:
360
+ return await self._html_to_full_markdown(tab)
361
+
362
+ async def _html_to_full_markdown(self, tab: TabHandle) -> str:
363
+ """将 HTML 页面转换为完整 Markdown"""
364
+ import trafilatura
365
+
366
+ raw_html = tab.html
367
+ url = self.browser.get_tab_url(tab)
368
+
369
+ # 使用 trafilatura 提取完整 Markdown
370
+ markdown = trafilatura.extract(
371
+ raw_html,
372
+ include_links=True,
373
+ include_formatting=True,
374
+ output_format='markdown',
375
+ url=url
376
+ )
377
+
378
+ # 备选方案
379
+ if not markdown or len(markdown) < 50:
380
+ markdown = tab.text
381
+
382
+ return markdown or ""
383
+
384
+ async def _pdf_to_full_markdown(self, tab: TabHandle, ctx: WebSearcherContext) -> str:
385
+ """将 PDF 转换为完整 Markdown(独立实现,便于后续优化)"""
386
+ from skills.report_writer_utils import pdf_to_markdown
387
+
388
+ # 下载 PDF 到本地
389
+ pdf_path = await self.browser.save_static_asset(tab)
390
+
391
+ # 转换完整 PDF 为 Markdown
392
+ markdown = pdf_to_markdown(pdf_path)
393
+
394
+ # 可选:保存到临时文件(调试用)
395
+ if ctx.temp_file_dir:
396
+ import os
397
+ from slugify import slugify
398
+ os.makedirs(ctx.temp_file_dir, exist_ok=True)
399
+ filename = slugify(f"pdf_{os.path.basename(pdf_path)}") + ".md"
400
+ temp_path = os.path.join(ctx.temp_file_dir, filename)
401
+ with open(temp_path, "w", encoding="utf-8") as f:
402
+ f.write(markdown)
403
+ self.logger.info(f"📄 Saved markdown to: {temp_path}")
404
+
405
+ return markdown
406
+
407
+ # ==========================================
408
+ # 3. 辅助方法(目录、选择章节、分段)
409
+ # ==========================================
410
+
411
+ def _generate_document_toc(self, markdown: str) -> List[Dict[str, Any]]:
412
+ """
413
+ 从 Markdown 中提取目录结构
414
+ 返回: [
415
+ {"level": 1, "title": "第一章", "start": 0, "end": 1234},
416
+ {"level": 2, "title": "1.1 简介", "start": 1235, "end": 2345},
417
+ ...
418
+ ]
419
+ """
420
+ toc = []
421
+ lines = markdown.split("\n")
422
+ current_pos = 0
423
+
424
+ for line in lines:
425
+ # 匹配 Markdown 标题
426
+ match = re.match(r'^(#{1,6})\s+(.+)$', line)
427
+ if match:
428
+ level = len(match.group(1))
429
+ title = match.group(2).strip()
430
+ toc.append({
431
+ "level": level,
432
+ "title": title,
433
+ "start": current_pos,
434
+ "line": line
435
+ })
436
+
437
+ current_pos += len(line) + 1 # +1 for newline
438
+
439
+ # 计算每个章节的结束位置
440
+ for i in range(len(toc) - 1):
441
+ toc[i]["end"] = toc[i + 1]["start"]
442
+ if toc:
443
+ toc[-1]["end"] = len(markdown)
444
+
445
+ return toc
446
+
447
+ async def _let_llm_select_chapters(
448
+ self,
449
+ toc: List[Dict],
450
+ ctx: WebSearcherContext
451
+ ) -> List[int]:
452
+ """
453
+ 让 LLM 根据问题选择相关章节(带重试机制)
454
+ 返回: 选中的章节索引列表(0-based)
455
+ """
456
+ # 构造 TOC 列表(不用数字编号,保留缩进)
457
+ toc_lines = []
458
+ for chapter in toc:
459
+ indent = " " * (chapter["level"] - 1)
460
+ toc_lines.append(f"{indent}{chapter['title']}")
461
+ toc_list = "\n".join(toc_lines)
462
+
463
+ # 构造章节名字到索引的映射(用于验证)
464
+ chapter_name_to_index = {
465
+ chapter["title"]: i
466
+ for i, chapter in enumerate(toc)
467
+ }
468
+
469
+ # 使用 prompt 模板
470
+ initial_prompt = WebSearcherPrompts.CHAPTER_SELECTION.format(
471
+ question=ctx.purpose,
472
+ toc_list=toc_list
473
+ )
474
+
475
+ # 初始化消息列表
476
+ messages = [{"role": "user", "content": initial_prompt}]
477
+
478
+ # 最大重试次数
479
+ MAX_RETRIES = 5
480
+
481
+ for attempt in range(MAX_RETRIES):
482
+ try:
483
+ # 调用 LLM
484
+ response = await self.cerebellum.backend.think(messages=messages)
485
+ reply = response.get('reply', '').strip()
486
+
487
+ self.logger.debug(f"Chapter selection attempt {attempt + 1}:\n{reply}")
488
+
489
+ # 将 LLM 的回复作为 assistant 消息加入历史
490
+ messages.append({"role": "assistant", "content": reply})
491
+
492
+ # 解析输出
493
+ result = self._parse_chapter_selection(reply, chapter_name_to_index)
494
+
495
+ if result["status"] == "success":
496
+ # 情况 (1): 解析成功,所有章节都是真的
497
+ selected_indices = result["selected_indices"]
498
+ self.logger.info(f"✅ Successfully selected {len(selected_indices)} chapters: {selected_indices}")
499
+ return selected_indices
500
+
501
+ elif result["status"] == "hallucination":
502
+ # 情况 (2): 解析成功,但有些章节是假的(幻觉)
503
+ invalid_chapters = result["invalid_chapters"]
504
+ self.logger.warning(f"⚠️ LLM hallucinated chapters: {invalid_chapters}")
505
+
506
+ # 使用错误提示模板
507
+ invalid_chapters_str = "\n".join(f"- {ch}" for ch in invalid_chapters)
508
+ error_msg = WebSearcherPrompts.CHAPTER_ERROR_HALLUCINATION.format(
509
+ invalid_chapters=invalid_chapters_str
510
+ )
511
+
512
+ messages.append({"role": "user", "content": error_msg})
513
+ continue
514
+
515
+ else: # result["status"] == "parse_error"
516
+ # 情况 (3): 解析失败(格式不对)
517
+ self.logger.warning(f"⚠️ LLM output format incorrect")
518
+
519
+ # 使用错误提示模板
520
+ error_msg = WebSearcherPrompts.CHAPTER_ERROR_FORMAT
521
+
522
+ messages.append({"role": "user", "content": error_msg})
523
+ continue
524
+
525
+ except Exception as e:
526
+ self.logger.error(f"Chapter selection failed: {e}")
527
+
528
+ if attempt < MAX_RETRIES - 1:
529
+ messages.append({
530
+ "role": "user",
531
+ "content": "An error occurred. Please try again. Make sure to follow the output format exactly."
532
+ })
533
+ continue
534
+ else:
535
+ # 所有重试都失败,返回空列表
536
+ return []
537
+
538
+ # 超过最大重试次数,返回空列表(会触发全文处理)
539
+ self.logger.error(f"❌ Max retries ({MAX_RETRIES}) exceeded. Falling back to full text processing.")
540
+ return []
541
+
542
+ def _parse_chapter_selection(
543
+ self,
544
+ llm_output: str,
545
+ chapter_name_to_index: Dict[str, int]
546
+ ) -> Dict[str, Any]:
547
+ """
548
+ 解析 LLM 的章节选择输出
549
+
550
+ 返回:
551
+ {
552
+ "status": "success" | "hallucination" | "parse_error",
553
+ "selected_indices": List[int], # 如果成功
554
+ "invalid_chapters": List[str] # 如果有幻觉
555
+ }
556
+ """
557
+ # 查找分隔符
558
+ start_marker = "====章节选择===="
559
+ end_marker = "====章节选择结束===="
560
+
561
+ start_idx = llm_output.find(start_marker)
562
+ end_idx = llm_output.find(end_marker)
563
+
564
+ # 检查分隔符是否存在
565
+ if start_idx == -1 or end_idx == -1:
566
+ return {"status": "parse_error", "selected_indices": [], "invalid_chapters": []}
567
+
568
+ # 提取章节列表部分
569
+ start_idx += len(start_marker)
570
+ chapter_section = llm_output[start_idx:end_idx].strip()
571
+
572
+ # 按行分割
573
+ chapter_lines = [
574
+ line.strip()
575
+ for line in chapter_section.split('\n')
576
+ if line.strip()
577
+ ]
578
+
579
+ if not chapter_lines:
580
+ return {"status": "parse_error", "selected_indices": [], "invalid_chapters": []}
581
+
582
+ # 验证章节是否存在于 TOC 中
583
+ selected_indices = []
584
+ invalid_chapters = []
585
+
586
+ for chapter_name in chapter_lines:
587
+ if chapter_name in chapter_name_to_index:
588
+ selected_indices.append(chapter_name_to_index[chapter_name])
589
+ else:
590
+ invalid_chapters.append(chapter_name)
591
+
592
+ # 判断结果
593
+ if invalid_chapters:
594
+ # 有幻觉
595
+ return {
596
+ "status": "hallucination",
597
+ "selected_indices": selected_indices,
598
+ "invalid_chapters": invalid_chapters
599
+ }
600
+ elif selected_indices:
601
+ # 成功
602
+ return {
603
+ "status": "success",
604
+ "selected_indices": selected_indices,
605
+ "invalid_chapters": []
606
+ }
607
+ else:
608
+ # 没有选中任何章节(也可能是格式错误)
609
+ return {
610
+ "status": "parse_error",
611
+ "selected_indices": [],
612
+ "invalid_chapters": []
613
+ }
614
+
615
+ def _split_by_paragraph_boundaries(
616
+ self,
617
+ text: str,
618
+ threshold: int
619
+ ) -> List[str]:
620
+ """
621
+ 按段落边界将文本分段,每段 ≤ threshold
622
+
623
+ 策略:
624
+ 1. 按 \\n\\n 分割段落
625
+ 2. 逐步添加段落,直到接近阈值
626
+ 3. 在最近的双换行处断开
627
+ 4. 超长段落按句子(。)细分
628
+ """
629
+ if len(text) <= threshold:
630
+ return [text]
631
+
632
+ # 按双换行分段
633
+ paragraphs = text.split('\n\n')
634
+ chunks = []
635
+ current_chunk = ""
636
+
637
+ for para in paragraphs:
638
+ test_chunk = current_chunk + ("\n\n" if current_chunk else "") + para
639
+
640
+ if len(test_chunk) <= threshold:
641
+ current_chunk = test_chunk
642
+ else:
643
+ # 当前段落会超出阈值
644
+ if current_chunk:
645
+ chunks.append(current_chunk)
646
+
647
+ # 如果单个段落就超过阈值,强制在中间断开
648
+ if len(para) > threshold:
649
+ # 按句子分割
650
+ sentences = para.split('。')
651
+ temp_chunk = ""
652
+ for sent in sentences:
653
+ test_sent = temp_chunk + ('。' if temp_chunk else '') + sent
654
+ if len(test_sent) <= threshold:
655
+ temp_chunk = test_sent
656
+ else:
657
+ if temp_chunk:
658
+ chunks.append(temp_chunk)
659
+ temp_chunk = sent
660
+ current_chunk = temp_chunk
661
+ else:
662
+ current_chunk = para
663
+
664
+ if current_chunk:
665
+ chunks.append(current_chunk)
666
+
667
+ return chunks
668
+
669
+ # ==========================================
670
+ # 4. 核心流式处理
671
+ # ==========================================
672
+
673
+ async def _process_batch(
674
+ self,
675
+ batch_text: str,
676
+ ctx: WebSearcherContext,
677
+ doc_title: str,
678
+ current_batch: int,
679
+ total_batches: int,
680
+ url
681
+ ) -> Dict[str, Any]:
682
+ """
683
+ 统一的批处理函数(带重试机制)
684
+
685
+ 参数:
686
+ batch_text: 当前批次文本
687
+ ctx: 搜索上下文
688
+ doc_title: 文档名称
689
+ current_batch: 当前批次(页码,从 1 开始)
690
+ total_batches: 总批次数(总页数)
691
+
692
+ 返回:
693
+ {
694
+ "heading_type": "answer" | "note" | "continue" | "skip_doc",
695
+ "content": str
696
+ }
697
+ """
698
+ # 计算进度百分比
699
+ progress_pct = int((current_batch / total_batches) * 100)
700
+
701
+ # 初始化消息列表
702
+ messages = [
703
+ {
704
+ "role": "user",
705
+ "content": WebSearcherPrompts.BATCH_PROCESSING.format(
706
+ question=ctx.purpose,
707
+ doc_title=doc_title,
708
+ current_batch=current_batch,
709
+ total_batches=total_batches,
710
+ progress_pct=progress_pct,
711
+ notebook=ctx.notebook,
712
+ batch_text=batch_text,
713
+ url=url
714
+ )
715
+ }
716
+ ]
717
+
718
+ # 最大重试次数
719
+ MAX_RETRIES = 5
720
+
721
+ for attempt in range(MAX_RETRIES):
722
+ try:
723
+ # 调用 LLM
724
+ response = await self.cerebellum.backend.think(messages=messages)
725
+ reply = response.get('reply', '').strip()
726
+
727
+ self.logger.debug(f"Batch processing attempt {attempt + 1}:\n{reply}")
728
+
729
+ # 将 LLM 的回复作为 assistant 消息加入历史
730
+ messages.append({"role": "assistant", "content": reply})
731
+
732
+ # 解析输出
733
+ result = self._parse_batch_output(reply)
734
+
735
+ if result["status"] == "success":
736
+ # 成功
737
+ heading_type = result["heading_type"]
738
+ content = result["content"]
739
+
740
+ # 根据类型记录日志
741
+ if heading_type == "answer":
742
+ self.logger.info(f"✅ Found answer in batch")
743
+ elif heading_type == "note":
744
+ self.logger.info(f"📝 Found useful info in batch")
745
+ elif heading_type == "continue":
746
+ self.logger.debug(f"👀 No new info, continuing")
747
+ else: # skip_doc
748
+ self.logger.warning(f"🚫 Document irrelevant, skipping")
749
+
750
+ return {
751
+ "heading_type": heading_type,
752
+ "content": content
753
+ }
754
+
755
+ else: # result["status"] == "parse_error"
756
+ # 格式错误
757
+ self.logger.warning(f"⚠️ LLM output format incorrect")
758
+
759
+ error_msg = WebSearcherPrompts.BATCH_ERROR_FORMAT
760
+ messages.append({"role": "user", "content": error_msg})
761
+ continue
762
+
763
+ except Exception as e:
764
+ self.logger.error(f"Batch processing failed: {e}")
765
+
766
+ if attempt < MAX_RETRIES - 1:
767
+ messages.append({
768
+ "role": "user",
769
+ "content": "An error occurred. Please try again. Make sure to start with one of the four headings."
770
+ })
771
+ continue
772
+ else:
773
+ # 所有重试都失败,返回默认值
774
+ return {"heading_type": "continue", "content": ""}
775
+
776
+ # 超过最大重试次数,返回默认值(继续阅读)
777
+ self.logger.error(f"❌ Max retries ({MAX_RETRIES}) exceeded. Defaulting to 'continue'.")
778
+ return {"heading_type": "continue", "content": ""}
779
+
780
+ def _parse_batch_output(self, llm_output: str) -> Dict[str, Any]:
781
+ """
782
+ 解析 LLM 的批处理输出
783
+
784
+ 返回:
785
+ {
786
+ "status": "success" | "parse_error",
787
+ "heading_type": "answer" | "note" | "continue" | "skip_doc",
788
+ "content": str
789
+ }
790
+ """
791
+ # 定义四种标题
792
+ HEADINGS = {
793
+ "##对问题的回答": "answer",
794
+ "##值得记录的笔记": "note",
795
+ "##没有值得记录的笔记继续阅读": "continue",
796
+ "##完全不相关的文档应该放弃": "skip_doc"
797
+ }
798
+
799
+ # 检查输出以哪个标题开头
800
+ heading_type = None
801
+ heading_used = None
802
+
803
+ for heading, htype in HEADINGS.items():
804
+ if llm_output.startswith(heading):
805
+ heading_type = htype
806
+ heading_used = heading
807
+ break
808
+
809
+ if heading_type is None:
810
+ # 没有找到任何标题
811
+ return {"status": "parse_error", "heading_type": None, "content": ""}
812
+
813
+ # 提取标题下面的内容
814
+ content_start = len(heading_used)
815
+ content = llm_output[content_start:].strip()
816
+
817
+ # 如果内容为空,也算解析错误
818
+ if not content:
819
+ return {"status": "parse_error", "heading_type": None, "content": ""}
820
+
821
+ # 成功
822
+ return {
823
+ "status": "success",
824
+ "heading_type": heading_type,
825
+ "content": content
826
+ }
827
+
828
+ def _extract_document_title(self, markdown: str) -> str:
829
+ """
830
+ 从 Markdown 中提取文档标题
831
+ 优先级:第一个 # 标题 > 前 50 字符 > "未命名文档"
832
+ """
833
+ # 1. 尝试找到第一个 # 标题
834
+ lines = markdown.split('\n')
835
+ for line in lines:
836
+ if line.startswith('# '):
837
+ return line[2:].strip()
838
+
839
+ # 2. 如果没有标题,使用前 50 字符作为标题
840
+ if len(markdown) > 50:
841
+ return markdown[:50].strip()
842
+
843
+ # 3. 默认标题
844
+ return "未命名文档"
845
+
846
+ async def _stream_process_markdown(
847
+ self,
848
+ markdown: str,
849
+ ctx: WebSearcherContext,
850
+ url: str
851
+ ) -> Optional[str]:
852
+ """
853
+ 流式处理 Markdown 文档(统一入口)
854
+
855
+ 流程:
856
+ 1. 判断长度 → 决定是否需要选章节
857
+ 2. 准备待处理内容(全文 OR 选中章节)
858
+ 3. 按段落边界分成批次
859
+ 4. 逐批流式处理
860
+ """
861
+ # 1. 判断长度
862
+ is_long = len(markdown) > ctx.chunk_threshold
863
+
864
+ # 2. 准备待处理内容
865
+ if not is_long:
866
+ # 短文档:全文处理
867
+ self.logger.info(f"📄 Short document ({len(markdown)} chars). Processing full text.")
868
+ content_to_process = markdown
869
+ else:
870
+ # 长文档:生成目录 → 选择章节
871
+ self.logger.info(f"📚 Long document ({len(markdown)} chars). Generating TOC...")
872
+ toc = self._generate_document_toc(markdown)
873
+
874
+ if not toc or len(toc)<2:
875
+ # 无标题结构或者只有一个标题,全文处理
876
+ self.logger.info("📋 No headers found. Processing full text.")
877
+ content_to_process = markdown
878
+ else:
879
+ # 让 LLM 选择章节
880
+ self.logger.info(f"📑 Found {len(toc)} chapters. Asking LLM to select...")
881
+ selected_indices = await self._let_llm_select_chapters(toc, ctx)
882
+
883
+ if not selected_indices:
884
+ self.logger.warning("⚠️ No chapters selected. Processing full text.")
885
+ content_to_process = markdown
886
+ else:
887
+ self.logger.info(f"✅ Selected {len(selected_indices)} chapters")
888
+ # 提取选中章节
889
+ selected_parts = []
890
+ for idx in selected_indices:
891
+ chapter = toc[idx]
892
+ content = markdown[chapter["start"]:chapter["end"]]
893
+ selected_parts.append(f"# {chapter['title']}\n\n{content}")
894
+ content_to_process = "\n\n".join(selected_parts)
895
+
896
+ # 3. 按段落边界分成批次
897
+ self.logger.info(f"🔪 Splitting content into batches (max {ctx.chunk_threshold} chars each)...")
898
+ batches = self._split_by_paragraph_boundaries(content_to_process, ctx.chunk_threshold)
899
+ total_batches = len(batches)
900
+ self.logger.info(f"📊 Split into {total_batches} batches")
901
+
902
+ # 4. 获取文档标题(用于 LLM 上下文)
903
+ doc_title = self._extract_document_title(content_to_process)
904
+
905
+ # 5. 逐批流式处理
906
+ for i, batch in enumerate(batches, start=1): # 从 1 开始计数
907
+ current_batch = i
908
+ progress_pct = int((current_batch / total_batches) * 100)
909
+ self.logger.info(
910
+ f"🔄 Processing batch {current_batch}/{total_batches} "
911
+ f"({progress_pct}%, {len(batch)} chars)..."
912
+ )
913
+
914
+ # 统一的批处理(传入进度信息)
915
+ result = await self._process_batch(
916
+ batch,
917
+ ctx,
918
+ doc_title=doc_title,
919
+ current_batch=current_batch,
920
+ total_batches=total_batches,
921
+ url=url
922
+ )
923
+
924
+ # 处理结果
925
+ if result["heading_type"] == "answer":
926
+ # 找到答案,立即返回
927
+ self.logger.info(f"✅ Answer found in batch {current_batch}!")
928
+ return result["content"]
929
+
930
+ elif result["heading_type"] == "note":
931
+ # 有用信息,添加到小本本
932
+ ctx.add_to_notebook(f"[Batch {current_batch}] {result['content']}")
933
+ self.logger.info(f"📝 Added useful info from batch {current_batch}")
934
+
935
+ elif result["heading_type"] == "skip_doc":
936
+ # 文档不相关,放弃整个文档
937
+ self.logger.warning(f"🚫 Document irrelevant. Skipping rest of document.")
938
+ break
939
+
940
+ # heading_type == "continue": 什么都不做,继续下一批
941
+
942
+ # 未找到答案
943
+ return None
944
+
945
+ async def _run_search_lifecycle(self, session: TabSession, ctx: WebSearcherContext) -> Optional[str]:
946
+ """
947
+ [The Core Loop] 搜索生命周期
948
+ 核心逻辑:访问页面 → 尝试回答问题 → 不能回答则记录信息 → 继续探索
949
+ """
950
+ while not ctx.is_time_up():
951
+ # --- Phase 1: Navigation ---
952
+ if not session.pending_link_queue:
953
+ self.logger.info("Queue empty. Ending search.")
954
+ break
955
+
956
+ next_url = session.pending_link_queue.popleft()
957
+ self.logger.info(f"🔗 Navigating to: {next_url}")
958
+
959
+ # 1.1 门禁检查
960
+ if ctx.has_visited(next_url) or any(bl in next_url for bl in ctx.blacklist):
961
+ continue
962
+
963
+ # 1.2 导航到页面
964
+ nav_report = await self.browser.navigate(session.handle, next_url)
965
+ final_url = self.browser.get_tab_url(session.handle)
966
+ session.current_url = final_url
967
+
968
+ ctx.mark_visited(next_url)
969
+ ctx.mark_visited(final_url)
970
+
971
+ # 1.3 二次黑名单检查
972
+ if any(bl in final_url for bl in ctx.blacklist):
973
+ self.logger.warning(f"🚫 Redirected to blacklisted URL: {final_url}")
974
+ continue
975
+
976
+ # === Phase 2: Identify Page Type ===
977
+ page_type = await self.browser.analyze_page_type(session.handle)
978
+
979
+ if page_type == PageType.ERRO_PAGE:
980
+ self.logger.warning(f"🚫 Error Page: {final_url}")
981
+ continue
982
+
983
+ # === 分支 A: 静态资源 ===
984
+ if page_type == PageType.STATIC_ASSET:
985
+ self.logger.info(f"📄 Static Asset: {final_url}")
986
+
987
+ # 获取完整 Markdown 并流式处理
988
+ markdown = await self._get_full_page_markdown(session.handle, ctx)
989
+ answer = await self._stream_process_markdown(markdown, ctx,final_url)
990
+
991
+ if answer:
992
+ return answer # 找到答案,直接返回
993
+
994
+ continue # 继续处理下一个 URL
995
+
996
+ # === 分支 B: 交互式网页 ===
997
+ elif page_type == PageType.NAVIGABLE:
998
+ self.logger.debug("🌐 Navigable Page. Entering processing loop.")
999
+ page_active = True
1000
+ page_changed = True
1001
+
1002
+ while page_active and not ctx.is_time_up():
1003
+ # 1. Stabilize (滚动加载)
1004
+ if page_changed:
1005
+ await self.browser.stabilize(session.handle)
1006
+
1007
+ # 2. 获取完整 Markdown 并流式处理
1008
+ markdown = await self._get_full_page_markdown(session.handle, ctx)
1009
+ answer = await self._stream_process_markdown(markdown, ctx,final_url)
1010
+
1011
+ # 3. 如果找到答案,直接返回
1012
+ if answer:
1013
+ return answer
1014
+
1015
+ # 4. 生成页面摘要(用于后续链接筛选)
1016
+ page_summary = markdown[:500] if markdown else ""
1017
+
1018
+ # === Phase 4: Scouting ===
1019
+ links, buttons = await self.browser.scan_elements(session.handle)
1020
+ self.logger.debug(f"🔍 Found {len(links)} links and {len(buttons)} buttons")
1021
+
1022
+ # 4.1 处理 Links
1023
+ if page_changed:
1024
+ filtered_links = {}
1025
+ for link in links:
1026
+ if ctx.has_link_assessed(link):
1027
+ continue
1028
+ if ctx.has_visited(link):
1029
+ continue
1030
+ if link in session.pending_link_queue:
1031
+ continue
1032
+ if any(bl in link for bl in ctx.blacklist):
1033
+ continue
1034
+ filtered_links[link] = links[link]
1035
+
1036
+ # 评估链接相关性
1037
+ selected_links = await self._filter_relevant_links(filtered_links, page_summary, ctx)
1038
+
1039
+ # 标记已评估
1040
+ for link in filtered_links:
1041
+ ctx.mark_link_assessed(link)
1042
+
1043
+ # 添加到队列
1044
+ new_links_count = 0
1045
+ for link in selected_links:
1046
+ session.pending_link_queue.append(link)
1047
+ new_links_count += 1
1048
+ self.logger.info(f"👀 Added {new_links_count} links to queue")
1049
+
1050
+ # 4.2 处理 Buttons
1051
+ candidate_buttons = []
1052
+ for button_text in buttons:
1053
+ if not ctx.has_button_assessed(session.current_url, button_text):
1054
+ candidate_buttons.append({button_text: buttons[button_text]})
1055
+
1056
+ # === Phase 5: Execution ===
1057
+ if not candidate_buttons:
1058
+ self.logger.info("🤔 No worthy buttons. Moving to next page.")
1059
+ page_active = False
1060
+ continue
1061
+
1062
+ chosen_button = await self._choose_best_interaction(candidate_buttons, page_summary, ctx)
1063
+
1064
+ # 标记已评估
1065
+ assessed_button_texts = [list(btn.keys())[0] for btn in candidate_buttons]
1066
+ ctx.mark_buttons_assessed(session.current_url, assessed_button_texts)
1067
+
1068
+ if not chosen_button:
1069
+ self.logger.info("🤔 No worthy buttons. Moving to next page.")
1070
+ page_active = False
1071
+ continue
1072
+
1073
+ # 执行点击
1074
+ self.logger.info(f"🖱️ Clicking: [{chosen_button.get_text()}]")
1075
+ ctx.mark_interacted(session.current_url, chosen_button.get_text())
1076
+
1077
+ report = await self.browser.click_and_observe(session.handle, chosen_button)
1078
+
1079
+ # 5.1 处理新 Tab
1080
+ if report.new_tabs:
1081
+ self.logger.info(f"✨ New Tab(s): {len(report.new_tabs)}")
1082
+ for new_tab_handle in report.new_tabs:
1083
+ new_session = TabSession(handle=new_tab_handle, current_url="", depth=session.depth + 1)
1084
+ answer = await self._run_search_lifecycle(new_session, ctx)
1085
+ if answer: # 如果在递归中找到答案,向上传递
1086
+ return answer
1087
+ await self.browser.close_tab(new_tab_handle)
1088
+
1089
+ # 5.2 处理页面变动
1090
+ if report.is_dom_changed or report.is_url_changed:
1091
+ self.logger.info("🔄 Page changed. Re-assessing.")
1092
+ page_changed = True
1093
+ if report.is_url_changed:
1094
+ session.current_url = self.browser.get_tab_url(session.handle)
1095
+ continue
1096
+
1097
+ # 5.3 无变化
1098
+ page_changed = False
1099
+ continue
1100
+
1101
+ # 未找到答案
1102
+ return None
1103
+
1104
+ # ==========================================
1105
+ # 3. 小脑决策辅助
1106
+ # ==========================================
1107
+