matrix-for-agents 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentmatrix/__init__.py +20 -0
- agentmatrix/agents/__init__.py +1 -0
- agentmatrix/agents/base.py +572 -0
- agentmatrix/agents/claude_coder.py +10 -0
- agentmatrix/agents/data_crawler.py +14 -0
- agentmatrix/agents/post_office.py +212 -0
- agentmatrix/agents/report_writer.py +14 -0
- agentmatrix/agents/secretary.py +10 -0
- agentmatrix/agents/stateful.py +10 -0
- agentmatrix/agents/user_proxy.py +82 -0
- agentmatrix/agents/worker.py +30 -0
- agentmatrix/backends/__init__.py +1 -0
- agentmatrix/backends/llm_client.py +414 -0
- agentmatrix/backends/mock_llm.py +35 -0
- agentmatrix/cli_runner.py +94 -0
- agentmatrix/core/__init__.py +0 -0
- agentmatrix/core/action.py +50 -0
- agentmatrix/core/browser/bing.py +208 -0
- agentmatrix/core/browser/browser_adapter.py +298 -0
- agentmatrix/core/browser/browser_common.py +85 -0
- agentmatrix/core/browser/drission_page_adapter.py +1296 -0
- agentmatrix/core/browser/google.py +230 -0
- agentmatrix/core/cerebellum.py +121 -0
- agentmatrix/core/events.py +22 -0
- agentmatrix/core/loader.py +185 -0
- agentmatrix/core/loader_v1.py +146 -0
- agentmatrix/core/log_util.py +158 -0
- agentmatrix/core/message.py +32 -0
- agentmatrix/core/prompt_engine.py +30 -0
- agentmatrix/core/runtime.py +211 -0
- agentmatrix/core/session.py +20 -0
- agentmatrix/db/__init__.py +1 -0
- agentmatrix/db/database.py +79 -0
- agentmatrix/db/vector_db.py +213 -0
- agentmatrix/docs/Design.md +109 -0
- agentmatrix/docs/Framework Capbilities.md +105 -0
- agentmatrix/docs/Planner Design.md +148 -0
- agentmatrix/docs/crawler_flow.md +110 -0
- agentmatrix/docs/report_writer.md +83 -0
- agentmatrix/docs/review.md +99 -0
- agentmatrix/docs/skill_design.md +23 -0
- agentmatrix/profiles/claude_coder.yml +40 -0
- agentmatrix/profiles/mark.yml +26 -0
- agentmatrix/profiles/planner.yml +21 -0
- agentmatrix/profiles/prompts/base.txt +88 -0
- agentmatrix/profiles/prompts/base_v1.txt +101 -0
- agentmatrix/profiles/prompts/base_v2.txt +94 -0
- agentmatrix/profiles/tom_the_data_crawler.yml +38 -0
- agentmatrix/profiles/user_proxy.yml +17 -0
- agentmatrix/skills/__init__.py +1 -0
- agentmatrix/skills/crawler_helpers.py +315 -0
- agentmatrix/skills/data_crawler.py +777 -0
- agentmatrix/skills/filesystem.py +204 -0
- agentmatrix/skills/notebook.py +158 -0
- agentmatrix/skills/project_management.py +114 -0
- agentmatrix/skills/report_writer.py +194 -0
- agentmatrix/skills/report_writer_utils.py +379 -0
- agentmatrix/skills/search_tool.py +383 -0
- agentmatrix/skills/terminal_ctrl.py +122 -0
- agentmatrix/skills/utils.py +33 -0
- agentmatrix/skills/web_searcher.py +1107 -0
- matrix_for_agents-0.1.2.dist-info/METADATA +44 -0
- matrix_for_agents-0.1.2.dist-info/RECORD +66 -0
- matrix_for_agents-0.1.2.dist-info/WHEEL +5 -0
- matrix_for_agents-0.1.2.dist-info/licenses/LICENSE +190 -0
- matrix_for_agents-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1107 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
import os
|
|
4
|
+
import json
|
|
5
|
+
import textwrap
|
|
6
|
+
import re
|
|
7
|
+
from typing import List, Set, Dict, Optional, Any, Deque
|
|
8
|
+
from collections import deque
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
|
|
11
|
+
from ..core.browser.google import search_google
|
|
12
|
+
from ..core.browser.bing import search_bing
|
|
13
|
+
from ..core.browser.browser_adapter import (
|
|
14
|
+
BrowserAdapter, TabHandle, PageElement, PageSnapshot, PageType
|
|
15
|
+
)
|
|
16
|
+
from ..core.browser.browser_common import TabSession, BaseCrawlerContext
|
|
17
|
+
from ..skills.crawler_helpers import CrawlerHelperMixin
|
|
18
|
+
from ..core.browser.drission_page_adapter import DrissionPageAdapter
|
|
19
|
+
from ..core.action import register_action
|
|
20
|
+
|
|
21
|
+
search_func = search_google
|
|
22
|
+
|
|
23
|
+
# ==========================================
|
|
24
|
+
# Prompt 集中管理
|
|
25
|
+
# ==========================================
|
|
26
|
+
|
|
27
|
+
class WebSearcherPrompts:
|
|
28
|
+
"""Web Searcher Prompt 集中管理"""
|
|
29
|
+
|
|
30
|
+
# ==========================================
|
|
31
|
+
# 1. 章节选择
|
|
32
|
+
# ==========================================
|
|
33
|
+
|
|
34
|
+
CHAPTER_SELECTION = """You are searching for information to answer: "{question}"
|
|
35
|
+
|
|
36
|
+
Below is the table of contents for a document:
|
|
37
|
+
|
|
38
|
+
{toc_list}
|
|
39
|
+
|
|
40
|
+
[Task]
|
|
41
|
+
Select the chapters that are MOST LIKELY to contain information relevant to answering the question.
|
|
42
|
+
|
|
43
|
+
[Rules]
|
|
44
|
+
1. You can select multiple chapters
|
|
45
|
+
2. Be conservative - only select chapters that seem directly relevant
|
|
46
|
+
3. If unsure, you can select multiple chapters to be safe
|
|
47
|
+
|
|
48
|
+
[Output Format]
|
|
49
|
+
|
|
50
|
+
First, explain your reasoning (why you selected these chapters).
|
|
51
|
+
|
|
52
|
+
Then, output your selections using following format:
|
|
53
|
+
|
|
54
|
+
====章节选择====
|
|
55
|
+
你选择的章节名称1(replace with your choice)
|
|
56
|
+
你选择的章节名称2(replace with your choice)
|
|
57
|
+
...
|
|
58
|
+
====章节选择结束====
|
|
59
|
+
|
|
60
|
+
One chapter name per line. The chapter names must EXACTLY match the names shown in the TOC above."""
|
|
61
|
+
|
|
62
|
+
CHAPTER_ERROR_HALLUCINATION = """Your selection contains chapters that don't exist in the TOC:
|
|
63
|
+
|
|
64
|
+
Invalid chapters:
|
|
65
|
+
{invalid_chapters}
|
|
66
|
+
|
|
67
|
+
Please select ONLY from the available chapters listed in the TOC. Try again."""
|
|
68
|
+
|
|
69
|
+
CHAPTER_ERROR_FORMAT = """Your output format is incorrect.
|
|
70
|
+
|
|
71
|
+
Please use this EXACT format:
|
|
72
|
+
|
|
73
|
+
====章节选择====
|
|
74
|
+
章节名称1
|
|
75
|
+
章节名称2
|
|
76
|
+
====章节选择结束====
|
|
77
|
+
|
|
78
|
+
Make sure:
|
|
79
|
+
1. The markers are EXACTLY '====章节选择====' and '====章节选择结束===='
|
|
80
|
+
2. One chapter name per line
|
|
81
|
+
3. Chapter names EXACTLY match the TOC
|
|
82
|
+
|
|
83
|
+
Try again."""
|
|
84
|
+
|
|
85
|
+
# ==========================================
|
|
86
|
+
# 2. 批处理
|
|
87
|
+
# ==========================================
|
|
88
|
+
|
|
89
|
+
BATCH_PROCESSING = """You are reading a document to answer: "{question}"
|
|
90
|
+
|
|
91
|
+
[Document Info]
|
|
92
|
+
- Title: {doc_title}
|
|
93
|
+
- Source URL: {url}
|
|
94
|
+
- Progress: Page {current_batch} of {total_batches} ({progress_pct}% complete)
|
|
95
|
+
|
|
96
|
+
[Notebook - What We Already Know]
|
|
97
|
+
{notebook}
|
|
98
|
+
|
|
99
|
+
[Current Page Content - Page {current_batch}]
|
|
100
|
+
{batch_text}
|
|
101
|
+
|
|
102
|
+
[Task]
|
|
103
|
+
Based on the Notebook, Current Page, AND your reading progress, provide a brief summary.
|
|
104
|
+
|
|
105
|
+
Consider your progress:
|
|
106
|
+
- If you're early in the document (first 20%), keep exploring even if this page is weak
|
|
107
|
+
- If you're late in the document (last 30%) and found nothing useful, consider skipping
|
|
108
|
+
- If you're in the middle, continue unless the content is completely irrelevant
|
|
109
|
+
|
|
110
|
+
Your response MUST start with ONE of these four headings:
|
|
111
|
+
|
|
112
|
+
##对问题的回答
|
|
113
|
+
If you can provide a clear, complete answer based on the Notebook and Current Page:
|
|
114
|
+
- Use this heading
|
|
115
|
+
- Provide your answer below
|
|
116
|
+
- Keep it concise but complete
|
|
117
|
+
- Keep key references (urls) for key information
|
|
118
|
+
|
|
119
|
+
##值得记录的笔记
|
|
120
|
+
If you cannot answer yet, but found NEW and USEFUL information:
|
|
121
|
+
- Use this heading
|
|
122
|
+
- Provide a concise summary (2-5 sentences)
|
|
123
|
+
- Focus on facts, data, definitions, explanations
|
|
124
|
+
- Only extract information NOT already in Notebook
|
|
125
|
+
- Always include the source URL
|
|
126
|
+
|
|
127
|
+
##没有值得记录的笔记继续阅读
|
|
128
|
+
If the page doesn't contain new or useful information, but the document still shows promise:
|
|
129
|
+
- Use this heading
|
|
130
|
+
- Briefly explain why (1 sentence)
|
|
131
|
+
- Consider: If you're late in the document (>70%), you might want to skip
|
|
132
|
+
|
|
133
|
+
##完全不相关的文档应该放弃
|
|
134
|
+
If the page is completely irrelevant to the question (navigation, ads, unrelated topics):
|
|
135
|
+
- Use this heading
|
|
136
|
+
- Explain why (1 sentence)
|
|
137
|
+
- Skip the rest of this document
|
|
138
|
+
- Especially consider this if you're already deep into the document (>50%) and found nothing useful
|
|
139
|
+
|
|
140
|
+
[Output Format]
|
|
141
|
+
|
|
142
|
+
##对问题的回答 (or one of the other three headings)
|
|
143
|
+
|
|
144
|
+
Your content here...
|
|
145
|
+
|
|
146
|
+
[Important]
|
|
147
|
+
- Start with ONE of the four headings above (EXACTLY as shown)
|
|
148
|
+
- Provide your content below the heading
|
|
149
|
+
- Consider your reading progress when deciding whether to continue or skip"""
|
|
150
|
+
|
|
151
|
+
BATCH_ERROR_FORMAT = """Your output format is incorrect.
|
|
152
|
+
|
|
153
|
+
Please start your response with ONE of these four headings (EXACTLY as shown):
|
|
154
|
+
|
|
155
|
+
##对问题的回答
|
|
156
|
+
##值得记录的笔记
|
|
157
|
+
##没有值得记录的笔记继续阅读
|
|
158
|
+
##完全不相关的文档应该放弃
|
|
159
|
+
|
|
160
|
+
Then provide your content below the heading.
|
|
161
|
+
|
|
162
|
+
Examples:
|
|
163
|
+
|
|
164
|
+
Example 1 (can answer):
|
|
165
|
+
##对问题的回答
|
|
166
|
+
Python装饰器是一种...
|
|
167
|
+
|
|
168
|
+
Example 2 (useful info):
|
|
169
|
+
##值得记录的笔记
|
|
170
|
+
装饰器使用@符号语法...
|
|
171
|
+
|
|
172
|
+
Example 3 (no new info):
|
|
173
|
+
##没有值得记录的笔记继续阅读
|
|
174
|
+
这段内容介绍了网站导航,但没有新的有用信息。
|
|
175
|
+
|
|
176
|
+
Example 4 (irrelevant):
|
|
177
|
+
##完全不相关的文档应该放弃
|
|
178
|
+
这是一段购物网站的广告内容,完全与装饰器无关。
|
|
179
|
+
|
|
180
|
+
Try again."""
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# ==========================================
|
|
184
|
+
# 1. 状态与上下文定义
|
|
185
|
+
# ==========================================
|
|
186
|
+
|
|
187
|
+
class WebSearcherContext(BaseCrawlerContext):
|
|
188
|
+
"""
|
|
189
|
+
Web 搜索任务上下文
|
|
190
|
+
用于回答问题的搜索任务,带有"小本本"机制记录有用信息
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
def __init__(self, purpose: str, deadline: float, chunk_threshold: int = 5000,
|
|
194
|
+
temp_file_dir: Optional[str] = None):
|
|
195
|
+
super().__init__(deadline)
|
|
196
|
+
self.purpose = purpose # 改名:question -> purpose
|
|
197
|
+
self.notebook = ""
|
|
198
|
+
self.chunk_threshold = chunk_threshold
|
|
199
|
+
self.temp_file_dir = temp_file_dir
|
|
200
|
+
|
|
201
|
+
def add_to_notebook(self, info: str):
|
|
202
|
+
"""添加信息到小本本"""
|
|
203
|
+
if info:
|
|
204
|
+
timestamp = time.strftime("%H:%M:%S")
|
|
205
|
+
self.notebook += f"\n\n[{timestamp}] {info}\n"
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# ==========================================
|
|
209
|
+
# 2. Web Searcher 核心逻辑
|
|
210
|
+
# ==========================================
|
|
211
|
+
|
|
212
|
+
class WebSearcherMixin(CrawlerHelperMixin):
|
|
213
|
+
"""
|
|
214
|
+
Web 搜索器技能
|
|
215
|
+
用于回答问题的网络搜索
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
@register_action(
|
|
219
|
+
"针对一个问题上网搜索答案,提供要解决的问题和(可选)搜索关键字词",
|
|
220
|
+
param_infos={
|
|
221
|
+
"purpose": "要回答的问题(或研究目标)",
|
|
222
|
+
"search_phrase": "可选,初始搜索关键词",
|
|
223
|
+
"max_time": "可选,最大搜索分钟,默认20",
|
|
224
|
+
"max_search_pages": "可选,最大搜索页数(默认5)",
|
|
225
|
+
|
|
226
|
+
}
|
|
227
|
+
)
|
|
228
|
+
async def web_search(
|
|
229
|
+
self,
|
|
230
|
+
purpose: str,
|
|
231
|
+
search_phrase: str = None,
|
|
232
|
+
max_time: int = 20,
|
|
233
|
+
max_search_pages: int = 5,
|
|
234
|
+
temp_file_dir: Optional[str] = None
|
|
235
|
+
):
|
|
236
|
+
"""
|
|
237
|
+
[Entry Point] 上网搜索回答问题(流式处理版本)
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
purpose: 要回答的问题(或研究目标)
|
|
241
|
+
search_phrase: 初始搜索关键词
|
|
242
|
+
max_time: 最大搜索时间(分钟)
|
|
243
|
+
max_search_pages: 最大搜索页数(默认5)
|
|
244
|
+
chunk_threshold: 分段阈值(字符数)
|
|
245
|
+
temp_file_dir: 临时文件保存目录(可选,用于调试)
|
|
246
|
+
"""
|
|
247
|
+
# 1. 准备环境
|
|
248
|
+
profile_path = os.path.join(self.workspace_root, ".matrix", "browser_profile", self.name)
|
|
249
|
+
download_path = os.path.join(self.current_workspace, "downloads")
|
|
250
|
+
chunk_threshold = 5000
|
|
251
|
+
|
|
252
|
+
if not search_phrase:
|
|
253
|
+
resp = await self.brain.think(f"""
|
|
254
|
+
现在我们要研究个新问题:{purpose},打算上网搜索一下,需要你设计一下最合适的关键词或者关键字组合。输出的时候可以先简单解释一下这么设计的理由,但是最后一行必须是也只能是要搜索的内容(也就是输入到搜索引擎搜索栏的内容)。例如你认为应该搜索"Keyword",那么最后一行就只能是"Keyword"
|
|
255
|
+
""")
|
|
256
|
+
reply = resp['reply']
|
|
257
|
+
#get last line of reply
|
|
258
|
+
if '\n' in reply:
|
|
259
|
+
search_phrase = reply.split('\n')[-1].strip()
|
|
260
|
+
#如果还是有问题,我们直接搜索问题:
|
|
261
|
+
if not search_phrase:
|
|
262
|
+
search_phrase = purpose
|
|
263
|
+
self.logger.info(f"🔍 准备搜索: {search_phrase}")
|
|
264
|
+
|
|
265
|
+
self.browser = DrissionPageAdapter(
|
|
266
|
+
profile_path=profile_path,
|
|
267
|
+
download_path=download_path
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
ctx = WebSearcherContext(
|
|
271
|
+
purpose=purpose,
|
|
272
|
+
deadline=time.time() + int(max_time) * 60,
|
|
273
|
+
chunk_threshold=chunk_threshold,
|
|
274
|
+
temp_file_dir=temp_file_dir
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
self.logger.info(f"🔍 Web Search Start: {purpose}")
|
|
278
|
+
self.logger.info(f"🔍 Initial search phrase: {search_phrase}")
|
|
279
|
+
self.logger.info(f"🔍 Max search pages: {max_search_pages}")
|
|
280
|
+
|
|
281
|
+
# 2. 启动浏览器
|
|
282
|
+
await self.browser.start(headless=False)
|
|
283
|
+
|
|
284
|
+
try:
|
|
285
|
+
# 3. 创建 Tab 和 Session
|
|
286
|
+
tab = await self.browser.get_tab()
|
|
287
|
+
session = TabSession(handle=tab, current_url="")
|
|
288
|
+
|
|
289
|
+
# 4. 外层循环:逐页处理搜索结果
|
|
290
|
+
for page_num in range(1, max_search_pages + 1):
|
|
291
|
+
self.logger.info(f"\n{'='*60}")
|
|
292
|
+
self.logger.info(f"🔍 Fetching search results page {page_num}/{max_search_pages}")
|
|
293
|
+
self.logger.info(f"{'='*60}\n")
|
|
294
|
+
|
|
295
|
+
# 4.1 获取第 page_num 页的搜索结果
|
|
296
|
+
search_result = await search_func(
|
|
297
|
+
self.browser,
|
|
298
|
+
tab,
|
|
299
|
+
search_phrase,
|
|
300
|
+
max_pages=max_search_pages,
|
|
301
|
+
page=page_num # 指定只获取第 page_num 页
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
if not search_result:
|
|
305
|
+
self.logger.warning(f"⚠️ No results found on page {page_num}")
|
|
306
|
+
break
|
|
307
|
+
|
|
308
|
+
# 4.2 将 URL 添加到 pending_link_queue
|
|
309
|
+
added_count = 0
|
|
310
|
+
for result in search_result:
|
|
311
|
+
url = result['url']
|
|
312
|
+
if not ctx.has_visited(url):
|
|
313
|
+
session.pending_link_queue.append(url)
|
|
314
|
+
added_count += 1
|
|
315
|
+
|
|
316
|
+
self.logger.info(f"✓ Added {added_count} URLs from page {page_num} to queue")
|
|
317
|
+
|
|
318
|
+
# 4.3 运行 _run_search_lifecycle 处理这些 URL
|
|
319
|
+
self.logger.info(f"\n🌐 Processing URLs from page {page_num}...")
|
|
320
|
+
answer = await self._run_search_lifecycle(session, ctx)
|
|
321
|
+
|
|
322
|
+
# 4.4 如果找到答案,提前返回
|
|
323
|
+
if answer:
|
|
324
|
+
self.logger.info(f"✅ Found answer on page {page_num}!")
|
|
325
|
+
return f"Answer: {answer}\n\n---\nNotebook:\n{ctx.notebook}"
|
|
326
|
+
|
|
327
|
+
# 4.5 检查时间和资源限制
|
|
328
|
+
if ctx.is_time_up():
|
|
329
|
+
self.logger.info("⏰ Time up!")
|
|
330
|
+
break
|
|
331
|
+
|
|
332
|
+
self.logger.info(f"✓ Completed page {page_num}, continuing to next page...")
|
|
333
|
+
|
|
334
|
+
# 5. 未找到答案,返回 notebook
|
|
335
|
+
self.logger.info("⏸ Exhausted all search pages without finding complete answer")
|
|
336
|
+
return f"Could not find a complete answer.\n\nHere's what I found:\n{ctx.notebook}"
|
|
337
|
+
|
|
338
|
+
except Exception as e:
|
|
339
|
+
self.logger.exception("Web searcher crashed")
|
|
340
|
+
return f"Search failed with error: {e}"
|
|
341
|
+
finally:
|
|
342
|
+
self.logger.info("🛑 Closing browser...")
|
|
343
|
+
await self.browser.close()
|
|
344
|
+
|
|
345
|
+
# ==========================================
|
|
346
|
+
# 2. 获取完整页面内容
|
|
347
|
+
# ==========================================
|
|
348
|
+
|
|
349
|
+
async def _get_full_page_markdown(self, tab: TabHandle, ctx: WebSearcherContext) -> str:
|
|
350
|
+
"""
|
|
351
|
+
获取完整页面的 Markdown,无字符限制
|
|
352
|
+
- HTML: 使用 trafilatura 提取完整 Markdown
|
|
353
|
+
- PDF: 使用 pdf_to_markdown 转换完整文档
|
|
354
|
+
"""
|
|
355
|
+
content_type = await self.browser.analyze_page_type(tab)
|
|
356
|
+
|
|
357
|
+
if content_type == PageType.STATIC_ASSET:
|
|
358
|
+
return await self._pdf_to_full_markdown(tab, ctx)
|
|
359
|
+
else:
|
|
360
|
+
return await self._html_to_full_markdown(tab)
|
|
361
|
+
|
|
362
|
+
async def _html_to_full_markdown(self, tab: TabHandle) -> str:
|
|
363
|
+
"""将 HTML 页面转换为完整 Markdown"""
|
|
364
|
+
import trafilatura
|
|
365
|
+
|
|
366
|
+
raw_html = tab.html
|
|
367
|
+
url = self.browser.get_tab_url(tab)
|
|
368
|
+
|
|
369
|
+
# 使用 trafilatura 提取完整 Markdown
|
|
370
|
+
markdown = trafilatura.extract(
|
|
371
|
+
raw_html,
|
|
372
|
+
include_links=True,
|
|
373
|
+
include_formatting=True,
|
|
374
|
+
output_format='markdown',
|
|
375
|
+
url=url
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
# 备选方案
|
|
379
|
+
if not markdown or len(markdown) < 50:
|
|
380
|
+
markdown = tab.text
|
|
381
|
+
|
|
382
|
+
return markdown or ""
|
|
383
|
+
|
|
384
|
+
async def _pdf_to_full_markdown(self, tab: TabHandle, ctx: WebSearcherContext) -> str:
|
|
385
|
+
"""将 PDF 转换为完整 Markdown(独立实现,便于后续优化)"""
|
|
386
|
+
from skills.report_writer_utils import pdf_to_markdown
|
|
387
|
+
|
|
388
|
+
# 下载 PDF 到本地
|
|
389
|
+
pdf_path = await self.browser.save_static_asset(tab)
|
|
390
|
+
|
|
391
|
+
# 转换完整 PDF 为 Markdown
|
|
392
|
+
markdown = pdf_to_markdown(pdf_path)
|
|
393
|
+
|
|
394
|
+
# 可选:保存到临时文件(调试用)
|
|
395
|
+
if ctx.temp_file_dir:
|
|
396
|
+
import os
|
|
397
|
+
from slugify import slugify
|
|
398
|
+
os.makedirs(ctx.temp_file_dir, exist_ok=True)
|
|
399
|
+
filename = slugify(f"pdf_{os.path.basename(pdf_path)}") + ".md"
|
|
400
|
+
temp_path = os.path.join(ctx.temp_file_dir, filename)
|
|
401
|
+
with open(temp_path, "w", encoding="utf-8") as f:
|
|
402
|
+
f.write(markdown)
|
|
403
|
+
self.logger.info(f"📄 Saved markdown to: {temp_path}")
|
|
404
|
+
|
|
405
|
+
return markdown
|
|
406
|
+
|
|
407
|
+
# ==========================================
|
|
408
|
+
# 3. 辅助方法(目录、选择章节、分段)
|
|
409
|
+
# ==========================================
|
|
410
|
+
|
|
411
|
+
def _generate_document_toc(self, markdown: str) -> List[Dict[str, Any]]:
|
|
412
|
+
"""
|
|
413
|
+
从 Markdown 中提取目录结构
|
|
414
|
+
返回: [
|
|
415
|
+
{"level": 1, "title": "第一章", "start": 0, "end": 1234},
|
|
416
|
+
{"level": 2, "title": "1.1 简介", "start": 1235, "end": 2345},
|
|
417
|
+
...
|
|
418
|
+
]
|
|
419
|
+
"""
|
|
420
|
+
toc = []
|
|
421
|
+
lines = markdown.split("\n")
|
|
422
|
+
current_pos = 0
|
|
423
|
+
|
|
424
|
+
for line in lines:
|
|
425
|
+
# 匹配 Markdown 标题
|
|
426
|
+
match = re.match(r'^(#{1,6})\s+(.+)$', line)
|
|
427
|
+
if match:
|
|
428
|
+
level = len(match.group(1))
|
|
429
|
+
title = match.group(2).strip()
|
|
430
|
+
toc.append({
|
|
431
|
+
"level": level,
|
|
432
|
+
"title": title,
|
|
433
|
+
"start": current_pos,
|
|
434
|
+
"line": line
|
|
435
|
+
})
|
|
436
|
+
|
|
437
|
+
current_pos += len(line) + 1 # +1 for newline
|
|
438
|
+
|
|
439
|
+
# 计算每个章节的结束位置
|
|
440
|
+
for i in range(len(toc) - 1):
|
|
441
|
+
toc[i]["end"] = toc[i + 1]["start"]
|
|
442
|
+
if toc:
|
|
443
|
+
toc[-1]["end"] = len(markdown)
|
|
444
|
+
|
|
445
|
+
return toc
|
|
446
|
+
|
|
447
|
+
async def _let_llm_select_chapters(
|
|
448
|
+
self,
|
|
449
|
+
toc: List[Dict],
|
|
450
|
+
ctx: WebSearcherContext
|
|
451
|
+
) -> List[int]:
|
|
452
|
+
"""
|
|
453
|
+
让 LLM 根据问题选择相关章节(带重试机制)
|
|
454
|
+
返回: 选中的章节索引列表(0-based)
|
|
455
|
+
"""
|
|
456
|
+
# 构造 TOC 列表(不用数字编号,保留缩进)
|
|
457
|
+
toc_lines = []
|
|
458
|
+
for chapter in toc:
|
|
459
|
+
indent = " " * (chapter["level"] - 1)
|
|
460
|
+
toc_lines.append(f"{indent}{chapter['title']}")
|
|
461
|
+
toc_list = "\n".join(toc_lines)
|
|
462
|
+
|
|
463
|
+
# 构造章节名字到索引的映射(用于验证)
|
|
464
|
+
chapter_name_to_index = {
|
|
465
|
+
chapter["title"]: i
|
|
466
|
+
for i, chapter in enumerate(toc)
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
# 使用 prompt 模板
|
|
470
|
+
initial_prompt = WebSearcherPrompts.CHAPTER_SELECTION.format(
|
|
471
|
+
question=ctx.purpose,
|
|
472
|
+
toc_list=toc_list
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
# 初始化消息列表
|
|
476
|
+
messages = [{"role": "user", "content": initial_prompt}]
|
|
477
|
+
|
|
478
|
+
# 最大重试次数
|
|
479
|
+
MAX_RETRIES = 5
|
|
480
|
+
|
|
481
|
+
for attempt in range(MAX_RETRIES):
|
|
482
|
+
try:
|
|
483
|
+
# 调用 LLM
|
|
484
|
+
response = await self.cerebellum.backend.think(messages=messages)
|
|
485
|
+
reply = response.get('reply', '').strip()
|
|
486
|
+
|
|
487
|
+
self.logger.debug(f"Chapter selection attempt {attempt + 1}:\n{reply}")
|
|
488
|
+
|
|
489
|
+
# 将 LLM 的回复作为 assistant 消息加入历史
|
|
490
|
+
messages.append({"role": "assistant", "content": reply})
|
|
491
|
+
|
|
492
|
+
# 解析输出
|
|
493
|
+
result = self._parse_chapter_selection(reply, chapter_name_to_index)
|
|
494
|
+
|
|
495
|
+
if result["status"] == "success":
|
|
496
|
+
# 情况 (1): 解析成功,所有章节都是真的
|
|
497
|
+
selected_indices = result["selected_indices"]
|
|
498
|
+
self.logger.info(f"✅ Successfully selected {len(selected_indices)} chapters: {selected_indices}")
|
|
499
|
+
return selected_indices
|
|
500
|
+
|
|
501
|
+
elif result["status"] == "hallucination":
|
|
502
|
+
# 情况 (2): 解析成功,但有些章节是假的(幻觉)
|
|
503
|
+
invalid_chapters = result["invalid_chapters"]
|
|
504
|
+
self.logger.warning(f"⚠️ LLM hallucinated chapters: {invalid_chapters}")
|
|
505
|
+
|
|
506
|
+
# 使用错误提示模板
|
|
507
|
+
invalid_chapters_str = "\n".join(f"- {ch}" for ch in invalid_chapters)
|
|
508
|
+
error_msg = WebSearcherPrompts.CHAPTER_ERROR_HALLUCINATION.format(
|
|
509
|
+
invalid_chapters=invalid_chapters_str
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
messages.append({"role": "user", "content": error_msg})
|
|
513
|
+
continue
|
|
514
|
+
|
|
515
|
+
else: # result["status"] == "parse_error"
|
|
516
|
+
# 情况 (3): 解析失败(格式不对)
|
|
517
|
+
self.logger.warning(f"⚠️ LLM output format incorrect")
|
|
518
|
+
|
|
519
|
+
# 使用错误提示模板
|
|
520
|
+
error_msg = WebSearcherPrompts.CHAPTER_ERROR_FORMAT
|
|
521
|
+
|
|
522
|
+
messages.append({"role": "user", "content": error_msg})
|
|
523
|
+
continue
|
|
524
|
+
|
|
525
|
+
except Exception as e:
|
|
526
|
+
self.logger.error(f"Chapter selection failed: {e}")
|
|
527
|
+
|
|
528
|
+
if attempt < MAX_RETRIES - 1:
|
|
529
|
+
messages.append({
|
|
530
|
+
"role": "user",
|
|
531
|
+
"content": "An error occurred. Please try again. Make sure to follow the output format exactly."
|
|
532
|
+
})
|
|
533
|
+
continue
|
|
534
|
+
else:
|
|
535
|
+
# 所有重试都失败,返回空列表
|
|
536
|
+
return []
|
|
537
|
+
|
|
538
|
+
# 超过最大重试次数,返回空列表(会触发全文处理)
|
|
539
|
+
self.logger.error(f"❌ Max retries ({MAX_RETRIES}) exceeded. Falling back to full text processing.")
|
|
540
|
+
return []
|
|
541
|
+
|
|
542
|
+
def _parse_chapter_selection(
|
|
543
|
+
self,
|
|
544
|
+
llm_output: str,
|
|
545
|
+
chapter_name_to_index: Dict[str, int]
|
|
546
|
+
) -> Dict[str, Any]:
|
|
547
|
+
"""
|
|
548
|
+
解析 LLM 的章节选择输出
|
|
549
|
+
|
|
550
|
+
返回:
|
|
551
|
+
{
|
|
552
|
+
"status": "success" | "hallucination" | "parse_error",
|
|
553
|
+
"selected_indices": List[int], # 如果成功
|
|
554
|
+
"invalid_chapters": List[str] # 如果有幻觉
|
|
555
|
+
}
|
|
556
|
+
"""
|
|
557
|
+
# 查找分隔符
|
|
558
|
+
start_marker = "====章节选择===="
|
|
559
|
+
end_marker = "====章节选择结束===="
|
|
560
|
+
|
|
561
|
+
start_idx = llm_output.find(start_marker)
|
|
562
|
+
end_idx = llm_output.find(end_marker)
|
|
563
|
+
|
|
564
|
+
# 检查分隔符是否存在
|
|
565
|
+
if start_idx == -1 or end_idx == -1:
|
|
566
|
+
return {"status": "parse_error", "selected_indices": [], "invalid_chapters": []}
|
|
567
|
+
|
|
568
|
+
# 提取章节列表部分
|
|
569
|
+
start_idx += len(start_marker)
|
|
570
|
+
chapter_section = llm_output[start_idx:end_idx].strip()
|
|
571
|
+
|
|
572
|
+
# 按行分割
|
|
573
|
+
chapter_lines = [
|
|
574
|
+
line.strip()
|
|
575
|
+
for line in chapter_section.split('\n')
|
|
576
|
+
if line.strip()
|
|
577
|
+
]
|
|
578
|
+
|
|
579
|
+
if not chapter_lines:
|
|
580
|
+
return {"status": "parse_error", "selected_indices": [], "invalid_chapters": []}
|
|
581
|
+
|
|
582
|
+
# 验证章节是否存在于 TOC 中
|
|
583
|
+
selected_indices = []
|
|
584
|
+
invalid_chapters = []
|
|
585
|
+
|
|
586
|
+
for chapter_name in chapter_lines:
|
|
587
|
+
if chapter_name in chapter_name_to_index:
|
|
588
|
+
selected_indices.append(chapter_name_to_index[chapter_name])
|
|
589
|
+
else:
|
|
590
|
+
invalid_chapters.append(chapter_name)
|
|
591
|
+
|
|
592
|
+
# 判断结果
|
|
593
|
+
if invalid_chapters:
|
|
594
|
+
# 有幻觉
|
|
595
|
+
return {
|
|
596
|
+
"status": "hallucination",
|
|
597
|
+
"selected_indices": selected_indices,
|
|
598
|
+
"invalid_chapters": invalid_chapters
|
|
599
|
+
}
|
|
600
|
+
elif selected_indices:
|
|
601
|
+
# 成功
|
|
602
|
+
return {
|
|
603
|
+
"status": "success",
|
|
604
|
+
"selected_indices": selected_indices,
|
|
605
|
+
"invalid_chapters": []
|
|
606
|
+
}
|
|
607
|
+
else:
|
|
608
|
+
# 没有选中任何章节(也可能是格式错误)
|
|
609
|
+
return {
|
|
610
|
+
"status": "parse_error",
|
|
611
|
+
"selected_indices": [],
|
|
612
|
+
"invalid_chapters": []
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
def _split_by_paragraph_boundaries(
|
|
616
|
+
self,
|
|
617
|
+
text: str,
|
|
618
|
+
threshold: int
|
|
619
|
+
) -> List[str]:
|
|
620
|
+
"""
|
|
621
|
+
按段落边界将文本分段,每段 ≤ threshold
|
|
622
|
+
|
|
623
|
+
策略:
|
|
624
|
+
1. 按 \\n\\n 分割段落
|
|
625
|
+
2. 逐步添加段落,直到接近阈值
|
|
626
|
+
3. 在最近的双换行处断开
|
|
627
|
+
4. 超长段落按句子(。)细分
|
|
628
|
+
"""
|
|
629
|
+
if len(text) <= threshold:
|
|
630
|
+
return [text]
|
|
631
|
+
|
|
632
|
+
# 按双换行分段
|
|
633
|
+
paragraphs = text.split('\n\n')
|
|
634
|
+
chunks = []
|
|
635
|
+
current_chunk = ""
|
|
636
|
+
|
|
637
|
+
for para in paragraphs:
|
|
638
|
+
test_chunk = current_chunk + ("\n\n" if current_chunk else "") + para
|
|
639
|
+
|
|
640
|
+
if len(test_chunk) <= threshold:
|
|
641
|
+
current_chunk = test_chunk
|
|
642
|
+
else:
|
|
643
|
+
# 当前段落会超出阈值
|
|
644
|
+
if current_chunk:
|
|
645
|
+
chunks.append(current_chunk)
|
|
646
|
+
|
|
647
|
+
# 如果单个段落就超过阈值,强制在中间断开
|
|
648
|
+
if len(para) > threshold:
|
|
649
|
+
# 按句子分割
|
|
650
|
+
sentences = para.split('。')
|
|
651
|
+
temp_chunk = ""
|
|
652
|
+
for sent in sentences:
|
|
653
|
+
test_sent = temp_chunk + ('。' if temp_chunk else '') + sent
|
|
654
|
+
if len(test_sent) <= threshold:
|
|
655
|
+
temp_chunk = test_sent
|
|
656
|
+
else:
|
|
657
|
+
if temp_chunk:
|
|
658
|
+
chunks.append(temp_chunk)
|
|
659
|
+
temp_chunk = sent
|
|
660
|
+
current_chunk = temp_chunk
|
|
661
|
+
else:
|
|
662
|
+
current_chunk = para
|
|
663
|
+
|
|
664
|
+
if current_chunk:
|
|
665
|
+
chunks.append(current_chunk)
|
|
666
|
+
|
|
667
|
+
return chunks
|
|
668
|
+
|
|
669
|
+
# ==========================================
|
|
670
|
+
# 4. 核心流式处理
|
|
671
|
+
# ==========================================
|
|
672
|
+
|
|
673
|
+
async def _process_batch(
|
|
674
|
+
self,
|
|
675
|
+
batch_text: str,
|
|
676
|
+
ctx: WebSearcherContext,
|
|
677
|
+
doc_title: str,
|
|
678
|
+
current_batch: int,
|
|
679
|
+
total_batches: int,
|
|
680
|
+
url
|
|
681
|
+
) -> Dict[str, Any]:
|
|
682
|
+
"""
|
|
683
|
+
统一的批处理函数(带重试机制)
|
|
684
|
+
|
|
685
|
+
参数:
|
|
686
|
+
batch_text: 当前批次文本
|
|
687
|
+
ctx: 搜索上下文
|
|
688
|
+
doc_title: 文档名称
|
|
689
|
+
current_batch: 当前批次(页码,从 1 开始)
|
|
690
|
+
total_batches: 总批次数(总页数)
|
|
691
|
+
|
|
692
|
+
返回:
|
|
693
|
+
{
|
|
694
|
+
"heading_type": "answer" | "note" | "continue" | "skip_doc",
|
|
695
|
+
"content": str
|
|
696
|
+
}
|
|
697
|
+
"""
|
|
698
|
+
# 计算进度百分比
|
|
699
|
+
progress_pct = int((current_batch / total_batches) * 100)
|
|
700
|
+
|
|
701
|
+
# 初始化消息列表
|
|
702
|
+
messages = [
|
|
703
|
+
{
|
|
704
|
+
"role": "user",
|
|
705
|
+
"content": WebSearcherPrompts.BATCH_PROCESSING.format(
|
|
706
|
+
question=ctx.purpose,
|
|
707
|
+
doc_title=doc_title,
|
|
708
|
+
current_batch=current_batch,
|
|
709
|
+
total_batches=total_batches,
|
|
710
|
+
progress_pct=progress_pct,
|
|
711
|
+
notebook=ctx.notebook,
|
|
712
|
+
batch_text=batch_text,
|
|
713
|
+
url=url
|
|
714
|
+
)
|
|
715
|
+
}
|
|
716
|
+
]
|
|
717
|
+
|
|
718
|
+
# 最大重试次数
|
|
719
|
+
MAX_RETRIES = 5
|
|
720
|
+
|
|
721
|
+
for attempt in range(MAX_RETRIES):
|
|
722
|
+
try:
|
|
723
|
+
# 调用 LLM
|
|
724
|
+
response = await self.cerebellum.backend.think(messages=messages)
|
|
725
|
+
reply = response.get('reply', '').strip()
|
|
726
|
+
|
|
727
|
+
self.logger.debug(f"Batch processing attempt {attempt + 1}:\n{reply}")
|
|
728
|
+
|
|
729
|
+
# 将 LLM 的回复作为 assistant 消息加入历史
|
|
730
|
+
messages.append({"role": "assistant", "content": reply})
|
|
731
|
+
|
|
732
|
+
# 解析输出
|
|
733
|
+
result = self._parse_batch_output(reply)
|
|
734
|
+
|
|
735
|
+
if result["status"] == "success":
|
|
736
|
+
# 成功
|
|
737
|
+
heading_type = result["heading_type"]
|
|
738
|
+
content = result["content"]
|
|
739
|
+
|
|
740
|
+
# 根据类型记录日志
|
|
741
|
+
if heading_type == "answer":
|
|
742
|
+
self.logger.info(f"✅ Found answer in batch")
|
|
743
|
+
elif heading_type == "note":
|
|
744
|
+
self.logger.info(f"📝 Found useful info in batch")
|
|
745
|
+
elif heading_type == "continue":
|
|
746
|
+
self.logger.debug(f"👀 No new info, continuing")
|
|
747
|
+
else: # skip_doc
|
|
748
|
+
self.logger.warning(f"🚫 Document irrelevant, skipping")
|
|
749
|
+
|
|
750
|
+
return {
|
|
751
|
+
"heading_type": heading_type,
|
|
752
|
+
"content": content
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
else: # result["status"] == "parse_error"
|
|
756
|
+
# 格式错误
|
|
757
|
+
self.logger.warning(f"⚠️ LLM output format incorrect")
|
|
758
|
+
|
|
759
|
+
error_msg = WebSearcherPrompts.BATCH_ERROR_FORMAT
|
|
760
|
+
messages.append({"role": "user", "content": error_msg})
|
|
761
|
+
continue
|
|
762
|
+
|
|
763
|
+
except Exception as e:
|
|
764
|
+
self.logger.error(f"Batch processing failed: {e}")
|
|
765
|
+
|
|
766
|
+
if attempt < MAX_RETRIES - 1:
|
|
767
|
+
messages.append({
|
|
768
|
+
"role": "user",
|
|
769
|
+
"content": "An error occurred. Please try again. Make sure to start with one of the four headings."
|
|
770
|
+
})
|
|
771
|
+
continue
|
|
772
|
+
else:
|
|
773
|
+
# 所有重试都失败,返回默认值
|
|
774
|
+
return {"heading_type": "continue", "content": ""}
|
|
775
|
+
|
|
776
|
+
# 超过最大重试次数,返回默认值(继续阅读)
|
|
777
|
+
self.logger.error(f"❌ Max retries ({MAX_RETRIES}) exceeded. Defaulting to 'continue'.")
|
|
778
|
+
return {"heading_type": "continue", "content": ""}
|
|
779
|
+
|
|
780
|
+
def _parse_batch_output(self, llm_output: str) -> Dict[str, Any]:
|
|
781
|
+
"""
|
|
782
|
+
解析 LLM 的批处理输出
|
|
783
|
+
|
|
784
|
+
返回:
|
|
785
|
+
{
|
|
786
|
+
"status": "success" | "parse_error",
|
|
787
|
+
"heading_type": "answer" | "note" | "continue" | "skip_doc",
|
|
788
|
+
"content": str
|
|
789
|
+
}
|
|
790
|
+
"""
|
|
791
|
+
# 定义四种标题
|
|
792
|
+
HEADINGS = {
|
|
793
|
+
"##对问题的回答": "answer",
|
|
794
|
+
"##值得记录的笔记": "note",
|
|
795
|
+
"##没有值得记录的笔记继续阅读": "continue",
|
|
796
|
+
"##完全不相关的文档应该放弃": "skip_doc"
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
# 检查输出以哪个标题开头
|
|
800
|
+
heading_type = None
|
|
801
|
+
heading_used = None
|
|
802
|
+
|
|
803
|
+
for heading, htype in HEADINGS.items():
|
|
804
|
+
if llm_output.startswith(heading):
|
|
805
|
+
heading_type = htype
|
|
806
|
+
heading_used = heading
|
|
807
|
+
break
|
|
808
|
+
|
|
809
|
+
if heading_type is None:
|
|
810
|
+
# 没有找到任何标题
|
|
811
|
+
return {"status": "parse_error", "heading_type": None, "content": ""}
|
|
812
|
+
|
|
813
|
+
# 提取标题下面的内容
|
|
814
|
+
content_start = len(heading_used)
|
|
815
|
+
content = llm_output[content_start:].strip()
|
|
816
|
+
|
|
817
|
+
# 如果内容为空,也算解析错误
|
|
818
|
+
if not content:
|
|
819
|
+
return {"status": "parse_error", "heading_type": None, "content": ""}
|
|
820
|
+
|
|
821
|
+
# 成功
|
|
822
|
+
return {
|
|
823
|
+
"status": "success",
|
|
824
|
+
"heading_type": heading_type,
|
|
825
|
+
"content": content
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
def _extract_document_title(self, markdown: str) -> str:
|
|
829
|
+
"""
|
|
830
|
+
从 Markdown 中提取文档标题
|
|
831
|
+
优先级:第一个 # 标题 > 前 50 字符 > "未命名文档"
|
|
832
|
+
"""
|
|
833
|
+
# 1. 尝试找到第一个 # 标题
|
|
834
|
+
lines = markdown.split('\n')
|
|
835
|
+
for line in lines:
|
|
836
|
+
if line.startswith('# '):
|
|
837
|
+
return line[2:].strip()
|
|
838
|
+
|
|
839
|
+
# 2. 如果没有标题,使用前 50 字符作为标题
|
|
840
|
+
if len(markdown) > 50:
|
|
841
|
+
return markdown[:50].strip()
|
|
842
|
+
|
|
843
|
+
# 3. 默认标题
|
|
844
|
+
return "未命名文档"
|
|
845
|
+
|
|
846
|
+
async def _stream_process_markdown(
|
|
847
|
+
self,
|
|
848
|
+
markdown: str,
|
|
849
|
+
ctx: WebSearcherContext,
|
|
850
|
+
url: str
|
|
851
|
+
) -> Optional[str]:
|
|
852
|
+
"""
|
|
853
|
+
流式处理 Markdown 文档(统一入口)
|
|
854
|
+
|
|
855
|
+
流程:
|
|
856
|
+
1. 判断长度 → 决定是否需要选章节
|
|
857
|
+
2. 准备待处理内容(全文 OR 选中章节)
|
|
858
|
+
3. 按段落边界分成批次
|
|
859
|
+
4. 逐批流式处理
|
|
860
|
+
"""
|
|
861
|
+
# 1. 判断长度
|
|
862
|
+
is_long = len(markdown) > ctx.chunk_threshold
|
|
863
|
+
|
|
864
|
+
# 2. 准备待处理内容
|
|
865
|
+
if not is_long:
|
|
866
|
+
# 短文档:全文处理
|
|
867
|
+
self.logger.info(f"📄 Short document ({len(markdown)} chars). Processing full text.")
|
|
868
|
+
content_to_process = markdown
|
|
869
|
+
else:
|
|
870
|
+
# 长文档:生成目录 → 选择章节
|
|
871
|
+
self.logger.info(f"📚 Long document ({len(markdown)} chars). Generating TOC...")
|
|
872
|
+
toc = self._generate_document_toc(markdown)
|
|
873
|
+
|
|
874
|
+
if not toc or len(toc)<2:
|
|
875
|
+
# 无标题结构或者只有一个标题,全文处理
|
|
876
|
+
self.logger.info("📋 No headers found. Processing full text.")
|
|
877
|
+
content_to_process = markdown
|
|
878
|
+
else:
|
|
879
|
+
# 让 LLM 选择章节
|
|
880
|
+
self.logger.info(f"📑 Found {len(toc)} chapters. Asking LLM to select...")
|
|
881
|
+
selected_indices = await self._let_llm_select_chapters(toc, ctx)
|
|
882
|
+
|
|
883
|
+
if not selected_indices:
|
|
884
|
+
self.logger.warning("⚠️ No chapters selected. Processing full text.")
|
|
885
|
+
content_to_process = markdown
|
|
886
|
+
else:
|
|
887
|
+
self.logger.info(f"✅ Selected {len(selected_indices)} chapters")
|
|
888
|
+
# 提取选中章节
|
|
889
|
+
selected_parts = []
|
|
890
|
+
for idx in selected_indices:
|
|
891
|
+
chapter = toc[idx]
|
|
892
|
+
content = markdown[chapter["start"]:chapter["end"]]
|
|
893
|
+
selected_parts.append(f"# {chapter['title']}\n\n{content}")
|
|
894
|
+
content_to_process = "\n\n".join(selected_parts)
|
|
895
|
+
|
|
896
|
+
# 3. 按段落边界分成批次
|
|
897
|
+
self.logger.info(f"🔪 Splitting content into batches (max {ctx.chunk_threshold} chars each)...")
|
|
898
|
+
batches = self._split_by_paragraph_boundaries(content_to_process, ctx.chunk_threshold)
|
|
899
|
+
total_batches = len(batches)
|
|
900
|
+
self.logger.info(f"📊 Split into {total_batches} batches")
|
|
901
|
+
|
|
902
|
+
# 4. 获取文档标题(用于 LLM 上下文)
|
|
903
|
+
doc_title = self._extract_document_title(content_to_process)
|
|
904
|
+
|
|
905
|
+
# 5. 逐批流式处理
|
|
906
|
+
for i, batch in enumerate(batches, start=1): # 从 1 开始计数
|
|
907
|
+
current_batch = i
|
|
908
|
+
progress_pct = int((current_batch / total_batches) * 100)
|
|
909
|
+
self.logger.info(
|
|
910
|
+
f"🔄 Processing batch {current_batch}/{total_batches} "
|
|
911
|
+
f"({progress_pct}%, {len(batch)} chars)..."
|
|
912
|
+
)
|
|
913
|
+
|
|
914
|
+
# 统一的批处理(传入进度信息)
|
|
915
|
+
result = await self._process_batch(
|
|
916
|
+
batch,
|
|
917
|
+
ctx,
|
|
918
|
+
doc_title=doc_title,
|
|
919
|
+
current_batch=current_batch,
|
|
920
|
+
total_batches=total_batches,
|
|
921
|
+
url=url
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
# 处理结果
|
|
925
|
+
if result["heading_type"] == "answer":
|
|
926
|
+
# 找到答案,立即返回
|
|
927
|
+
self.logger.info(f"✅ Answer found in batch {current_batch}!")
|
|
928
|
+
return result["content"]
|
|
929
|
+
|
|
930
|
+
elif result["heading_type"] == "note":
|
|
931
|
+
# 有用信息,添加到小本本
|
|
932
|
+
ctx.add_to_notebook(f"[Batch {current_batch}] {result['content']}")
|
|
933
|
+
self.logger.info(f"📝 Added useful info from batch {current_batch}")
|
|
934
|
+
|
|
935
|
+
elif result["heading_type"] == "skip_doc":
|
|
936
|
+
# 文档不相关,放弃整个文档
|
|
937
|
+
self.logger.warning(f"🚫 Document irrelevant. Skipping rest of document.")
|
|
938
|
+
break
|
|
939
|
+
|
|
940
|
+
# heading_type == "continue": 什么都不做,继续下一批
|
|
941
|
+
|
|
942
|
+
# 未找到答案
|
|
943
|
+
return None
|
|
944
|
+
|
|
945
|
+
async def _run_search_lifecycle(self, session: TabSession, ctx: WebSearcherContext) -> Optional[str]:
|
|
946
|
+
"""
|
|
947
|
+
[The Core Loop] 搜索生命周期
|
|
948
|
+
核心逻辑:访问页面 → 尝试回答问题 → 不能回答则记录信息 → 继续探索
|
|
949
|
+
"""
|
|
950
|
+
while not ctx.is_time_up():
|
|
951
|
+
# --- Phase 1: Navigation ---
|
|
952
|
+
if not session.pending_link_queue:
|
|
953
|
+
self.logger.info("Queue empty. Ending search.")
|
|
954
|
+
break
|
|
955
|
+
|
|
956
|
+
next_url = session.pending_link_queue.popleft()
|
|
957
|
+
self.logger.info(f"🔗 Navigating to: {next_url}")
|
|
958
|
+
|
|
959
|
+
# 1.1 门禁检查
|
|
960
|
+
if ctx.has_visited(next_url) or any(bl in next_url for bl in ctx.blacklist):
|
|
961
|
+
continue
|
|
962
|
+
|
|
963
|
+
# 1.2 导航到页面
|
|
964
|
+
nav_report = await self.browser.navigate(session.handle, next_url)
|
|
965
|
+
final_url = self.browser.get_tab_url(session.handle)
|
|
966
|
+
session.current_url = final_url
|
|
967
|
+
|
|
968
|
+
ctx.mark_visited(next_url)
|
|
969
|
+
ctx.mark_visited(final_url)
|
|
970
|
+
|
|
971
|
+
# 1.3 二次黑名单检查
|
|
972
|
+
if any(bl in final_url for bl in ctx.blacklist):
|
|
973
|
+
self.logger.warning(f"🚫 Redirected to blacklisted URL: {final_url}")
|
|
974
|
+
continue
|
|
975
|
+
|
|
976
|
+
# === Phase 2: Identify Page Type ===
|
|
977
|
+
page_type = await self.browser.analyze_page_type(session.handle)
|
|
978
|
+
|
|
979
|
+
if page_type == PageType.ERRO_PAGE:
|
|
980
|
+
self.logger.warning(f"🚫 Error Page: {final_url}")
|
|
981
|
+
continue
|
|
982
|
+
|
|
983
|
+
# === 分支 A: 静态资源 ===
|
|
984
|
+
if page_type == PageType.STATIC_ASSET:
|
|
985
|
+
self.logger.info(f"📄 Static Asset: {final_url}")
|
|
986
|
+
|
|
987
|
+
# 获取完整 Markdown 并流式处理
|
|
988
|
+
markdown = await self._get_full_page_markdown(session.handle, ctx)
|
|
989
|
+
answer = await self._stream_process_markdown(markdown, ctx,final_url)
|
|
990
|
+
|
|
991
|
+
if answer:
|
|
992
|
+
return answer # 找到答案,直接返回
|
|
993
|
+
|
|
994
|
+
continue # 继续处理下一个 URL
|
|
995
|
+
|
|
996
|
+
# === 分支 B: 交互式网页 ===
|
|
997
|
+
elif page_type == PageType.NAVIGABLE:
|
|
998
|
+
self.logger.debug("🌐 Navigable Page. Entering processing loop.")
|
|
999
|
+
page_active = True
|
|
1000
|
+
page_changed = True
|
|
1001
|
+
|
|
1002
|
+
while page_active and not ctx.is_time_up():
|
|
1003
|
+
# 1. Stabilize (滚动加载)
|
|
1004
|
+
if page_changed:
|
|
1005
|
+
await self.browser.stabilize(session.handle)
|
|
1006
|
+
|
|
1007
|
+
# 2. 获取完整 Markdown 并流式处理
|
|
1008
|
+
markdown = await self._get_full_page_markdown(session.handle, ctx)
|
|
1009
|
+
answer = await self._stream_process_markdown(markdown, ctx,final_url)
|
|
1010
|
+
|
|
1011
|
+
# 3. 如果找到答案,直接返回
|
|
1012
|
+
if answer:
|
|
1013
|
+
return answer
|
|
1014
|
+
|
|
1015
|
+
# 4. 生成页面摘要(用于后续链接筛选)
|
|
1016
|
+
page_summary = markdown[:500] if markdown else ""
|
|
1017
|
+
|
|
1018
|
+
# === Phase 4: Scouting ===
|
|
1019
|
+
links, buttons = await self.browser.scan_elements(session.handle)
|
|
1020
|
+
self.logger.debug(f"🔍 Found {len(links)} links and {len(buttons)} buttons")
|
|
1021
|
+
|
|
1022
|
+
# 4.1 处理 Links
|
|
1023
|
+
if page_changed:
|
|
1024
|
+
filtered_links = {}
|
|
1025
|
+
for link in links:
|
|
1026
|
+
if ctx.has_link_assessed(link):
|
|
1027
|
+
continue
|
|
1028
|
+
if ctx.has_visited(link):
|
|
1029
|
+
continue
|
|
1030
|
+
if link in session.pending_link_queue:
|
|
1031
|
+
continue
|
|
1032
|
+
if any(bl in link for bl in ctx.blacklist):
|
|
1033
|
+
continue
|
|
1034
|
+
filtered_links[link] = links[link]
|
|
1035
|
+
|
|
1036
|
+
# 评估链接相关性
|
|
1037
|
+
selected_links = await self._filter_relevant_links(filtered_links, page_summary, ctx)
|
|
1038
|
+
|
|
1039
|
+
# 标记已评估
|
|
1040
|
+
for link in filtered_links:
|
|
1041
|
+
ctx.mark_link_assessed(link)
|
|
1042
|
+
|
|
1043
|
+
# 添加到队列
|
|
1044
|
+
new_links_count = 0
|
|
1045
|
+
for link in selected_links:
|
|
1046
|
+
session.pending_link_queue.append(link)
|
|
1047
|
+
new_links_count += 1
|
|
1048
|
+
self.logger.info(f"👀 Added {new_links_count} links to queue")
|
|
1049
|
+
|
|
1050
|
+
# 4.2 处理 Buttons
|
|
1051
|
+
candidate_buttons = []
|
|
1052
|
+
for button_text in buttons:
|
|
1053
|
+
if not ctx.has_button_assessed(session.current_url, button_text):
|
|
1054
|
+
candidate_buttons.append({button_text: buttons[button_text]})
|
|
1055
|
+
|
|
1056
|
+
# === Phase 5: Execution ===
|
|
1057
|
+
if not candidate_buttons:
|
|
1058
|
+
self.logger.info("🤔 No worthy buttons. Moving to next page.")
|
|
1059
|
+
page_active = False
|
|
1060
|
+
continue
|
|
1061
|
+
|
|
1062
|
+
chosen_button = await self._choose_best_interaction(candidate_buttons, page_summary, ctx)
|
|
1063
|
+
|
|
1064
|
+
# 标记已评估
|
|
1065
|
+
assessed_button_texts = [list(btn.keys())[0] for btn in candidate_buttons]
|
|
1066
|
+
ctx.mark_buttons_assessed(session.current_url, assessed_button_texts)
|
|
1067
|
+
|
|
1068
|
+
if not chosen_button:
|
|
1069
|
+
self.logger.info("🤔 No worthy buttons. Moving to next page.")
|
|
1070
|
+
page_active = False
|
|
1071
|
+
continue
|
|
1072
|
+
|
|
1073
|
+
# 执行点击
|
|
1074
|
+
self.logger.info(f"🖱️ Clicking: [{chosen_button.get_text()}]")
|
|
1075
|
+
ctx.mark_interacted(session.current_url, chosen_button.get_text())
|
|
1076
|
+
|
|
1077
|
+
report = await self.browser.click_and_observe(session.handle, chosen_button)
|
|
1078
|
+
|
|
1079
|
+
# 5.1 处理新 Tab
|
|
1080
|
+
if report.new_tabs:
|
|
1081
|
+
self.logger.info(f"✨ New Tab(s): {len(report.new_tabs)}")
|
|
1082
|
+
for new_tab_handle in report.new_tabs:
|
|
1083
|
+
new_session = TabSession(handle=new_tab_handle, current_url="", depth=session.depth + 1)
|
|
1084
|
+
answer = await self._run_search_lifecycle(new_session, ctx)
|
|
1085
|
+
if answer: # 如果在递归中找到答案,向上传递
|
|
1086
|
+
return answer
|
|
1087
|
+
await self.browser.close_tab(new_tab_handle)
|
|
1088
|
+
|
|
1089
|
+
# 5.2 处理页面变动
|
|
1090
|
+
if report.is_dom_changed or report.is_url_changed:
|
|
1091
|
+
self.logger.info("🔄 Page changed. Re-assessing.")
|
|
1092
|
+
page_changed = True
|
|
1093
|
+
if report.is_url_changed:
|
|
1094
|
+
session.current_url = self.browser.get_tab_url(session.handle)
|
|
1095
|
+
continue
|
|
1096
|
+
|
|
1097
|
+
# 5.3 无变化
|
|
1098
|
+
page_changed = False
|
|
1099
|
+
continue
|
|
1100
|
+
|
|
1101
|
+
# 未找到答案
|
|
1102
|
+
return None
|
|
1103
|
+
|
|
1104
|
+
# ==========================================
|
|
1105
|
+
# 3. 小脑决策辅助
|
|
1106
|
+
# ==========================================
|
|
1107
|
+
|