matrix-for-agents 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentmatrix/__init__.py +20 -0
- agentmatrix/agents/__init__.py +1 -0
- agentmatrix/agents/base.py +572 -0
- agentmatrix/agents/claude_coder.py +10 -0
- agentmatrix/agents/data_crawler.py +14 -0
- agentmatrix/agents/post_office.py +212 -0
- agentmatrix/agents/report_writer.py +14 -0
- agentmatrix/agents/secretary.py +10 -0
- agentmatrix/agents/stateful.py +10 -0
- agentmatrix/agents/user_proxy.py +82 -0
- agentmatrix/agents/worker.py +30 -0
- agentmatrix/backends/__init__.py +1 -0
- agentmatrix/backends/llm_client.py +414 -0
- agentmatrix/backends/mock_llm.py +35 -0
- agentmatrix/cli_runner.py +94 -0
- agentmatrix/core/__init__.py +0 -0
- agentmatrix/core/action.py +50 -0
- agentmatrix/core/browser/bing.py +208 -0
- agentmatrix/core/browser/browser_adapter.py +298 -0
- agentmatrix/core/browser/browser_common.py +85 -0
- agentmatrix/core/browser/drission_page_adapter.py +1296 -0
- agentmatrix/core/browser/google.py +230 -0
- agentmatrix/core/cerebellum.py +121 -0
- agentmatrix/core/events.py +22 -0
- agentmatrix/core/loader.py +185 -0
- agentmatrix/core/loader_v1.py +146 -0
- agentmatrix/core/log_util.py +158 -0
- agentmatrix/core/message.py +32 -0
- agentmatrix/core/prompt_engine.py +30 -0
- agentmatrix/core/runtime.py +211 -0
- agentmatrix/core/session.py +20 -0
- agentmatrix/db/__init__.py +1 -0
- agentmatrix/db/database.py +79 -0
- agentmatrix/db/vector_db.py +213 -0
- agentmatrix/docs/Design.md +109 -0
- agentmatrix/docs/Framework Capbilities.md +105 -0
- agentmatrix/docs/Planner Design.md +148 -0
- agentmatrix/docs/crawler_flow.md +110 -0
- agentmatrix/docs/report_writer.md +83 -0
- agentmatrix/docs/review.md +99 -0
- agentmatrix/docs/skill_design.md +23 -0
- agentmatrix/profiles/claude_coder.yml +40 -0
- agentmatrix/profiles/mark.yml +26 -0
- agentmatrix/profiles/planner.yml +21 -0
- agentmatrix/profiles/prompts/base.txt +88 -0
- agentmatrix/profiles/prompts/base_v1.txt +101 -0
- agentmatrix/profiles/prompts/base_v2.txt +94 -0
- agentmatrix/profiles/tom_the_data_crawler.yml +38 -0
- agentmatrix/profiles/user_proxy.yml +17 -0
- agentmatrix/skills/__init__.py +1 -0
- agentmatrix/skills/crawler_helpers.py +315 -0
- agentmatrix/skills/data_crawler.py +777 -0
- agentmatrix/skills/filesystem.py +204 -0
- agentmatrix/skills/notebook.py +158 -0
- agentmatrix/skills/project_management.py +114 -0
- agentmatrix/skills/report_writer.py +194 -0
- agentmatrix/skills/report_writer_utils.py +379 -0
- agentmatrix/skills/search_tool.py +383 -0
- agentmatrix/skills/terminal_ctrl.py +122 -0
- agentmatrix/skills/utils.py +33 -0
- agentmatrix/skills/web_searcher.py +1107 -0
- matrix_for_agents-0.1.2.dist-info/METADATA +44 -0
- matrix_for_agents-0.1.2.dist-info/RECORD +66 -0
- matrix_for_agents-0.1.2.dist-info/WHEEL +5 -0
- matrix_for_agents-0.1.2.dist-info/licenses/LICENSE +190 -0
- matrix_for_agents-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,777 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
import os,json,textwrap
|
|
4
|
+
import re
|
|
5
|
+
import sqlite3
|
|
6
|
+
from typing import List, Set, Dict, Optional, Any, Deque
|
|
7
|
+
from collections import deque
|
|
8
|
+
from enum import Enum, auto
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from ..core.browser.google import search_google
|
|
11
|
+
from ..core.browser.bing import search_bing
|
|
12
|
+
from ..skills.utils import sanitize_filename
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# 引入之前的 Adapter 定义 (假设在 drission_page_adapter 或 browser_adapter 中)
|
|
17
|
+
from ..core.browser.browser_adapter import (
|
|
18
|
+
BrowserAdapter, TabHandle, PageElement, InteractionReport, PageSnapshot, PageType
|
|
19
|
+
)
|
|
20
|
+
# 引入公共数据结构
|
|
21
|
+
from ..core.browser.browser_common import TabSession, BaseCrawlerContext
|
|
22
|
+
# 引入爬虫辅助方法
|
|
23
|
+
from ..skills.crawler_helpers import CrawlerHelperMixin
|
|
24
|
+
# 引入具体的 Adapter 实现
|
|
25
|
+
from ..core.browser.drission_page_adapter import DrissionPageAdapter
|
|
26
|
+
from ..core.action import register_action
|
|
27
|
+
from slugify import slugify
|
|
28
|
+
|
|
29
|
+
search_func = search_bing
|
|
30
|
+
|
|
31
|
+
# ==========================================
|
|
32
|
+
# 1. 状态与上下文定义 (State & Context)
|
|
33
|
+
# ==========================================
|
|
34
|
+
|
|
35
|
+
class ContentVerdict(Enum):
|
|
36
|
+
"""Phase 3: 页面价值判断结果"""
|
|
37
|
+
TRASH = auto() # 垃圾/无关/登录墙 -> 关掉或跳过
|
|
38
|
+
RELEVANT_INDEX = auto() # 索引页/列表页 -> 不值得总结,但值得挖掘链接
|
|
39
|
+
HIGH_VALUE = auto() # 高价值内容 -> 总结并保存
|
|
40
|
+
|
|
41
|
+
class MissionContext(BaseCrawlerContext):
|
|
42
|
+
"""
|
|
43
|
+
全局任务上下文 (Global Memory)
|
|
44
|
+
跨越所有递归层级共享的数据。
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, purpose: str, save_dir: str, deadline: float):
|
|
48
|
+
super().__init__(deadline)
|
|
49
|
+
self.purpose = purpose
|
|
50
|
+
self.save_dir = save_dir
|
|
51
|
+
self.knowledge_base: List[Dict] = []
|
|
52
|
+
self._db_conn: Optional[sqlite3.Connection] = None
|
|
53
|
+
self._init_database()
|
|
54
|
+
self._load_assessed_history()
|
|
55
|
+
|
|
56
|
+
def mark_link_assessed(self, url: str):
|
|
57
|
+
"""标记链接为已评估(内存 + 数据库)"""
|
|
58
|
+
super().mark_link_assessed(url)
|
|
59
|
+
if url not in self.assessed_links:
|
|
60
|
+
self._db_conn.execute(
|
|
61
|
+
"INSERT OR IGNORE INTO assessed_links (url) VALUES (?)",
|
|
62
|
+
(url,)
|
|
63
|
+
)
|
|
64
|
+
self._db_conn.commit()
|
|
65
|
+
|
|
66
|
+
def mark_buttons_assessed(self, url: str, button_texts: List[str]):
|
|
67
|
+
"""批量标记按钮为已评估(内存 + 数据库)"""
|
|
68
|
+
super().mark_buttons_assessed(url, button_texts)
|
|
69
|
+
for button_text in button_texts:
|
|
70
|
+
key = f"{url}|{button_text}"
|
|
71
|
+
if key not in self.assessed_buttons:
|
|
72
|
+
self._db_conn.execute(
|
|
73
|
+
"INSERT OR IGNORE INTO assessed_buttons (button_key) VALUES (?)",
|
|
74
|
+
(key,)
|
|
75
|
+
)
|
|
76
|
+
self._db_conn.commit()
|
|
77
|
+
|
|
78
|
+
def _init_database(self):
|
|
79
|
+
"""初始化 SQLite 数据库"""
|
|
80
|
+
db_path = os.path.join(self.save_dir, ".crawler_assessment.db")
|
|
81
|
+
self._db_conn = sqlite3.connect(db_path)
|
|
82
|
+
self._db_conn.execute("PRAGMA journal_mode=WAL") # 提升并发性能
|
|
83
|
+
|
|
84
|
+
# 创建表
|
|
85
|
+
self._db_conn.execute("""
|
|
86
|
+
CREATE TABLE IF NOT EXISTS assessed_links (
|
|
87
|
+
url TEXT PRIMARY KEY,
|
|
88
|
+
assessed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
89
|
+
)
|
|
90
|
+
""")
|
|
91
|
+
|
|
92
|
+
self._db_conn.execute("""
|
|
93
|
+
CREATE TABLE IF NOT EXISTS assessed_buttons (
|
|
94
|
+
button_key TEXT PRIMARY KEY,
|
|
95
|
+
assessed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
96
|
+
)
|
|
97
|
+
""")
|
|
98
|
+
|
|
99
|
+
self._db_conn.commit()
|
|
100
|
+
|
|
101
|
+
def _load_assessed_history(self):
|
|
102
|
+
"""从数据库加载已评估历史到内存"""
|
|
103
|
+
# 加载已评估的链接
|
|
104
|
+
cursor = self._db_conn.execute("SELECT url FROM assessed_links")
|
|
105
|
+
self.assessed_links = {row[0] for row in cursor}
|
|
106
|
+
|
|
107
|
+
# 加载已评估的按钮
|
|
108
|
+
cursor = self._db_conn.execute("SELECT button_key FROM assessed_buttons")
|
|
109
|
+
self.assessed_buttons = {row[0] for row in cursor}
|
|
110
|
+
|
|
111
|
+
def cleanup(self):
|
|
112
|
+
"""清理资源,关闭数据库连接"""
|
|
113
|
+
if self._db_conn:
|
|
114
|
+
self._db_conn.close()
|
|
115
|
+
self._db_conn = None
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# ==========================================
|
|
119
|
+
# 2. 逻辑核心 (Logic Mixin)
|
|
120
|
+
# ==========================================
|
|
121
|
+
|
|
122
|
+
class DigitalInternCrawlerMixin(CrawlerHelperMixin):
|
|
123
|
+
"""
|
|
124
|
+
数字实习生逻辑核心。
|
|
125
|
+
实现了 "Observation -> Thought -> Action" 的递归循环。
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
# 依赖注入:假设 self.cerebellum 和 self.logger 已经由主类提供
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
#def start_browser(self, ctx: MissionContext):
|
|
134
|
+
# profile_path = os.path.join(self.workspace_root ,".matrix", "browser_profile", self.name)
|
|
135
|
+
# download_path = os.path.join(self.workspace_root ,"download")
|
|
136
|
+
# self.browser_adapter = DrissionPageAdapter(
|
|
137
|
+
# profile_path=profile_path,
|
|
138
|
+
# download_path=download_path
|
|
139
|
+
# )
|
|
140
|
+
|
|
141
|
+
def _resolve_download_folder(self, folder: str) -> Optional[str]:
|
|
142
|
+
"""
|
|
143
|
+
安全路径解析,确保下载目录在允许的工作空间范围内
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
folder: 用户提供的目录路径(可以是相对路径或绝对路径)
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
解析后的绝对路径字符串,如果路径不安全则返回 None
|
|
150
|
+
"""
|
|
151
|
+
try:
|
|
152
|
+
workspace_root = Path(self.workspace_root).resolve()
|
|
153
|
+
folder_path = Path(folder)
|
|
154
|
+
|
|
155
|
+
self.logger.debug(f"DEBUG _resolve_download_folder: workspace_root={workspace_root}")
|
|
156
|
+
self.logger.debug(f"DEBUG _resolve_download_folder: folder_path={folder_path}")
|
|
157
|
+
self.logger.debug(f"DEBUG _resolve_download_folder: is_absolute={folder_path.is_absolute()}")
|
|
158
|
+
self.logger.debug(f"DEBUG _resolve_download_folder: current_workspace={Path(self.current_workspace)}")
|
|
159
|
+
|
|
160
|
+
# 如果是相对路径,基于 current_workspace 解析
|
|
161
|
+
if not folder_path.is_absolute():
|
|
162
|
+
target_path = (Path(self.current_workspace) / folder_path).resolve()
|
|
163
|
+
self.logger.debug(f"DEBUG _resolve_download_folder: Using relative path logic, target_path={target_path}")
|
|
164
|
+
else:
|
|
165
|
+
# 如果是绝对路径,直接使用
|
|
166
|
+
target_path = folder_path.resolve()
|
|
167
|
+
self.logger.debug(f"DEBUG _resolve_download_folder: Using absolute path logic, target_path={target_path}")
|
|
168
|
+
|
|
169
|
+
# 安全检查:确保目标路径在 workspace_root 内部
|
|
170
|
+
try:
|
|
171
|
+
relative = target_path.relative_to(workspace_root)
|
|
172
|
+
self.logger.debug(f"DEBUG _resolve_download_folder: relative_to workspace_root={relative}")
|
|
173
|
+
except ValueError:
|
|
174
|
+
self.logger.error(
|
|
175
|
+
f"Security Alert: Download folder is outside workspace. "
|
|
176
|
+
f"Requested: {folder}, Resolved: {target_path}, "
|
|
177
|
+
f"Allowed root: {workspace_root}"
|
|
178
|
+
)
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
# 确保目录存在
|
|
182
|
+
target_path.mkdir(parents=True, exist_ok=True)
|
|
183
|
+
|
|
184
|
+
return str(target_path)
|
|
185
|
+
|
|
186
|
+
except Exception as e:
|
|
187
|
+
self.logger.exception(f"Failed to resolve download folder: {e}")
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
@register_action(
|
|
191
|
+
"下载一个指定的文件,提供文件的 URL,可选指定的保存目录",
|
|
192
|
+
param_infos={
|
|
193
|
+
"url": "文件的下载链接",
|
|
194
|
+
"filename": "保存文件的名称",
|
|
195
|
+
"folder": "(可选)保存目录的名称",
|
|
196
|
+
}
|
|
197
|
+
)
|
|
198
|
+
async def download_file(self, url: str, folder: str=None):
|
|
199
|
+
if folder is None:
|
|
200
|
+
folder = os.path.join(self.current_workspace,"downloads")
|
|
201
|
+
|
|
202
|
+
self.logger.debug(f"DEBUG: Initial folder (type {type(folder).__name__}): {folder}")
|
|
203
|
+
|
|
204
|
+
# 安全检查:确保 folder 路径在允许的工作空间范围内
|
|
205
|
+
folder_resolved = self._resolve_download_folder(folder)
|
|
206
|
+
if not folder_resolved:
|
|
207
|
+
return "Security Error: Download folder path is outside allowed workspace"
|
|
208
|
+
|
|
209
|
+
self.logger.debug(f"DEBUG: Resolved folder: {folder_resolved}")
|
|
210
|
+
self.logger.debug(f"DEBUG: workspace_root: {self.workspace_root}")
|
|
211
|
+
|
|
212
|
+
profile_path = os.path.join(self.workspace_root ,".matrix", "browser_profile", self.name)
|
|
213
|
+
|
|
214
|
+
self.browser_adapter = DrissionPageAdapter(
|
|
215
|
+
profile_path=profile_path,
|
|
216
|
+
download_path=folder_resolved
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
ctx = MissionContext(
|
|
220
|
+
purpose='download file',
|
|
221
|
+
save_dir=folder_resolved,
|
|
222
|
+
deadline=time.time() + 60
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
self.logger.info(f"🚀 Mission Start: Download file {url}")
|
|
226
|
+
|
|
227
|
+
# 2. 启动浏览器
|
|
228
|
+
await self.browser_adapter.start(headless=False) # 调试模式先开有头
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
# 访问url
|
|
232
|
+
tab = await self.browser_adapter.get_tab()
|
|
233
|
+
# 注意:浏览器已经在 DrissionPageAdapter 中设置了 download_path
|
|
234
|
+
# 所以这里不应该再传第二个参数,否则路径会被拼接两次!
|
|
235
|
+
res = await asyncio.to_thread(tab.download, url)
|
|
236
|
+
status, file_path = res
|
|
237
|
+
if status == 'success':
|
|
238
|
+
self.logger.info(f"File downloaded successfully: {file_path}")
|
|
239
|
+
# 转换为完整路径(绝对路径)
|
|
240
|
+
full_path = os.path.abspath(file_path)
|
|
241
|
+
return full_path
|
|
242
|
+
else:
|
|
243
|
+
self.logger.error(f"Failed to download file: {file_path}")
|
|
244
|
+
return 'Failed to Download'
|
|
245
|
+
except Exception as e:
|
|
246
|
+
self.logger.exception(e)
|
|
247
|
+
return f'Failed to Download {e}'
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
@register_action(
|
|
254
|
+
"为研究做准备,上网搜索并下载相关资料,要提供研究的目标和搜索关键词",
|
|
255
|
+
param_infos={
|
|
256
|
+
"purpose": "研究的具体目标",
|
|
257
|
+
"search_phrase": "在搜索引擎输入的初始关键词",
|
|
258
|
+
"topic": "保存资料的文件夹名称",
|
|
259
|
+
"max_time": "最大运行时间(分钟)"
|
|
260
|
+
}
|
|
261
|
+
)
|
|
262
|
+
async def research_crawler(self, purpose: str, search_phrase: str, topic: str, max_time: int = 30):
|
|
263
|
+
"""
|
|
264
|
+
[Entry Point] 外部调用的入口
|
|
265
|
+
"""
|
|
266
|
+
# 1. 准备环境
|
|
267
|
+
save_dir = os.path.join(self.workspace_root, "downloads", sanitize_filename(topic))
|
|
268
|
+
|
|
269
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
270
|
+
|
|
271
|
+
profile_path = os.path.join(self.workspace_root ,".matrix", "browser_profile", self.name)
|
|
272
|
+
|
|
273
|
+
self.browser_adapter = DrissionPageAdapter(
|
|
274
|
+
profile_path=profile_path,
|
|
275
|
+
download_path=save_dir
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
ctx = MissionContext(
|
|
279
|
+
purpose=purpose,
|
|
280
|
+
save_dir=save_dir,
|
|
281
|
+
deadline=time.time() + int(max_time) * 60
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
self.logger.info(f"🚀 Mission Start: {purpose}")
|
|
285
|
+
|
|
286
|
+
# 2. 启动浏览器
|
|
287
|
+
await self.browser_adapter.start(headless=False) # 调试模式先开有头
|
|
288
|
+
|
|
289
|
+
try:
|
|
290
|
+
# 3. 初始阶段:执行搜索 (Phase 0)
|
|
291
|
+
# 我们把搜索结果页当做第一个 Tab 的初始页面
|
|
292
|
+
first_tab = await self.browser_adapter.get_tab()
|
|
293
|
+
|
|
294
|
+
search_result = await search_func(self.browser_adapter, first_tab, search_phrase)
|
|
295
|
+
# 创建初始 Session
|
|
296
|
+
initial_session = TabSession(handle=first_tab, current_url="")
|
|
297
|
+
# 把搜索页直接推入队列,让 lifecycle 去处理 navigate
|
|
298
|
+
for result in search_result:
|
|
299
|
+
initial_session.pending_link_queue.append(result['url'])
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
# 4. 进入递归循环
|
|
308
|
+
await self._run_tab_lifecycle(initial_session, ctx)
|
|
309
|
+
|
|
310
|
+
# 5. 生成报告
|
|
311
|
+
return self._generate_final_report(ctx)
|
|
312
|
+
|
|
313
|
+
except Exception as e:
|
|
314
|
+
self.logger.exception("Crawler crashed")
|
|
315
|
+
return f"Mission failed with error: {e}"
|
|
316
|
+
finally:
|
|
317
|
+
self.logger.info("🛑 Closing browser...")
|
|
318
|
+
await self.browser_adapter.close()
|
|
319
|
+
ctx.cleanup() # 关闭数据库连接
|
|
320
|
+
|
|
321
|
+
async def _run_tab_lifecycle(self, session: TabSession, ctx: MissionContext):
|
|
322
|
+
"""
|
|
323
|
+
[The Core Loop] 物理 Tab 的生命周期管理。
|
|
324
|
+
只要队列不空,或者页面上有交互要做,就一直在这转。
|
|
325
|
+
"""
|
|
326
|
+
|
|
327
|
+
while not ctx.is_time_up():
|
|
328
|
+
|
|
329
|
+
# --- Phase 1: Navigation (从队列取任务) ---
|
|
330
|
+
# 如果当前没有在浏览特定页面,或者当前页面的交互都处理完了(Flag),则从队列取下一个
|
|
331
|
+
if not session.pending_link_queue:
|
|
332
|
+
self.logger.info(f"Tab {session.handle} queue empty. Closing tab.")
|
|
333
|
+
break # 队列空了,结束这个 Tab
|
|
334
|
+
|
|
335
|
+
next_url = session.pending_link_queue.popleft()
|
|
336
|
+
print(next_url)
|
|
337
|
+
|
|
338
|
+
# 1.1 门禁检查
|
|
339
|
+
if ctx.has_visited(next_url) or any(bl in next_url for bl in ctx.blacklist):
|
|
340
|
+
continue
|
|
341
|
+
|
|
342
|
+
self.logger.info(f"🔗 Navigating to: {next_url}")
|
|
343
|
+
nav_report = await self.browser_adapter.navigate(session.handle, next_url)
|
|
344
|
+
final_url = self.browser_adapter.get_tab_url(session.handle)
|
|
345
|
+
session.current_url = final_url # 更新当前
|
|
346
|
+
|
|
347
|
+
# 1.2 标记已访问
|
|
348
|
+
ctx.mark_visited(next_url)
|
|
349
|
+
ctx.mark_visited(final_url)
|
|
350
|
+
self.logger.info(f"🔗 Landed on: {final_url}")
|
|
351
|
+
# 2. 二次黑名单检查 (防止跳转到 Facebook/Login 页)
|
|
352
|
+
if any(bl in final_url for bl in ctx.blacklist):
|
|
353
|
+
self.logger.warning(f"🚫 Redirected to blacklisted URL: {final_url}. Aborting tab.")
|
|
354
|
+
# 这种情况下,直接 break 还是 continue?
|
|
355
|
+
# 这是一个 Dead End,所以应该结束当前页面的处理,去处理队列里的下一个
|
|
356
|
+
continue
|
|
357
|
+
|
|
358
|
+
# === Phase 2: Identify Logic Branch ===
|
|
359
|
+
# 先稳一手,不用全页面 stabilize,只要能拿到 contentType 就行
|
|
360
|
+
page_type = await self.browser_adapter.analyze_page_type(session.handle)
|
|
361
|
+
|
|
362
|
+
if page_type == PageType.ERRO_PAGE:
|
|
363
|
+
self.logger.warning(f"🚫 Error Page: {final_url}. Skipping.")
|
|
364
|
+
continue
|
|
365
|
+
|
|
366
|
+
# === 分支 A: 静态资源 (Dead End) ===
|
|
367
|
+
if page_type == PageType.STATIC_ASSET:
|
|
368
|
+
self.logger.info(f"📄 Detected Static Asset: {session.current_url}")
|
|
369
|
+
|
|
370
|
+
# 1. 尝试获取内容 (Snapshot)
|
|
371
|
+
# 对于 PDF,如果浏览器能提取文字最好,提取不到就拿文件名和 URL 做摘要
|
|
372
|
+
snapshot = await self.browser_adapter.get_page_snapshot(session.handle)
|
|
373
|
+
|
|
374
|
+
# 2. 小脑判断 (Assess)
|
|
375
|
+
# "这是一个 PDF,标题是 xxx,前 500 字是 xxx... 值得存吗?"
|
|
376
|
+
# 注意:对于无法提取文字的 Image/PDF,只能让小脑根据 URL/Title 盲猜
|
|
377
|
+
verdict_dict = await self._assess_page_value(snapshot, ctx)
|
|
378
|
+
verdict = verdict_dict["verdict"]
|
|
379
|
+
|
|
380
|
+
if verdict == ContentVerdict.HIGH_VALUE:
|
|
381
|
+
self.logger.info("💾 Saving Asset...")
|
|
382
|
+
# 保存文件
|
|
383
|
+
await self.browser_adapter.save_static_asset(session.handle)
|
|
384
|
+
# 记录到 Context
|
|
385
|
+
ctx.knowledge_base.append({"type": "file", "url": session.current_url, "title": snapshot.title})
|
|
386
|
+
|
|
387
|
+
# 3. 结束当前 URL 的处理
|
|
388
|
+
# 因为是 Static Asset,没有交互,没有 scout,直接 break (如果是单页) 或 continue (如果还要处理队列)
|
|
389
|
+
# 但在我们的逻辑里,Asset 是终点,处理完就可以从 Queue 取下一个了
|
|
390
|
+
# 不需要 break Loop (Loop 是处理 Queue 的),而是 continue Outer Loop
|
|
391
|
+
continue
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
# === 分支 B: 交互式网页 (Infinite Possibilities) ===
|
|
395
|
+
elif page_type == PageType.NAVIGABLE:
|
|
396
|
+
|
|
397
|
+
# 进入我们之前的复杂循环:Stabilize -> Assess -> Scout -> Act
|
|
398
|
+
# 这里就是原来的 Inner Loop 代码
|
|
399
|
+
self.logger.debug("🌐 Detected Navigable Page. Entering complex loop.")
|
|
400
|
+
page_active = True
|
|
401
|
+
page_changed = True
|
|
402
|
+
one_line_summary=""
|
|
403
|
+
while page_active and not ctx.is_time_up():
|
|
404
|
+
# 1. Stabilize (滚动加载)
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
if page_changed:
|
|
408
|
+
#第一次永远是page_changed,但如果点过一个button没什么变化,从头再来一次,就是False了
|
|
409
|
+
#这时候不需要再去stablize,也不用再看assess了,直接看scout
|
|
410
|
+
await self.browser_adapter.stabilize(session.handle)
|
|
411
|
+
# 2. Assess (HTML Extract -> Brain)
|
|
412
|
+
snapshot = await self.browser_adapter.get_page_snapshot(session.handle)
|
|
413
|
+
verdict_dict = await self._assess_page_value(snapshot, ctx)
|
|
414
|
+
verdict = verdict_dict["verdict"]
|
|
415
|
+
one_line_summary = verdict_dict["reason"]
|
|
416
|
+
|
|
417
|
+
if verdict == ContentVerdict.HIGH_VALUE:
|
|
418
|
+
await self._save_content(snapshot, ctx) # 保存 Summary
|
|
419
|
+
elif verdict == ContentVerdict.TRASH:
|
|
420
|
+
page_active = False;
|
|
421
|
+
continue
|
|
422
|
+
|
|
423
|
+
# === Phase 4: Scouting (Look) ===
|
|
424
|
+
# 扫描所有元素
|
|
425
|
+
links, buttons = await self.browser_adapter.scan_elements(session.handle)
|
|
426
|
+
#links的格式:{url: text}
|
|
427
|
+
#buttons的格式:{text: button_element}
|
|
428
|
+
self.logger.debug(f"🔍 Found {len(links)} links and {len(buttons)} buttons.")
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
# 4.1 处理 Links -> 入队 (不立即访问)
|
|
433
|
+
# 如果page_changed = False,这个可以跳过了
|
|
434
|
+
|
|
435
|
+
if page_changed:
|
|
436
|
+
filtered_links = {}
|
|
437
|
+
for link in links:
|
|
438
|
+
|
|
439
|
+
# 过滤掉已评估过的链接(避免重复调用 LLM)
|
|
440
|
+
if ctx.has_link_assessed(link):
|
|
441
|
+
continue
|
|
442
|
+
#先判断这个link是否已经访问过,或者是否在黑名单中,以及是不是已经在pending_link_queue里
|
|
443
|
+
if ctx.has_visited(link):
|
|
444
|
+
continue
|
|
445
|
+
if link in session.pending_link_queue:
|
|
446
|
+
continue
|
|
447
|
+
if any(bl in link for bl in ctx.blacklist):
|
|
448
|
+
continue
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
filtered_links[link] = links[link]
|
|
452
|
+
|
|
453
|
+
selected_links = await self._filter_relevant_links(filtered_links,one_line_summary, ctx)
|
|
454
|
+
|
|
455
|
+
# 记录所有评估过的链接(无论是否被选中)
|
|
456
|
+
for link in filtered_links:
|
|
457
|
+
ctx.mark_link_assessed(link)
|
|
458
|
+
|
|
459
|
+
new_links_count = 0
|
|
460
|
+
for link in selected_links:
|
|
461
|
+
|
|
462
|
+
session.pending_link_queue.append(link)
|
|
463
|
+
new_links_count += 1
|
|
464
|
+
self.logger.info(f"👀 Scouted {new_links_count} relevant links (enqueued).")
|
|
465
|
+
|
|
466
|
+
# 4.2 处理 Buttons -> 候选列表
|
|
467
|
+
candidate_buttons=[]
|
|
468
|
+
#只保留没评估过的按钮
|
|
469
|
+
for button_text in buttons:
|
|
470
|
+
# 过滤掉已评估过的按钮
|
|
471
|
+
if not ctx.has_button_assessed(session.current_url, button_text):
|
|
472
|
+
candidate_buttons.append({
|
|
473
|
+
button_text: buttons[button_text]
|
|
474
|
+
})
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
# === Phase 5: Execution (Act) ===
|
|
479
|
+
# 尝试点击最有价值的按钮
|
|
480
|
+
# 如果小脑决定不点任何按钮,或者没按钮可点,Inner Loop 结束
|
|
481
|
+
if not candidate_buttons:
|
|
482
|
+
self.logger.info("🤔 No worthy interactions found. Moving to next page in queue.")
|
|
483
|
+
page_active = False # 结束当前页
|
|
484
|
+
continue
|
|
485
|
+
|
|
486
|
+
chosen_button = await self._choose_best_interaction(candidate_buttons,one_line_summary, ctx)
|
|
487
|
+
|
|
488
|
+
# 记录所有评估过的按钮(无论是否被选中)
|
|
489
|
+
assessed_button_texts = [list(btn.keys())[0] for btn in candidate_buttons]
|
|
490
|
+
ctx.mark_buttons_assessed(session.current_url, assessed_button_texts)
|
|
491
|
+
|
|
492
|
+
if not chosen_button:
|
|
493
|
+
self.logger.info("🤔 No worthy interactions found. Moving to next page in queue.")
|
|
494
|
+
page_active = False # 结束当前页
|
|
495
|
+
continue
|
|
496
|
+
|
|
497
|
+
# 执行点击
|
|
498
|
+
self.logger.info(f"point_up: Clicking button: [{chosen_button.get_text()}]")
|
|
499
|
+
ctx.mark_interacted(session.current_url, chosen_button.get_text())
|
|
500
|
+
|
|
501
|
+
report = await self.browser_adapter.click_and_observe(session.handle, chosen_button)
|
|
502
|
+
|
|
503
|
+
# 5.1 处理后果: 新 Tab
|
|
504
|
+
if report.new_tabs:
|
|
505
|
+
self.logger.info(f"✨ New Tab(s) detected: {len(report.new_tabs)}")
|
|
506
|
+
for new_tab_handle in report.new_tabs:
|
|
507
|
+
# 递归!创建新的 Session
|
|
508
|
+
new_session = TabSession(handle=new_tab_handle, current_url="", depth=session.depth + 1)
|
|
509
|
+
# 等待递归返回
|
|
510
|
+
await self._run_tab_lifecycle(new_session, ctx)
|
|
511
|
+
# 递归回来后,关闭那个 tab (通常 lifecycle 结束时会自杀,这里可以做个保险)
|
|
512
|
+
await self.browser_adapter.close_tab(new_tab_handle)
|
|
513
|
+
|
|
514
|
+
# 5.2 处理后果: 页面变动 (Soft Restart)
|
|
515
|
+
if report.is_dom_changed or report.is_url_changed:
|
|
516
|
+
self.logger.info("🔄 Page mutated. Triggering Soft Restart (Re-assess).")
|
|
517
|
+
# 不设置 page_active = False,而是直接 continue Inner Loop
|
|
518
|
+
# 这会导致重新 Stabilize -> Assess -> Scout
|
|
519
|
+
# 注意更新 URL
|
|
520
|
+
page_changed = True
|
|
521
|
+
if report.is_url_changed:
|
|
522
|
+
session.current_url = self.browser_adapter.get_tab_url(session.handle) # 获取最新 URL
|
|
523
|
+
continue
|
|
524
|
+
|
|
525
|
+
# 5.3 处理后果: 无事发生或仅下载
|
|
526
|
+
# 如果没变动,也没弹窗,我们假设这个按钮点完了。
|
|
527
|
+
# 继续 Inner Loop 的下一次迭代?不,因为 DOM 没变,candidate_buttons 也没变。
|
|
528
|
+
# 我们应该继续从 candidate_buttons 里选下一个吗?
|
|
529
|
+
# 为了简单起见,如果点了一个按钮没反应,我们就认为“这页没啥好点的了”,或者让小脑在下一轮重新选(反正已经 mark interacted 了)
|
|
530
|
+
# 这里选择:继续循环,让 Assess/Scout 再跑一遍(成本不高),确保万无一失
|
|
531
|
+
page_changed = False
|
|
532
|
+
continue
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
# End Inner Loop
|
|
536
|
+
|
|
537
|
+
# End Outer Loop (Queue Empty or Time Up)
|
|
538
|
+
self.logger.info(f"🏁 Tab Session ended. visited: {len(ctx.visited_urls)}")
|
|
539
|
+
# 这里的 close_tab 交给调用方处理,或者 adapter.close_tab(session.handle)
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
# ==========================================
|
|
543
|
+
# 3. 小脑决策辅助 (Brain Power)
|
|
544
|
+
# ==========================================
|
|
545
|
+
|
|
546
|
+
async def _assess_page_value(self, snapshot: PageSnapshot, ctx: MissionContext):
|
|
547
|
+
"""
|
|
548
|
+
[Brain] 评估页面价值。
|
|
549
|
+
输入:页面快照 (URL, Title, Text Preview)
|
|
550
|
+
输出:ContentVerdict (TRASH | RELEVANT_INDEX | HIGH_VALUE)
|
|
551
|
+
"""
|
|
552
|
+
|
|
553
|
+
# 1. 极简启发式过滤 (Heuristics)
|
|
554
|
+
# 如果是 NAVIGABLE 类型,且内容极短 (例如 < 50 字符),
|
|
555
|
+
# 往往是脚本没加载出来,或者确实是空页。
|
|
556
|
+
# 为了防止漏掉只有图片的页面,我们稍微宽容一点,交给 LLM,
|
|
557
|
+
# 但如果连 Title 都是空的,直接扔掉。
|
|
558
|
+
if not snapshot.title and len(snapshot.main_text) < 10:
|
|
559
|
+
self.logger.warning(f"🗑️ Empty title and content: {snapshot.url}")
|
|
560
|
+
return {"verdict":ContentVerdict.TRASH, "reason":"Empty title and content"}
|
|
561
|
+
|
|
562
|
+
# 2. 构造 Prompt
|
|
563
|
+
# 截断文本,避免 Token 溢出。2000字通常足够判断价值。
|
|
564
|
+
# 如果是文件,main_text 可能是空的或者只有元数据,没关系。
|
|
565
|
+
preview_text = snapshot.main_text[:2500]
|
|
566
|
+
|
|
567
|
+
# 针对静态资源和普通网页使用略微不同的 Prompt 侧重
|
|
568
|
+
if snapshot.content_type == PageType.STATIC_ASSET:
|
|
569
|
+
evaluation_guide = textwrap.dedent("""
|
|
570
|
+
Type: STATIC FILE (PDF/Image/Doc).
|
|
571
|
+
Task: Decide if this file is relevant to [Research Goal] and should be DOWNLOADED based on its Title and URL.
|
|
572
|
+
Allowed Verdict Value:
|
|
573
|
+
- TRASH: Completely unrelated.
|
|
574
|
+
- HIGH_VALUE: The file seems relevant to the Research Goal (e.g., specific data, report, paper) or not enough information to judge. (Download anyway.)
|
|
575
|
+
|
|
576
|
+
(Note: Use HIGH_VALUE if you are not sure.)
|
|
577
|
+
""")
|
|
578
|
+
else:
|
|
579
|
+
evaluation_guide = textwrap.dedent("""
|
|
580
|
+
Type: WEBPAGE.
|
|
581
|
+
Task: Analyze content relevance.
|
|
582
|
+
Allowed Verdict Value:
|
|
583
|
+
- TRASH:
|
|
584
|
+
* Login/Signup walls, Captchas.
|
|
585
|
+
* 404/Errors, "Site under construction".
|
|
586
|
+
* Pure SEO spam, generic ads, "Buy now" product pages (unless research goal is shopping).
|
|
587
|
+
* Completely off-topic content.
|
|
588
|
+
- RELEVANT_INDEX:
|
|
589
|
+
* Hub pages, Directories, List of links (e.g., "Top 10 resources").
|
|
590
|
+
* Content is relevant but short/shallow (not worth summarizing, but worth exploring links).
|
|
591
|
+
* If unsure or info is sparse but looks relevant -> Choose THIS.
|
|
592
|
+
- HIGH_VALUE:
|
|
593
|
+
* Detailed articles, Reports, Data tables, Technical documentation.
|
|
594
|
+
* Directly answers the Research Goal with substance.
|
|
595
|
+
""")
|
|
596
|
+
|
|
597
|
+
prompt = textwrap.dedent(f"""
|
|
598
|
+
You are a Research Assistant.
|
|
599
|
+
|
|
600
|
+
[Research Goal]
|
|
601
|
+
"{ctx.purpose}"
|
|
602
|
+
|
|
603
|
+
[Target Info]
|
|
604
|
+
URL: {snapshot.url}
|
|
605
|
+
Title: {snapshot.title}
|
|
606
|
+
|
|
607
|
+
[Evaluation Guide]
|
|
608
|
+
{evaluation_guide}
|
|
609
|
+
|
|
610
|
+
[Content Preview]
|
|
611
|
+
{preview_text}
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
[Output Requirement]
|
|
615
|
+
Return JSON ONLY. Format: {{"verdict": "one of allowed verdict values", "reason": "One line summary about the page"}}
|
|
616
|
+
""")
|
|
617
|
+
|
|
618
|
+
# 3. 调用小脑
|
|
619
|
+
try:
|
|
620
|
+
# 假设 self.cerebellum.think 返回 dict: {'reply': '...', 'reasoning': '...'}
|
|
621
|
+
# 这里的 messages 格式取决于你的底层 LLM 接口,这里按常见格式写
|
|
622
|
+
response = await self.cerebellum.backend.think(
|
|
623
|
+
messages=[{"role": "user", "content": prompt}]
|
|
624
|
+
)
|
|
625
|
+
raw_reasoning = response.get('reasoning', '').strip()
|
|
626
|
+
raw_reply = response.get('reply', '').strip()
|
|
627
|
+
#self.logger.debug(f"🧠 Brain Reply: {raw_reply} \n\n Reasoning: {raw_reasoning}")
|
|
628
|
+
|
|
629
|
+
# 4. 解析结果
|
|
630
|
+
# 简单的 JSON 清洗(防止 LLM 加 markdown code block)
|
|
631
|
+
json_str = raw_reply.replace("```json", "").replace("```", "").strip()
|
|
632
|
+
result = json.loads(json_str)
|
|
633
|
+
|
|
634
|
+
verdict_str = result.get("verdict", "RELEVANT_INDEX").upper()
|
|
635
|
+
reason = result.get("reason", "No reason provided")
|
|
636
|
+
|
|
637
|
+
self.logger.info(f"🧠 Brain Assess [{verdict_str}]: {snapshot.title[:30]}... | Reason: {reason}")
|
|
638
|
+
|
|
639
|
+
if verdict_str == "HIGH_VALUE":
|
|
640
|
+
return {"verdict": ContentVerdict.HIGH_VALUE, "reason": reason}
|
|
641
|
+
elif verdict_str == "TRASH":
|
|
642
|
+
return {"verdict": ContentVerdict.TRASH, "reason": reason}
|
|
643
|
+
else:
|
|
644
|
+
return {"verdict": ContentVerdict.RELEVANT_INDEX, "reason": snapshot.main_text[:800]}
|
|
645
|
+
|
|
646
|
+
except Exception as e:
|
|
647
|
+
self.logger.error(f"🧠 Brain Assessment Failed: {e}. Defaulting to RELEVANT_INDEX.")
|
|
648
|
+
# 发生异常(如 JSON 解析失败、网络超时)时,
|
|
649
|
+
# 遵循“默认宽容原则”,只要不是静态资源,就当作 INDEX 继续探索,避免漏掉。
|
|
650
|
+
if snapshot.content_type == PageType.STATIC_ASSET:
|
|
651
|
+
# 文件如果判断不出,通常为了保险起见,可以设为 TRASH 或者 HIGH_VALUE
|
|
652
|
+
# 这里为了防止下垃圾文件,设为 TRASH (或者你可以改为 HIGH_VALUE)
|
|
653
|
+
return {"verdict": ContentVerdict.HIGH_VALUE, "reason": "Static Asset"}
|
|
654
|
+
return {"verdict": ContentVerdict.TRASH, "reason": "Possible related info"}
|
|
655
|
+
|
|
656
|
+
async def _save_content(self, snapshot: PageSnapshot, ctx: MissionContext):
|
|
657
|
+
"""
|
|
658
|
+
[Action] 保存内容到文件系统。
|
|
659
|
+
策略:
|
|
660
|
+
1. 短文 (< 1k chars): 直接存原文,不总结。
|
|
661
|
+
2. 中文 (1k - 15k chars): 小脑进行深度总结 (Deep Summary)。
|
|
662
|
+
3. 长文 (> 15k chars): 生成简介 (Abstract) + 附上全文。
|
|
663
|
+
"""
|
|
664
|
+
|
|
665
|
+
# === 1. 文件名生成策略 ===
|
|
666
|
+
# 使用 slugify 保证文件名安全,截断防止过长
|
|
667
|
+
safe_title = sanitize_filename(snapshot.title,60)
|
|
668
|
+
# 加个时间戳防止重名覆盖 (比如两个页面标题一样)
|
|
669
|
+
timestamp_suffix = str(int(time.time()))[-4:]
|
|
670
|
+
filename = f"{safe_title}_{timestamp_suffix}.md"
|
|
671
|
+
save_path = os.path.join(ctx.save_dir, filename)
|
|
672
|
+
|
|
673
|
+
text_len = len(snapshot.main_text)
|
|
674
|
+
final_content = ""
|
|
675
|
+
summary_type = ""
|
|
676
|
+
|
|
677
|
+
# === 2. 分级处理 ===
|
|
678
|
+
|
|
679
|
+
# --- Tier A: 短文 (直接保存) ---
|
|
680
|
+
if text_len < 1000:
|
|
681
|
+
self.logger.info(f"💾 Saving Short Content ({text_len} chars): {filename}")
|
|
682
|
+
summary_type = "Raw (Short)"
|
|
683
|
+
final_content = self._format_markdown(snapshot, "No summary generated (Content too short).", snapshot.main_text)
|
|
684
|
+
|
|
685
|
+
# --- Tier B: 中篇 (深度总结) ---
|
|
686
|
+
elif text_len < 15000:
|
|
687
|
+
self.logger.info(f"📝 Summarizing Medium Content ({text_len} chars)...")
|
|
688
|
+
summary_type = "AI Summary"
|
|
689
|
+
|
|
690
|
+
summary = await self._generate_summary(snapshot.main_text, ctx.purpose, mode="deep")
|
|
691
|
+
final_content = self._format_markdown(snapshot, summary, snapshot.main_text)
|
|
692
|
+
|
|
693
|
+
# --- Tier C: 长篇 (简介 + 原文) ---
|
|
694
|
+
else:
|
|
695
|
+
self.logger.info(f"📚 Archiving Long Content ({text_len} chars)...")
|
|
696
|
+
summary_type = "Abstract + Full Text"
|
|
697
|
+
|
|
698
|
+
# 策略:取前 5000 字(包含介绍)和后 2000 字(包含结论),跳过中间细节
|
|
699
|
+
# 这样小脑能读懂大概在讲什么,而不会被中间的细节淹没
|
|
700
|
+
partial_text = snapshot.main_text[:5000] + "\n\n...[Middle section omitted for summarization]...\n\n" + snapshot.main_text[-2000:]
|
|
701
|
+
|
|
702
|
+
abstract = await self._generate_summary(partial_text, ctx.purpose, mode="abstract")
|
|
703
|
+
|
|
704
|
+
note = f"**Note**: Document is very long ({text_len} chars). Below is an AI generated abstract based on intro/outro, followed by the full raw text."
|
|
705
|
+
final_content = self._format_markdown(snapshot, f"{note}\n\n{abstract}", snapshot.main_text)
|
|
706
|
+
|
|
707
|
+
# === 3. 写入文件 ===
|
|
708
|
+
try:
|
|
709
|
+
with open(save_path, "w", encoding="utf-8") as f:
|
|
710
|
+
f.write(final_content)
|
|
711
|
+
|
|
712
|
+
# === 4. 更新知识库索引 ===
|
|
713
|
+
# 这是给 Brain 最后看的 Manifest
|
|
714
|
+
ctx.knowledge_base.append({
|
|
715
|
+
"type": "page",
|
|
716
|
+
"title": snapshot.title,
|
|
717
|
+
"url": snapshot.url,
|
|
718
|
+
"file_path": filename, # 相对路径
|
|
719
|
+
"summary_type": summary_type,
|
|
720
|
+
"size_kb": round(text_len / 1024, 1)
|
|
721
|
+
})
|
|
722
|
+
|
|
723
|
+
except Exception as e:
|
|
724
|
+
self.logger.error(f"Failed to write file {save_path}: {e}")
|
|
725
|
+
|
|
726
|
+
# --- 辅助函数 ---
|
|
727
|
+
|
|
728
|
+
def _format_markdown(self, snapshot: PageSnapshot, summary_part: str, raw_part: str) -> str:
|
|
729
|
+
"""
|
|
730
|
+
统一的 Markdown 文件格式
|
|
731
|
+
"""
|
|
732
|
+
return textwrap.dedent(f"""
|
|
733
|
+
# {snapshot.title}
|
|
734
|
+
|
|
735
|
+
> Source: {snapshot.url}
|
|
736
|
+
> Captured: {time.strftime("%Y-%m-%d %H:%M:%S")}
|
|
737
|
+
|
|
738
|
+
## 🤖 AI Summary / Notes
|
|
739
|
+
{summary_part}
|
|
740
|
+
|
|
741
|
+
---
|
|
742
|
+
|
|
743
|
+
## 📄 Original Content
|
|
744
|
+
{raw_part}
|
|
745
|
+
""").strip()
|
|
746
|
+
|
|
747
|
+
async def _generate_summary(self, text: str, purpose: str, mode: str = "deep") -> str:
|
|
748
|
+
"""
|
|
749
|
+
调用小脑生成总结
|
|
750
|
+
"""
|
|
751
|
+
if mode == "deep":
|
|
752
|
+
task_desc = "Create a detailed structured summary (Markdown). Focus on facts, data, and answers relevant to the Research Goal."
|
|
753
|
+
else:
|
|
754
|
+
task_desc = "Create a brief Abstract/Overview (1-2 paragraphs). Explain what this document is about and its potential value."
|
|
755
|
+
|
|
756
|
+
prompt = f"""
|
|
757
|
+
You are a Research Assistant.
|
|
758
|
+
Research Goal: "{purpose}"
|
|
759
|
+
|
|
760
|
+
Task: {task_desc}
|
|
761
|
+
|
|
762
|
+
Content:
|
|
763
|
+
{text}
|
|
764
|
+
|
|
765
|
+
Output Markdown only.
|
|
766
|
+
"""
|
|
767
|
+
|
|
768
|
+
try:
|
|
769
|
+
resp = await self.cerebellum.backend.think(messages=[{"role": "user", "content": prompt}])
|
|
770
|
+
return resp.get('reply', '').strip()
|
|
771
|
+
except Exception:
|
|
772
|
+
return "[Error: AI Summary Generation Failed]"
|
|
773
|
+
|
|
774
|
+
import re
|
|
775
|
+
|
|
776
|
+
def _generate_final_report(self, ctx: MissionContext) -> str:
|
|
777
|
+
return f"Mission Complete. Found {len(ctx.knowledge_base)} items."
|