matrix-for-agents 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. agentmatrix/__init__.py +20 -0
  2. agentmatrix/agents/__init__.py +1 -0
  3. agentmatrix/agents/base.py +572 -0
  4. agentmatrix/agents/claude_coder.py +10 -0
  5. agentmatrix/agents/data_crawler.py +14 -0
  6. agentmatrix/agents/post_office.py +212 -0
  7. agentmatrix/agents/report_writer.py +14 -0
  8. agentmatrix/agents/secretary.py +10 -0
  9. agentmatrix/agents/stateful.py +10 -0
  10. agentmatrix/agents/user_proxy.py +82 -0
  11. agentmatrix/agents/worker.py +30 -0
  12. agentmatrix/backends/__init__.py +1 -0
  13. agentmatrix/backends/llm_client.py +414 -0
  14. agentmatrix/backends/mock_llm.py +35 -0
  15. agentmatrix/cli_runner.py +94 -0
  16. agentmatrix/core/__init__.py +0 -0
  17. agentmatrix/core/action.py +50 -0
  18. agentmatrix/core/browser/bing.py +208 -0
  19. agentmatrix/core/browser/browser_adapter.py +298 -0
  20. agentmatrix/core/browser/browser_common.py +85 -0
  21. agentmatrix/core/browser/drission_page_adapter.py +1296 -0
  22. agentmatrix/core/browser/google.py +230 -0
  23. agentmatrix/core/cerebellum.py +121 -0
  24. agentmatrix/core/events.py +22 -0
  25. agentmatrix/core/loader.py +185 -0
  26. agentmatrix/core/loader_v1.py +146 -0
  27. agentmatrix/core/log_util.py +158 -0
  28. agentmatrix/core/message.py +32 -0
  29. agentmatrix/core/prompt_engine.py +30 -0
  30. agentmatrix/core/runtime.py +211 -0
  31. agentmatrix/core/session.py +20 -0
  32. agentmatrix/db/__init__.py +1 -0
  33. agentmatrix/db/database.py +79 -0
  34. agentmatrix/db/vector_db.py +213 -0
  35. agentmatrix/docs/Design.md +109 -0
  36. agentmatrix/docs/Framework Capbilities.md +105 -0
  37. agentmatrix/docs/Planner Design.md +148 -0
  38. agentmatrix/docs/crawler_flow.md +110 -0
  39. agentmatrix/docs/report_writer.md +83 -0
  40. agentmatrix/docs/review.md +99 -0
  41. agentmatrix/docs/skill_design.md +23 -0
  42. agentmatrix/profiles/claude_coder.yml +40 -0
  43. agentmatrix/profiles/mark.yml +26 -0
  44. agentmatrix/profiles/planner.yml +21 -0
  45. agentmatrix/profiles/prompts/base.txt +88 -0
  46. agentmatrix/profiles/prompts/base_v1.txt +101 -0
  47. agentmatrix/profiles/prompts/base_v2.txt +94 -0
  48. agentmatrix/profiles/tom_the_data_crawler.yml +38 -0
  49. agentmatrix/profiles/user_proxy.yml +17 -0
  50. agentmatrix/skills/__init__.py +1 -0
  51. agentmatrix/skills/crawler_helpers.py +315 -0
  52. agentmatrix/skills/data_crawler.py +777 -0
  53. agentmatrix/skills/filesystem.py +204 -0
  54. agentmatrix/skills/notebook.py +158 -0
  55. agentmatrix/skills/project_management.py +114 -0
  56. agentmatrix/skills/report_writer.py +194 -0
  57. agentmatrix/skills/report_writer_utils.py +379 -0
  58. agentmatrix/skills/search_tool.py +383 -0
  59. agentmatrix/skills/terminal_ctrl.py +122 -0
  60. agentmatrix/skills/utils.py +33 -0
  61. agentmatrix/skills/web_searcher.py +1107 -0
  62. matrix_for_agents-0.1.2.dist-info/METADATA +44 -0
  63. matrix_for_agents-0.1.2.dist-info/RECORD +66 -0
  64. matrix_for_agents-0.1.2.dist-info/WHEEL +5 -0
  65. matrix_for_agents-0.1.2.dist-info/licenses/LICENSE +190 -0
  66. matrix_for_agents-0.1.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,204 @@
1
+ # skills/filesystem.py
2
+ import os
3
+ from pathlib import Path
4
+ from ..core.action import register_action
5
+
6
+
7
+ class FileSkillMixin:
8
+ """
9
+ 文件系统技能包。
10
+ 需要宿主类 (Agent) 提供 private_workspace 和 current_workspace 属性。
11
+ """
12
+
13
+ def _resolve_path(self, workspace_root: Path, relative_path: str = ".") -> Path:
14
+ """
15
+ 安全路径解析,防止路径穿越攻击
16
+
17
+ Args:
18
+ workspace_root: workspace 根目录 (Path 对象)
19
+ relative_path: 相对路径,默认 "." 表示根目录
20
+
21
+ Returns:
22
+ 解析后的绝对路径 (Path 对象)
23
+
24
+ Raises:
25
+ ValueError: 路径穿越检测或 workspace_root 为 None
26
+ """
27
+ # 1. 检查 workspace_root
28
+ if workspace_root is None:
29
+ raise ValueError("Workspace is not initialized. Please set workspace_root first.")
30
+
31
+ # 2. 规范化根目录
32
+ workspace_root = workspace_root.resolve()
33
+
34
+ # 3. 拼接并规范化目标路径
35
+ target_path = (workspace_root / relative_path).resolve()
36
+
37
+ # 4. 安全检查:确保目标路径在 workspace_root 内部
38
+ try:
39
+ target_path.relative_to(workspace_root)
40
+ except ValueError:
41
+ raise ValueError(
42
+ f"Security Alert: Path traversal detected. "
43
+ f"Attempted to access {relative_path}, which is outside workspace."
44
+ )
45
+
46
+ return target_path
47
+
48
+ # ========== Private Workspace Methods ==========
49
+
50
+ @register_action(
51
+ "列出私有工作区的文件。用于查看有哪些文件可用。",
52
+ param_infos={
53
+ "relative_path": "子目录路径,默认 '.' 表示根目录"
54
+ }
55
+ )
56
+ async def list_private_file(self, relative_path: str = ".") -> str:
57
+ """列出私有工作区中的文件和目录"""
58
+ try:
59
+ workspace = self.private_workspace
60
+ target_dir = self._resolve_path(workspace, relative_path)
61
+
62
+ if not target_dir.exists():
63
+ return f"Directory not found: {relative_path}"
64
+
65
+ items = []
66
+ for item in target_dir.iterdir():
67
+ marker = "[DIR] " if item.is_dir() else "[FILE]"
68
+ items.append(f"{marker}{item.name}")
69
+
70
+ return "\n".join(items) if items else "(Empty Directory)"
71
+
72
+ except Exception as e:
73
+ return f"Error listing private files: {str(e)}"
74
+
75
+ @register_action(
76
+ "读取私有工作区的文件内容。仅支持文本文件。",
77
+ param_infos={
78
+ "relative_path": "文件相对路径"
79
+ }
80
+ )
81
+ async def read_private_file(self, relative_path: str) -> str:
82
+ """读取私有工作区中的文本文件"""
83
+ try:
84
+ workspace = self.private_workspace
85
+ file_path = self._resolve_path(workspace, relative_path)
86
+
87
+ if not file_path.exists():
88
+ return f"Error: File '{relative_path}' does not exist."
89
+
90
+ if not file_path.is_file():
91
+ return f"Error: '{relative_path}' is not a file."
92
+
93
+ # 文件大小限制:50KB
94
+ MAX_SIZE = 50 * 1024
95
+ if file_path.stat().st_size > MAX_SIZE:
96
+ return f"Error: File too large ({file_path.stat().st_size} bytes). Limit: {MAX_SIZE} bytes."
97
+
98
+ return file_path.read_text(encoding='utf-8')
99
+
100
+ except UnicodeDecodeError:
101
+ return "Error: Binary file detected. Cannot read as text."
102
+ except Exception as e:
103
+ return f"Error reading private file: {str(e)}"
104
+
105
+ @register_action(
106
+ "写入内容到私有工作区的文件。",
107
+ param_infos={
108
+ "relative_path": "文件相对路径",
109
+ "content": "要写入的文本内容"
110
+ }
111
+ )
112
+ async def write_private_file(self, relative_path: str, content: str) -> str:
113
+ """写入内容到私有工作区的文件"""
114
+ try:
115
+ workspace = self.private_workspace
116
+ file_path = self._resolve_path(workspace, relative_path)
117
+
118
+ # 自动创建父目录
119
+ file_path.parent.mkdir(parents=True, exist_ok=True)
120
+
121
+ file_path.write_text(content, encoding='utf-8')
122
+ return f"Success: Written to private workspace '{relative_path}'"
123
+
124
+ except Exception as e:
125
+ return f"Error writing private file: {str(e)}"
126
+
127
+ # ========== Shared Workspace Methods ==========
128
+
129
+ @register_action(
130
+ "列出共享工作区的文件。用于查看有哪些文件可用。",
131
+ param_infos={
132
+ "relative_path": "子目录路径,默认 '.' 表示根目录"
133
+ }
134
+ )
135
+ async def list_shared_file(self, relative_path: str = ".") -> str:
136
+ """列出共享工作区中的文件和目录"""
137
+ try:
138
+ workspace = self.current_workspace
139
+ target_dir = self._resolve_path(workspace, relative_path)
140
+
141
+ if not target_dir.exists():
142
+ return f"Directory not found: {relative_path}"
143
+
144
+ items = []
145
+ for item in target_dir.iterdir():
146
+ marker = "[DIR] " if item.is_dir() else "[FILE]"
147
+ items.append(f"{marker}{item.name}")
148
+
149
+ return "\n".join(items) if items else "(Empty Directory)"
150
+
151
+ except Exception as e:
152
+ return f"Error listing shared files: {str(e)}"
153
+
154
+ @register_action(
155
+ "读取共享工作区的文件内容。仅支持文本文件。",
156
+ param_infos={
157
+ "relative_path": "文件相对路径"
158
+ }
159
+ )
160
+ async def read_shared_file(self, relative_path: str) -> str:
161
+ """读取共享工作区中的文本文件"""
162
+ try:
163
+ workspace = self.current_workspace
164
+ file_path = self._resolve_path(workspace, relative_path)
165
+
166
+ if not file_path.exists():
167
+ return f"Error: File '{relative_path}' does not exist."
168
+
169
+ if not file_path.is_file():
170
+ return f"Error: '{relative_path}' is not a file."
171
+
172
+ # 文件大小限制:50KB
173
+ MAX_SIZE = 50 * 1024
174
+ if file_path.stat().st_size > MAX_SIZE:
175
+ return f"Error: File too large ({file_path.stat().st_size} bytes). Limit: {MAX_SIZE} bytes."
176
+
177
+ return file_path.read_text(encoding='utf-8')
178
+
179
+ except UnicodeDecodeError:
180
+ return "Error: Binary file detected. Cannot read as text."
181
+ except Exception as e:
182
+ return f"Error reading shared file: {str(e)}"
183
+
184
+ @register_action(
185
+ "写入内容到共享工作区的文件。",
186
+ param_infos={
187
+ "relative_path": "文件相对路径",
188
+ "content": "要写入的文本内容"
189
+ }
190
+ )
191
+ async def write_shared_file(self, relative_path: str, content: str) -> str:
192
+ """写入内容到共享工作区的文件"""
193
+ try:
194
+ workspace = self.current_workspace
195
+ file_path = self._resolve_path(workspace, relative_path)
196
+
197
+ # 自动创建父目录
198
+ file_path.parent.mkdir(parents=True, exist_ok=True)
199
+
200
+ file_path.write_text(content, encoding='utf-8')
201
+ return f"Success: Written to shared workspace '{relative_path}'"
202
+
203
+ except Exception as e:
204
+ return f"Error writing shared file: {str(e)}"
@@ -0,0 +1,158 @@
1
+
2
+ from ..core.action import register_action
3
+ from datetime import datetime,timezone
4
+ import json
5
+ import os
6
+ import uuid
7
+ import asyncio
8
+ import textwrap
9
+ import traceback
10
+ class NotebookMixin:
11
+
12
+ @register_action(
13
+ "从详细的系统日志记录中查找信息,补充遗忘的记忆",
14
+ param_infos={
15
+ "query": "想找什么?(自然语言描述)"
16
+ }
17
+ )
18
+ async def search_in_diary(self, query):
19
+ search_result = await asyncio.to_thread(
20
+ self.sync_search_in_history,
21
+ query=query
22
+ )
23
+ return search_result
24
+
25
+ def sync_search_in_history(self, query):
26
+ max_count = 10 #最多检查 10 封邮件
27
+ session = self.current_session
28
+ user_session_id = session.user_session_id
29
+ partial_results = []
30
+ for index in range(max_count):
31
+ mails = self.post_office.get_mails_by_range(user_session_id, self.name, start=index, end=index +1)
32
+ if mails:
33
+ m = mails[0]
34
+ mail_text = str(m)
35
+
36
+
37
+ search_history_prompt = textwrap.dedent(f"""
38
+ 你是一个专业的历史邮件管理助理。负责从系统邮件历史中查找能够回答用户问题的邮件。
39
+ 你只是负责查找,不需要回答问题。如果发现可以完整回答用户问题的邮件,就报告找到了。
40
+ 如果具有部分有用信息,就摘录出来。
41
+ 如果没有能回答问题的信息,就报告没有找到。
42
+
43
+ 用户现在想找的信息是:
44
+ {query}
45
+
46
+ [现在的查看的邮件是]:
47
+ {mail_text}
48
+
49
+ [应答的要求]:
50
+ 1. 严格按照下面的专业格式报告查找的结果:
51
+ 2. 如果这封邮件包含用户想找的信息,就输出 “FOUND"。单一单词,不要附加任何额外信息。
52
+ 3. 如果这封邮件包含用户想找的部分信息,就摘录这部分可能有用的信息的原文,并且直接输出。只输出原文,不要任何其他信息。
53
+ 4. 如果这封邮件没有任何有用信息,输出 "NOT_FOUND"。单一单词,不要附加任何额外信息
54
+ 5. 总之,你的输出要么是 “FOUND”,要么是 “NOT_FOUND”,要么是邮件中摘录的部分原文。任何其他内容都被认为是不专业的。
55
+ """)
56
+ messages= [{"role": "user", "content": search_history_prompt}]
57
+
58
+ response = self.cerebellum.backend.think(messages=messages)
59
+ reply_str= response['reply']
60
+ if reply_str == "FOUND":
61
+ return textwrap.dedent(f"""
62
+ Found relevant information in email
63
+ {mail_text}
64
+ """)
65
+ elif reply_str == "NOT_FOUND":
66
+ continue
67
+ else:
68
+ # 部分有用信息
69
+
70
+ partial_results.append(textwrap.dedent(f"""[{m.timestamp}]:
71
+ {reply_str}
72
+ """))
73
+ if partial_results:
74
+ return_str = "Found following partial information in diary:\n"
75
+ for info in partial_results:
76
+ return_str += info + "\n"
77
+ return return_str
78
+ else:
79
+ return "No relevant information found in diary."
80
+
81
+
82
+
83
+ @register_action(
84
+ "记笔记。记录关键信息",
85
+ param_infos={
86
+ "content": "具体的记忆内容 (自然语言)"
87
+ }
88
+ )
89
+ async def take_note(self, content):
90
+ notebook_filepath = os.path.join(self.workspace_root, ".matrix", self.name, "notebook", f"{self.current_user_session_id}.jsonl")
91
+ if not os.path.exists(notebook_filepath):
92
+ os.makedirs(os.path.dirname(notebook_filepath), exist_ok=True)
93
+ current_timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
94
+ record_id = str(uuid.uuid4())
95
+
96
+
97
+ note_record ={
98
+ "id":record_id,
99
+ "created_at": current_timestamp, #用于判断信息的新旧
100
+ "content": content,
101
+
102
+ }
103
+
104
+ #把笔记保存到文件
105
+ with open(notebook_filepath, "a") as f:
106
+ f.write(json.dumps(note_record) + "\n")
107
+ session = self.current_session
108
+ user_session_id = session.user_session_id
109
+
110
+ await self.vector_db.add_documents("notebook", [content],
111
+ metadatas={"created_at": current_timestamp,
112
+ "creator": self.name,
113
+ "user_session_id": user_session_id
114
+ },
115
+ ids=[record_id]
116
+ )
117
+
118
+
119
+ return "Note saved."
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+ @register_action(
128
+ "Vector Search查笔记。查找以前记录的笔记",
129
+ param_infos={
130
+ "query": "想找什么?(自然语言描述)",
131
+ }
132
+ )
133
+ async def search_notebook(self, query):
134
+ session = self.current_session
135
+ user_session_id = session.user_session_id
136
+ where = {
137
+ "$and": [
138
+ {"user_session_id": user_session_id},
139
+ {"creator": self.name}
140
+ ]
141
+ }
142
+ results = await self.vector_db.query("notebook", [query], where, n_results=5)
143
+ if results['documents'] and results['documents'][0]:
144
+ return_str = "Most relevant notes:\n"
145
+ for doc, metadata,distance in zip(results['documents'][0], results['metadatas'][0], results['distances'][0]):
146
+ return_str += textwrap.dedent(f"""#### On : {metadata['created_at']} (Vector search distance: {distance} )
147
+
148
+ {doc}
149
+
150
+ """)
151
+ return return_str
152
+ else:
153
+ return "No notes found."
154
+
155
+
156
+
157
+
158
+
@@ -0,0 +1,114 @@
1
+ # skills/project_management.py
2
+ import os
3
+ import time
4
+ import textwrap
5
+ from datetime import datetime
6
+ from ..core.action import register_action
7
+
8
+ class ProjectManagementMixin:
9
+ """
10
+ 赋予 Agent 项目经理的能力:
11
+ 1. 维护一个持久化的 Project Board (看板)。
12
+ 2. 能够主动压缩上下文 (Memory Compression),只保留看板和最近的对话。
13
+ 3. 自动归档旧的历史记录到文件 (Audit Trail)。
14
+ """
15
+
16
+ def _get_board(self):
17
+ """获取当前看板,如果没有则初始化"""
18
+ if not hasattr(self, 'project_board') or self.project_board is None:
19
+ self.project_board = "Project Initialized. Waiting for scope definition."
20
+ return self.project_board
21
+
22
+ def _archive_history(self, old_history, reason):
23
+ """[Side Effect] 将被清理的对话记录归档到本地文件,防止永久丢失"""
24
+ if not self.workspace_root:
25
+ return
26
+
27
+ archive_dir = os.path.join(self.workspace_root, self.name, "logs", "archives")
28
+ os.makedirs(archive_dir, exist_ok=True)
29
+
30
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
31
+ filename = f"{self.name}_archive_{timestamp}.log"
32
+ filepath = os.path.join(archive_dir, filename)
33
+
34
+ with open(filepath, "w", encoding='utf-8') as f:
35
+ f.write(f"=== ARCHIVE REASON: {reason} ===\n")
36
+ f.write(f"=== BOARD STATE ===\n{self.project_board}\n")
37
+ f.write("=== CONVERSATION HISTORY ===\n")
38
+ for msg in old_history:
39
+ role = msg.get('role','')
40
+ content = msg.get('content','')
41
+ if role and content:
42
+ f.write(f"[{role}]: {content}\n{'-'*20}\n")
43
+
44
+ if hasattr(self, 'logger'):
45
+ self.logger.info(f"Old memory archived to {filename}")
46
+
47
+ @register_action(
48
+ "阶段总结,更新项目看板,Markdown格式。项目状态发生重要变化后或者处理了比较多封邮件后需要阶段性的总结。保留所有关键信息",
49
+ param_infos={
50
+ "summary": "最新的全局项目状态总结",
51
+ }
52
+ )
53
+ async def update_board(self, summary: str):
54
+ """
55
+ Planner 的记忆重置操作。
56
+ """
57
+ # 1. 更新内存变量
58
+ self.project_board = summary
59
+ session = self.current_session
60
+
61
+ # 2. 归档逻辑 (Archiving) - 这是一个好习惯,防止总结错了找不回原文
62
+ # 我们归档除了 System Prompt 和 Anchor 之外的所有中间层
63
+
64
+ # history 这个时候应该是这样的的
65
+ # (1)system prompt
66
+ # (2)第一个 user msg (anchor task, incoming email)
67
+ # (3)第一个 assistant msg (intent + action )
68
+ # (N 轮对话)
69
+ # 【倒数第四】user msg
70
+ # 【倒数第三】assistant msg
71
+ # 【倒数第二】 user msg
72
+ # [倒数第一】assistant msg (intent + action = update_board)
73
+
74
+
75
+ if len(session.history) > 10:
76
+ # history[0] = System, history[1] = Anchor
77
+ # 切片范围:从索引 2 到 最后(不包含即将生成的 Feedback)
78
+ msgs_to_archive = session.history[2:]
79
+ self._archive_history(msgs_to_archive, reason=f"Board Update: {summary[:20]}...")
80
+
81
+ # 3. 构造新的“中间层” (The Compressed State)
82
+ # 用 System 角色或者 Assistant 角色都可以。
83
+ # 用 System 角色更像“上帝视角的旁白”,用 Assistant 角色更像“我自己的笔记”。
84
+ # 这里推荐用 System 格式,以此区隔于普通的对话。
85
+ board_msg = {
86
+ "role": "assistant",
87
+ "content": textwrap.dedent(f"""
88
+ Latest project status
89
+
90
+ ### 📌 CURRENT PROJECT BOARD
91
+ {self.project_board}
92
+
93
+ """)
94
+ }
95
+
96
+ # 4. 重组 History
97
+ # [System Prompt] + [Anchor Task] + [New Board]
98
+ # 注意:这里我们只取前两个。如果 history 长度不足 2(比如刚开始就 update),要做保护
99
+ base_history = session.history[:2]
100
+ #从
101
+ # 覆盖 Session History
102
+ session.history = base_history + [board_msg]
103
+
104
+ # 5. 返回结果
105
+ # 这个返回值会被 BaseAgent 追加到 history 的末尾,成为新的激活信号
106
+ return "Project status reviewed."
107
+ else:
108
+ return "Status is up to date"
109
+
110
+
111
+
112
+ # skills/project_management.py
113
+
114
+
@@ -0,0 +1,194 @@
1
+ import os
2
+ from pathlib import Path
3
+ from dataclasses import dataclass, field
4
+ from typing import List, AsyncIterator, Optional
5
+ import fitz # PyMuPDF
6
+ # 正确的 Marker 导入方式
7
+ from marker.models import load_all_models
8
+ from marker.convert import convert_single_pdf
9
+
10
+
11
+ import torch
12
+ import textwrap
13
+ from ..skills.report_writer_utils import *
14
+ from ..core.action import register_action
15
+
16
+
17
+ @dataclass
18
+ class ResearchState:
19
+ """研究状态数据结构,在整个流程中流转"""
20
+ main_subject: str # 研究主题
21
+ main_purpose: str # 研究目的
22
+ input_dir : str
23
+ output_dir: str
24
+ blueprint: str = "" # 调查蓝图(大纲、核心问题清单、预设章节)
25
+ concept_notes: str = "" # 概念笔记(Markdown格式,包含实体定义、关系、来源)
26
+ draft_content: str = "" # 正文草稿(Markdown格式,分章节填充的内容)
27
+ scrachpad: str = "" # 临时草稿(Markdown格式,用于临时保存的笔记)
28
+ processed_files: List[str] = field(default_factory=list) # 进度记录
29
+
30
+
31
+ class ReportWriterSkillMixin:
32
+
33
+
34
+
35
+ async def ask_ai(self, prompt:str, sys_prompt:str = None) -> str:
36
+ messages =[]
37
+ if sys_prompt:
38
+ messages.append({"role": "system", "content": sys_prompt})
39
+ messages.append({"role": "user", "content": prompt})
40
+ response = await self.brain.think(messages)
41
+ self.logger.debug(f"AI Think: {response['reasoning']}")
42
+ self.logger.debug(f"AI Reply: {response['reply']}")
43
+ return response['reply']
44
+
45
+ @register_action(
46
+ "编写报告blueprint,需要提供报告主题和研究目的说明,生成研究方案",
47
+ param_infos={
48
+
49
+ "main_subject": "报告主题",
50
+ "main_purpose": "研究目的,要解决或者研究的问题",
51
+ "src_dir": "可选非必须,输入资料目录",
52
+ "output_dir": "可选非必须,报告输出目录",
53
+
54
+ }
55
+ )
56
+ async def write_report(self, main_subject: str, main_purpose: str, src_dir: str =".", output_dir: str="."):
57
+ """
58
+ 阅读分析src_dir中的资料,撰写主题为main_subject、目的为main_purpose的报告,保存到output_dir中。
59
+ """
60
+ # 初始化状态
61
+ state = ResearchState(main_subject=main_subject, main_purpose= main_purpose, input_dir=src_dir, output_dir=output_dir)
62
+
63
+ # Phase 0: 先验生成
64
+ await self._phase0_theorist(state)
65
+
66
+
67
+ # Phase 1: 侦察与校准
68
+ state = await self._phase1_scout(state, src_dir)
69
+ '''
70
+ # Phase 2: 全量迭代
71
+ state = await self._phase2_execution_loop(state, src_dir)
72
+
73
+ # Phase 3: 终稿润色
74
+ final_report = await self._phase3_finalizer(state)
75
+
76
+ # 保存最终报告
77
+ self._save_final_report(final_report, output_dir)
78
+ '''
79
+
80
+ # ========== Phase 0: The Theorist ==========
81
+ async def _phase0_theorist(self, state: ResearchState) -> ResearchState:
82
+ main_subject = state.main_subject
83
+ main_purpose = state.main_purpose
84
+ #第一步,先生成人设prompt:
85
+ persona_prompt = textwrap.dedent(PERSONA_DESIGNER).replace("{{main_subject}}", main_subject).replace("{{main_purpose}}", main_purpose)
86
+ persona= await self.ask_ai(persona_prompt)
87
+ #第二步,生成blueprint prompt:
88
+ sys_prompt = textwrap.dedent(BLUEPRINT_DESIGNER).replace("{{persona}}", persona)
89
+ user_input = f"""
90
+ 请基于以下主题和目标,生成一份 **Deep Research Blueprint**。
91
+
92
+ ### 用户输入
93
+ **研究主题**: {main_subject}
94
+ **研究动机/目标**:
95
+ {main_purpose}
96
+ """
97
+ state.blueprint = await self.ask_ai(user_input, sys_prompt)
98
+
99
+
100
+ return state.blueprint
101
+
102
+
103
+
104
+
105
+
106
+ # ========== Phase 1: The Scout ==========
107
+ async def _phase1_scout(self, state: ResearchState, src_dir: str) -> ResearchState:
108
+ """采样文档,修正蓝图"""
109
+ pass
110
+
111
+ async def _sample_and_analyze(self, state: ResearchState, src_dir: str, sample_count: int = 5):
112
+ """并行采样分析文档,生成Delta Reports"""
113
+ pass
114
+
115
+ async def _synthesize_blueprint(self, state: ResearchState, delta_reports: List[str]) -> str:
116
+ """主编合成:汇总Delta Reports,生成专用蓝图"""
117
+ pass
118
+
119
+ # ========== Phase 2: The Execution Loop ==========
120
+ async def _phase2_execution_loop(self, state: ResearchState, src_dir: str) -> ResearchState:
121
+ """流式阅读所有文档,迭代更新笔记和草稿"""
122
+ async for chunk in self._document_stream_generator(src_dir):
123
+ # Step A: 更新知识库
124
+ state.concept_notes = await self._update_concept_notes(state.concept_notes, chunk)
125
+
126
+ # Step B: 更新草稿
127
+ state.draft_content = await self._update_draft(state.blueprint, state.draft_content,
128
+ state.concept_notes, chunk)
129
+
130
+ return state
131
+
132
+ async def _update_concept_notes(self, current_notes: str, new_text: str) -> str:
133
+ """识别新实体/新定义,或更新旧实体的属性/关系"""
134
+ pass
135
+
136
+ async def _update_draft(self, blueprint: str, current_draft: str, notes: str, new_text: str) -> str:
137
+ """判断new_text是否回答了蓝图中的问题,整合进草稿"""
138
+ pass
139
+
140
+ # ========== Phase 3: The Finalizer ==========
141
+ async def _phase3_finalizer(self, state: ResearchState) -> str:
142
+ """润色正文,生成附录"""
143
+ pass
144
+
145
+ # ========== Utility Functions ==========
146
+ async def _document_stream_generator(self, src_dir: str) -> AsyncIterator[str]:
147
+ """文档流生成器:封装文件读取,提供统一的yield chunk接口"""
148
+ pass
149
+
150
+ def _save_checkpoint(self, state: ResearchState, output_dir: str, phase_name: str):
151
+ """保存检查点:每个阶段结束时保存State为Markdown文件"""
152
+ pass
153
+
154
+ def _load_checkpoint(self, checkpoint_path: str) -> Optional[ResearchState]:
155
+ """加载检查点:从Markdown文件恢复State"""
156
+ pass
157
+
158
+ def _save_final_report(self, report: str, output_dir: str):
159
+ """保存最终报告"""
160
+ pass
161
+
162
+
163
+ ### 如何使用这个函数
164
+
165
+
166
+ if __name__ == '__main__':
167
+ # ==================== 使用示例 ====================
168
+
169
+ # 定义你的 PDF 文件路径和输出目录
170
+ # 请确保将 'path/to/your/document.pdf' 替换为你的实际文件路径
171
+ my_pdf = 'path/to/your/document.pdf'
172
+ output_directory = 'output'
173
+
174
+ # 示例1:转换整个 PDF 文档
175
+ # 不提供 start_page 和 end_page 参数
176
+ self.logger.debug("--- 任务1: 转换整个 PDF ---")
177
+ if Path(my_pdf).exists():
178
+ convert_pdf_to_markdown(my_pdf, output_directory)
179
+ else:
180
+ self.logger.debug(f"示例文件 '{my_pdf}' 不存在,请修改路径后重试。")
181
+
182
+ # 示例2:只转换 PDF 的第 2 到 3 页
183
+ self.logger.debug("\n--- 任务2: 转换 PDF 的第 2-3 页 ---")
184
+ if Path(my_pdf).exists():
185
+ convert_pdf_to_markdown(my_pdf, output_directory, start_page=2, end_page=3)
186
+ else:
187
+ self.logger.debug(f"示例文件 '{my_pdf}' 不存在,请修改路径后重试。")
188
+
189
+ # 示例3:只转换第 5 页
190
+ self.logger.debug("\n--- 任务3: 只转换 PDF 的第 5 页 ---")
191
+ if Path(my_pdf).exists():
192
+ convert_pdf_to_markdown(my_pdf, output_directory, start_page=5, end_page=5)
193
+ else:
194
+ self.logger.debug(f"示例文件 '{my_pdf}' 不存在,请修改路径后重试。")