matrix-for-agents 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentmatrix/__init__.py +20 -0
- agentmatrix/agents/__init__.py +1 -0
- agentmatrix/agents/base.py +572 -0
- agentmatrix/agents/claude_coder.py +10 -0
- agentmatrix/agents/data_crawler.py +14 -0
- agentmatrix/agents/post_office.py +212 -0
- agentmatrix/agents/report_writer.py +14 -0
- agentmatrix/agents/secretary.py +10 -0
- agentmatrix/agents/stateful.py +10 -0
- agentmatrix/agents/user_proxy.py +82 -0
- agentmatrix/agents/worker.py +30 -0
- agentmatrix/backends/__init__.py +1 -0
- agentmatrix/backends/llm_client.py +414 -0
- agentmatrix/backends/mock_llm.py +35 -0
- agentmatrix/cli_runner.py +94 -0
- agentmatrix/core/__init__.py +0 -0
- agentmatrix/core/action.py +50 -0
- agentmatrix/core/browser/bing.py +208 -0
- agentmatrix/core/browser/browser_adapter.py +298 -0
- agentmatrix/core/browser/browser_common.py +85 -0
- agentmatrix/core/browser/drission_page_adapter.py +1296 -0
- agentmatrix/core/browser/google.py +230 -0
- agentmatrix/core/cerebellum.py +121 -0
- agentmatrix/core/events.py +22 -0
- agentmatrix/core/loader.py +185 -0
- agentmatrix/core/loader_v1.py +146 -0
- agentmatrix/core/log_util.py +158 -0
- agentmatrix/core/message.py +32 -0
- agentmatrix/core/prompt_engine.py +30 -0
- agentmatrix/core/runtime.py +211 -0
- agentmatrix/core/session.py +20 -0
- agentmatrix/db/__init__.py +1 -0
- agentmatrix/db/database.py +79 -0
- agentmatrix/db/vector_db.py +213 -0
- agentmatrix/docs/Design.md +109 -0
- agentmatrix/docs/Framework Capbilities.md +105 -0
- agentmatrix/docs/Planner Design.md +148 -0
- agentmatrix/docs/crawler_flow.md +110 -0
- agentmatrix/docs/report_writer.md +83 -0
- agentmatrix/docs/review.md +99 -0
- agentmatrix/docs/skill_design.md +23 -0
- agentmatrix/profiles/claude_coder.yml +40 -0
- agentmatrix/profiles/mark.yml +26 -0
- agentmatrix/profiles/planner.yml +21 -0
- agentmatrix/profiles/prompts/base.txt +88 -0
- agentmatrix/profiles/prompts/base_v1.txt +101 -0
- agentmatrix/profiles/prompts/base_v2.txt +94 -0
- agentmatrix/profiles/tom_the_data_crawler.yml +38 -0
- agentmatrix/profiles/user_proxy.yml +17 -0
- agentmatrix/skills/__init__.py +1 -0
- agentmatrix/skills/crawler_helpers.py +315 -0
- agentmatrix/skills/data_crawler.py +777 -0
- agentmatrix/skills/filesystem.py +204 -0
- agentmatrix/skills/notebook.py +158 -0
- agentmatrix/skills/project_management.py +114 -0
- agentmatrix/skills/report_writer.py +194 -0
- agentmatrix/skills/report_writer_utils.py +379 -0
- agentmatrix/skills/search_tool.py +383 -0
- agentmatrix/skills/terminal_ctrl.py +122 -0
- agentmatrix/skills/utils.py +33 -0
- agentmatrix/skills/web_searcher.py +1107 -0
- matrix_for_agents-0.1.2.dist-info/METADATA +44 -0
- matrix_for_agents-0.1.2.dist-info/RECORD +66 -0
- matrix_for_agents-0.1.2.dist-info/WHEEL +5 -0
- matrix_for_agents-0.1.2.dist-info/licenses/LICENSE +190 -0
- matrix_for_agents-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import asyncio
|
|
3
|
+
import urllib.parse
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from curl_cffi.requests import AsyncSession
|
|
6
|
+
import xml.etree.ElementTree as ET # 用于解析 ArXiv API
|
|
7
|
+
from urllib.parse import parse_qs, urlparse
|
|
8
|
+
import base64
|
|
9
|
+
|
|
10
|
+
class SmartSearcherMixin:
|
|
11
|
+
def _clean_bing_url(self, url: str) -> str:
|
|
12
|
+
"""
|
|
13
|
+
[上策] 本地解码 Bing 的跳转链接,彻底绕过 403 和网络请求。
|
|
14
|
+
"""
|
|
15
|
+
if "bing.com/ck/a" not in url:
|
|
16
|
+
return url
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
parsed = urlparse(url)
|
|
20
|
+
query = parse_qs(parsed.query)
|
|
21
|
+
# 获取加密的 u 参数
|
|
22
|
+
u_param = query.get('u', [''])[0]
|
|
23
|
+
|
|
24
|
+
if not u_param:
|
|
25
|
+
return url
|
|
26
|
+
|
|
27
|
+
# Bing 的编码逻辑通常是: "a1" + Base64
|
|
28
|
+
# 1. 去掉 "a1" 前缀
|
|
29
|
+
if u_param.startswith('a1'):
|
|
30
|
+
u_param = u_param[2:]
|
|
31
|
+
|
|
32
|
+
# 2. 补全 Base64 padding (长度必须是 4 的倍数)
|
|
33
|
+
padding = len(u_param) % 4
|
|
34
|
+
if padding > 0:
|
|
35
|
+
u_param += '=' * (4 - padding)
|
|
36
|
+
|
|
37
|
+
# 3. Base64 解码
|
|
38
|
+
# 注意:Bing 使用的是 URL-safe Base64
|
|
39
|
+
decoded_bytes = base64.urlsafe_b64decode(u_param)
|
|
40
|
+
real_url = decoded_bytes.decode('utf-8')
|
|
41
|
+
|
|
42
|
+
self.logger.info(f"🔓 Decoded Bing URL: {real_url}")
|
|
43
|
+
return real_url
|
|
44
|
+
|
|
45
|
+
except Exception as e:
|
|
46
|
+
self.logger.warning(f"Bing URL decode failed: {e}. Fallback to direct request.")
|
|
47
|
+
# 解码失败时,返回原 URL,尝试后续的 Plan B
|
|
48
|
+
return url
|
|
49
|
+
|
|
50
|
+
def _is_chinese_query(self, text: str) -> bool:
|
|
51
|
+
"""
|
|
52
|
+
[Helper] 检测 query 是否包含中文字符。
|
|
53
|
+
只要包含哪怕一个汉字,就倾向于认为用户想要中文结果。
|
|
54
|
+
"""
|
|
55
|
+
for char in text:
|
|
56
|
+
if '\u4e00' <= char <= '\u9fff':
|
|
57
|
+
return True
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
# === 1. Google Search (VIP, 需要代理) ===
|
|
61
|
+
async def _search_google(self, session, query, limit=10):
|
|
62
|
+
"""
|
|
63
|
+
[Hard Mode] 伪装爬取 Google。
|
|
64
|
+
Google 反爬极严,需要高质量的 IP 和指纹。
|
|
65
|
+
"""
|
|
66
|
+
# Google 的搜索结果结构经常变,但 h3 -> a 的结构相对稳定
|
|
67
|
+
base_url = "https://www.google.com/search"
|
|
68
|
+
params = {"q": query, "num": str(limit + 5)}
|
|
69
|
+
|
|
70
|
+
# 语言适配
|
|
71
|
+
if self._is_chinese_query(query):
|
|
72
|
+
params["hl"] = "zh-CN" # Interface Language
|
|
73
|
+
params["gl"] = "sg" # Language Restrict (可选,看你是否想要强制纯中文)
|
|
74
|
+
# 建议只设 hl=zh-CN,Google 很聪明,会自己混合结果
|
|
75
|
+
else:
|
|
76
|
+
params["hl"] = "en"
|
|
77
|
+
params["gl"] = "us" # Geo Location preference
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
# 必须带 Header,否则 Google 以为是脚本
|
|
81
|
+
headers = {
|
|
82
|
+
#"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
|
83
|
+
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8" if self._is_chinese_query(query) else "en-US,en;q=0.9",
|
|
84
|
+
"Referer": "https://www.google.com/"
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
resp = await session.get(base_url, params=params, headers=headers, timeout=15)
|
|
88
|
+
|
|
89
|
+
if resp.status_code == 429:
|
|
90
|
+
self.logger.warning("Google Search: CAPTCHA triggered (429). Proxy might be dirty.")
|
|
91
|
+
return []
|
|
92
|
+
if resp.status_code != 200:
|
|
93
|
+
self.logger.warning(f"Google Search failed: {resp.status_code}")
|
|
94
|
+
return []
|
|
95
|
+
|
|
96
|
+
soup = BeautifulSoup(resp.content, "lxml")
|
|
97
|
+
results = []
|
|
98
|
+
|
|
99
|
+
# Google 标准结果通常在 div.g 里
|
|
100
|
+
for g in soup.select("div.g"):
|
|
101
|
+
if len(results) >= limit: break
|
|
102
|
+
try:
|
|
103
|
+
# 提取标题和链接
|
|
104
|
+
h3 = g.select_one("h3")
|
|
105
|
+
link = g.select_one("a")
|
|
106
|
+
|
|
107
|
+
if h3 and link and link.has_attr("href"):
|
|
108
|
+
title = h3.get_text()
|
|
109
|
+
href = link["href"]
|
|
110
|
+
|
|
111
|
+
# 跳过 Google 的相关搜索推荐等无效链接
|
|
112
|
+
if href.startswith("/search") or "google.com" in href:
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
# 尝试提取摘要 (Google 的摘要 class 很乱,通常是 div 里的文本)
|
|
116
|
+
# 这里用一种比较粗暴但有效的方法:找 h3 后面最大的那段字
|
|
117
|
+
snippet = "No snippet"
|
|
118
|
+
text_divs = g.select("div[style*='-webkit-line-clamp']") # 很多时候摘要有这个属性
|
|
119
|
+
if not text_divs:
|
|
120
|
+
# Fallback: 找所有 span/div
|
|
121
|
+
text_divs = g.select("div span")
|
|
122
|
+
|
|
123
|
+
for t in text_divs:
|
|
124
|
+
txt = t.get_text()
|
|
125
|
+
if len(txt) > 20: # 认为是摘要
|
|
126
|
+
snippet = txt
|
|
127
|
+
break
|
|
128
|
+
|
|
129
|
+
results.append({"title": title, "href": href, "body": snippet, "source": "Google"})
|
|
130
|
+
except:
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
return results
|
|
134
|
+
except Exception as e:
|
|
135
|
+
self.logger.warning(f"Google Scrape Exception: {e}")
|
|
136
|
+
return []
|
|
137
|
+
|
|
138
|
+
# === 2. Bing Search (Fallback, 中国直连) ===
|
|
139
|
+
async def _search_bing(self, session, query, limit=20):
|
|
140
|
+
"""
|
|
141
|
+
[Easy Mode] 优化版 Bing 搜索。
|
|
142
|
+
强制使用国际版参数,防止被重定向到国内版。
|
|
143
|
+
"""
|
|
144
|
+
base_url = "https://cn.bing.com/search"
|
|
145
|
+
# 1. 基础配置:强制连接 US 服务器以获取完整的索引库 (Global Index)
|
|
146
|
+
# 'cc=US' 主要是为了物理层面告诉 Bing "我想要国际版的库",防止被重定向到阉割版
|
|
147
|
+
|
|
148
|
+
# 你可以在这里把不喜欢的内容农场全部加进去
|
|
149
|
+
blacklist = [
|
|
150
|
+
"zhihu.com", # 知乎主站
|
|
151
|
+
"zhuanlan.zhihu.com", # 知乎专栏
|
|
152
|
+
"csdn.net", # CSDN (很多搬运/付费可见,质量不稳定)
|
|
153
|
+
"baidu.com", # 屏蔽百度知道、贴吧、文库等
|
|
154
|
+
"bilibili.com", # 屏蔽视频站(因为爬虫处理不了视频)
|
|
155
|
+
"sohu.com", # 搜狐号(很多营销号)
|
|
156
|
+
"baijiahao.baidu.com" # 百家号
|
|
157
|
+
"zhidao.baidu.com"
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
# 构造屏蔽字符串: " -site:zhihu.com -site:csdn.net ..."
|
|
161
|
+
exclusion_str = " " + " ".join([f"-site:{domain}" for domain in blacklist])
|
|
162
|
+
|
|
163
|
+
# === 2. 注入到 Query 中 ===
|
|
164
|
+
tuned_query = query + exclusion_str
|
|
165
|
+
|
|
166
|
+
params = {
|
|
167
|
+
"q": tuned_query,
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
'''
|
|
171
|
+
# 2. 语言自适应:根据 Query 决定“市场偏好”
|
|
172
|
+
if self._is_chinese_query(query):
|
|
173
|
+
self.logger.info(f"🇨🇳 Detected Chinese Query: '{query}'. Tuning for Chinese results.")
|
|
174
|
+
# 显式告诉 Bing:虽然我连的是 US 服务器,但请优先给我中文内容
|
|
175
|
+
params["setlang"] = "zh-CN"
|
|
176
|
+
# 甚至可以不设 setmkt,让它自然匹配;或者设为 zh-CN
|
|
177
|
+
# 注意:如果设了 setmkt=zh-CN,有些时候可能会被 Bing 强转回 cn.bing.com,
|
|
178
|
+
# 所以保守策略是:只设 setlang,不设 setmkt,或者 setmkt 留空
|
|
179
|
+
else:
|
|
180
|
+
self.logger.info(f"🇺🇸 Detected Non-Chinese Query: '{query}'. Tuning for Global/English results.")
|
|
181
|
+
params["setlang"] = "en"
|
|
182
|
+
params["setmkt"] = "en-US" # 英文搜索时,强制 US 市场结果质量最高
|
|
183
|
+
'''
|
|
184
|
+
try:
|
|
185
|
+
# 必须带 Header,否则 Bing 可能会根据 IP 强行锁定语言
|
|
186
|
+
headers = {
|
|
187
|
+
#"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
|
188
|
+
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8" if self._is_chinese_query(query) else "en-US,en;q=0.9"
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
#resp = await session.get(base_url, params=params, headers=headers, timeout=10)
|
|
192
|
+
resp = await session.get(base_url, params=params, timeout=10)
|
|
193
|
+
|
|
194
|
+
if resp.status_code != 200:
|
|
195
|
+
self.logger.warning(f"Bing Search Error: {resp.status_code}")
|
|
196
|
+
self.logger.warning(f"Response Headers: {resp.headers}")
|
|
197
|
+
return []
|
|
198
|
+
|
|
199
|
+
soup = BeautifulSoup(resp.content, "lxml")
|
|
200
|
+
results = []
|
|
201
|
+
|
|
202
|
+
for item in soup.select("li.b_algo"):
|
|
203
|
+
if len(results) >= limit: break
|
|
204
|
+
try:
|
|
205
|
+
h2 = item.select_one("h2 a")
|
|
206
|
+
if not h2: continue
|
|
207
|
+
raw_href = h2['href']
|
|
208
|
+
clean_href = self._clean_bing_url(raw_href)
|
|
209
|
+
|
|
210
|
+
results.append({
|
|
211
|
+
"title": h2.get_text(),
|
|
212
|
+
"href": clean_href, # 存入清洗后的 URL
|
|
213
|
+
"body": item.select_one(".b_caption p").get_text() if item.select_one(".b_caption p") else "",
|
|
214
|
+
"source": "Bing"
|
|
215
|
+
})
|
|
216
|
+
except: continue
|
|
217
|
+
return results
|
|
218
|
+
except Exception as e:
|
|
219
|
+
self.logger.error(f"Bing Search Error: {e}")
|
|
220
|
+
return []
|
|
221
|
+
|
|
222
|
+
# === 3. ArXiv API (STEM 增强) ===
|
|
223
|
+
async def _search_arxiv(self, session, query, limit=5):
|
|
224
|
+
"""
|
|
225
|
+
[Bonus] 针对 STEM 领域,直接查 ArXiv API。
|
|
226
|
+
这是极其高质量的源,绝对没有营销号。
|
|
227
|
+
"""
|
|
228
|
+
# ArXiv API 不需要代理,也不需要 Cookie,非常稳
|
|
229
|
+
api_url = "http://export.arxiv.org/api/query"
|
|
230
|
+
# 简单的预处理:把空格换成 AND
|
|
231
|
+
safe_query = urllib.parse.quote(query)
|
|
232
|
+
params = f"search_query=all:{safe_query}&start=0&max_results={limit}"
|
|
233
|
+
|
|
234
|
+
try:
|
|
235
|
+
# ArXiv 很快,不需要伪装,普通 get 即可
|
|
236
|
+
resp = await session.get(f"{api_url}?{params}", timeout=10)
|
|
237
|
+
if resp.status_code != 200: return []
|
|
238
|
+
|
|
239
|
+
# 解析 XML
|
|
240
|
+
root = ET.fromstring(resp.content)
|
|
241
|
+
results = []
|
|
242
|
+
# XML 命名空间
|
|
243
|
+
ns = {'atom': 'http://www.w3.org/2005/Atom'}
|
|
244
|
+
|
|
245
|
+
for entry in root.findall('atom:entry', ns):
|
|
246
|
+
title = entry.find('atom:title', ns).text.replace('\n', ' ').strip()
|
|
247
|
+
summary = entry.find('atom:summary', ns).text.replace('\n', ' ').strip()[:200]
|
|
248
|
+
link = entry.find('atom:id', ns).text # ArXiv 的 ID url
|
|
249
|
+
|
|
250
|
+
# 优先找 PDF 链接
|
|
251
|
+
pdf_link = link
|
|
252
|
+
for l in entry.findall('atom:link', ns):
|
|
253
|
+
if l.attrib.get('title') == 'pdf':
|
|
254
|
+
pdf_link = l.attrib['href']
|
|
255
|
+
|
|
256
|
+
results.append({
|
|
257
|
+
"title": f"[ArXiv] {title}",
|
|
258
|
+
"href": pdf_link,
|
|
259
|
+
"body": summary,
|
|
260
|
+
"source": "ArXiv API"
|
|
261
|
+
})
|
|
262
|
+
return results
|
|
263
|
+
except Exception as e:
|
|
264
|
+
self.logger.warning(f"ArXiv Search Error: {e}")
|
|
265
|
+
return []
|
|
266
|
+
|
|
267
|
+
async def _search_baidu_xueshu(self, session, query, limit=5):
|
|
268
|
+
"""
|
|
269
|
+
[CN Scholar] 百度学术搜索。
|
|
270
|
+
聚合了知网、万方等元数据,且能找到部分免费 PDF 链接。
|
|
271
|
+
"""
|
|
272
|
+
base_url = "https://xueshu.baidu.com/s"
|
|
273
|
+
params = {"wd": query, "tn": "SE_baiduxueshu_c1gjeupa", "ie": "utf-8"}
|
|
274
|
+
|
|
275
|
+
try:
|
|
276
|
+
# 百度学术对 header 比较敏感,建议模拟完整 header
|
|
277
|
+
headers = {
|
|
278
|
+
#"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/110.0.0.0 Safari/537.36",
|
|
279
|
+
"Referer": "https://xueshu.baidu.com/"
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
resp = await session.get(base_url, params=params, headers=headers, timeout=10)
|
|
283
|
+
if resp.status_code != 200: return []
|
|
284
|
+
|
|
285
|
+
soup = BeautifulSoup(resp.content, "lxml")
|
|
286
|
+
results = []
|
|
287
|
+
|
|
288
|
+
# 百度学术的结果卡片通常是 div.sc_content
|
|
289
|
+
for item in soup.select("div.sc_content"):
|
|
290
|
+
if len(results) >= limit: break
|
|
291
|
+
try:
|
|
292
|
+
# 1. 标题和详情页链接
|
|
293
|
+
h3 = item.select_one("h3.t a")
|
|
294
|
+
if not h3: continue
|
|
295
|
+
|
|
296
|
+
title = h3.get_text().strip()
|
|
297
|
+
# 这是百度学术的详情页链接
|
|
298
|
+
detail_url = h3['href']
|
|
299
|
+
|
|
300
|
+
# 2. 摘要
|
|
301
|
+
abstract_div = item.select_one("div.c_abstract")
|
|
302
|
+
snippet = abstract_div.get_text().strip() if abstract_div else "No abstract"
|
|
303
|
+
|
|
304
|
+
# 3. 尝试提取“来源”信息 (例如:知网、万方、或者是 pdf 链接)
|
|
305
|
+
# 百度学术有时候会直接给出下载链接,但更多时候在详情页里
|
|
306
|
+
# 对于 Agent,先把详情页给它,让递归爬虫进去找下载链接是更稳的策略
|
|
307
|
+
|
|
308
|
+
results.append({
|
|
309
|
+
"title": f"[百度学术] {title}",
|
|
310
|
+
"href": detail_url, # 注意:这是中间页,需要 Hunter 进去 Deep Hunt
|
|
311
|
+
"body": snippet,
|
|
312
|
+
"source": "Baidu Xueshu"
|
|
313
|
+
})
|
|
314
|
+
except:
|
|
315
|
+
continue
|
|
316
|
+
return results
|
|
317
|
+
except Exception as e:
|
|
318
|
+
self.logger.warning(f"Baidu Xueshu Error: {e}")
|
|
319
|
+
return []
|
|
320
|
+
|
|
321
|
+
# === 4. 智能路由入口 ===
|
|
322
|
+
async def _smart_search_entry(self, query: str, limit: int = 20, domain: str = "STEM"):
|
|
323
|
+
"""
|
|
324
|
+
[Master Controller] 智能决定走哪条路。
|
|
325
|
+
"""
|
|
326
|
+
# 检测代理
|
|
327
|
+
proxy = os.environ.get("HTTP_PROXY") or os.environ.get("HTTPS_PROXY") or os.environ.get("ALL_PROXY") or os.environ.get("http_proxy") or os.environ.get("https_proxy") or os.environ.get("all_proxy")
|
|
328
|
+
has_proxy = proxy is not None and len(proxy) > 0
|
|
329
|
+
is_chinese = self._is_chinese_query(query)
|
|
330
|
+
should_search_arxiv = (domain == "STEM") and (not is_chinese)
|
|
331
|
+
all_results = []
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
# 使用 impersonate="chrome110" 是最稳的
|
|
335
|
+
# 如果有代理,curl_cffi 会自动读取环境变量,也可以显式传入 proxies=...
|
|
336
|
+
async with AsyncSession(impersonate="chrome") as session:
|
|
337
|
+
|
|
338
|
+
# --- 策略 A: 代理优先 (Google) ---
|
|
339
|
+
if has_proxy:
|
|
340
|
+
self.logger.info(f"🌍 Proxy detected. Attempting Google Search for: {query}")
|
|
341
|
+
google_results = await self._search_google(session, query, limit)
|
|
342
|
+
if google_results:
|
|
343
|
+
all_results.extend(google_results)
|
|
344
|
+
else:
|
|
345
|
+
self.logger.warning("Google failed despite proxy. Falling back to Bing.")
|
|
346
|
+
|
|
347
|
+
# --- 策略 B: 兜底/直连 (Bing) ---
|
|
348
|
+
# 如果 Google 没搜到,或者没代理,用 Bing 补位
|
|
349
|
+
if not all_results:
|
|
350
|
+
self.logger.info(f"🌏 Using Bing Search (Global Mode) for: {query}")
|
|
351
|
+
bing_results = await self._search_bing(session, query, limit)
|
|
352
|
+
all_results.extend(bing_results)
|
|
353
|
+
|
|
354
|
+
# --- 策略 C: 领域增强 (ArXiv) ---
|
|
355
|
+
# 如果是理工科,强行注入 ArXiv 结果 (这个 API 在国内通常能直连,不行就走代理)
|
|
356
|
+
if should_search_arxiv:
|
|
357
|
+
self.logger.info(f"🧪 STEM domain detected. Injecting ArXiv results...")
|
|
358
|
+
# ArXiv 结果少而精,取 3-5 个即可
|
|
359
|
+
arxiv_results = await self._search_arxiv(session, query, limit=5)
|
|
360
|
+
# 把 ArXiv 结果插到最前面!因为它们质量最高
|
|
361
|
+
all_results = arxiv_results + all_results
|
|
362
|
+
|
|
363
|
+
# === 策略 D: 中文学术增强 (百度学术) ===
|
|
364
|
+
# 如果是中文搜索,且属于 STEM 或 Humanities (历史/文学更需要学术搜索)
|
|
365
|
+
if is_chinese and domain in ["STEM", "HUMANITIES"]:
|
|
366
|
+
self.logger.info(f"📚 Chinese Academic query detected. Injecting Baidu Xueshu results...")
|
|
367
|
+
|
|
368
|
+
# 百度学术结果通常比较精准,取 3-5 个即可
|
|
369
|
+
baidu_results = await self._search_baidu_xueshu(session, query, limit=5)
|
|
370
|
+
|
|
371
|
+
# 插入到结果列表前面,赋予高优先级
|
|
372
|
+
all_results = baidu_results + all_results
|
|
373
|
+
|
|
374
|
+
# 简单的去重 (以 href 为 key)
|
|
375
|
+
seen_urls = set()
|
|
376
|
+
unique_results = []
|
|
377
|
+
for r in all_results:
|
|
378
|
+
if r['href'] not in seen_urls:
|
|
379
|
+
seen_urls.add(r['href'])
|
|
380
|
+
unique_results.append(r)
|
|
381
|
+
|
|
382
|
+
self.logger.info(f"✅ Smart Search finished. Found {len(unique_results)} URLs from {[r['source'] for r in unique_results[:3]]}...")
|
|
383
|
+
return unique_results[:limit] # 返回请求的数量
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# skills/terminal_ctrl.py
|
|
2
|
+
import libtmux
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
import platform
|
|
6
|
+
import subprocess
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
import random
|
|
10
|
+
from ..core.log_util import AutoLoggerMixin
|
|
11
|
+
from ..core.action import register_action
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TerminalSkillMixin:
|
|
15
|
+
"""
|
|
16
|
+
赋予 Agent 操作 Tmux 终端的能力。
|
|
17
|
+
支持:创建会话、输入命令、读取屏幕、弹出窗口。
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def _get_server(self):
|
|
21
|
+
# 获取或启动 tmux server
|
|
22
|
+
return libtmux.Server()
|
|
23
|
+
|
|
24
|
+
def _get_session(self, session_name="matrix_coder"):
|
|
25
|
+
server = self._get_server()
|
|
26
|
+
try:
|
|
27
|
+
session = server.sessions.get(session_name=session_name)
|
|
28
|
+
except Exception:
|
|
29
|
+
session = None
|
|
30
|
+
|
|
31
|
+
if not session:
|
|
32
|
+
# 创建新会话,默认 shell
|
|
33
|
+
session = server.new_session(session_name=session_name)
|
|
34
|
+
return session
|
|
35
|
+
|
|
36
|
+
def _get_pane(self, session_name="matrix_coder"):
|
|
37
|
+
session = self._get_session(session_name)
|
|
38
|
+
return session.windows[0].panes[0]
|
|
39
|
+
|
|
40
|
+
@register_action("初始化并弹出一个可见的终端窗口。只需调用一次。", param_infos={
|
|
41
|
+
"session_name": "会话名称,默认为 matrix_coder"
|
|
42
|
+
})
|
|
43
|
+
async def launch_terminal_window(self, session_name="matrix_coder"):
|
|
44
|
+
"""
|
|
45
|
+
在宿主机操作系统层面弹出一个终端窗口,并 Attach 到指定的 tmux 会话。
|
|
46
|
+
这样用户可以看到 Agent 的操作。
|
|
47
|
+
"""
|
|
48
|
+
# 1. 确保 Session 存在
|
|
49
|
+
self._get_session(session_name)
|
|
50
|
+
|
|
51
|
+
# 转换为绝对路径
|
|
52
|
+
workspace_root = os.path.abspath(self.workspace_root)
|
|
53
|
+
|
|
54
|
+
system = platform.system()
|
|
55
|
+
cmd = ""
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
if system == "Darwin": # macOS
|
|
59
|
+
# 使用 AppleScript 调用 Terminal.app 并执行 tmux attach
|
|
60
|
+
# 这是一个比较 hacky 但有效的方法
|
|
61
|
+
script = f'''
|
|
62
|
+
tell application "Terminal"
|
|
63
|
+
do script "cd {workspace_root} && tmux attach -t {session_name}"
|
|
64
|
+
activate
|
|
65
|
+
end tell
|
|
66
|
+
'''
|
|
67
|
+
subprocess.run(["osascript", "-e", script])
|
|
68
|
+
return f"Success: MacOS Terminal launched attached to '{session_name}'."
|
|
69
|
+
|
|
70
|
+
elif system == "Linux":
|
|
71
|
+
# 尝试 gnome-terminal (Ubuntu/Debian)
|
|
72
|
+
# 如果是其他发行版需要调整
|
|
73
|
+
subprocess.Popen(["gnome-terminal", "--", "bash", "-c", f"cd {workspace_root} && exec tmux attach -t {session_name}"])
|
|
74
|
+
return f"Success: Linux Terminal launched attached to '{session_name}'."
|
|
75
|
+
|
|
76
|
+
else:
|
|
77
|
+
return "Error: Unsupported OS for auto-launching window. Please manually run 'tmux attach -t matrix_coder'."
|
|
78
|
+
|
|
79
|
+
except Exception as e:
|
|
80
|
+
return f"Error launching window: {str(e)}"
|
|
81
|
+
|
|
82
|
+
@register_action("向终端发送命令或文本(如启动 claude 或回答 y/n)。", param_infos={
|
|
83
|
+
"command": "要输入的文本",
|
|
84
|
+
"press_enter": "是否在末尾加回车,默认为 True"
|
|
85
|
+
})
|
|
86
|
+
async def send_terminal_command(self, command: str, press_enter: bool = True):
|
|
87
|
+
pane = self._get_pane()
|
|
88
|
+
for char in command:
|
|
89
|
+
pane.send_keys(char, enter=False)
|
|
90
|
+
# triangular(left, mode, right) - 延迟更可能接近 0.05
|
|
91
|
+
time.sleep(random.triangular(0.05, 0.05, 0.5))
|
|
92
|
+
|
|
93
|
+
if press_enter:
|
|
94
|
+
pane.send_keys("Enter")
|
|
95
|
+
|
|
96
|
+
# 给一点点时间让终端反应
|
|
97
|
+
time.sleep(1)
|
|
98
|
+
return f"Command '{command}' sent."
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@register_action("查看终端屏幕的当前内容(截图)。", param_infos={
|
|
111
|
+
"lines": "读取最近多少行,默认 20"
|
|
112
|
+
})
|
|
113
|
+
async def read_terminal_screen(self, lines: int = 20):
|
|
114
|
+
pane = self._get_pane()
|
|
115
|
+
# capture_pane 返回的是列表
|
|
116
|
+
output_lines = pane.capture_pane()
|
|
117
|
+
|
|
118
|
+
# 取最后 N 行
|
|
119
|
+
recent_output = output_lines[-int(lines):]
|
|
120
|
+
content = "\n".join(recent_output)
|
|
121
|
+
|
|
122
|
+
return f"=== TERMINAL SCREEN ===\n{content}\n======================="
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import re
|
|
2
|
+
def sanitize_filename(name: str, max_length: int = 200) -> str:
|
|
3
|
+
"""
|
|
4
|
+
清洗字符串,使其可以作为合法的文件名/目录名,同时保留中文。
|
|
5
|
+
|
|
6
|
+
规则:
|
|
7
|
+
1. 去除 Windows/Linux 非法字符
|
|
8
|
+
2. 去除不可见字符 (换行、Tab等)
|
|
9
|
+
3. 去除首尾的空格和点 (Windows 不喜欢文件名以点或空格结尾)
|
|
10
|
+
4. 截断长度,防止路径过长
|
|
11
|
+
"""
|
|
12
|
+
if not name:
|
|
13
|
+
return "untitled"
|
|
14
|
+
|
|
15
|
+
# 1. 替换文件系统非法字符为下划线
|
|
16
|
+
# Windows非法字符: < > : " / \ | ? *
|
|
17
|
+
name = re.sub(r'[<>:"/\\|?*]', '_', name)
|
|
18
|
+
|
|
19
|
+
# 2. 替换不可见控制字符 (如换行符 \n, \r, \t) 为空格
|
|
20
|
+
name = "".join(ch if ch.isprintable() else " " for ch in name)
|
|
21
|
+
|
|
22
|
+
# 3. 将连续的空格或下划线合并为一个 (美观优化)
|
|
23
|
+
name = re.sub(r'[\s_]+', '_', name)
|
|
24
|
+
|
|
25
|
+
# 4. 去除首尾的空格和点 (Windows文件名不能以点结尾)
|
|
26
|
+
name = name.strip(' .')
|
|
27
|
+
|
|
28
|
+
# 5. 如果清洗后为空 (比如原文件名全是非法字符),给个默认值
|
|
29
|
+
if not name:
|
|
30
|
+
name = "untitled_file"
|
|
31
|
+
|
|
32
|
+
# 6. 截断长度 (通常文件系统限制 255 字节,考虑到路径长度,限制在 200 字符比较安全)
|
|
33
|
+
return name[:max_length]
|