matrix-for-agents 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentmatrix/__init__.py +20 -0
- agentmatrix/agents/__init__.py +1 -0
- agentmatrix/agents/base.py +572 -0
- agentmatrix/agents/claude_coder.py +10 -0
- agentmatrix/agents/data_crawler.py +14 -0
- agentmatrix/agents/post_office.py +212 -0
- agentmatrix/agents/report_writer.py +14 -0
- agentmatrix/agents/secretary.py +10 -0
- agentmatrix/agents/stateful.py +10 -0
- agentmatrix/agents/user_proxy.py +82 -0
- agentmatrix/agents/worker.py +30 -0
- agentmatrix/backends/__init__.py +1 -0
- agentmatrix/backends/llm_client.py +414 -0
- agentmatrix/backends/mock_llm.py +35 -0
- agentmatrix/cli_runner.py +94 -0
- agentmatrix/core/__init__.py +0 -0
- agentmatrix/core/action.py +50 -0
- agentmatrix/core/browser/bing.py +208 -0
- agentmatrix/core/browser/browser_adapter.py +298 -0
- agentmatrix/core/browser/browser_common.py +85 -0
- agentmatrix/core/browser/drission_page_adapter.py +1296 -0
- agentmatrix/core/browser/google.py +230 -0
- agentmatrix/core/cerebellum.py +121 -0
- agentmatrix/core/events.py +22 -0
- agentmatrix/core/loader.py +185 -0
- agentmatrix/core/loader_v1.py +146 -0
- agentmatrix/core/log_util.py +158 -0
- agentmatrix/core/message.py +32 -0
- agentmatrix/core/prompt_engine.py +30 -0
- agentmatrix/core/runtime.py +211 -0
- agentmatrix/core/session.py +20 -0
- agentmatrix/db/__init__.py +1 -0
- agentmatrix/db/database.py +79 -0
- agentmatrix/db/vector_db.py +213 -0
- agentmatrix/docs/Design.md +109 -0
- agentmatrix/docs/Framework Capbilities.md +105 -0
- agentmatrix/docs/Planner Design.md +148 -0
- agentmatrix/docs/crawler_flow.md +110 -0
- agentmatrix/docs/report_writer.md +83 -0
- agentmatrix/docs/review.md +99 -0
- agentmatrix/docs/skill_design.md +23 -0
- agentmatrix/profiles/claude_coder.yml +40 -0
- agentmatrix/profiles/mark.yml +26 -0
- agentmatrix/profiles/planner.yml +21 -0
- agentmatrix/profiles/prompts/base.txt +88 -0
- agentmatrix/profiles/prompts/base_v1.txt +101 -0
- agentmatrix/profiles/prompts/base_v2.txt +94 -0
- agentmatrix/profiles/tom_the_data_crawler.yml +38 -0
- agentmatrix/profiles/user_proxy.yml +17 -0
- agentmatrix/skills/__init__.py +1 -0
- agentmatrix/skills/crawler_helpers.py +315 -0
- agentmatrix/skills/data_crawler.py +777 -0
- agentmatrix/skills/filesystem.py +204 -0
- agentmatrix/skills/notebook.py +158 -0
- agentmatrix/skills/project_management.py +114 -0
- agentmatrix/skills/report_writer.py +194 -0
- agentmatrix/skills/report_writer_utils.py +379 -0
- agentmatrix/skills/search_tool.py +383 -0
- agentmatrix/skills/terminal_ctrl.py +122 -0
- agentmatrix/skills/utils.py +33 -0
- agentmatrix/skills/web_searcher.py +1107 -0
- matrix_for_agents-0.1.2.dist-info/METADATA +44 -0
- matrix_for_agents-0.1.2.dist-info/RECORD +66 -0
- matrix_for_agents-0.1.2.dist-info/WHEEL +5 -0
- matrix_for_agents-0.1.2.dist-info/licenses/LICENSE +190 -0
- matrix_for_agents-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import traceback
|
|
2
|
+
async def extract_search_results(adapter, tab):
|
|
3
|
+
"""
|
|
4
|
+
Extract search results from Bing search results page.
|
|
5
|
+
|
|
6
|
+
Args:
|
|
7
|
+
adapter: DrissionPageAdapter instance
|
|
8
|
+
tab: Current browser tab handle
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
List of dictionaries containing title, url, and snippet for each search result
|
|
12
|
+
"""
|
|
13
|
+
print("\n4. Extracting search results...")
|
|
14
|
+
results = []
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
# Wait for search results to load
|
|
18
|
+
print(" Waiting for search results to load...")
|
|
19
|
+
import time
|
|
20
|
+
time.sleep(3) # Give time for search results to appear
|
|
21
|
+
|
|
22
|
+
# Use DrissionPage's ele method to find search result elements
|
|
23
|
+
# Bing search results are typically in li.b_algo elements
|
|
24
|
+
search_result_elements = tab.eles('@@tag()=li@@class=b_algo')
|
|
25
|
+
|
|
26
|
+
print(f" Found {len(search_result_elements)} search result elements")
|
|
27
|
+
|
|
28
|
+
for idx, element in enumerate(search_result_elements):
|
|
29
|
+
#print(element)
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
#print(f" Processing element {idx+1}...")
|
|
33
|
+
|
|
34
|
+
# Extract title and URL from h2 element containing a link
|
|
35
|
+
title_element = element.ele('@tag()=h2')
|
|
36
|
+
if not title_element:
|
|
37
|
+
print(f" No h2 found in element {idx+1}")
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
# Find the link within h2
|
|
41
|
+
link_element = title_element.ele('@tag()=a')
|
|
42
|
+
if not link_element:
|
|
43
|
+
print(f" No link found in h2 of element {idx+1}")
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
title = link_element.text
|
|
47
|
+
url = link_element.attr('href')
|
|
48
|
+
|
|
49
|
+
print(f" Found title: {title[:50]}...")
|
|
50
|
+
print(f" Found URL: {url}")
|
|
51
|
+
|
|
52
|
+
# Extract snippet/description - try multiple possible selectors
|
|
53
|
+
snippet_element = None
|
|
54
|
+
snippet = "No description available"
|
|
55
|
+
|
|
56
|
+
# Try different possible selectors for the description
|
|
57
|
+
possible_selectors = [
|
|
58
|
+
'css:.b_caption p',
|
|
59
|
+
'tag:p',
|
|
60
|
+
'css:.b_caption',
|
|
61
|
+
'tag:div'
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
for selector in possible_selectors:
|
|
65
|
+
try:
|
|
66
|
+
snippet_element = element.ele(selector)
|
|
67
|
+
if snippet_element and snippet_element.text.strip():
|
|
68
|
+
snippet = snippet_element.text.strip()
|
|
69
|
+
break
|
|
70
|
+
except:
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
if title and url:
|
|
74
|
+
result = {
|
|
75
|
+
'title': title,
|
|
76
|
+
'url': url,
|
|
77
|
+
'snippet': snippet
|
|
78
|
+
}
|
|
79
|
+
results.append(result)
|
|
80
|
+
print(f" ✓ Successfully extracted result {idx+1}")
|
|
81
|
+
|
|
82
|
+
except Exception as e:
|
|
83
|
+
print(f" Error extracting result {idx+1}: {e}")
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
print(f"✓ Successfully extracted {len(results)} search results")
|
|
87
|
+
|
|
88
|
+
except Exception as e:
|
|
89
|
+
traceback.print_exc()
|
|
90
|
+
print(f"❌ Error extracting search results: {e}")
|
|
91
|
+
|
|
92
|
+
return results
|
|
93
|
+
|
|
94
|
+
async def search_bing(adapter, tab, query, max_pages=5, page=None):
|
|
95
|
+
"""
|
|
96
|
+
Perform a Bing search and extract results from multiple pages.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
adapter: DrissionPageAdapter instance
|
|
100
|
+
tab: Current browser tab handle
|
|
101
|
+
query: Search query string
|
|
102
|
+
max_pages: Maximum number of pages to extract (default: 5)
|
|
103
|
+
page: Specific page to extract (default: None). If specified, only returns results from that page.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
List of dictionaries containing title, url, and snippet for each search result
|
|
107
|
+
"""
|
|
108
|
+
print(f"\n=== Bing Search: {query} (max pages: {max_pages}) ===")
|
|
109
|
+
|
|
110
|
+
# Navigate to Bing
|
|
111
|
+
print("1. Navigating to Bing...")
|
|
112
|
+
interaction_report = await adapter.navigate(tab, "https://www.bing.com")
|
|
113
|
+
print(f"✓ Navigation completed. URL changed: {interaction_report.is_url_changed}")
|
|
114
|
+
|
|
115
|
+
# Wait a moment for page to load
|
|
116
|
+
import time
|
|
117
|
+
time.sleep(2)
|
|
118
|
+
|
|
119
|
+
# Check for International button and click if present
|
|
120
|
+
intl_btn = tab.ele("@id=est_en")
|
|
121
|
+
if intl_btn:
|
|
122
|
+
print(" Found International button. Clicking...")
|
|
123
|
+
intl_btn.click()
|
|
124
|
+
time.sleep(1) # Wait for the page to update after clicking intl button
|
|
125
|
+
|
|
126
|
+
# Type search query and submit
|
|
127
|
+
print("2. Typing search query...")
|
|
128
|
+
await adapter.type_text(tab, "@@tag()=input@@name=q", f"{query}\n", True)
|
|
129
|
+
print("✓ Search query submitted")
|
|
130
|
+
|
|
131
|
+
# Stabilize the search results page
|
|
132
|
+
print("\n3. Stabilizing search results page...")
|
|
133
|
+
stabilization_success = await adapter.stabilize(tab)
|
|
134
|
+
print(f"✓ Stabilization completed: {stabilization_success}")
|
|
135
|
+
|
|
136
|
+
# If page is specified, only extract that specific page
|
|
137
|
+
if page is not None:
|
|
138
|
+
print(f"\n=== Extracting page {page} only ===")
|
|
139
|
+
|
|
140
|
+
# Navigate to the specified page
|
|
141
|
+
target_page = page
|
|
142
|
+
while target_page > 1:
|
|
143
|
+
try:
|
|
144
|
+
next_page_selector = f'css:a[aria-label=\'Page {target_page}\']'
|
|
145
|
+
print(f"Looking for Page {target_page}...")
|
|
146
|
+
next_page_link = tab.ele(next_page_selector, timeout=2)
|
|
147
|
+
|
|
148
|
+
if next_page_link:
|
|
149
|
+
print(f"✓ Found Page {target_page}, clicking...")
|
|
150
|
+
next_page_link.click()
|
|
151
|
+
time.sleep(2)
|
|
152
|
+
await adapter.stabilize(tab)
|
|
153
|
+
target_page -= 1
|
|
154
|
+
else:
|
|
155
|
+
print(f"✗ Page {page} not found")
|
|
156
|
+
return []
|
|
157
|
+
except Exception as e:
|
|
158
|
+
print(f"✗ Error navigating to page {page}: {e}")
|
|
159
|
+
return []
|
|
160
|
+
|
|
161
|
+
# Extract results from the specified page
|
|
162
|
+
print(f"\n=== Processing page {page} ===")
|
|
163
|
+
page_results = await extract_search_results(adapter, tab)
|
|
164
|
+
print(f"\n=== Total results collected: {len(page_results)} ===")
|
|
165
|
+
return page_results
|
|
166
|
+
|
|
167
|
+
# Extract search results from multiple pages (original logic)
|
|
168
|
+
all_results = []
|
|
169
|
+
current_page = 1
|
|
170
|
+
|
|
171
|
+
while current_page <= max_pages:
|
|
172
|
+
print(f"\n=== Processing page {current_page} ===")
|
|
173
|
+
|
|
174
|
+
# Extract results from current page
|
|
175
|
+
page_results = await extract_search_results(adapter, tab)
|
|
176
|
+
all_results.extend(page_results)
|
|
177
|
+
|
|
178
|
+
# Check if we should continue to next page
|
|
179
|
+
if current_page < max_pages:
|
|
180
|
+
# Look for next page link using aria-label='Page X'
|
|
181
|
+
next_page_num = current_page + 1
|
|
182
|
+
next_page_selector = f'css:a[aria-label=\'Page {next_page_num}\']'
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
print(f"\nLooking for next page (Page {next_page_num})...")
|
|
186
|
+
next_page_link = tab.ele(next_page_selector, timeout=2)
|
|
187
|
+
|
|
188
|
+
if next_page_link:
|
|
189
|
+
print(f"✓ Found next page link, clicking...")
|
|
190
|
+
next_page_link.click()
|
|
191
|
+
time.sleep(2) # Wait for page to load
|
|
192
|
+
|
|
193
|
+
# Stabilize after page change
|
|
194
|
+
await adapter.stabilize(tab)
|
|
195
|
+
current_page += 1
|
|
196
|
+
else:
|
|
197
|
+
print(f"✓ No more pages available")
|
|
198
|
+
break
|
|
199
|
+
|
|
200
|
+
except Exception as e:
|
|
201
|
+
print(f"✓ No more pages available or error finding next page: {e}")
|
|
202
|
+
break
|
|
203
|
+
else:
|
|
204
|
+
print(f"\n✓ Reached maximum page limit ({max_pages})")
|
|
205
|
+
break
|
|
206
|
+
|
|
207
|
+
print(f"\n=== Total results collected: {len(all_results)} ===")
|
|
208
|
+
return all_results
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
'''
|
|
2
|
+
这是一个为 BrowserAdapter 抽象层骨架。
|
|
3
|
+
|
|
4
|
+
它定义了**逻辑层(Logic Layer)与执行层(Execution Layer)**之间的契约。支持 DrissionPage 等库的后续实现。
|
|
5
|
+
|
|
6
|
+
InteractionReport (互动报告):
|
|
7
|
+
|
|
8
|
+
它允许 Adapter 告诉逻辑层:“我点了一下,结果弹出了一个新窗口,并且当前页面也刷新了”。逻辑层收到报告后,可以先去递归处理 new_tab_handles,回来后再根据 is_dom_changed 决定是否 Soft Restart。
|
|
9
|
+
|
|
10
|
+
scan_elements (侦察):
|
|
11
|
+
|
|
12
|
+
我们将“寻找链接”和“寻找按钮”合并为一个扫描动作。这更高效,避免遍历两次 DOM。
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
PageContentType:
|
|
16
|
+
|
|
17
|
+
明确区分 HTML 和 PDF。因为如果浏览器直接打开了 PDF,我们不需要做“小脑总结”,而是直接由 Adapter 提供的 save_view_as_file 存下来。
|
|
18
|
+
'''
|
|
19
|
+
from abc import ABC, abstractmethod
|
|
20
|
+
from enum import Enum, auto
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from typing import List, Optional, Any, Dict, Union
|
|
23
|
+
|
|
24
|
+
# ==========================================
|
|
25
|
+
# 1. 通用类型定义 (Type Definitions)
|
|
26
|
+
# ==========================================
|
|
27
|
+
|
|
28
|
+
# TabHandle: 一个标记,代表浏览器的一个具体标签页。
|
|
29
|
+
# 在 DrissionPage 中这可能是 ChromiumTab 对象,在 Playwright 中是 Page 对象。
|
|
30
|
+
# 逻辑层不需要知道它具体是什么,只需要拿着它传回给 Adapter。
|
|
31
|
+
TabHandle = Any
|
|
32
|
+
|
|
33
|
+
class KeyAction(Enum):
|
|
34
|
+
"""常用的键盘操作"""
|
|
35
|
+
ENTER = "enter"
|
|
36
|
+
ESC = "esc"
|
|
37
|
+
TAB = "tab"
|
|
38
|
+
PAGE_DOWN = "page_down"
|
|
39
|
+
SPACE = "space"
|
|
40
|
+
|
|
41
|
+
class PageType(Enum):
|
|
42
|
+
"""当前页面的内容类型"""
|
|
43
|
+
NAVIGABLE = auto() # HTML 网页 (值得 Scout 和 Click)
|
|
44
|
+
STATIC_ASSET = auto() # PDF, JSON, TXT, Image (只值得 Save 或 Read)
|
|
45
|
+
ERRO_PAGE = auto()
|
|
46
|
+
|
|
47
|
+
class ElementType(Enum):
|
|
48
|
+
"""可交互元素的类型"""
|
|
49
|
+
LINK = auto() # <a href="..."> 导航链接
|
|
50
|
+
BUTTON = auto() # <button>, <div role="button"> 交互按钮
|
|
51
|
+
INPUT = auto() # 输入框 (虽然目前流程主要只读/点,保留扩展性)
|
|
52
|
+
|
|
53
|
+
# ==========================================
|
|
54
|
+
# 2. 数据载体 (Data Carriers)
|
|
55
|
+
# ==========================================
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class PageElement:
|
|
59
|
+
"""
|
|
60
|
+
[Input] 逻辑层看到的“可点击对象”。
|
|
61
|
+
Adapter 在 Phase 4 (Scout) 扫描页面时生成此对象。
|
|
62
|
+
"""
|
|
63
|
+
@abstractmethod
|
|
64
|
+
def get_text(self) -> str:
|
|
65
|
+
"""获取元素的可见文本 (用于小脑判断)"""
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
@abstractmethod
|
|
69
|
+
def get_tag_name(self) -> str:
|
|
70
|
+
"""获取元素的标签名 (a, button, div)"""
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
@abstractmethod
|
|
74
|
+
def get_element(self) -> Any:
|
|
75
|
+
"""获取元素对象 (如 DrissionPageElement)"""
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
@abstractmethod
|
|
79
|
+
def is_visible(self) -> bool:
|
|
80
|
+
"""元素是否可见"""
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class InteractionReport:
|
|
86
|
+
"""
|
|
87
|
+
[Output] 点击/操作后的“后果报告单”。
|
|
88
|
+
对应 Phase 5 的串行处理逻辑。非互斥,包含所有观测到的现象。
|
|
89
|
+
"""
|
|
90
|
+
# 1. 外部后果 (External Consequences)
|
|
91
|
+
new_tabs: List[TabHandle] = field(default_factory=list) # 弹出的新标签页句柄
|
|
92
|
+
downloaded_files: List[str] = field(default_factory=list) # 触发下载的文件本地路径
|
|
93
|
+
|
|
94
|
+
# 2. 内部后果 (Internal Consequences)
|
|
95
|
+
is_url_changed: bool = False # URL 是否改变
|
|
96
|
+
is_dom_changed: bool = False # DOM 结构是否显著改变 (用于 Soft Restart 判断)
|
|
97
|
+
|
|
98
|
+
# 3. 错误信息
|
|
99
|
+
error: Optional[str] = None # 如果操作失败 (如元素被遮挡/超时)
|
|
100
|
+
|
|
101
|
+
@dataclass
|
|
102
|
+
class PageSnapshot:
|
|
103
|
+
"""
|
|
104
|
+
[Output] 页面的静态快照。
|
|
105
|
+
用于 Phase 2/3 (Assess) 小脑阅读。
|
|
106
|
+
"""
|
|
107
|
+
url: str
|
|
108
|
+
title: str
|
|
109
|
+
content_type: PageType
|
|
110
|
+
main_text: str # 清洗后的正文 (Markdown 或 纯文本)
|
|
111
|
+
raw_html: str # 原始 HTML (备用)
|
|
112
|
+
|
|
113
|
+
# ==========================================
|
|
114
|
+
# 3. 浏览器适配器接口 (The Interface)
|
|
115
|
+
# ==========================================
|
|
116
|
+
|
|
117
|
+
class BrowserAdapter(ABC):
|
|
118
|
+
"""
|
|
119
|
+
浏览器自动化层的统一接口。
|
|
120
|
+
负责屏蔽具体库 (DrissionPage/Selenium) 的实现细节。
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
# --- Lifecycle (生命周期管理) ---
|
|
124
|
+
|
|
125
|
+
@abstractmethod
|
|
126
|
+
async def start(self, headless: bool = False):
|
|
127
|
+
"""启动浏览器进程"""
|
|
128
|
+
pass
|
|
129
|
+
|
|
130
|
+
@abstractmethod
|
|
131
|
+
async def close(self):
|
|
132
|
+
"""关闭浏览器进程并清理资源"""
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
# --- Tab Management (标签页管理) ---
|
|
136
|
+
|
|
137
|
+
@abstractmethod
|
|
138
|
+
async def create_tab(self, url: Optional[str] = None) -> TabHandle:
|
|
139
|
+
"""打开一个新的标签页,返回句柄"""
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
@abstractmethod
|
|
143
|
+
async def close_tab(self, tab: TabHandle):
|
|
144
|
+
"""关闭指定的标签页"""
|
|
145
|
+
pass
|
|
146
|
+
|
|
147
|
+
@abstractmethod
|
|
148
|
+
async def get_tab(self) -> TabHandle:
|
|
149
|
+
"""获取当前焦点标签页的句柄"""
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
@abstractmethod
|
|
153
|
+
def get_tab_url(self, tab: TabHandle) -> str:
|
|
154
|
+
"""获取指定标签页的 URL"""
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@abstractmethod
|
|
160
|
+
async def switch_to_tab(self, tab: TabHandle):
|
|
161
|
+
"""将浏览器焦点切换到指定标签页 (模拟人类视线)"""
|
|
162
|
+
pass
|
|
163
|
+
|
|
164
|
+
# --- Navigation & Content (导航与内容获取) ---
|
|
165
|
+
|
|
166
|
+
@abstractmethod
|
|
167
|
+
async def navigate(self, tab: TabHandle, url: str) -> InteractionReport:
|
|
168
|
+
"""
|
|
169
|
+
在指定 Tab 访问 URL。
|
|
170
|
+
注意:Navigate 也可能触发下载 (如直接访问 pdf 链接),因此返回 InteractionReport。
|
|
171
|
+
"""
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
@abstractmethod
|
|
175
|
+
async def stabilize(self, tab: TabHandle):
|
|
176
|
+
"""
|
|
177
|
+
[Phase 2] 页面稳定化。
|
|
178
|
+
等待 DOM Ready,处理弹窗 (Alert/Cookie Consent),滚动加载。
|
|
179
|
+
"""
|
|
180
|
+
pass
|
|
181
|
+
|
|
182
|
+
@abstractmethod
|
|
183
|
+
async def analyze_page_type(self, tab: TabHandle) -> PageType:
|
|
184
|
+
"""
|
|
185
|
+
判断当前页面是什么类型 (HTML, PDF Viewer, etc.)
|
|
186
|
+
"""
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
@abstractmethod
|
|
190
|
+
async def get_page_snapshot(self, tab: TabHandle) -> PageSnapshot:
|
|
191
|
+
"""
|
|
192
|
+
[Phase 3] 获取页面内容供小脑阅读。
|
|
193
|
+
应包含提取好的正文。
|
|
194
|
+
"""
|
|
195
|
+
pass
|
|
196
|
+
|
|
197
|
+
@abstractmethod
|
|
198
|
+
async def save_view_as_file(self, tab: TabHandle, save_dir: str) -> Optional[str]:
|
|
199
|
+
"""
|
|
200
|
+
如果当前页面是 PDF 预览或纯文本,将其保存为本地文件。
|
|
201
|
+
"""
|
|
202
|
+
pass
|
|
203
|
+
|
|
204
|
+
# --- Scouting & Interaction (侦察与交互) ---
|
|
205
|
+
|
|
206
|
+
@abstractmethod
|
|
207
|
+
async def scan_elements(self, tab: TabHandle):
|
|
208
|
+
"""
|
|
209
|
+
[Phase 4] 扫描页面。
|
|
210
|
+
返回所有可见的、有意义的交互元素 (链接 + 按钮)。
|
|
211
|
+
需要过滤掉不可见元素、空链接等。
|
|
212
|
+
"""
|
|
213
|
+
pass
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
@abstractmethod
|
|
219
|
+
async def click_and_observe(self, tab: TabHandle, element: Union[str, PageElement]) -> InteractionReport:
|
|
220
|
+
"""
|
|
221
|
+
[Phase 5] 核心交互函数。
|
|
222
|
+
点击元素,并智能等待,捕捉所有可能的后果 (新Tab、下载、页面变动)。
|
|
223
|
+
必须能够处理 SPA (单页应用) 的 DOM 变动检测。
|
|
224
|
+
"""
|
|
225
|
+
pass
|
|
226
|
+
|
|
227
|
+
# ==========================================
|
|
228
|
+
# Input & Control (精确输入与控制)
|
|
229
|
+
# 用于 Phase 0 (搜索) 或特定表单交互
|
|
230
|
+
# ==========================================
|
|
231
|
+
|
|
232
|
+
@abstractmethod
|
|
233
|
+
async def type_text(self, tab: TabHandle, selector: str, text: str, clear_existing: bool = True) -> bool:
|
|
234
|
+
"""
|
|
235
|
+
在指定元素中输入文本。
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
selector: 定位符 (CSS/XPath/DrissionPage语法)。例如: 'input[name="q"]'
|
|
239
|
+
text: 要输入的文本。
|
|
240
|
+
clear_existing: 输入前是否清空原有内容。
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
bool: 操作是否成功 (元素找到且输入完成)。
|
|
244
|
+
"""
|
|
245
|
+
pass
|
|
246
|
+
|
|
247
|
+
@abstractmethod
|
|
248
|
+
async def press_key(self, tab: TabHandle, key: Union[KeyAction, str]) -> InteractionReport:
|
|
249
|
+
"""
|
|
250
|
+
在当前页面模拟按键。
|
|
251
|
+
通常用于输入搜索词后按回车。
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
InteractionReport: 按键可能会导致页面刷新或跳转 (如按回车提交表单),
|
|
255
|
+
所以必须返回后果报告,供逻辑层判断是否需要 Soft Restart。
|
|
256
|
+
"""
|
|
257
|
+
pass
|
|
258
|
+
|
|
259
|
+
@abstractmethod
|
|
260
|
+
async def click_by_selector(self, tab: TabHandle, selector: str) -> InteractionReport:
|
|
261
|
+
"""
|
|
262
|
+
[精确点击] 通过选择器点击特定元素。
|
|
263
|
+
区别于 click_and_observe (那个是基于侦察出的 PageElement 对象),
|
|
264
|
+
这个方法用于已知页面结构的场景 (如点击搜索按钮)。
|
|
265
|
+
"""
|
|
266
|
+
pass
|
|
267
|
+
|
|
268
|
+
@abstractmethod
|
|
269
|
+
async def scroll(self, tab: TabHandle, direction: str = "bottom", distance: int = 0):
|
|
270
|
+
"""
|
|
271
|
+
手动控制滚动。
|
|
272
|
+
Args:
|
|
273
|
+
direction: 'bottom', 'top', 'down', 'up'
|
|
274
|
+
distance: 像素值 (如果 direction 是 down/up)
|
|
275
|
+
"""
|
|
276
|
+
pass
|
|
277
|
+
|
|
278
|
+
@abstractmethod
|
|
279
|
+
async def find_element(self, tab: TabHandle, selector: str) -> PageElement:
|
|
280
|
+
"""
|
|
281
|
+
检查某个特定元素是否存在。
|
|
282
|
+
用于验证页面是否加载正确 (例如:检查是否存在 'input[name="q"]' 来确认是否在 Google 首页)。
|
|
283
|
+
如果存在就返回这个element, 不存在返回None
|
|
284
|
+
"""
|
|
285
|
+
pass
|
|
286
|
+
|
|
287
|
+
@abstractmethod
|
|
288
|
+
async def save_static_asset(self, tab: TabHandle) -> Optional[str]:
|
|
289
|
+
"""
|
|
290
|
+
[针对 STATIC_ASSET]
|
|
291
|
+
保存当前 Tab 显示的内容为文件。
|
|
292
|
+
DrissionPage/Chrome 的下载机制通常是针对 Click 触发的。
|
|
293
|
+
对于已经打开在 Tab 里的资源,我们需要用 CDP 或 requests 把它“捞”下来。
|
|
294
|
+
"""
|
|
295
|
+
# 简单实现策略:
|
|
296
|
+
# 1. 如果是 PDF/Image,DrissionPage 有 download 方法,或者用 wget/requests 再请求一次 URL
|
|
297
|
+
# 2. 如果是 JSON/TXT,直接 f.write(tab.ele("tag:body").text)
|
|
298
|
+
pass
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""
|
|
2
|
+
公共浏览器数据结构和工具
|
|
3
|
+
|
|
4
|
+
为 data_crawler 和 web_searcher 等技能提供共享的数据结构。
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import time
|
|
8
|
+
from abc import ABC
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Deque, Set, List
|
|
11
|
+
from collections import deque
|
|
12
|
+
|
|
13
|
+
from ...core.browser.browser_adapter import TabHandle
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class TabSession:
|
|
18
|
+
"""
|
|
19
|
+
物理标签页上下文 (Physical Tab Context)
|
|
20
|
+
|
|
21
|
+
用于管理浏览器标签页的状态和待处理队列。
|
|
22
|
+
被多个爬虫技能共享使用。
|
|
23
|
+
"""
|
|
24
|
+
handle: TabHandle
|
|
25
|
+
current_url: str = ""
|
|
26
|
+
depth: int = 0
|
|
27
|
+
pending_link_queue: Deque[str] = field(default_factory=deque)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class BaseCrawlerContext(ABC):
|
|
31
|
+
"""
|
|
32
|
+
爬虫上下文基类
|
|
33
|
+
提供公共的状态管理和历史记录功能。
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, deadline: float):
|
|
37
|
+
self.deadline = deadline
|
|
38
|
+
self.visited_urls: Set[str] = set()
|
|
39
|
+
self.interaction_history: Set[str] = set()
|
|
40
|
+
self.assessed_links: Set[str] = set()
|
|
41
|
+
self.assessed_buttons: Set[str] = set()
|
|
42
|
+
self.blacklist: Set[str] = {
|
|
43
|
+
"facebook.com", "twitter.com", "instagram.com",
|
|
44
|
+
"taobao.com", "jd.com", "amazon.com",
|
|
45
|
+
"signin", "login", "signup"
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def __repr__(self):
|
|
49
|
+
return f"{self.__class__.__name__}(deadline={self.deadline}, visited={len(self.visited_urls)})"
|
|
50
|
+
|
|
51
|
+
def is_time_up(self) -> bool:
|
|
52
|
+
return time.time() > self.deadline
|
|
53
|
+
|
|
54
|
+
def mark_visited(self, url: str):
|
|
55
|
+
self.visited_urls.add(url)
|
|
56
|
+
|
|
57
|
+
def has_visited(self, url: str) -> bool:
|
|
58
|
+
return url in self.visited_urls
|
|
59
|
+
|
|
60
|
+
def mark_interacted(self, url: str, button_text: str):
|
|
61
|
+
key = f"{url}|{button_text}"
|
|
62
|
+
self.interaction_history.add(key)
|
|
63
|
+
|
|
64
|
+
def has_interacted(self, url: str, button_text: str) -> bool:
|
|
65
|
+
key = f"{url}|{button_text}"
|
|
66
|
+
return key in self.interaction_history
|
|
67
|
+
|
|
68
|
+
def mark_link_assessed(self, url: str):
|
|
69
|
+
"""标记链接为已评估(默认:内存版本)"""
|
|
70
|
+
self.assessed_links.add(url)
|
|
71
|
+
|
|
72
|
+
def has_link_assessed(self, url: str) -> bool:
|
|
73
|
+
"""检查链接是否已评估"""
|
|
74
|
+
return url in self.assessed_links
|
|
75
|
+
|
|
76
|
+
def mark_buttons_assessed(self, url: str, button_texts: List[str]):
|
|
77
|
+
"""批量标记按钮为已评估(默认:内存版本)"""
|
|
78
|
+
for button_text in button_texts:
|
|
79
|
+
key = f"{url}|{button_text}"
|
|
80
|
+
self.assessed_buttons.add(key)
|
|
81
|
+
|
|
82
|
+
def has_button_assessed(self, url: str, button_text: str) -> bool:
|
|
83
|
+
"""检查按钮是否已评估"""
|
|
84
|
+
key = f"{url}|{button_text}"
|
|
85
|
+
return key in self.assessed_buttons
|