Jarvis-Brain 0.1.5.5__tar.gz → 0.1.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {jarvis_brain-0.1.5.5 → jarvis_brain-0.1.10.0}/PKG-INFO +1 -1
- {jarvis_brain-0.1.5.5 → jarvis_brain-0.1.10.0}/README.md +2 -2
- jarvis_brain-0.1.10.0/mcp_tools/__init__.py +5 -0
- jarvis_brain-0.1.10.0/mcp_tools/chrome_devtools_tools.py +11 -0
- jarvis_brain-0.1.10.0/mcp_tools/dp_tools.py +306 -0
- {jarvis_brain-0.1.5.5 → jarvis_brain-0.1.10.0}/mcp_tools/main.py +9 -4
- {jarvis_brain-0.1.5.5 → jarvis_brain-0.1.10.0}/pyproject.toml +1 -1
- {jarvis_brain-0.1.5.5 → jarvis_brain-0.1.10.0}/tools/browser_manager.py +9 -1
- jarvis_brain-0.1.10.0/tools/browser_proxy.py +184 -0
- {jarvis_brain-0.1.5.5 → jarvis_brain-0.1.10.0}/tools/tools.py +90 -0
- jarvis_brain-0.1.5.5/mcp_tools/dp_tools.py +0 -166
- jarvis_brain-0.1.5.5/tools/__init__.py +0 -0
- {jarvis_brain-0.1.5.5 → jarvis_brain-0.1.10.0}/.gitignore +0 -0
- {jarvis_brain-0.1.5.5/mcp_tools → jarvis_brain-0.1.10.0/tools}/__init__.py +0 -0
- {jarvis_brain-0.1.5.5 → jarvis_brain-0.1.10.0}/uv.lock +0 -0
|
@@ -4,8 +4,8 @@
|
|
|
4
4
|
|
|
5
5
|
一个基于 FastMCP 和 DrissionPage 的浏览器自动化 MCP 服务器
|
|
6
6
|
|
|
7
|
-
[](https://www.python.org/downloads/)
|
|
8
|
+
[](https://github.com/yourusername/jarvis-mcp)
|
|
9
9
|
|
|
10
10
|
## 📖 简介
|
|
11
11
|
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
test_str="""
|
|
2
|
+
\\"X-XSS-Protection\\": \\"1; mode=block\\", \\"Server\\": \\"-\\"}, \\"response_body_segment\\": \\"pename\\\\\\": \\\\\\"SpuItemAttribute\\\\\\"}, {\\\\\\"code\\\\\\": \\\\\\"imageVersion\\\\\\", \\\\\\"name\\\\\\": \\\\\\"imageVersion\\\\\\", \\\\\\"subAttrList\\\\\\": [{\\\\\\"attrName\\\\\\": \\\\\\"1\\\\\\", \\\\\\"attrValue\\\\\\": \\\\\\"1\\\\\\", \\\\\\"__typename\\\\\\": \\\\\\"SubAttribute\\\\\\"}], \\\\\\"__typename\\\\\\": \\\\\\"SpuItemAttribute\\\\\\"}, {\\\\\\"code\\\\\\": \\\\\\"PLPhover\\\\\\", \\\\\\"name\\\\\\": \\\\\\"PLPhover\\\\\\", \\\\\\"subAttrList\\\\\\": [{\\\\\\"attrName\\\\\\": \\\\\\"2\\\\\\", \\\\\\"attrValue\\\\\\": \\\\\\"2\\\\\\", \\\\\\"__typename\\\\\\": \\\\\\"SubAttribute\\\\\\"}], \\\\\\"__typename\\\\\\": \\\\\\"SpuItemAttribute\\\\\\"}, {\\\\\\"code\\\\\\": \\\\\\"官网\\\\\\", \\\\\\"name\\\\\\": \\\\\\"官网\\\\\\", \\\\\\"subAttrList\\\\\\": [{\\\\\\"attrName\\\\\\": \\\\\\"是\\\\\\", \\\\\\"attrValue\\\\\\": \\\\\\"true\\\\\\", \\\\\\"__typename\\\\\\": \\\\\\"SubAttribute\\\\\\"}], \\\\\\"__typename\\\\\\": \\\\\\"SpuItemAttribute\\\\\\"}, {\\\\\\"code\\\\\\": \\\\\\"小程序\\\\\\", \\\\\\"name\\\\\\": \\\\\\"小程序\\\\\\", \\\\\\"subAttrList\\\\\\": [{\\\\\\"attrName\\\\\\": \\\\\\"是\\\\\\", \\\\\\"attrValue\\\\\\": \\\\\\"true\\\\\\", \\\\\\"__typename\\\\\\": \\\\\\"SubAttribute\\\\\\"}], \\\\\\"__typename\\\\\\": \\\\\\"SpuItemAttribute\\\\\\"}], \\\\\\"__typename\\\\\\": \\\\\\"SpuAttribute\\\\\\"}, \\\\\\"labelList\\\\\\": [{\\\\\\"name\\\\\\": \\\\\\"new\\\\\\", \\\\\\"value\\\\\\": \\\\\\"新品\\\\\\", \\\\\\"excludeValue\\\\\\": null, \\\\\\"__typename\\\\\\": \\\\\\"BaseLabel\\\\\\"}], \\\\\\"__typename\\\\\\": \\\\\\"Product2\\\\\\"}, {\\\\\\"baseInfo\\\\\\": {\\\\\\"spuCode\\\\\\": \\\\\\"864428CVE0G1000\\\\\\", \\\\\\"title\\\\\\": \\\\\\"互扣式双G带扣窄版腰带\\\\\\", \\\\\\"salePrice\\\\\\": 4300.0, \\\\\\"style\\\\\\": \\\\\\"864428\\\\\\", \\\\\\"categorys\\\\\\": [{\\\\\\"navFrontName\\\\\\": \\\\\\"女士风尚-手工plp\\\\\\", \\\\\\"frontName\\\\\\": null, \\\\\\"code\\\\\\": \\\\\\"67d7d4c5eb9d2100016b4fe5\\\\\\", \\\\\\"parentCode\\\\\\": null, \\\\\\"businessCode\\\\\\": \\\\\\"women-fashion手工plp\\\\\\", \\\\\\"__typename\\\\\\": \\\\\\"Category\\\\\\"}, {\\\\\\"navFrontName\\\\\\": \\\\\\"女士-women>女士配饰-women-accessories>女士腰带-women-accessories-belts\\\\\\", \\\\\\"frontName\\\\\\": null, \\\\\\"code\\\\\\": \\\\\\"642670bf0ae2090001133cbe\\\\\\", \\\\\\"parentCode\\\\\\": null, \\\\\\"businessCode\\\\\\": \\\\\\"women-accessories-belts\\\\\\", \\\\\\"__typename\\\\\\": \\\\\\"Category\\\\\\"}, {\\\\\\"navFrontName\\\\\\": \\\\\\"女士-women>女士配饰-women-accessories\\\\\\", \\\\\\"frontName\\\\\\": null, \\\\\\"code\\\\\\": \\\\\\"642670bf0ae2090001133cbd\\\\\\", \\\\\\"parentCode\\\\\\": null, \\\\\\"businessCode\\\\\\": \\\\\\"women-accessories\\\\\\",
|
|
3
|
+
"""
|
|
4
|
+
decoded = test_str.encode().decode('unicode_escape')
|
|
5
|
+
print(decoded)
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
"""
|
|
2
|
+
这个文件中提供的工具作为独立的Drissionpage mcp工具
|
|
3
|
+
"""
|
|
4
|
+
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from fastmcp import FastMCP
|
|
10
|
+
|
|
11
|
+
from tools.browser_manager import BrowserManager
|
|
12
|
+
from tools.tools import compress_html, requests_html, dp_headless_html, assert_waf_cookie, dp_mcp_message_pack, \
|
|
13
|
+
compress_html_js
|
|
14
|
+
from tools.browser_proxy import DPProxyClient, DPProxyClientManager
|
|
15
|
+
|
|
16
|
+
html_source_code_local_save_path = os.path.join(os.getcwd(), "html-source-code")
|
|
17
|
+
waf_status_code_dict = {
|
|
18
|
+
412: "瑞数",
|
|
19
|
+
521: "加速乐"
|
|
20
|
+
}
|
|
21
|
+
# 一轮最大输入,以免单个html最大长度超过ai最大输入
|
|
22
|
+
one_turn_max_token = 8000
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def register_visit_url(mcp: FastMCP, browser_manager: BrowserManager, client_manager: DPProxyClientManager):
|
|
26
|
+
@mcp.tool(name="visit_url",
|
|
27
|
+
description="使用Drissionpage打开url访问某个网站,并开始监听初始tab页的所有的XHR请求"
|
|
28
|
+
"当需要使用手机版浏览器Ua时use_mobile_user_agent为True"
|
|
29
|
+
"如果想要以域名对packet进行过滤,可以传入想要过滤的域名列表。默认是:None。"
|
|
30
|
+
"如果想要以method对packet进行过滤,可以传入想要过滤的method列表,默认是:['GET', 'POST']")
|
|
31
|
+
async def visit_url(url: str, domain_filter: list = None, method_filter: list = ["GET", "POST"],
|
|
32
|
+
use_mobile_user_agent: bool = False) -> dict[str, Any]:
|
|
33
|
+
mobile_user_agent = None
|
|
34
|
+
if use_mobile_user_agent:
|
|
35
|
+
mobile_user_agent = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Mobile Safari/537.36"
|
|
36
|
+
port, _browser = browser_manager.create_browser(mobile_user_agent)
|
|
37
|
+
tab = _browser.get_tab()
|
|
38
|
+
packet_filter = {
|
|
39
|
+
"domain_filter": domain_filter,
|
|
40
|
+
"method_filter": method_filter,
|
|
41
|
+
}
|
|
42
|
+
client_manager.create_client(tab, packet_filter)
|
|
43
|
+
tab.get(url)
|
|
44
|
+
tab_id = tab.tab_id
|
|
45
|
+
return dp_mcp_message_pack(
|
|
46
|
+
f"已在[{port}]端口创建浏览器对象,并已打开链接:{url},打开的模式是:{'手机版' if use_mobile_user_agent else '电脑版'}",
|
|
47
|
+
tab_id=tab_id,
|
|
48
|
+
browser_port=port
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def register_get_new_tab(mcp: FastMCP, browser_manager, client_manager: DPProxyClientManager):
|
|
53
|
+
@mcp.tool(name="get_new_tab",
|
|
54
|
+
description="使用Drissionpage创建一个新的tab页,在新的tab页中打开url,并开始监听新的tab页的所有XHR请求"
|
|
55
|
+
"如果想要以域名对packet进行过滤,可以传入想要过滤的域名列表。默认是:None。"
|
|
56
|
+
"如果想要以method对packet进行过滤,可以传入想要过滤的method列表,默认是:['GET', 'POST']")
|
|
57
|
+
async def get_new_tab(browser_port: int, url: str, domain_filter: list = None,
|
|
58
|
+
method_filter: list = ["GET", "POST"]) -> dict[str, Any]:
|
|
59
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
60
|
+
tab = _browser.new_tab()
|
|
61
|
+
packet_filter = {
|
|
62
|
+
"domain_filter": domain_filter,
|
|
63
|
+
"method_filter": method_filter,
|
|
64
|
+
}
|
|
65
|
+
client_manager.create_client(tab, packet_filter)
|
|
66
|
+
tab.get(url)
|
|
67
|
+
_browser.activate_tab(tab)
|
|
68
|
+
tab_id = tab.tab_id
|
|
69
|
+
return dp_mcp_message_pack(f"已创建新的tab页,并打开链接:{url}", tab_id=tab_id)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def register_pop_first_packet(mcp: FastMCP, browser_manager, client_manager: DPProxyClientManager):
|
|
73
|
+
@mcp.tool(name="pop_first_packet",
|
|
74
|
+
description="每调用一次就会弹出传入的tab页所监听到的数据包中的第一个packet_message,当一个packet_message的response body过长时会被切分成多个包,具体一个请求是否还有下一个包,可以参考body_completed字段")
|
|
75
|
+
async def pop_first_packet(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
76
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
77
|
+
client = client_manager.get_client(tab_id)
|
|
78
|
+
current_queue_size, packet_message = client.pop_first_packet()
|
|
79
|
+
message = f"tab页:【{tab_id}】,暂时没有监听到XHR数据包"
|
|
80
|
+
if packet_message:
|
|
81
|
+
message = f"tab页:【{tab_id}】,监听到XHR数据包,当前数据包队列中还剩 {current_queue_size} 条数据,如果还剩数据为0,可以暂时稍后再次调用该方法"
|
|
82
|
+
if (packet_message is None) and current_queue_size:
|
|
83
|
+
message = f"tab页:【{tab_id}】,当前弹出的第一个数据包不符合过滤条件,当前数据包队列中还剩 {current_queue_size} 条数据,请不要改变条件,继续弹出下一个数据包"
|
|
84
|
+
return dp_mcp_message_pack(
|
|
85
|
+
message,
|
|
86
|
+
browser_port=browser_port,
|
|
87
|
+
tab_id=tab_id,
|
|
88
|
+
packet_message=packet_message,
|
|
89
|
+
current_queue_size=current_queue_size,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def register_get_html(mcp: FastMCP, browser_manager):
|
|
94
|
+
@mcp.tool(name="get_html", description="使用Drissionpage获取某一个tab页的html")
|
|
95
|
+
async def get_html(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
96
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
97
|
+
tab = _browser.get_tab(tab_id)
|
|
98
|
+
file_name_prefix = hashlib.md5(str(tab.title).encode('utf-8')).hexdigest()
|
|
99
|
+
if not os.path.exists(html_source_code_local_save_path):
|
|
100
|
+
os.makedirs(html_source_code_local_save_path)
|
|
101
|
+
# min_html, compress_rate = compress_html(tab.html)
|
|
102
|
+
min_html = tab.run_js(compress_html_js)
|
|
103
|
+
# html_str_list = [min_html[i:i + one_turn_max_token] for i in range(0, len(min_html), one_turn_max_token)]
|
|
104
|
+
html_file_list = []
|
|
105
|
+
for index, html_str in enumerate([min_html]):
|
|
106
|
+
file_name = file_name_prefix + f"_{tab_id}_segment{index}.html"
|
|
107
|
+
abs_path = os.path.join(html_source_code_local_save_path, file_name)
|
|
108
|
+
with open(abs_path, "w", encoding="utf-8") as f:
|
|
109
|
+
f.write(html_str)
|
|
110
|
+
html_file_list.append(abs_path)
|
|
111
|
+
message = f"已保存tab页:【{tab_id}】的html源码片段共{len(html_file_list)}个"
|
|
112
|
+
return dp_mcp_message_pack(message, tab_id=tab_id, htmls_local_path=html_file_list)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def register_switch_tab(mcp: FastMCP, browser_manager):
|
|
116
|
+
@mcp.tool(name="switch_tab", description="根据传入的tab_id切换到对应的tab页", )
|
|
117
|
+
async def switch_tab(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
118
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
119
|
+
_browser.activate_tab(tab_id)
|
|
120
|
+
return dp_mcp_message_pack(f"已将tab页:【{tab_id}】切换至最前端")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def register_close_tab(mcp: FastMCP, browser_manager):
|
|
124
|
+
@mcp.tool(name="close_tab", description="根据传入的tab_id关闭tab页", )
|
|
125
|
+
async def close_tab(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
126
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
127
|
+
_browser.close_tabs(tab_id)
|
|
128
|
+
return dp_mcp_message_pack(f"已将tab页:【{tab_id}】关闭")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def register_check_selector(mcp: FastMCP, browser_manager):
|
|
132
|
+
@mcp.tool(name="check_selector",
|
|
133
|
+
description="查找tab页中是否包含元素,并返回元素attr_name所对应的值。"
|
|
134
|
+
"当要选择的元素包含过多元素时,需要传入offset和page_size来分批查看元素,一般不建议调整page_size,更推荐你调整offset"
|
|
135
|
+
"同时如果单个元素属性值太长,函数会进行截断。一般的单个元素的属性值超过300个字符的就会触发截断,截断后会在最后拼接'...'")
|
|
136
|
+
async def check_selector(browser_port: int, tab_id: str, css_selector: str, attr_name: str = "text",
|
|
137
|
+
offset: int = 0, page_size: int = 10) -> dict[
|
|
138
|
+
str, Any]:
|
|
139
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
140
|
+
target_tab = _browser.get_tab(tab_id)
|
|
141
|
+
css_selector = css_selector
|
|
142
|
+
if "css:" not in css_selector:
|
|
143
|
+
css_selector = "css:" + css_selector
|
|
144
|
+
target_eles = target_tab.eles(css_selector)
|
|
145
|
+
exist_flag = False
|
|
146
|
+
if len(target_eles) != 0:
|
|
147
|
+
exist_flag = True
|
|
148
|
+
if len(target_eles) > page_size:
|
|
149
|
+
target_eles = target_eles[offset:offset + page_size]
|
|
150
|
+
slice_seg = max(300, one_turn_max_token // (page_size + 6))
|
|
151
|
+
if attr_name == "text":
|
|
152
|
+
ele_attr_list = [i.text.replace("\n", "") for i in target_eles]
|
|
153
|
+
ele_attr_list = [attr_str[:slice_seg] for attr_str in ele_attr_list]
|
|
154
|
+
# 如果经过截断遍历后的字符串长度与截断长度相等,则默认截断了
|
|
155
|
+
ele_attr_list = [attr_str + "..." if len(attr_str) == slice_seg else attr_str for attr_str in ele_attr_list]
|
|
156
|
+
attr_output = "\n".join(ele_attr_list)
|
|
157
|
+
else:
|
|
158
|
+
ele_attr_list = [i.attr(attr_name) for i in target_eles]
|
|
159
|
+
ele_attr_list = [attr_str[:slice_seg] for attr_str in ele_attr_list if attr_str]
|
|
160
|
+
ele_attr_list = [attr_str + "..." if len(attr_str) == slice_seg else attr_str for attr_str in ele_attr_list]
|
|
161
|
+
attr_output = json.dumps(ele_attr_list, ensure_ascii=False)
|
|
162
|
+
# 对attr_output逐个截断,截断的长度为:一轮最大token除以元素个数+3个点+两个引号和逗号
|
|
163
|
+
return dp_mcp_message_pack(
|
|
164
|
+
f"已完成tab页:【{tab_id}】对:【{css_selector}】的检查",
|
|
165
|
+
tab_id=tab_id,
|
|
166
|
+
selector=css_selector,
|
|
167
|
+
selector_ele_exist=exist_flag,
|
|
168
|
+
page_size=page_size,
|
|
169
|
+
offset=offset,
|
|
170
|
+
attr_output=attr_output
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def register_quit_browser(mcp: FastMCP, browser_manager):
|
|
175
|
+
@mcp.tool(name="quit_browser", description="退出浏览器会话,关闭浏览器")
|
|
176
|
+
async def quit_browser(browser_port: int) -> dict[str, Any]:
|
|
177
|
+
flag, _browser = browser_manager.remove_page(browser_port)
|
|
178
|
+
if flag:
|
|
179
|
+
_browser.quit()
|
|
180
|
+
return dp_mcp_message_pack(
|
|
181
|
+
f"浏览器[{browser_port}],退出会话,关闭浏览器{'成功' if flag else '失败'}",
|
|
182
|
+
browser_port=browser_port,
|
|
183
|
+
quit_flag=flag
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def register_assert_waf(mcp: FastMCP, browser_manager):
|
|
188
|
+
@mcp.tool(name="assert_waf",
|
|
189
|
+
description="通过对比requests、有头浏览器、无头浏览器获取到的html,判断网页是否使用了waf以及是否为动态渲染的网页")
|
|
190
|
+
async def assert_waf(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
191
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
192
|
+
target_tab = _browser.get_tab(tab_id)
|
|
193
|
+
recommend_team = "drissionpage_head"
|
|
194
|
+
head_cookies = target_tab.cookies()
|
|
195
|
+
# 通过cookie判断是否有waf
|
|
196
|
+
waf_flag, waf_type = assert_waf_cookie(head_cookies)
|
|
197
|
+
head_html = target_tab.html
|
|
198
|
+
min_head_html, head_rate = compress_html(head_html, only_text=True)
|
|
199
|
+
raw_html, status_code = requests_html(target_tab.url)
|
|
200
|
+
min_raw_html, raw_rate = compress_html(raw_html, only_text=True)
|
|
201
|
+
r_h_rate_diff = abs(head_rate - raw_rate)
|
|
202
|
+
# 如果有已知的防火墙,则不浪费时间使用无头获取html和压缩比了
|
|
203
|
+
if waf_flag or status_code in waf_status_code_dict.keys():
|
|
204
|
+
return dp_mcp_message_pack(
|
|
205
|
+
f"已完成tab页:【{tab_id}】的分析,该tab页存在waf",
|
|
206
|
+
tab_id=tab_id,
|
|
207
|
+
recommend_team=recommend_team,
|
|
208
|
+
raw_head_rate_difference=r_h_rate_diff,
|
|
209
|
+
raw_headless_rate_difference=0,
|
|
210
|
+
head_headless_rate_difference=0
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
headless_html = dp_headless_html(target_tab.url)
|
|
214
|
+
min_headless_html, headless_rate = compress_html(headless_html, only_text=True)
|
|
215
|
+
r_hless_rate_diff = abs(raw_rate - headless_rate)
|
|
216
|
+
h_hless_rate_diff = abs(head_rate - headless_rate)
|
|
217
|
+
# 最优情况:requests,dp有头和无头拿到的结果基本一致,认定为没有防护的静态网页
|
|
218
|
+
if r_h_rate_diff < 40 and r_hless_rate_diff < 40 and h_hless_rate_diff < 40:
|
|
219
|
+
recommend_team = "requests"
|
|
220
|
+
# 最差情况:requests,dp有头和无头拿到的结果差距都很大,认定为有浏览器无头检测+动态网页
|
|
221
|
+
# if r_h_rate_diff < 40 and r_hless_rate_diff < 40 and h_hless_rate_diff < 40:
|
|
222
|
+
# 较差1:dp有头和无头差距很小,但是requests拿不到正确结果,认定为有requests防护 or 动态网页
|
|
223
|
+
elif h_hless_rate_diff < 30 and r_hless_rate_diff > 40:
|
|
224
|
+
recommend_team = "drissionpage_headless"
|
|
225
|
+
# 较差2:有头和无头差距很大,但是requests和有头拿到的结果基本一致,认定为要么有别的没有防护requests的waf,或者间歇性的瑞数【此时应该拿有头的cookie去判断其中是否有瑞数特征,上面已经做了】
|
|
226
|
+
# if r_h_rate_diff < 15 and h_hless_rate_diff > 40:
|
|
227
|
+
return dp_mcp_message_pack(
|
|
228
|
+
f"已完成tab页:【{tab_id}】的分析,该tab页存在waf",
|
|
229
|
+
tab_id=tab_id,
|
|
230
|
+
recommend_team=recommend_team,
|
|
231
|
+
raw_head_rate_difference=r_h_rate_diff,
|
|
232
|
+
raw_headless_rate_difference=h_hless_rate_diff,
|
|
233
|
+
head_headless_rate_difference=h_hless_rate_diff
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def register_click_action(mcp: FastMCP, browser_manager):
|
|
238
|
+
@mcp.tool(name="click_action", description="尝试点击tab页中的元素,返回元素是否可以被点击,以及是否点击成功。")
|
|
239
|
+
async def click_action(browser_port: int, tab_id: str, css_selector: str) -> dict[str, Any]:
|
|
240
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
241
|
+
target_tab = _browser.get_tab(tab_id)
|
|
242
|
+
css_selector = css_selector
|
|
243
|
+
if "css:" not in css_selector:
|
|
244
|
+
css_selector = "css:" + css_selector
|
|
245
|
+
target_eles = target_tab.eles(css_selector)
|
|
246
|
+
click_success = False
|
|
247
|
+
element_clickable = False
|
|
248
|
+
if len(target_eles) == 1:
|
|
249
|
+
target_element = target_eles[0]
|
|
250
|
+
element_clickable = target_element.states.is_clickable
|
|
251
|
+
try:
|
|
252
|
+
target_element.click()
|
|
253
|
+
click_success = True
|
|
254
|
+
except Exception as e:
|
|
255
|
+
click_success = False
|
|
256
|
+
message = f"tab页:【{tab_id}】点击【{css_selector}】 {'成功' if click_success else '失败'} 了"
|
|
257
|
+
else:
|
|
258
|
+
message = f"tab页:【{tab_id}】传入的css_selector找到了{len(target_eles)}个元素,请确保传入的css_selector可以找到唯一的一个元素"
|
|
259
|
+
return dp_mcp_message_pack(
|
|
260
|
+
message=message,
|
|
261
|
+
browser_port=browser_port,
|
|
262
|
+
tab_id=tab_id,
|
|
263
|
+
css_selector=css_selector,
|
|
264
|
+
element_clickable=element_clickable,
|
|
265
|
+
click_success=click_success,
|
|
266
|
+
extra_message="点击成功,页面可能有更新,请重新获取页面html,并重新分析页面Selector" if click_success else ""
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def register_scroll_action(mcp: FastMCP, browser_manager):
|
|
271
|
+
@mcp.tool(name="scroll_action", description="尝试滚动tab页"
|
|
272
|
+
"forward参数是滚动的方向:down、up、left、right"
|
|
273
|
+
"pixel参数是滚动的像素值,默认为None。"
|
|
274
|
+
"当forward为down且pixel为None,则将页面滚动到垂直中间位置,水平位置不变"
|
|
275
|
+
"当forward为up且pixel为None,则将页面滚动到顶部,水平位置不变"
|
|
276
|
+
"当forward为left且pixel为None,则将页面滚动到最左边,垂直位置不变"
|
|
277
|
+
"当forward为right且pixel为None,则将页面滚动到最右边,垂直位置不变")
|
|
278
|
+
async def scroll_action(browser_port: int, tab_id: str, forward: str = "down", pixel: int = None) -> dict[str, Any]:
|
|
279
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
280
|
+
target_tab = _browser.get_tab(tab_id)
|
|
281
|
+
if forward == "down":
|
|
282
|
+
if pixel is None:
|
|
283
|
+
target_tab.scroll.to_half()
|
|
284
|
+
target_tab.scroll.down(pixel)
|
|
285
|
+
elif forward == "up":
|
|
286
|
+
if pixel is None:
|
|
287
|
+
target_tab.scroll.to_top()
|
|
288
|
+
target_tab.scroll.up(pixel)
|
|
289
|
+
elif forward == "left":
|
|
290
|
+
if pixel is None:
|
|
291
|
+
target_tab.scroll.to_leftmost()
|
|
292
|
+
target_tab.scroll.left(pixel)
|
|
293
|
+
elif forward == "right":
|
|
294
|
+
if pixel is None:
|
|
295
|
+
target_tab.scroll.to_rightmost()
|
|
296
|
+
target_tab.scroll.right(pixel)
|
|
297
|
+
else:
|
|
298
|
+
if pixel is None:
|
|
299
|
+
target_tab.scroll.to_half()
|
|
300
|
+
target_tab.scroll.down()
|
|
301
|
+
message = f"已完成对tab页:【{tab_id}】forward={forward} 的滑动"
|
|
302
|
+
return dp_mcp_message_pack(
|
|
303
|
+
message=message,
|
|
304
|
+
browser_port=browser_port,
|
|
305
|
+
tab_id=tab_id,
|
|
306
|
+
)
|
|
@@ -2,6 +2,7 @@ from fastmcp import FastMCP
|
|
|
2
2
|
|
|
3
3
|
from mcp_tools.dp_tools import *
|
|
4
4
|
from tools.browser_manager import browser_manager
|
|
5
|
+
from tools.browser_proxy import client_manager
|
|
5
6
|
|
|
6
7
|
mcp = FastMCP("Jarvis Brain Mcp Tools")
|
|
7
8
|
|
|
@@ -13,18 +14,22 @@ if "TeamNode-Dp" in enabled_modules:
|
|
|
13
14
|
# 页面管理
|
|
14
15
|
register_close_tab(mcp, browser_manager)
|
|
15
16
|
register_switch_tab(mcp, browser_manager)
|
|
16
|
-
register_get_new_tab(mcp, browser_manager)
|
|
17
|
-
#
|
|
18
|
-
register_visit_url(mcp, browser_manager)
|
|
17
|
+
register_get_new_tab(mcp, browser_manager, client_manager)
|
|
18
|
+
# 基础功能
|
|
19
|
+
register_visit_url(mcp, browser_manager, client_manager)
|
|
19
20
|
register_get_html(mcp, browser_manager)
|
|
20
21
|
register_check_selector(mcp, browser_manager)
|
|
22
|
+
register_pop_first_packet(mcp, browser_manager, client_manager)
|
|
23
|
+
# 页面交互
|
|
24
|
+
register_click_action(mcp, browser_manager)
|
|
25
|
+
register_scroll_action(mcp, browser_manager)
|
|
21
26
|
|
|
22
27
|
if "JarvisNode" in enabled_modules:
|
|
23
28
|
register_assert_waf(mcp, browser_manager)
|
|
24
29
|
|
|
25
30
|
|
|
26
31
|
def main():
|
|
27
|
-
mcp.run(transport="stdio")
|
|
32
|
+
mcp.run(transport="stdio",show_banner=False)
|
|
28
33
|
|
|
29
34
|
|
|
30
35
|
if __name__ == '__main__':
|
|
@@ -3,6 +3,10 @@ import random
|
|
|
3
3
|
from typing import Optional, Tuple
|
|
4
4
|
import os
|
|
5
5
|
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
6
|
+
import platform
|
|
7
|
+
from DrissionPage.common import Settings
|
|
8
|
+
|
|
9
|
+
Settings.set_raise_when_click_failed(True)
|
|
6
10
|
|
|
7
11
|
|
|
8
12
|
class BrowserManager:
|
|
@@ -15,13 +19,17 @@ class BrowserManager:
|
|
|
15
19
|
cls._instance.browser_pool = {}
|
|
16
20
|
return cls._instance
|
|
17
21
|
|
|
18
|
-
def create_browser(self) -> Tuple[int, ChromiumPage]:
|
|
22
|
+
def create_browser(self, user_agent: str = None) -> Tuple[int, ChromiumPage]:
|
|
19
23
|
"""创建新的浏览器实例"""
|
|
20
24
|
random_port = random.randint(9223, 9934)
|
|
21
25
|
while random_port in self.browser_pool:
|
|
22
26
|
random_port = random.randint(9223, 9934)
|
|
23
27
|
|
|
24
28
|
co = ChromiumOptions().set_local_port(random_port)
|
|
29
|
+
if user_agent:
|
|
30
|
+
co.set_user_agent(user_agent)
|
|
31
|
+
if platform.system() != 'Windows':
|
|
32
|
+
co.set_argument('--no-sandbox')
|
|
25
33
|
custom_data_dir = os.path.join(os.path.expanduser('~'), 'DrissionPage', "userData", f"{random_port}")
|
|
26
34
|
co.set_user_data_path(custom_data_dir) # 设置用户数据路径
|
|
27
35
|
# if not os.path.exists(custom_data_dir):
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
from collections import deque
|
|
3
|
+
import time
|
|
4
|
+
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
5
|
+
from DrissionPage._pages.chromium_tab import ChromiumTab
|
|
6
|
+
from DrissionPage._units.listener import DataPacket
|
|
7
|
+
from typing import Tuple, Optional
|
|
8
|
+
import json
|
|
9
|
+
from urllib.parse import urlparse, urlunparse
|
|
10
|
+
|
|
11
|
+
one_turn_max_token = 16000
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DPProxyClient:
|
|
15
|
+
def __init__(self, driver: ChromiumTab, packet_filter: dict, self_kill=False):
|
|
16
|
+
self.tab_id = driver.tab_id
|
|
17
|
+
self.driver = ChromePageProxy(driver, self)
|
|
18
|
+
self.thread = None
|
|
19
|
+
self.self_kill = self_kill
|
|
20
|
+
self.packet_filter = packet_filter
|
|
21
|
+
self.packet_queue = deque()
|
|
22
|
+
|
|
23
|
+
def get_driver(self, start_listen, count=None, timeout=10) -> ChromiumTab:
|
|
24
|
+
"""
|
|
25
|
+
获取代理后的driver、tab
|
|
26
|
+
:param start_listen: 若你自己写的代码里已经使用自动化框架监听发包的功能了,则该值应该置为False。若没监听,则必须将该值置为True
|
|
27
|
+
:param count: 需捕获的数据包总数,为None表示无限
|
|
28
|
+
:param timeout: 两个数据包之间等待的最大时长(秒),为None表示无限,默认为10秒
|
|
29
|
+
:return:
|
|
30
|
+
"""
|
|
31
|
+
if start_listen:
|
|
32
|
+
self.driver.listen.set_targets(res_type=('xhr', 'fetch'))
|
|
33
|
+
self.driver.listen.start()
|
|
34
|
+
self.thread = threading.Thread(target=self.start_listen, args=(count, timeout,))
|
|
35
|
+
self.thread.start()
|
|
36
|
+
return self.driver
|
|
37
|
+
|
|
38
|
+
def start_listen(self, count=None, timeout=10):
|
|
39
|
+
for _ in self.driver.listen.steps(count=count, timeout=timeout, gap=1):
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
# 每次调用函数,都从队列的左端弹出一个数据包
|
|
43
|
+
def pop_first_packet(self):
|
|
44
|
+
if self.packet_queue:
|
|
45
|
+
result = self.packet_queue.popleft()
|
|
46
|
+
current_queue_size = len(self.packet_queue)
|
|
47
|
+
return current_queue_size, json.dumps(result, ensure_ascii=False, separators=(',', ':')).replace("\\", "")
|
|
48
|
+
else:
|
|
49
|
+
return 0, None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class DPProxyClientManager:
|
|
53
|
+
"""浏览器池管理器 - 使用单例模式"""
|
|
54
|
+
_instance = None
|
|
55
|
+
|
|
56
|
+
def __new__(cls):
|
|
57
|
+
if cls._instance is None:
|
|
58
|
+
cls._instance = super().__new__(cls)
|
|
59
|
+
cls._instance.tab_pool = {}
|
|
60
|
+
return cls._instance
|
|
61
|
+
|
|
62
|
+
def create_client(self, tab: ChromiumTab, packet_filter: dict, self_kill=False) -> Tuple[
|
|
63
|
+
str, DPProxyClient, ChromiumTab]:
|
|
64
|
+
"""创建新的tab页面代理实例"""
|
|
65
|
+
client = DPProxyClient(tab, packet_filter, self_kill=self_kill)
|
|
66
|
+
tab = client.get_driver(True, timeout=60 * 10)
|
|
67
|
+
tab_id = tab.tab_id
|
|
68
|
+
self.tab_pool[tab_id] = {"client": client, "driver": tab}
|
|
69
|
+
return tab_id, client, tab
|
|
70
|
+
|
|
71
|
+
def get_client(self, tab_id: str) -> Optional[DPProxyClient]:
|
|
72
|
+
"""根据端口获取浏览器实例"""
|
|
73
|
+
return self.tab_pool.get(tab_id).get("client", None)
|
|
74
|
+
|
|
75
|
+
def remove_client(self, tab_id: str) -> Tuple[bool, Optional[ChromiumPage]]:
|
|
76
|
+
"""根据端口移除浏览器实例"""
|
|
77
|
+
client = self.tab_pool.pop(tab_id, None)
|
|
78
|
+
return client is not None, client
|
|
79
|
+
|
|
80
|
+
def list_clients(self) -> list[int]:
|
|
81
|
+
"""列出所有活跃的浏览器端口"""
|
|
82
|
+
return list(self.tab_pool.keys())
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class ChromePageProxy:
|
|
86
|
+
def __init__(self, page, client=None):
|
|
87
|
+
self.__dict__['page'] = page
|
|
88
|
+
self.__dict__['client'] = client
|
|
89
|
+
|
|
90
|
+
def __getattr__(self, item):
|
|
91
|
+
attr = getattr(self.page, item)
|
|
92
|
+
print(item, attr)
|
|
93
|
+
if item == 'listen':
|
|
94
|
+
listen_proxy = DrissionPageListenerProxy(attr, self.__dict__['client'])
|
|
95
|
+
return listen_proxy
|
|
96
|
+
return attr
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class DrissionPageListenerProxy:
|
|
100
|
+
def __init__(self, listener, client=None):
|
|
101
|
+
self.listener = listener
|
|
102
|
+
self.client = client
|
|
103
|
+
|
|
104
|
+
def __getattr__(self, item):
|
|
105
|
+
attr = getattr(self.listener, item)
|
|
106
|
+
# 当监听到wait被调用的时候
|
|
107
|
+
if item == "wait":
|
|
108
|
+
def wrapper(*args, **kwargs):
|
|
109
|
+
result = attr(*args, **kwargs)
|
|
110
|
+
check_data_packet(result, self.client)
|
|
111
|
+
return result
|
|
112
|
+
|
|
113
|
+
return wrapper
|
|
114
|
+
# 当监听到steps被调用的时候
|
|
115
|
+
if item == "steps":
|
|
116
|
+
def wrapper(*args, **kwargs):
|
|
117
|
+
if kwargs.get("gap", 1) > 1:
|
|
118
|
+
raise Exception("暂不支持多包监控")
|
|
119
|
+
result = attr(*args, **kwargs)
|
|
120
|
+
if attr.__name__ == "steps":
|
|
121
|
+
for step in result:
|
|
122
|
+
check_data_packet(step, self.client)
|
|
123
|
+
yield step
|
|
124
|
+
|
|
125
|
+
return wrapper
|
|
126
|
+
return attr
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def check_data_packet(packet: DataPacket, client: DPProxyClient):
|
|
130
|
+
"""
|
|
131
|
+
封装监听到的数据包,并将其存放在client的packet_queue中
|
|
132
|
+
:param packet:
|
|
133
|
+
:param client:
|
|
134
|
+
:return:
|
|
135
|
+
"""
|
|
136
|
+
url = packet.url
|
|
137
|
+
method = packet.request.method
|
|
138
|
+
data = None
|
|
139
|
+
if packet.request.hasPostData:
|
|
140
|
+
data = packet.request.postData
|
|
141
|
+
domain = urlparse(url).netloc
|
|
142
|
+
body = packet.response.body
|
|
143
|
+
body_str = json.dumps(body, ensure_ascii=False, separators=(',', ':'))
|
|
144
|
+
body_str_list = [body_str[i:i + one_turn_max_token] for i in range(0, len(body_str), one_turn_max_token)]
|
|
145
|
+
body_completed = True
|
|
146
|
+
packet_filter = client.packet_filter
|
|
147
|
+
domain_filter = packet_filter.get("domain_filter", None)
|
|
148
|
+
method_filter = packet_filter.get("method_filter", ["GET", "POST"])
|
|
149
|
+
for index, body_str in enumerate(body_str_list):
|
|
150
|
+
# 如果给了domain_filter并且domain没有在domain_filter中时跳过该数据包
|
|
151
|
+
if domain_filter and domain not in domain_filter:
|
|
152
|
+
continue
|
|
153
|
+
# 如果method没有在method_filter中,则跳过该数据包
|
|
154
|
+
if method not in method_filter:
|
|
155
|
+
continue
|
|
156
|
+
if (index + 1) != len(body_str_list):
|
|
157
|
+
body_completed = False
|
|
158
|
+
temp_dict = {
|
|
159
|
+
"url": url,
|
|
160
|
+
"body_completed": body_completed,
|
|
161
|
+
"method": method,
|
|
162
|
+
"request_data": data,
|
|
163
|
+
"request_headers": dict(packet.request.headers),
|
|
164
|
+
"response_headers": dict(packet.response.headers),
|
|
165
|
+
"response_body_segment": body_str.replace("\\", ""),
|
|
166
|
+
}
|
|
167
|
+
client.packet_queue.append(temp_dict)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
client_manager = DPProxyClientManager()
|
|
171
|
+
|
|
172
|
+
# if __name__ == '__main__':
|
|
173
|
+
# co = ChromiumOptions().set_user_agent(
|
|
174
|
+
# "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Mobile Safari/537.36")
|
|
175
|
+
# tab = ChromiumPage(co).latest_tab
|
|
176
|
+
# client = DPProxyClient(tab, self_kill=False)
|
|
177
|
+
# # client = CaptchaClient(tab, self_kill=True)
|
|
178
|
+
# tab = client.get_driver(True)
|
|
179
|
+
# url = "https://api.toutiaoapi.com/feoffline/hotspot_and_local/html/hot_list/index.html?client_extra_params=%7B%22custom_log_pb%22%3A%22%7B%5C%22style_id%5C%22%3A%5C%2240030%5C%22%2C%5C%22entrance_hotspot%5C%22%3A%5C%22search%5C%22%2C%5C%22location%5C%22%3A%5C%22hot_board%5C%22%2C%5C%22category_name%5C%22%3A%5C%22hotboard_light%5C%22%7D%22%7D&count=50&log_pb=%7B%22style_id%22%3A%2240030%22%2C%22entrance_hotspot%22%3A%22search%22%2C%22location%22%3A%22hot_board%22%2C%22category_name%22%3A%22hotboard_light%22%7D&only_hot_list=1&tab_name=stream&enter_keyword=%23%E7%BE%8E%E5%9B%BD%E9%80%80%E5%87%BA66%E4%B8%AA%E5%9B%BD%E9%99%85%E7%BB%84%E7%BB%87%23"
|
|
180
|
+
# tab.get(url)
|
|
181
|
+
# for _ in range(5056):
|
|
182
|
+
# new_packet = client.pop_first_packet()
|
|
183
|
+
# print(new_packet, "23")
|
|
184
|
+
# time.sleep(1)
|
|
@@ -7,6 +7,87 @@ from bs4 import BeautifulSoup
|
|
|
7
7
|
from curl_cffi import requests
|
|
8
8
|
from lxml import html, etree
|
|
9
9
|
|
|
10
|
+
compress_html_js = """
|
|
11
|
+
function getSimplifiedDOM(node) {
|
|
12
|
+
// 1. 处理文本节点
|
|
13
|
+
if (node.nodeType === Node.TEXT_NODE) {
|
|
14
|
+
const text = node.textContent.trim();
|
|
15
|
+
// 限制文本长度,避免大段文章消耗 token,保留前100个字符通常足够定位
|
|
16
|
+
return text ? text.slice(0, 100) + (text.length > 100 ? '...' : '') : null;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// 2. 过滤无用标签
|
|
20
|
+
const ignoreTags = ['SCRIPT', 'STYLE', 'NOSCRIPT', 'IFRAME', 'SVG', 'LINK', 'META'];
|
|
21
|
+
if (ignoreTags.includes(node.tagName)) return null;
|
|
22
|
+
if (node.nodeType !== Node.ELEMENT_NODE) return null;
|
|
23
|
+
|
|
24
|
+
// 3. 过滤不可见元素
|
|
25
|
+
const style = window.getComputedStyle(node);
|
|
26
|
+
if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') return null;
|
|
27
|
+
// 过滤宽高太小的元素(往往是埋点空像素)
|
|
28
|
+
const rect = node.getBoundingClientRect();
|
|
29
|
+
if (rect.width === 0 || rect.height === 0) return null;
|
|
30
|
+
|
|
31
|
+
// --- 开始构建标签字符串 ---
|
|
32
|
+
const tagName = node.tagName.toLowerCase();
|
|
33
|
+
let tagStr = tagName;
|
|
34
|
+
|
|
35
|
+
// A. 基础标识符 (ID 和 Class)
|
|
36
|
+
if (node.id) tagStr += `#${node.id}`;
|
|
37
|
+
if (node.className && typeof node.className === 'string') {
|
|
38
|
+
// 过滤掉 Tailwind 等太长且无语义的 class,保留有意义的业务 class
|
|
39
|
+
// 这里简单处理,全部保留,让 LLM 自己判断
|
|
40
|
+
const classes = node.className.trim().split(/\s+/);
|
|
41
|
+
if (classes.length > 0) tagStr += `.${classes.join('.')}`;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// B. 关键属性白名单 (这是你指出问题的核心修复)
|
|
45
|
+
const props = [];
|
|
46
|
+
|
|
47
|
+
// 通用重要属性
|
|
48
|
+
if (node.getAttribute('role')) props.push(`role="${node.getAttribute('role')}"`);
|
|
49
|
+
if (node.getAttribute('aria-label')) props.push(`aria-label="${node.getAttribute('aria-label')}"`);
|
|
50
|
+
if (node.getAttribute('title')) props.push(`title="${node.getAttribute('title')}"`);
|
|
51
|
+
|
|
52
|
+
// 特定标签的特定属性
|
|
53
|
+
if (tagName === 'a') {
|
|
54
|
+
const href = node.getAttribute('href');
|
|
55
|
+
// 只保留有意义的链接,忽略 javascript:;
|
|
56
|
+
if (href && !href.startsWith('javascript')) props.push(`href="${href}"`);
|
|
57
|
+
} else if (tagName === 'input' || tagName === 'textarea' || tagName === 'select') {
|
|
58
|
+
if (node.getAttribute('type')) props.push(`type="${node.getAttribute('type')}"`);
|
|
59
|
+
if (node.getAttribute('name')) props.push(`name="${node.getAttribute('name')}"`);
|
|
60
|
+
if (node.getAttribute('placeholder')) props.push(`placeholder="${node.getAttribute('placeholder')}"`);
|
|
61
|
+
if (node.disabled) props.push('disabled');
|
|
62
|
+
if (node.checked) props.push('checked');
|
|
63
|
+
} else if (tagName === 'button') {
|
|
64
|
+
if (node.getAttribute('type')) props.push(`type="${node.getAttribute('type')}"`);
|
|
65
|
+
} else if (tagName === 'img') {
|
|
66
|
+
if (node.getAttribute('alt')) props.push(`alt="${node.getAttribute('alt')}"`);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if (props.length > 0) {
|
|
70
|
+
tagStr += ` ${props.join(' ')}`;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// 4. 递归子节点
|
|
74
|
+
const children = Array.from(node.childNodes)
|
|
75
|
+
.map(getSimplifiedDOM)
|
|
76
|
+
.filter(n => n !== null);
|
|
77
|
+
|
|
78
|
+
// 5. 组装输出
|
|
79
|
+
// 如果没有子节点,也没有ID/Class,也不是输入框/图片/链接,那这个标签可能只是布局用的 div,可以考虑跳过它直接返回子节点内容
|
|
80
|
+
// 但为了保持结构完整,我们暂时保留它
|
|
81
|
+
if (children.length === 0) {
|
|
82
|
+
// 自闭合标签或空标签
|
|
83
|
+
return `<${tagStr} />`;
|
|
84
|
+
}
|
|
85
|
+
return `<${tagStr}>${children.join('')}</${tagName}>`; // 结束标签只保留 tagName 节省 token
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return getSimplifiedDOM(document.body);
|
|
89
|
+
"""
|
|
90
|
+
|
|
10
91
|
|
|
11
92
|
# 使用requests获取html,用于测试是否使用了瑞数和jsl
|
|
12
93
|
def requests_html(url):
|
|
@@ -53,6 +134,15 @@ def compress_html(content, only_text=False):
|
|
|
53
134
|
for meta in doc.xpath('//meta'):
|
|
54
135
|
meta.getparent().remove(meta)
|
|
55
136
|
|
|
137
|
+
for svg in doc.xpath('//svg'):
|
|
138
|
+
# 获取 SVG 内的文本内容
|
|
139
|
+
text_content = svg.text_content()
|
|
140
|
+
# 创建一个新的文本节点替换 SVG
|
|
141
|
+
parent = svg.getparent()
|
|
142
|
+
if parent is not None:
|
|
143
|
+
parent.text = (parent.text or '') + text_content
|
|
144
|
+
parent.remove(svg)
|
|
145
|
+
|
|
56
146
|
# 删除 style 属性
|
|
57
147
|
for element in doc.xpath('//*[@style]'):
|
|
58
148
|
element.attrib.pop('style')
|
|
@@ -1,166 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
from typing import Any
|
|
4
|
-
|
|
5
|
-
from DrissionPage._elements.none_element import NoneElement
|
|
6
|
-
from fastmcp import FastMCP
|
|
7
|
-
|
|
8
|
-
from tools.tools import compress_html, requests_html, dp_headless_html, assert_waf_cookie, dp_mcp_message_pack
|
|
9
|
-
|
|
10
|
-
html_source_code_local_save_path = os.path.join(os.getcwd(), "html-source-code")
|
|
11
|
-
waf_status_code_dict = {
|
|
12
|
-
412: "瑞数",
|
|
13
|
-
521: "加速乐"
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def register_visit_url(mcp: FastMCP, browser_manager):
|
|
18
|
-
@mcp.tool(name="visit_url", description="使用Drissionpage打开url访问某个网站")
|
|
19
|
-
async def visit_url(url: str) -> dict[str, Any]:
|
|
20
|
-
port, _browser = browser_manager.create_browser()
|
|
21
|
-
tab = _browser.get_tab()
|
|
22
|
-
tab.get(url)
|
|
23
|
-
tab_id = tab.tab_id
|
|
24
|
-
return dp_mcp_message_pack(
|
|
25
|
-
f"已在[{port}]端口创建浏览器对象,并已打开链接:{url}",
|
|
26
|
-
tab_id=tab_id,
|
|
27
|
-
browser_port=port
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def register_get_html(mcp: FastMCP, browser_manager):
|
|
32
|
-
@mcp.tool(name="get_html", description="使用Drissionpage获取某一个tab页的html")
|
|
33
|
-
async def get_html(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
34
|
-
_browser = browser_manager.get_browser(browser_port)
|
|
35
|
-
tab = _browser.get_tab(tab_id)
|
|
36
|
-
file_name = tab.title + f"_{tab_id}.html"
|
|
37
|
-
if not os.path.exists(html_source_code_local_save_path):
|
|
38
|
-
os.makedirs(html_source_code_local_save_path)
|
|
39
|
-
abs_path = os.path.join(html_source_code_local_save_path, file_name)
|
|
40
|
-
with open(abs_path, "w", encoding="utf-8") as f:
|
|
41
|
-
min_html, compress_rate = compress_html(tab.html)
|
|
42
|
-
f.write(min_html)
|
|
43
|
-
return dp_mcp_message_pack(f"已保存tab页:【{tab_id}】的html源码", tab_id=tab_id, html_local_path=abs_path)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def register_get_new_tab(mcp: FastMCP, browser_manager):
|
|
47
|
-
@mcp.tool(name="get_new_tab", description="使用Drissionpage创建一个新的tab页,在新的tab页中打开url")
|
|
48
|
-
async def get_new_tab(browser_port: int, url: str) -> dict[str, Any]:
|
|
49
|
-
_browser = browser_manager.get_browser(browser_port)
|
|
50
|
-
tab = _browser.new_tab(url)
|
|
51
|
-
_browser.activate_tab(tab)
|
|
52
|
-
tab_id = tab.tab_id
|
|
53
|
-
return dp_mcp_message_pack(f"已创建新的tab页,并打开链接:{url}", tab_id=tab_id)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def register_switch_tab(mcp: FastMCP, browser_manager):
|
|
57
|
-
@mcp.tool(name="switch_tab", description="根据传入的tab_id切换到对应的tab页", )
|
|
58
|
-
async def switch_tab(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
59
|
-
_browser = browser_manager.get_browser(browser_port)
|
|
60
|
-
_browser.activate_tab(tab_id)
|
|
61
|
-
return dp_mcp_message_pack(f"已将tab页:【{tab_id}】切换至最前端")
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
def register_close_tab(mcp: FastMCP, browser_manager):
|
|
65
|
-
@mcp.tool(name="close_tab", description="根据传入的tab_id关闭tab页", )
|
|
66
|
-
async def close_tab(browser_port, tab_id) -> dict[str, Any]:
|
|
67
|
-
_browser = browser_manager.get_browser(browser_port)
|
|
68
|
-
_browser.close_tabs(tab_id)
|
|
69
|
-
return dp_mcp_message_pack(f"已将tab页:【{tab_id}】关闭")
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def register_check_selector(mcp: FastMCP, browser_manager):
|
|
73
|
-
@mcp.tool(name="check_selector", description="查找tab页中是否包含元素,并返回元素attr_name所对应的值")
|
|
74
|
-
async def check_selector(browser_port: int, tab_id: str, css_selector: str, attr_name: str = "text") -> dict[str, Any]:
|
|
75
|
-
_browser = browser_manager.get_browser(browser_port)
|
|
76
|
-
target_tab = _browser.get_tab(tab_id)
|
|
77
|
-
css_selector = css_selector
|
|
78
|
-
if "css:" not in css_selector:
|
|
79
|
-
css_selector = "css:" + css_selector
|
|
80
|
-
target_eles = target_tab.eles(css_selector)
|
|
81
|
-
exist_flag = False
|
|
82
|
-
if len(target_eles) != 0:
|
|
83
|
-
exist_flag = True
|
|
84
|
-
if attr_name == "text":
|
|
85
|
-
ele_text_list = [i.text.replace("\n", "") for i in target_eles]
|
|
86
|
-
attr_output = "\n".join(ele_text_list)
|
|
87
|
-
else:
|
|
88
|
-
attr_output = json.dumps([i.attr(attr_name) for i in target_eles])
|
|
89
|
-
return dp_mcp_message_pack(
|
|
90
|
-
f"已完成tab页:【{tab_id}】对:【{css_selector}】的检查",
|
|
91
|
-
tab_id=tab_id,
|
|
92
|
-
selector=css_selector,
|
|
93
|
-
selector_ele_exist=exist_flag,
|
|
94
|
-
attr_output=attr_output
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def register_quit_browser(mcp: FastMCP, browser_manager):
|
|
99
|
-
@mcp.tool(name="quit_browser", description="退出浏览器会话,关闭浏览器")
|
|
100
|
-
async def quit_browser(browser_port: int) -> dict[str, Any]:
|
|
101
|
-
flag, _browser = browser_manager.remove_page(browser_port)
|
|
102
|
-
if flag:
|
|
103
|
-
_browser.quit()
|
|
104
|
-
return dp_mcp_message_pack(
|
|
105
|
-
f"浏览器[{browser_port}],退出会话,关闭浏览器{'成功' if flag else '失败'}",
|
|
106
|
-
browser_port=browser_port,
|
|
107
|
-
quit_flag=flag
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
def register_assert_waf(mcp: FastMCP, browser_manager):
|
|
112
|
-
@mcp.tool(name="assert_waf",
|
|
113
|
-
description="通过对比requests、有头浏览器、无头浏览器获取到的html,判断网页是否使用了waf以及是否为动态渲染的网页")
|
|
114
|
-
async def assert_waf(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
115
|
-
_browser = browser_manager.get_browser(browser_port)
|
|
116
|
-
target_tab = _browser.get_tab(tab_id)
|
|
117
|
-
recommend_team = "drissionpage_head"
|
|
118
|
-
head_cookies = target_tab.cookies()
|
|
119
|
-
# 通过cookie判断是否有waf
|
|
120
|
-
waf_flag, waf_type = assert_waf_cookie(head_cookies)
|
|
121
|
-
head_html = target_tab.html
|
|
122
|
-
min_head_html, head_rate = compress_html(head_html, only_text=True)
|
|
123
|
-
raw_html, status_code = requests_html(target_tab.url)
|
|
124
|
-
min_raw_html, raw_rate = compress_html(raw_html, only_text=True)
|
|
125
|
-
r_h_rate_diff = abs(head_rate - raw_rate)
|
|
126
|
-
# 如果有已知的防火墙,则不浪费时间使用无头获取html和压缩比了
|
|
127
|
-
if waf_flag or status_code in waf_status_code_dict.keys():
|
|
128
|
-
return dp_mcp_message_pack(
|
|
129
|
-
f"已完成tab页:【{tab_id}】的分析,该tab页存在waf",
|
|
130
|
-
tab_id=tab_id,
|
|
131
|
-
recommend_team=recommend_team,
|
|
132
|
-
raw_head_rate_difference=r_h_rate_diff,
|
|
133
|
-
raw_headless_rate_difference=0,
|
|
134
|
-
head_headless_rate_difference=0
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
headless_html = dp_headless_html(target_tab.url)
|
|
138
|
-
min_headless_html, headless_rate = compress_html(headless_html, only_text=True)
|
|
139
|
-
r_hless_rate_diff = abs(raw_rate - headless_rate)
|
|
140
|
-
h_hless_rate_diff = abs(head_rate - headless_rate)
|
|
141
|
-
# 最优情况:requests,dp有头和无头拿到的结果基本一致,认定为没有防护的静态网页
|
|
142
|
-
if r_h_rate_diff < 40 and r_hless_rate_diff < 40 and h_hless_rate_diff < 40:
|
|
143
|
-
recommend_team = "requests"
|
|
144
|
-
# 最差情况:requests,dp有头和无头拿到的结果差距都很大,认定为有浏览器无头检测+动态网页
|
|
145
|
-
# if r_h_rate_diff < 40 and r_hless_rate_diff < 40 and h_hless_rate_diff < 40:
|
|
146
|
-
# 较差1:dp有头和无头差距很小,但是requests拿不到正确结果,认定为有requests防护 or 动态网页
|
|
147
|
-
elif h_hless_rate_diff < 30 and r_hless_rate_diff > 40:
|
|
148
|
-
recommend_team = "drissionpage_headless"
|
|
149
|
-
# 较差2:有头和无头差距很大,但是requests和有头拿到的结果基本一致,认定为要么有别的没有防护requests的waf,或者间歇性的瑞数【此时应该拿有头的cookie去判断其中是否有瑞数特征,上面已经做了】
|
|
150
|
-
# if r_h_rate_diff < 15 and h_hless_rate_diff > 40:
|
|
151
|
-
return dp_mcp_message_pack(
|
|
152
|
-
f"已完成tab页:【{tab_id}】的分析,该tab页存在waf",
|
|
153
|
-
tab_id=tab_id,
|
|
154
|
-
recommend_team=recommend_team,
|
|
155
|
-
raw_head_rate_difference=r_h_rate_diff,
|
|
156
|
-
raw_headless_rate_difference=h_hless_rate_diff,
|
|
157
|
-
head_headless_rate_difference=h_hless_rate_diff
|
|
158
|
-
)
|
|
159
|
-
|
|
160
|
-
# def register_highlight_element_captcha(mcp: FastMCP, browser_manager):
|
|
161
|
-
# @mcp.tool(name="highlight_element_captcha",
|
|
162
|
-
# description="将传入的Selector在页面上高亮,并截屏")
|
|
163
|
-
# async def highlight_element_captcha(browser_port: int, tab_id: str, selector: str) -> dict[str, Any]:
|
|
164
|
-
# _browser = browser_manager.get_browser(browser_port)
|
|
165
|
-
# tab = _browser.get_tab(tab_id)
|
|
166
|
-
# tab.ele
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|