Jarvis-Brain 0.1.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.4
2
+ Name: Jarvis_Brain
3
+ Version: 0.1.7.7
4
+ Summary: Jarvis brain mcp
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: beautifulsoup4
7
+ Requires-Dist: curl-cffi
8
+ Requires-Dist: drissionpage
9
+ Requires-Dist: fastmcp
10
+ Requires-Dist: minify-html
@@ -0,0 +1,11 @@
1
+ mcp_tools/__init__.py,sha256=nqhlHRalTYQF8gU5RBfanOn-zSONLYON1mjPIvP-f4w,113
2
+ mcp_tools/dp_tools.py,sha256=O51cz2ectsbJJ6fN9X1YarSC7JereXeUWFtehx71E3Q,10796
3
+ mcp_tools/main.py,sha256=Fdt2N3oKGwvruuno_ywnuWSlm1BexE9ZY669H2LTo9w,1056
4
+ tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ tools/browser_manager.py,sha256=EXM7n-sDOtdQGpWkVTAZHWhepVU-7PAoUTDNgGF9_fQ,1938
6
+ tools/browser_proxy.py,sha256=cdMRxcUYyaOqGU17lldltHOvt9rxXD5Dwh7hBXEBby4,6780
7
+ tools/tools.py,sha256=TaWs-CNXy-py9BFmCnJrQ09ke938xXpImf-N2Qo_Rvc,4708
8
+ jarvis_brain-0.1.7.7.dist-info/METADATA,sha256=u_pB128wsC-0P2-hKsYlgkZFaDZ45dCoM81ZdnZDRdU,241
9
+ jarvis_brain-0.1.7.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
+ jarvis_brain-0.1.7.7.dist-info/entry_points.txt,sha256=YFQT4xpkUqt5dM5wlKPQQOqcjMuFrT9iuRAzIpAyH7U,51
11
+ jarvis_brain-0.1.7.7.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ jarvis-mcp = mcp_tools.main:main
mcp_tools/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ a = ["a", "b", "c", "d", "e", "f", "g", "h"]
2
+ print(len(a))
3
+ for index, num in enumerate(a):
4
+ print(index, num)
mcp_tools/dp_tools.py ADDED
@@ -0,0 +1,197 @@
1
+ import hashlib
2
+ import json
3
+ import os
4
+ from typing import Any
5
+
6
+ from fastmcp import FastMCP
7
+
8
+ from tools.browser_manager import BrowserManager
9
+ from tools.tools import compress_html, requests_html, dp_headless_html, assert_waf_cookie, dp_mcp_message_pack
10
+ from tools.browser_proxy import DPProxyClient, DPProxyClientManager
11
+
12
+ html_source_code_local_save_path = os.path.join(os.getcwd(), "html-source-code")
13
+ waf_status_code_dict = {
14
+ 412: "瑞数",
15
+ 521: "加速乐"
16
+ }
17
+ # 一轮最大输入,以免单个html最大长度超过ai最大输入
18
+ one_turn_max_token = 20000
19
+
20
+
21
+ def register_visit_url(mcp: FastMCP, browser_manager: BrowserManager, client_manager: DPProxyClientManager):
22
+ @mcp.tool(name="visit_url",
23
+ description="使用Drissionpage打开url访问某个网站,并开始监听初始tab页的所有的XHR请求,当需要使用手机版浏览器Ua时use_mobile_user_agent为True")
24
+ async def visit_url(url: str, use_mobile_user_agent=False) -> dict[str, Any]:
25
+ mobile_user_agent = None
26
+ if use_mobile_user_agent:
27
+ mobile_user_agent = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Mobile Safari/537.36"
28
+ port, _browser = browser_manager.create_browser(mobile_user_agent)
29
+ tab = _browser.get_tab()
30
+ client_manager.create_client(tab)
31
+ tab.get(url)
32
+ tab_id = tab.tab_id
33
+ return dp_mcp_message_pack(
34
+ f"已在[{port}]端口创建浏览器对象,并已打开链接:{url},打开的模式是:{'手机版' if use_mobile_user_agent else '电脑版'}",
35
+ tab_id=tab_id,
36
+ browser_port=port
37
+ )
38
+
39
+
40
+ def register_get_html(mcp: FastMCP, browser_manager, client_manager: DPProxyClientManager):
41
+ @mcp.tool(name="get_html", description="使用Drissionpage获取某一个tab页的html")
42
+ async def get_html(browser_port: int, tab_id: str) -> dict[str, Any]:
43
+ _browser = browser_manager.get_browser(browser_port)
44
+ tab = _browser.get_tab(tab_id)
45
+ file_name_prefix = hashlib.md5(str(tab.title).encode('utf-8')).hexdigest()
46
+ if not os.path.exists(html_source_code_local_save_path):
47
+ os.makedirs(html_source_code_local_save_path)
48
+ min_html, compress_rate = compress_html(tab.html)
49
+ html_str_list = [min_html[i:i + one_turn_max_token] for i in range(0, len(min_html), one_turn_max_token)]
50
+ html_file_list = []
51
+ for index, html_str in enumerate(html_str_list):
52
+ file_name = file_name_prefix + f"_{tab_id}_segment{index}.html"
53
+ abs_path = os.path.join(html_source_code_local_save_path, file_name)
54
+ with open(abs_path, "w", encoding="utf-8") as f:
55
+ f.write(html_str)
56
+ html_file_list.append(abs_path)
57
+ message = f"已保存tab页:【{tab_id}】的html源码片段共{len(html_file_list)}个"
58
+ return dp_mcp_message_pack(message, tab_id=tab_id, htmls_local_path=html_file_list)
59
+
60
+
61
+ def register_get_new_tab(mcp: FastMCP, browser_manager, client_manager: DPProxyClientManager):
62
+ @mcp.tool(name="get_new_tab",
63
+ description="使用Drissionpage创建一个新的tab页,在新的tab页中打开url,并开始监听新的tab页的所有XHR请求")
64
+ async def get_new_tab(browser_port: int, url: str) -> dict[str, Any]:
65
+ _browser = browser_manager.get_browser(browser_port)
66
+ tab = _browser.new_tab()
67
+ client_manager.create_client(tab)
68
+ tab.get(url)
69
+ _browser.activate_tab(tab)
70
+ tab_id = tab.tab_id
71
+ return dp_mcp_message_pack(f"已创建新的tab页,并打开链接:{url}", tab_id=tab_id)
72
+
73
+
74
+ def register_switch_tab(mcp: FastMCP, browser_manager, client_manager: DPProxyClientManager):
75
+ @mcp.tool(name="switch_tab", description="根据传入的tab_id切换到对应的tab页", )
76
+ async def switch_tab(browser_port: int, tab_id: str) -> dict[str, Any]:
77
+ _browser = browser_manager.get_browser(browser_port)
78
+ _browser.activate_tab(tab_id)
79
+ return dp_mcp_message_pack(f"已将tab页:【{tab_id}】切换至最前端")
80
+
81
+
82
+ def register_close_tab(mcp: FastMCP, browser_manager, client_manager: DPProxyClientManager):
83
+ @mcp.tool(name="close_tab", description="根据传入的tab_id关闭tab页", )
84
+ async def close_tab(browser_port, tab_id) -> dict[str, Any]:
85
+ _browser = browser_manager.get_browser(browser_port)
86
+ _browser.close_tabs(tab_id)
87
+ return dp_mcp_message_pack(f"已将tab页:【{tab_id}】关闭")
88
+
89
+
90
+ def register_check_selector(mcp: FastMCP, browser_manager, client_manager: DPProxyClientManager):
91
+ @mcp.tool(name="check_selector", description="查找tab页中是否包含元素,并返回元素attr_name所对应的值")
92
+ async def check_selector(browser_port: int, tab_id: str, css_selector: str, attr_name: str = "text") -> dict[
93
+ str, Any]:
94
+ _browser = browser_manager.get_browser(browser_port)
95
+ target_tab = _browser.get_tab(tab_id)
96
+ css_selector = css_selector
97
+ if "css:" not in css_selector:
98
+ css_selector = "css:" + css_selector
99
+ target_eles = target_tab.eles(css_selector)
100
+ exist_flag = False
101
+ if len(target_eles) != 0:
102
+ exist_flag = True
103
+ if attr_name == "text":
104
+ ele_text_list = [i.text.replace("\n", "") for i in target_eles]
105
+ attr_output = "\n".join(ele_text_list)
106
+ else:
107
+ attr_output = json.dumps([i.attr(attr_name) for i in target_eles])
108
+ # 对attr_output逐个截断,截断的长度为:一轮最大token除以元素个数+6个点
109
+ attr_output = [attr_str[:one_turn_max_token // (len(attr_str) + 6)] + "......" for attr_str in attr_output]
110
+ return dp_mcp_message_pack(
111
+ f"已完成tab页:【{tab_id}】对:【{css_selector}】的检查",
112
+ tab_id=tab_id,
113
+ selector=css_selector,
114
+ selector_ele_exist=exist_flag,
115
+ attr_output=attr_output
116
+ )
117
+
118
+
119
+ def register_quit_browser(mcp: FastMCP, browser_manager, client_manager: DPProxyClientManager):
120
+ @mcp.tool(name="quit_browser", description="退出浏览器会话,关闭浏览器")
121
+ async def quit_browser(browser_port: int) -> dict[str, Any]:
122
+ flag, _browser = browser_manager.remove_page(browser_port)
123
+ if flag:
124
+ _browser.quit()
125
+ return dp_mcp_message_pack(
126
+ f"浏览器[{browser_port}],退出会话,关闭浏览器{'成功' if flag else '失败'}",
127
+ browser_port=browser_port,
128
+ quit_flag=flag
129
+ )
130
+
131
+
132
+ def register_pop_first_packet(mcp: FastMCP, browser_manager, client_manager: DPProxyClientManager):
133
+ @mcp.tool(name="pop_first_packet",
134
+ description="每调用一次就会弹出传入的tab页所监听到的数据包中的第一个packet_message,当一个packet_message的response body过长时会被切分成多个包,具体一个请求是否还有下一个包,可以参考body_completed字段")
135
+ async def pop_first_packet(browser_port: int, tab_id: str) -> dict[str, Any]:
136
+ _browser = browser_manager.get_browser(browser_port)
137
+ client = client_manager.get_client(tab_id)
138
+ packet_message = client.pop_first_packet()
139
+ message = f"tab页:【{tab_id}】,暂时没有监听到XHR数据包"
140
+ if packet_message:
141
+ message = f"tab页:【{tab_id}】,监听到XHR数据包",
142
+ return dp_mcp_message_pack(
143
+ message,
144
+ browser_port=browser_port,
145
+ tab_id=tab_id,
146
+ packet_message=packet_message
147
+ )
148
+
149
+
150
+ def register_assert_waf(mcp: FastMCP, browser_manager, client_manager: DPProxyClientManager):
151
+ @mcp.tool(name="assert_waf",
152
+ description="通过对比requests、有头浏览器、无头浏览器获取到的html,判断网页是否使用了waf以及是否为动态渲染的网页")
153
+ async def assert_waf(browser_port: int, tab_id: str) -> dict[str, Any]:
154
+ _browser = browser_manager.get_browser(browser_port)
155
+ target_tab = _browser.get_tab(tab_id)
156
+ recommend_team = "drissionpage_head"
157
+ head_cookies = target_tab.cookies()
158
+ # 通过cookie判断是否有waf
159
+ waf_flag, waf_type = assert_waf_cookie(head_cookies)
160
+ head_html = target_tab.html
161
+ min_head_html, head_rate = compress_html(head_html, only_text=True)
162
+ raw_html, status_code = requests_html(target_tab.url)
163
+ min_raw_html, raw_rate = compress_html(raw_html, only_text=True)
164
+ r_h_rate_diff = abs(head_rate - raw_rate)
165
+ # 如果有已知的防火墙,则不浪费时间使用无头获取html和压缩比了
166
+ if waf_flag or status_code in waf_status_code_dict.keys():
167
+ return dp_mcp_message_pack(
168
+ f"已完成tab页:【{tab_id}】的分析,该tab页存在waf",
169
+ tab_id=tab_id,
170
+ recommend_team=recommend_team,
171
+ raw_head_rate_difference=r_h_rate_diff,
172
+ raw_headless_rate_difference=0,
173
+ head_headless_rate_difference=0
174
+ )
175
+
176
+ headless_html = dp_headless_html(target_tab.url)
177
+ min_headless_html, headless_rate = compress_html(headless_html, only_text=True)
178
+ r_hless_rate_diff = abs(raw_rate - headless_rate)
179
+ h_hless_rate_diff = abs(head_rate - headless_rate)
180
+ # 最优情况:requests,dp有头和无头拿到的结果基本一致,认定为没有防护的静态网页
181
+ if r_h_rate_diff < 40 and r_hless_rate_diff < 40 and h_hless_rate_diff < 40:
182
+ recommend_team = "requests"
183
+ # 最差情况:requests,dp有头和无头拿到的结果差距都很大,认定为有浏览器无头检测+动态网页
184
+ # if r_h_rate_diff < 40 and r_hless_rate_diff < 40 and h_hless_rate_diff < 40:
185
+ # 较差1:dp有头和无头差距很小,但是requests拿不到正确结果,认定为有requests防护 or 动态网页
186
+ elif h_hless_rate_diff < 30 and r_hless_rate_diff > 40:
187
+ recommend_team = "drissionpage_headless"
188
+ # 较差2:有头和无头差距很大,但是requests和有头拿到的结果基本一致,认定为要么有别的没有防护requests的waf,或者间歇性的瑞数【此时应该拿有头的cookie去判断其中是否有瑞数特征,上面已经做了】
189
+ # if r_h_rate_diff < 15 and h_hless_rate_diff > 40:
190
+ return dp_mcp_message_pack(
191
+ f"已完成tab页:【{tab_id}】的分析,该tab页存在waf",
192
+ tab_id=tab_id,
193
+ recommend_team=recommend_team,
194
+ raw_head_rate_difference=r_h_rate_diff,
195
+ raw_headless_rate_difference=h_hless_rate_diff,
196
+ head_headless_rate_difference=h_hless_rate_diff
197
+ )
mcp_tools/main.py ADDED
@@ -0,0 +1,33 @@
1
+ from fastmcp import FastMCP
2
+
3
+ from mcp_tools.dp_tools import *
4
+ from tools.browser_manager import browser_manager
5
+ from tools.browser_proxy import client_manager
6
+
7
+ mcp = FastMCP("Jarvis Brain Mcp Tools")
8
+
9
+ # 根据环境变量加载模块
10
+ enabled_modules = os.getenv("MCP_MODULES", "TeamNode-Dp").split(",")
11
+ base_cwd = os.getenv("BASE_CWD", os.path.expanduser('~'))
12
+
13
+ if "TeamNode-Dp" in enabled_modules:
14
+ # 页面管理
15
+ register_close_tab(mcp, browser_manager, client_manager)
16
+ register_switch_tab(mcp, browser_manager, client_manager)
17
+ register_get_new_tab(mcp, browser_manager, client_manager)
18
+ # 功能
19
+ register_visit_url(mcp, browser_manager, client_manager)
20
+ register_get_html(mcp, browser_manager, client_manager)
21
+ register_check_selector(mcp, browser_manager, client_manager)
22
+ register_pop_first_packet(mcp, browser_manager, client_manager)
23
+
24
+ if "JarvisNode" in enabled_modules:
25
+ register_assert_waf(mcp, browser_manager, client_manager)
26
+
27
+
28
+ def main():
29
+ mcp.run(transport="stdio")
30
+
31
+
32
+ if __name__ == '__main__':
33
+ main()
tools/__init__.py ADDED
File without changes
@@ -0,0 +1,52 @@
1
+ """浏览器池管理模块 - 单例模式确保状态共享"""
2
+ import random
3
+ from typing import Optional, Tuple
4
+ import os
5
+ from DrissionPage import ChromiumPage, ChromiumOptions
6
+ import platform
7
+
8
+
9
+ class BrowserManager:
10
+ """浏览器池管理器 - 使用单例模式"""
11
+ _instance = None
12
+
13
+ def __new__(cls):
14
+ if cls._instance is None:
15
+ cls._instance = super().__new__(cls)
16
+ cls._instance.browser_pool = {}
17
+ return cls._instance
18
+
19
+ def create_browser(self, user_agent: str = None) -> Tuple[int, ChromiumPage]:
20
+ """创建新的浏览器实例"""
21
+ random_port = random.randint(9223, 9934)
22
+ while random_port in self.browser_pool:
23
+ random_port = random.randint(9223, 9934)
24
+
25
+ co = ChromiumOptions().set_local_port(random_port)
26
+ if user_agent:
27
+ co.set_user_agent(user_agent)
28
+ if platform.system() != 'Windows':
29
+ co.set_argument('--no-sandbox')
30
+ custom_data_dir = os.path.join(os.path.expanduser('~'), 'DrissionPage', "userData", f"{random_port}")
31
+ co.set_user_data_path(custom_data_dir) # 设置用户数据路径
32
+ # if not os.path.exists(custom_data_dir):
33
+ # os.makedirs(custom_data_dir)
34
+ self.browser_pool[random_port] = ChromiumPage(co)
35
+ return random_port, self.browser_pool[random_port]
36
+
37
+ def get_browser(self, port: int) -> Optional[ChromiumPage]:
38
+ """根据端口获取浏览器实例"""
39
+ return self.browser_pool.get(port)
40
+
41
+ def remove_browser(self, port: int) -> Tuple[bool, Optional[ChromiumPage]]:
42
+ """根据端口移除浏览器实例"""
43
+ browser = self.browser_pool.pop(port, None)
44
+ return browser is not None, browser
45
+
46
+ def list_browsers(self) -> list[int]:
47
+ """列出所有活跃的浏览器端口"""
48
+ return list(self.browser_pool.keys())
49
+
50
+
51
+ # 创建全局单例实例
52
+ browser_manager = BrowserManager()
tools/browser_proxy.py ADDED
@@ -0,0 +1,171 @@
1
+ import threading
2
+ from collections import deque
3
+ import time
4
+ from DrissionPage import ChromiumPage, ChromiumOptions
5
+ from DrissionPage._pages.chromium_tab import ChromiumTab
6
+ from DrissionPage._units.listener import DataPacket
7
+ from typing import Tuple, Optional
8
+ import json
9
+
10
+ one_turn_max_token = 20000
11
+
12
+
13
+ class DPProxyClient:
14
+ def __init__(self, driver: ChromiumTab, self_kill=False):
15
+ self.tab_id = driver.tab_id
16
+ self.driver = ChromePageProxy(driver, self)
17
+ self.thread = None
18
+ self.self_kill = self_kill
19
+ # self.packet_list = []
20
+ self.packet_queue = deque()
21
+
22
+ def get_driver(self, start_listen, count=None, timeout=10) -> ChromiumTab:
23
+ """
24
+ 获取代理后的driver、tab
25
+ :param start_listen: 若你自己写的代码里已经使用自动化框架监听发包的功能了,则该值应该置为False。若没监听,则必须将该值置为True
26
+ :param count: 需捕获的数据包总数,为None表示无限
27
+ :param timeout: 两个数据包之间等待的最大时长(秒),为None表示无限,默认为10秒
28
+ :return:
29
+ """
30
+ if start_listen:
31
+ self.driver.listen.set_targets(res_type="XHR")
32
+ self.driver.listen.start()
33
+ self.thread = threading.Thread(target=self.start_listen, args=(count, timeout,))
34
+ self.thread.start()
35
+ return self.driver
36
+
37
+ def start_listen(self, count=None, timeout=10):
38
+ for _ in self.driver.listen.steps(count=count, timeout=timeout, gap=1):
39
+ pass
40
+
41
+ # 每次调用函数,都从队列的左端弹出一个数据包
42
+ def pop_first_packet(self):
43
+ if self.packet_queue:
44
+ result = self.packet_queue.popleft()
45
+ return json.dumps(result, ensure_ascii=False)
46
+ else:
47
+ return None
48
+
49
+
50
+ class DPProxyClientManager:
51
+ """浏览器池管理器 - 使用单例模式"""
52
+ _instance = None
53
+
54
+ def __new__(cls):
55
+ if cls._instance is None:
56
+ cls._instance = super().__new__(cls)
57
+ cls._instance.tab_pool = {}
58
+ return cls._instance
59
+
60
+ def create_client(self, tab: ChromiumTab, self_kill=False) -> Tuple[str, DPProxyClient, ChromiumTab]:
61
+ """创建新的tab页面代理实例"""
62
+ client = DPProxyClient(tab, self_kill=self_kill)
63
+ tab = client.get_driver(True)
64
+ tab_id = tab.tab_id
65
+ self.tab_pool[tab_id] = {"client": client, "driver": tab}
66
+ return tab_id, client, tab
67
+
68
+ def get_client(self, tab_id: str) -> Optional[DPProxyClient]:
69
+ """根据端口获取浏览器实例"""
70
+ return self.tab_pool.get(tab_id).get("client", None)
71
+
72
+ def remove_client(self, tab_id: str) -> Tuple[bool, Optional[ChromiumPage]]:
73
+ """根据端口移除浏览器实例"""
74
+ client = self.tab_pool.pop(tab_id, None)
75
+ return client is not None, client
76
+
77
+ def list_clients(self) -> list[int]:
78
+ """列出所有活跃的浏览器端口"""
79
+ return list(self.tab_pool.keys())
80
+
81
+
82
+ class ChromePageProxy:
83
+ def __init__(self, page, client=None):
84
+ self.__dict__['page'] = page
85
+ self.__dict__['client'] = client
86
+
87
+ def __getattr__(self, item):
88
+ attr = getattr(self.page, item)
89
+ print(item, attr)
90
+ if item == 'listen':
91
+ listen_proxy = DrissionPageListenerProxy(attr, self.__dict__['client'])
92
+ return listen_proxy
93
+ return attr
94
+
95
+
96
+ class DrissionPageListenerProxy:
97
+ def __init__(self, listener, client=None):
98
+ self.listener = listener
99
+ self.client = client
100
+
101
+ def __getattr__(self, item):
102
+ attr = getattr(self.listener, item)
103
+ # 当监听到wait被调用的时候
104
+ if item == "wait":
105
+ def wrapper(*args, **kwargs):
106
+ result = attr(*args, **kwargs)
107
+ check_data_packet(result, self.client)
108
+ return result
109
+
110
+ return wrapper
111
+ # 当监听到steps被调用的时候
112
+ if item == "steps":
113
+ def wrapper(*args, **kwargs):
114
+ if kwargs.get("gap", 1) > 1:
115
+ raise Exception("暂不支持多包监控")
116
+ result = attr(*args, **kwargs)
117
+ if attr.__name__ == "steps":
118
+ for step in result:
119
+ check_data_packet(step, self.client)
120
+ yield step
121
+
122
+ return wrapper
123
+ return attr
124
+
125
+
126
+ def check_data_packet(packet: DataPacket, client: DPProxyClient):
127
+ """
128
+ 封装监听到的数据包,并将其存放在client的packet_queue中
129
+ :param packet:
130
+ :param client:
131
+ :return:
132
+ """
133
+ url = packet.url
134
+ method = packet.request.method
135
+ data = None
136
+ if packet.request.hasPostData:
137
+ data = packet.request.postData
138
+ body = packet.response.body
139
+ body_str = json.dumps(body, ensure_ascii=False)
140
+ body_str_list = [body_str[i:i + one_turn_max_token] for i in range(0, len(body_str), one_turn_max_token)]
141
+ body_completed = True
142
+ for index, body_str in enumerate(body_str_list):
143
+ if (index + 1) != len(body_str_list):
144
+ body_completed = False
145
+ temp_dict = {
146
+ "url": url,
147
+ "body_completed": body_completed,
148
+ "method": method,
149
+ "request_data": data,
150
+ "request_headers": dict(packet.request.headers),
151
+ "response_headers": dict(packet.response.headers),
152
+ "response_body_segment": body_str,
153
+ }
154
+ client.packet_queue.append(temp_dict)
155
+
156
+
157
+ client_manager = DPProxyClientManager()
158
+
159
+ # if __name__ == '__main__':
160
+ # co = ChromiumOptions().set_user_agent(
161
+ # "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Mobile Safari/537.36")
162
+ # tab = ChromiumPage(co).latest_tab
163
+ # client = DPProxyClient(tab, self_kill=False)
164
+ # # client = CaptchaClient(tab, self_kill=True)
165
+ # tab = client.get_driver(True)
166
+ # url = "https://api.toutiaoapi.com/feoffline/hotspot_and_local/html/hot_list/index.html?client_extra_params=%7B%22custom_log_pb%22%3A%22%7B%5C%22style_id%5C%22%3A%5C%2240030%5C%22%2C%5C%22entrance_hotspot%5C%22%3A%5C%22search%5C%22%2C%5C%22location%5C%22%3A%5C%22hot_board%5C%22%2C%5C%22category_name%5C%22%3A%5C%22hotboard_light%5C%22%7D%22%7D&count=50&log_pb=%7B%22style_id%22%3A%2240030%22%2C%22entrance_hotspot%22%3A%22search%22%2C%22location%22%3A%22hot_board%22%2C%22category_name%22%3A%22hotboard_light%22%7D&only_hot_list=1&tab_name=stream&enter_keyword=%23%E7%BE%8E%E5%9B%BD%E9%80%80%E5%87%BA66%E4%B8%AA%E5%9B%BD%E9%99%85%E7%BB%84%E7%BB%87%23"
167
+ # tab.get(url)
168
+ # for _ in range(5056):
169
+ # new_packet = client.pop_first_packet()
170
+ # print(new_packet, "23")
171
+ # time.sleep(1)
tools/tools.py ADDED
@@ -0,0 +1,108 @@
1
+ import time
2
+ import random
3
+ import os
4
+ import minify_html
5
+ from DrissionPage import ChromiumPage, ChromiumOptions
6
+ from bs4 import BeautifulSoup
7
+ from curl_cffi import requests
8
+ from lxml import html, etree
9
+
10
+
11
+ # 使用requests获取html,用于测试是否使用了瑞数和jsl
12
+ def requests_html(url):
13
+ headers = {
14
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36",
15
+ }
16
+ response = requests.get(url, headers=headers, verify=False)
17
+ response.encoding = "utf-8"
18
+ return response.text, response.status_code
19
+
20
+
21
+ # 使用dp无头模式获取html,用于测试是否使用了其他waf,如移动waf
22
+ def dp_headless_html(url):
23
+ opt = ChromiumOptions().headless(True)
24
+ opt.set_argument('--no-sandbox')
25
+ """创建新的浏览器实例"""
26
+ random_port = random.randint(9934, 10034)
27
+ custom_data_dir = os.path.join(os.path.expanduser('~'), 'DrissionPage', "userData", f"{random_port}")
28
+ opt.set_user_data_path(custom_data_dir) # 设置用户数据路径
29
+ opt.set_local_port(random_port)
30
+ page = ChromiumPage(opt)
31
+ tab = page.latest_tab
32
+ tab.get(url)
33
+ # todo: 目前没有更好的方式,为了数据渲染完全,只能硬等【受网速波动影响比较大】
34
+ time.sleep(10)
35
+ page_html = tab.html
36
+ # 无头浏览器在用完之后一定要记得再page级别进行quit
37
+ page.quit()
38
+ return page_html
39
+
40
+
41
+ # 压缩html
42
+ def compress_html(content, only_text=False):
43
+ doc = html.fromstring(content)
44
+ # 删除 style 和 script 标签
45
+ for element in doc.xpath('//style | //script'):
46
+ element.getparent().remove(element)
47
+
48
+ # 删除 link 标签
49
+ for link in doc.xpath('//link[@rel="stylesheet"]'):
50
+ link.getparent().remove(link)
51
+
52
+ # 删除 meta 标签(新增功能)
53
+ for meta in doc.xpath('//meta'):
54
+ meta.getparent().remove(meta)
55
+
56
+ # 删除 style 属性
57
+ for element in doc.xpath('//*[@style]'):
58
+ element.attrib.pop('style')
59
+
60
+ # 删除所有 on* 事件属性
61
+ for element in doc.xpath('//*'):
62
+ for attr in list(element.attrib.keys()):
63
+ if attr.startswith('on'):
64
+ element.attrib.pop(attr)
65
+
66
+ result = etree.tostring(doc, encoding='unicode')
67
+ result = minify_html.minify(result)
68
+ compress_rate = round(len(content) / len(result) * 100)
69
+ print(f"html压缩比=> {compress_rate}%")
70
+ if not only_text:
71
+ return result, compress_rate
72
+ soup = BeautifulSoup(result, 'html.parser')
73
+ result = soup.get_text(strip=True)
74
+ return result, compress_rate
75
+
76
+
77
+ # 通过cookie判断是否有waf,需要通过遇到的例子,不断的完善cookie判别函数
78
+ def assert_waf_cookie(cookies: list):
79
+ for cookie in cookies:
80
+ cookie_name = cookie['name']
81
+ cookie_value = cookie['value']
82
+ if len(cookie_name) == 13 and len(cookie_value) == 88:
83
+ return True, "瑞数"
84
+ if "_jsl" in cookie_name:
85
+ return True, "加速乐"
86
+ return False, "没有waf"
87
+
88
+
89
+ # 对dp_mcp的消息打包
90
+ def dp_mcp_message_pack(message: str, **kwargs):
91
+ text_obj = {key: value for key, value in kwargs.items()}
92
+ text_obj.update({"message": message})
93
+ return {
94
+ "content": [{
95
+ "type": "text",
96
+ # "text": json.dumps(text_obj, ensure_ascii=False)
97
+ "text": text_obj
98
+ }]
99
+ }
100
+
101
+ # todo: 大致盘一下各种判定的逻辑【以下的所有压缩比之间的差距均取“绝对值”】
102
+ # 1. 如果requests、无头、有头获取到的压缩比之间从差距都在15%以内,则认定该页面是静态页面,此时优先使用requests请求
103
+ # 2. 如果requests的status_code为特定的412,或者521,则判定是瑞数和jsl。[此时还有一个特点:requests的压缩比会与其他两种方式获取到的压缩比差距非常大(一两千的那种)]
104
+ # 3. 如果requests、无头、有头获取到的压缩比之间差距都在40%以上,则判定该页面只可以用有头采集
105
+ # 4. 如果无头和有头获取到的压缩比之间差距小于15%,但是requests和无头的差距大于40%,则认定该页面可以使用无头浏览器采集
106
+ # 5. 如果requests和有头获取到的压缩比之间差距小于15%,但是无头和有头的差距大于40%,则认定该页面优先使用有头浏览器采集
107
+ # 【此时可能是:1.使用了别的检测无头的waf。2.网站使用瑞数,但是这次请求没有拦截requests(不知道是不是瑞数那边故意设置的),
108
+ # 此时如果想进一步判定是否是瑞数,可以使用有头浏览器取一下cookies,如果cookies里面存在瑞数的cookie,那么就可以断定是瑞数】