Jarvis-Brain 0.1.11.2__tar.gz → 0.1.13.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {jarvis_brain-0.1.11.2 → jarvis_brain-0.1.13.9}/PKG-INFO +1 -1
- {jarvis_brain-0.1.11.2 → jarvis_brain-0.1.13.9}/mcp_tools/dp_tools.py +143 -22
- {jarvis_brain-0.1.11.2 → jarvis_brain-0.1.13.9}/mcp_tools/main.py +2 -0
- {jarvis_brain-0.1.11.2 → jarvis_brain-0.1.13.9}/pyproject.toml +1 -1
- {jarvis_brain-0.1.11.2 → jarvis_brain-0.1.13.9}/tools/browser_proxy.py +6 -58
- jarvis_brain-0.1.13.9/tools/tools.py +365 -0
- jarvis_brain-0.1.11.2/tools/tools.py +0 -267
- {jarvis_brain-0.1.11.2 → jarvis_brain-0.1.13.9}/.gitignore +0 -0
- {jarvis_brain-0.1.11.2 → jarvis_brain-0.1.13.9}/README.md +0 -0
- {jarvis_brain-0.1.11.2 → jarvis_brain-0.1.13.9}/mcp_tools/__init__.py +0 -0
- {jarvis_brain-0.1.11.2 → jarvis_brain-0.1.13.9}/tools/__init__.py +0 -0
- {jarvis_brain-0.1.11.2 → jarvis_brain-0.1.13.9}/tools/browser_manager.py +0 -0
- {jarvis_brain-0.1.11.2 → jarvis_brain-0.1.13.9}/uv.lock +0 -0
|
@@ -7,6 +7,7 @@ import os
|
|
|
7
7
|
import time
|
|
8
8
|
from typing import Any
|
|
9
9
|
|
|
10
|
+
from DrissionPage.errors import NoRectError
|
|
10
11
|
from fastmcp import FastMCP
|
|
11
12
|
|
|
12
13
|
from tools.browser_manager import BrowserManager
|
|
@@ -162,13 +163,13 @@ def register_check_selector(mcp: FastMCP, browser_manager):
|
|
|
162
163
|
attr_output = json.dumps(ele_attr_list, ensure_ascii=False)
|
|
163
164
|
# 对attr_output逐个截断,截断的长度为:一轮最大token除以元素个数+3个点+两个引号和逗号
|
|
164
165
|
return dp_mcp_message_pack(
|
|
165
|
-
f"已完成tab页:【{tab_id}】对:【{css_selector}
|
|
166
|
+
f"已完成tab页:【{tab_id}】对:【{css_selector}】的检查,当前选中了 {len(target_eles)} 个元素",
|
|
166
167
|
tab_id=tab_id,
|
|
167
168
|
selector=css_selector,
|
|
168
169
|
selector_ele_exist=exist_flag,
|
|
169
170
|
page_size=page_size,
|
|
170
171
|
offset=offset,
|
|
171
|
-
attr_output=attr_output
|
|
172
|
+
attr_output=attr_output,
|
|
172
173
|
)
|
|
173
174
|
|
|
174
175
|
|
|
@@ -236,27 +237,28 @@ def register_assert_waf(mcp: FastMCP, browser_manager):
|
|
|
236
237
|
|
|
237
238
|
|
|
238
239
|
def register_click_action(mcp: FastMCP, browser_manager):
|
|
239
|
-
@mcp.tool(name="click_action",
|
|
240
|
-
|
|
240
|
+
@mcp.tool(name="click_action",
|
|
241
|
+
description="尝试点击tab页中的元素,返回元素是否可以被点击,以及是否点击成功。"
|
|
242
|
+
"其中target_element_index默认为0,当传入的Selector可以定位到多个元素时,需要传入target_element_index指定具体点击目标 ")
|
|
243
|
+
async def click_action(browser_port: int, tab_id: str, css_selector: str, target_element_index: int = 0) -> dict[
|
|
244
|
+
str, Any]:
|
|
241
245
|
_browser = browser_manager.get_browser(browser_port)
|
|
242
246
|
target_tab = _browser.get_tab(tab_id)
|
|
243
247
|
css_selector = css_selector
|
|
244
248
|
if "css:" not in css_selector:
|
|
245
249
|
css_selector = "css:" + css_selector
|
|
246
250
|
target_eles = target_tab.eles(css_selector)
|
|
247
|
-
|
|
248
|
-
element_clickable =
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
click_success = False
|
|
257
|
-
message = f"tab页:【{tab_id}】点击【{css_selector}】 {'成功' if click_success else '失败'} 了"
|
|
251
|
+
target_element = target_eles[target_element_index]
|
|
252
|
+
element_clickable = target_element.states.is_clickable
|
|
253
|
+
try:
|
|
254
|
+
click_success = target_element.click()
|
|
255
|
+
click_success = click_success is not False
|
|
256
|
+
except Exception as e:
|
|
257
|
+
click_success = False
|
|
258
|
+
if target_element_index > 0:
|
|
259
|
+
message = f"tab页:【{tab_id}】点击【{css_selector}】【index={target_element_index}】的元素 {'成功' if click_success else '失败'} 了"
|
|
258
260
|
else:
|
|
259
|
-
message = f"tab页:【{tab_id}
|
|
261
|
+
message = f"tab页:【{tab_id}】点击【{css_selector}】 {'成功' if click_success else '失败'} 了"
|
|
260
262
|
return dp_mcp_message_pack(
|
|
261
263
|
message=message,
|
|
262
264
|
browser_port=browser_port,
|
|
@@ -282,23 +284,28 @@ def register_scroll_action(mcp: FastMCP, browser_manager):
|
|
|
282
284
|
if forward == "down":
|
|
283
285
|
if pixel is None:
|
|
284
286
|
target_tab.scroll.to_half()
|
|
285
|
-
|
|
287
|
+
else:
|
|
288
|
+
target_tab.scroll.down(pixel)
|
|
286
289
|
elif forward == "up":
|
|
287
290
|
if pixel is None:
|
|
288
291
|
target_tab.scroll.to_top()
|
|
289
|
-
|
|
292
|
+
else:
|
|
293
|
+
target_tab.scroll.up(pixel)
|
|
290
294
|
elif forward == "left":
|
|
291
295
|
if pixel is None:
|
|
292
296
|
target_tab.scroll.to_leftmost()
|
|
293
|
-
|
|
297
|
+
else:
|
|
298
|
+
target_tab.scroll.left(pixel)
|
|
294
299
|
elif forward == "right":
|
|
295
300
|
if pixel is None:
|
|
296
301
|
target_tab.scroll.to_rightmost()
|
|
297
|
-
|
|
302
|
+
else:
|
|
303
|
+
target_tab.scroll.right(pixel)
|
|
298
304
|
else:
|
|
299
305
|
if pixel is None:
|
|
300
306
|
target_tab.scroll.to_half()
|
|
301
|
-
|
|
307
|
+
else:
|
|
308
|
+
target_tab.scroll.down()
|
|
302
309
|
message = f"已完成对tab页:【{tab_id}】forward={forward} 的滑动"
|
|
303
310
|
return dp_mcp_message_pack(
|
|
304
311
|
message=message,
|
|
@@ -307,14 +314,128 @@ def register_scroll_action(mcp: FastMCP, browser_manager):
|
|
|
307
314
|
)
|
|
308
315
|
|
|
309
316
|
|
|
317
|
+
def register_mouse_hover(mcp: FastMCP, browser_manager):
|
|
318
|
+
@mcp.tool(name="mouse_hover", description="将鼠标悬停在元素上【这个功能使用了Drissionpage的action行为链功能】")
|
|
319
|
+
async def mouse_hover(browser_port: int, tab_id: str, css_selector: str, target_element_index: int = 0) -> dict[
|
|
320
|
+
str, Any]:
|
|
321
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
322
|
+
target_tab = _browser.get_tab(tab_id)
|
|
323
|
+
css_selector = css_selector
|
|
324
|
+
if "css:" not in css_selector:
|
|
325
|
+
css_selector = "css:" + css_selector
|
|
326
|
+
target_eles = target_tab.eles(css_selector)
|
|
327
|
+
target_element = target_eles[target_element_index]
|
|
328
|
+
try:
|
|
329
|
+
target_tab.actions.move_to(target_element)
|
|
330
|
+
target_element.hover()
|
|
331
|
+
hover_success = True
|
|
332
|
+
except Exception as e:
|
|
333
|
+
hover_success = False
|
|
334
|
+
if target_element_index > 0:
|
|
335
|
+
message = f"tab页:【{tab_id}】hover【{css_selector}】【index={target_element_index}】的元素 {'成功' if hover_success else '失败'} 了"
|
|
336
|
+
else:
|
|
337
|
+
message = f"tab页:【{tab_id}】hover【{css_selector}】 {'成功' if hover_success else '失败'} 了"
|
|
338
|
+
# else:
|
|
339
|
+
# message = f"tab页:【{tab_id}】传入的css_selector找到了{len(target_eles)}个元素,请确保传入的css_selector可以找到唯一的一个元素"
|
|
340
|
+
return dp_mcp_message_pack(
|
|
341
|
+
message=message,
|
|
342
|
+
browser_port=browser_port,
|
|
343
|
+
tab_id=tab_id,
|
|
344
|
+
css_selector=css_selector,
|
|
345
|
+
hoversuccess=hover_success,
|
|
346
|
+
extra_message="hover成功,页面可能有更新,请重新获取页面html,并重新分析页面Selector" if hover_success else ""
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def register_get_ele_info(mcp: FastMCP, browser_manager):
|
|
351
|
+
@mcp.tool(name="get_ele_info", description="返回元素有关一系列的信息。"
|
|
352
|
+
"参数说明:"
|
|
353
|
+
"browser_port:浏览器端口号。"
|
|
354
|
+
"tab_id:tab页的id。"
|
|
355
|
+
"css_selector:目标选择器"
|
|
356
|
+
"element_index:如果选择器定位到的是一个列表【或者可以定位到多个元素】,则该值用于定位这个列表中具体元素,默认值为0【列表中的第一个元素】"
|
|
357
|
+
"返回值说明:"
|
|
358
|
+
"element_tag:此属性返回元素的标签名。"
|
|
359
|
+
"element_attrs_key:此属性以list的形式返回元素所有属性的key。"
|
|
360
|
+
"element_rect_size:此属性以元组形式返回元素的大小【如果元素没有位置及大小,则返回空元组】。"
|
|
361
|
+
"is_in_viewport:此属性以布尔值方式返回元素是否在视口中,以元素可以接受点击的点为判断。"
|
|
362
|
+
"is_whole_in_viewport:此属性以布尔值方式返回元素是否整个在视口中。"
|
|
363
|
+
"is_alive:此属性以布尔值形式返回当前元素是否仍可用。用于判断是否因页面刷新而导致元素失效。"
|
|
364
|
+
"is_checked:此属性以布尔值返回表单单选或多选元素是否选中。"
|
|
365
|
+
"is_selected:此属性以布尔值返回<select>元素中的项是否选中。"
|
|
366
|
+
"is_enabled:此属性以布尔值返回元素是否可用。"
|
|
367
|
+
"is_displayed:此属性以布尔值返回元素是否可见。"
|
|
368
|
+
"is_covered:此属性返回元素是否被其它元素覆盖/遮挡。如被覆盖/遮挡,返回覆盖元素的 id【无障碍树的id,无法用于Selector】,否则返回False。"
|
|
369
|
+
"is_clickable:此属性返回元素是否可被模拟点击,从是否有大小、是否可用、是否显示、是否响应点击判断,不判断是否被遮挡。"
|
|
370
|
+
"has_rect:此属性返回元素是否拥有大小和位置信息,有则返回四个角在页面上的坐标组成的列表,没有则返回False。")
|
|
371
|
+
async def get_ele_info(browser_port: int, tab_id: str, css_selector: str, element_index: int = 0) -> dict[
|
|
372
|
+
str, Any]:
|
|
373
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
374
|
+
target_tab = _browser.get_tab(tab_id)
|
|
375
|
+
css_selector = css_selector
|
|
376
|
+
if "css:" not in css_selector:
|
|
377
|
+
css_selector = "css:" + css_selector
|
|
378
|
+
target_eles = target_tab.eles(css_selector)
|
|
379
|
+
try:
|
|
380
|
+
target_element = target_eles[element_index]
|
|
381
|
+
except IndexError:
|
|
382
|
+
return dp_mcp_message_pack(
|
|
383
|
+
message="报错:IndexError: list index out of range。请检查Selector中是否包含了如:first-child、nth-child等字段,如果有则去掉。必须使用element_index来控制元素的选择",
|
|
384
|
+
browser_port=browser_port,
|
|
385
|
+
tab_id=tab_id,
|
|
386
|
+
css_selector=css_selector,
|
|
387
|
+
element_index=element_index,
|
|
388
|
+
)
|
|
389
|
+
try:
|
|
390
|
+
has_rect = target_element.states.has_rect,
|
|
391
|
+
element_rect_size = tuple()
|
|
392
|
+
if not has_rect:
|
|
393
|
+
element_rect_size = target_element.rect.size,
|
|
394
|
+
try:
|
|
395
|
+
child_count=target_element.child_count
|
|
396
|
+
except Exception:
|
|
397
|
+
child_count=0
|
|
398
|
+
return dp_mcp_message_pack(
|
|
399
|
+
message="元素可以被正常的选择到,以下是元素相关的一系列信息",
|
|
400
|
+
browser_port=browser_port,
|
|
401
|
+
tab_id=tab_id,
|
|
402
|
+
css_selector=css_selector,
|
|
403
|
+
element_index=element_index,
|
|
404
|
+
element_tag=target_element.tag,
|
|
405
|
+
element_attrs_key=list(target_element.attrs.keys()),
|
|
406
|
+
element_child_count=child_count,
|
|
407
|
+
element_rect_size=element_rect_size,
|
|
408
|
+
is_in_viewport=target_element.states.is_in_viewport,
|
|
409
|
+
is_whole_in_viewport=target_element.states.is_whole_in_viewport,
|
|
410
|
+
is_alive=target_element.states.is_alive,
|
|
411
|
+
is_checked=target_element.states.is_checked,
|
|
412
|
+
is_selected=target_element.states.is_selected,
|
|
413
|
+
is_enabled=target_element.states.is_enabled,
|
|
414
|
+
is_displayed=target_element.states.is_displayed,
|
|
415
|
+
is_covered=target_element.states.is_covered,
|
|
416
|
+
is_clickable=target_element.states.is_clickable,
|
|
417
|
+
has_rect=has_rect
|
|
418
|
+
)
|
|
419
|
+
except NoRectError:
|
|
420
|
+
return dp_mcp_message_pack(
|
|
421
|
+
message="报错:NoRectError: 该元素没有位置及大小。你传入的css_selector和element_index选出的元素没有rect,请确认该元素是否可见",
|
|
422
|
+
browser_port=browser_port,
|
|
423
|
+
tab_id=tab_id,
|
|
424
|
+
css_selector=css_selector,
|
|
425
|
+
element_index=element_index,
|
|
426
|
+
)
|
|
427
|
+
|
|
310
428
|
def register_get_screenshot(mcp: FastMCP, browser_manager):
|
|
311
|
-
@mcp.tool(name="get_tab_screenshot",
|
|
429
|
+
@mcp.tool(name="get_tab_screenshot",
|
|
430
|
+
description="尝试对传入tab页进行截图,并将截图压缩为1M大小png图片,会返回截图保存路径")
|
|
312
431
|
async def get_tab_screenshot(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
313
432
|
_browser = browser_manager.get_browser(browser_port)
|
|
314
433
|
target_tab = _browser.get_tab(tab_id)
|
|
434
|
+
target_tab.wait.doc_loaded()
|
|
315
435
|
if not os.path.exists(html_source_code_local_save_path):
|
|
316
436
|
os.makedirs(html_source_code_local_save_path)
|
|
317
437
|
timestamp = int(time.time() * 1000)
|
|
438
|
+
time.sleep(3)
|
|
318
439
|
origin_png = target_tab.get_screenshot(as_bytes="png")
|
|
319
440
|
compress_png = compress_image_bytes(origin_png)
|
|
320
441
|
image_path = os.path.join(html_source_code_local_save_path, f"{browser_port}_{tab_id}_{timestamp}.png")
|
|
@@ -24,6 +24,8 @@ if "TeamNode-Dp" in enabled_modules:
|
|
|
24
24
|
# 页面交互
|
|
25
25
|
register_click_action(mcp, browser_manager)
|
|
26
26
|
register_scroll_action(mcp, browser_manager)
|
|
27
|
+
register_mouse_hover(mcp, browser_manager)
|
|
28
|
+
register_get_ele_info(mcp, browser_manager)
|
|
27
29
|
|
|
28
30
|
if "JarvisNode" in enabled_modules:
|
|
29
31
|
register_assert_waf(mcp, browser_manager)
|
|
@@ -140,7 +140,10 @@ def check_data_packet(packet: DataPacket, client: DPProxyClient):
|
|
|
140
140
|
data = packet.request.postData
|
|
141
141
|
domain = urlparse(url).netloc
|
|
142
142
|
body = packet.response.body
|
|
143
|
-
|
|
143
|
+
if isinstance(body, dict):
|
|
144
|
+
body_str = json.dumps(body, ensure_ascii=False, separators=(',', ':'))
|
|
145
|
+
else:
|
|
146
|
+
body_str = str(body)
|
|
144
147
|
body_str_list = [body_str[i:i + one_turn_max_token] for i in range(0, len(body_str), one_turn_max_token)]
|
|
145
148
|
body_completed = True
|
|
146
149
|
packet_filter = client.packet_filter
|
|
@@ -155,9 +158,9 @@ def check_data_packet(packet: DataPacket, client: DPProxyClient):
|
|
|
155
158
|
continue
|
|
156
159
|
if (index + 1) != len(body_str_list):
|
|
157
160
|
body_completed = False
|
|
158
|
-
|
|
161
|
+
try:
|
|
159
162
|
response_headers = packet.response.headers
|
|
160
|
-
|
|
163
|
+
except TypeError:
|
|
161
164
|
response_headers = {}
|
|
162
165
|
temp_dict = {
|
|
163
166
|
"url": url,
|
|
@@ -171,59 +174,4 @@ def check_data_packet(packet: DataPacket, client: DPProxyClient):
|
|
|
171
174
|
client.packet_queue.append(temp_dict)
|
|
172
175
|
|
|
173
176
|
|
|
174
|
-
def check_data_packet(packet: DataPacket, client: DPProxyClient):
|
|
175
|
-
"""
|
|
176
|
-
封装监听到的数据包,并将其存放在client的packet_queue中
|
|
177
|
-
:param packet:
|
|
178
|
-
:param client:
|
|
179
|
-
:return:
|
|
180
|
-
"""
|
|
181
|
-
url = packet.url
|
|
182
|
-
method = packet.request.method
|
|
183
|
-
data = None
|
|
184
|
-
if packet.request.hasPostData:
|
|
185
|
-
data = packet.request.postData
|
|
186
|
-
domain = urlparse(url).netloc
|
|
187
|
-
body = packet.response.body
|
|
188
|
-
body_str = json.dumps(body, ensure_ascii=False, separators=(',', ':'))
|
|
189
|
-
body_str_list = [body_str[i:i + one_turn_max_token] for i in range(0, len(body_str), one_turn_max_token)]
|
|
190
|
-
body_completed = True
|
|
191
|
-
packet_filter = client.packet_filter
|
|
192
|
-
domain_filter = packet_filter.get("domain_filter", None)
|
|
193
|
-
method_filter = packet_filter.get("method_filter", ["GET", "POST"])
|
|
194
|
-
for index, body_str in enumerate(body_str_list):
|
|
195
|
-
# 如果给了domain_filter并且domain没有在domain_filter中时跳过该数据包
|
|
196
|
-
if domain_filter and domain not in domain_filter:
|
|
197
|
-
continue
|
|
198
|
-
# 如果method没有在method_filter中,则跳过该数据包
|
|
199
|
-
if method not in method_filter:
|
|
200
|
-
continue
|
|
201
|
-
if (index + 1) != len(body_str_list):
|
|
202
|
-
body_completed = False
|
|
203
|
-
temp_dict = {
|
|
204
|
-
"url": url,
|
|
205
|
-
"body_completed": body_completed,
|
|
206
|
-
"method": method,
|
|
207
|
-
"request_data": data,
|
|
208
|
-
"request_headers": dict(packet.request.headers),
|
|
209
|
-
"response_headers": dict(packet.response.headers),
|
|
210
|
-
"response_body_segment": body_str.replace("\\", ""),
|
|
211
|
-
}
|
|
212
|
-
client.packet_queue.append(temp_dict)
|
|
213
|
-
|
|
214
|
-
|
|
215
177
|
client_manager = DPProxyClientManager()
|
|
216
|
-
|
|
217
|
-
# if __name__ == '__main__':
|
|
218
|
-
# co = ChromiumOptions().set_user_agent(
|
|
219
|
-
# "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Mobile Safari/537.36")
|
|
220
|
-
# tab = ChromiumPage(co).latest_tab
|
|
221
|
-
# client = DPProxyClient(tab, self_kill=False)
|
|
222
|
-
# # client = CaptchaClient(tab, self_kill=True)
|
|
223
|
-
# tab = client.get_driver(True)
|
|
224
|
-
# url = "https://api.toutiaoapi.com/feoffline/hotspot_and_local/html/hot_list/index.html?client_extra_params=%7B%22custom_log_pb%22%3A%22%7B%5C%22style_id%5C%22%3A%5C%2240030%5C%22%2C%5C%22entrance_hotspot%5C%22%3A%5C%22search%5C%22%2C%5C%22location%5C%22%3A%5C%22hot_board%5C%22%2C%5C%22category_name%5C%22%3A%5C%22hotboard_light%5C%22%7D%22%7D&count=50&log_pb=%7B%22style_id%22%3A%2240030%22%2C%22entrance_hotspot%22%3A%22search%22%2C%22location%22%3A%22hot_board%22%2C%22category_name%22%3A%22hotboard_light%22%7D&only_hot_list=1&tab_name=stream&enter_keyword=%23%E7%BE%8E%E5%9B%BD%E9%80%80%E5%87%BA66%E4%B8%AA%E5%9B%BD%E9%99%85%E7%BB%84%E7%BB%87%23"
|
|
225
|
-
# tab.get(url)
|
|
226
|
-
# for _ in range(5056):
|
|
227
|
-
# new_packet = client.pop_first_packet()
|
|
228
|
-
# print(new_packet, "23")
|
|
229
|
-
# time.sleep(1)
|
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import random
|
|
3
|
+
import os
|
|
4
|
+
import minify_html
|
|
5
|
+
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
6
|
+
from bs4 import BeautifulSoup
|
|
7
|
+
from curl_cffi import requests
|
|
8
|
+
from lxml import html, etree
|
|
9
|
+
import base64
|
|
10
|
+
from PIL import Image
|
|
11
|
+
import io
|
|
12
|
+
|
|
13
|
+
compress_html_js1 = """
|
|
14
|
+
function getSimplifiedDOM(node) {
|
|
15
|
+
// 1. 处理文本节点
|
|
16
|
+
if (node.nodeType === Node.TEXT_NODE) {
|
|
17
|
+
const text = node.textContent.trim();
|
|
18
|
+
return text ? text.slice(0, 100) + (text.length > 100 ? '...' : '') : null;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// 2. 过滤无用标签
|
|
22
|
+
const ignoreTags = ['SCRIPT', 'STYLE', 'NOSCRIPT', 'IFRAME', 'SVG', 'LINK', 'META'];
|
|
23
|
+
if (ignoreTags.includes(node.tagName)) return null;
|
|
24
|
+
if (node.nodeType !== Node.ELEMENT_NODE) return null;
|
|
25
|
+
|
|
26
|
+
// 3. 过滤不可见元素
|
|
27
|
+
// 【注意】这里声明了第一次 style
|
|
28
|
+
const style = window.getComputedStyle(node);
|
|
29
|
+
|
|
30
|
+
if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') return null;
|
|
31
|
+
|
|
32
|
+
// 过滤宽高太小的元素(往往是埋点空像素)
|
|
33
|
+
const rect = node.getBoundingClientRect();
|
|
34
|
+
|
|
35
|
+
// 【修复点】删除了这里重复的 const style = ... 代码
|
|
36
|
+
// 直接使用上面已经定义好的 style 变量即可
|
|
37
|
+
|
|
38
|
+
// 如果宽高为0,但溢出可见,说明可能有定位的子元素显示在外面
|
|
39
|
+
if ((rect.width === 0 || rect.height === 0) && style.overflow !== 'visible') return null;
|
|
40
|
+
|
|
41
|
+
// --- 开始构建标签字符串 ---
|
|
42
|
+
const tagName = node.tagName.toLowerCase();
|
|
43
|
+
let tagStr = tagName;
|
|
44
|
+
|
|
45
|
+
// A. 基础标识符 (ID 和 Class)
|
|
46
|
+
if (node.id) tagStr += `#${node.id}`;
|
|
47
|
+
if (node.className && typeof node.className === 'string') {
|
|
48
|
+
const classes = node.className.trim().split(/\s+/);
|
|
49
|
+
if (classes.length > 0) tagStr += `.${classes.join('.')}`;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// B. 关键属性白名单
|
|
53
|
+
const props = [];
|
|
54
|
+
|
|
55
|
+
// 通用重要属性
|
|
56
|
+
if (node.getAttribute('role')) props.push(`role="${node.getAttribute('role')}"`);
|
|
57
|
+
if (node.getAttribute('aria-label')) props.push(`aria-label="${node.getAttribute('aria-label')}"`);
|
|
58
|
+
if (node.getAttribute('title')) props.push(`title="${node.getAttribute('title')}"`);
|
|
59
|
+
// 建议增加这个,很多弹窗用这个属性
|
|
60
|
+
if (node.getAttribute('aria-modal')) props.push(`aria-modal="${node.getAttribute('aria-modal')}"`);
|
|
61
|
+
|
|
62
|
+
// 特定标签的特定属性
|
|
63
|
+
if (tagName === 'a') {
|
|
64
|
+
const href = node.getAttribute('href');
|
|
65
|
+
if (href && !href.startsWith('javascript')) props.push(`href="${href}"`);
|
|
66
|
+
} else if (tagName === 'input' || tagName === 'textarea' || tagName === 'select') {
|
|
67
|
+
if (node.getAttribute('type')) props.push(`type="${node.getAttribute('type')}"`);
|
|
68
|
+
if (node.getAttribute('name')) props.push(`name="${node.getAttribute('name')}"`);
|
|
69
|
+
if (node.getAttribute('placeholder')) props.push(`placeholder="${node.getAttribute('placeholder')}"`);
|
|
70
|
+
if (node.disabled) props.push('disabled');
|
|
71
|
+
if (node.checked) props.push('checked');
|
|
72
|
+
} else if (tagName === 'button') {
|
|
73
|
+
if (node.getAttribute('type')) props.push(`type="${node.getAttribute('type')}"`);
|
|
74
|
+
} else if (tagName === 'img') {
|
|
75
|
+
if (node.getAttribute('alt')) props.push(`alt="${node.getAttribute('alt')}"`);
|
|
76
|
+
} else if (tagName === 'dialog') {
|
|
77
|
+
// 保留 open 属性
|
|
78
|
+
if (node.open) props.push('open');
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (props.length > 0) {
|
|
82
|
+
tagStr += ` ${props.join(' ')}`;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// 4. 递归子节点 (包含 Shadow DOM 处理)
|
|
86
|
+
let childNodes = Array.from(node.childNodes);
|
|
87
|
+
if (node.shadowRoot) {
|
|
88
|
+
childNodes = [...childNodes, ...Array.from(node.shadowRoot.childNodes)];
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const children = childNodes
|
|
92
|
+
.map(getSimplifiedDOM)
|
|
93
|
+
.filter(n => n !== null);
|
|
94
|
+
|
|
95
|
+
// 5. 组装输出
|
|
96
|
+
if (children.length === 0) {
|
|
97
|
+
return `<${tagStr} />`;
|
|
98
|
+
}
|
|
99
|
+
return `<${tagStr}>${children.join('')}</${tagName}>`;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return getSimplifiedDOM(document.body);
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
# 我自己优化后的版本,逻辑为:删除不可见元素、标签的任何属性value的长度大于20时直接删除这个属性、id和class采用简写方式:id=>#,class=>.
|
|
106
|
+
compress_html_js="""
|
|
107
|
+
function getSimplifiedDOM(node) {
|
|
108
|
+
// 全局配置:最大属性值长度
|
|
109
|
+
const MAX_ATTR_LEN = 40;
|
|
110
|
+
|
|
111
|
+
// 1. 处理文本节点
|
|
112
|
+
if (node.nodeType === Node.TEXT_NODE) {
|
|
113
|
+
const text = node.textContent.trim();
|
|
114
|
+
return text ? text.slice(0, 100) + (text.length > 100 ? '...' : '') : null;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// 2. 过滤无用标签
|
|
118
|
+
const ignoreTags = ['SCRIPT', 'STYLE', 'NOSCRIPT', 'IFRAME', 'LINK', 'META', 'AUDIO', 'VIDEO', 'CANVAS'];
|
|
119
|
+
if (ignoreTags.includes(node.tagName)) return null;
|
|
120
|
+
if (node.nodeType !== Node.ELEMENT_NODE) return null;
|
|
121
|
+
|
|
122
|
+
// 3. 过滤不可见元素
|
|
123
|
+
const style = window.getComputedStyle(node);
|
|
124
|
+
if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') return null;
|
|
125
|
+
|
|
126
|
+
const rect = node.getBoundingClientRect();
|
|
127
|
+
if ((rect.width === 0 || rect.height === 0) && style.overflow !== 'visible') return null;
|
|
128
|
+
|
|
129
|
+
// --- 开始构建标签字符串 ---
|
|
130
|
+
const tagName = node.tagName.toLowerCase();
|
|
131
|
+
let tagStr = tagName;
|
|
132
|
+
|
|
133
|
+
const id = node.id;
|
|
134
|
+
const className = node.getAttribute('class');
|
|
135
|
+
|
|
136
|
+
// A. 处理 ID 简写 (#id)
|
|
137
|
+
// 限制提高到 40
|
|
138
|
+
if (id && id.length <= MAX_ATTR_LEN) {
|
|
139
|
+
tagStr += `#${id}`;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// B. 处理 Class 简写 (.class)
|
|
143
|
+
// 限制提高到 40
|
|
144
|
+
if (className && typeof className === 'string' && className.length <= MAX_ATTR_LEN) {
|
|
145
|
+
const classes = className.trim().split(/\s+/);
|
|
146
|
+
if (classes.length > 0) {
|
|
147
|
+
tagStr += `.${classes.join('.')}`;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
let propsStr = '';
|
|
152
|
+
|
|
153
|
+
// C. 处理属性
|
|
154
|
+
if (node.hasAttributes()) {
|
|
155
|
+
for (const attr of node.attributes) {
|
|
156
|
+
const name = attr.name;
|
|
157
|
+
const value = attr.value;
|
|
158
|
+
|
|
159
|
+
// 1. 跳过 ID 和 Class (已在 tagStr 处理,或因过长被丢弃)
|
|
160
|
+
if (name === 'id' || name === 'class') continue;
|
|
161
|
+
|
|
162
|
+
// 2. 黑名单:直接删除 style 和 aria-label
|
|
163
|
+
if (name === 'style' || name === 'aria-label') continue;
|
|
164
|
+
|
|
165
|
+
// 3. 特殊标签:path 标签删除所有属性
|
|
166
|
+
if (tagName === 'path') continue;
|
|
167
|
+
|
|
168
|
+
// 4. 【长度与白名单逻辑】
|
|
169
|
+
// 如果不是 src 且不是 href,同时长度又超过了 40,则删除
|
|
170
|
+
const isLinkAttr = (name === 'src' || name === 'href');
|
|
171
|
+
|
|
172
|
+
if (!isLinkAttr && value.length > MAX_ATTR_LEN) {
|
|
173
|
+
continue;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// 5. 拼接保留的属性
|
|
177
|
+
propsStr += ` ${name}="${value.replace(/"/g, '"')}"`;
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// 4. 递归子节点
|
|
182
|
+
let childNodes = Array.from(node.childNodes);
|
|
183
|
+
if (node.shadowRoot) {
|
|
184
|
+
childNodes = [...childNodes, ...Array.from(node.shadowRoot.childNodes)];
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const children = childNodes
|
|
188
|
+
.map(getSimplifiedDOM)
|
|
189
|
+
.filter(n => n !== null);
|
|
190
|
+
|
|
191
|
+
// 5. 组装输出
|
|
192
|
+
if (children.length === 0) {
|
|
193
|
+
return `<${tagStr}${propsStr} />`;
|
|
194
|
+
}
|
|
195
|
+
return `<${tagStr}${propsStr}>${children.join('')}</${tagName}>`;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return getSimplifiedDOM(document.body);
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
# 使用requests获取html,用于测试是否使用了瑞数和jsl
|
|
202
|
+
def requests_html(url):
|
|
203
|
+
headers = {
|
|
204
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36",
|
|
205
|
+
}
|
|
206
|
+
response = requests.get(url, headers=headers, verify=False)
|
|
207
|
+
response.encoding = "utf-8"
|
|
208
|
+
return response.text, response.status_code
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
# 使用dp无头模式获取html,用于测试是否使用了其他waf,如移动waf
|
|
212
|
+
def dp_headless_html(url):
|
|
213
|
+
opt = ChromiumOptions().headless(True)
|
|
214
|
+
opt.set_argument('--no-sandbox')
|
|
215
|
+
"""创建新的浏览器实例"""
|
|
216
|
+
random_port = random.randint(9934, 10034)
|
|
217
|
+
custom_data_dir = os.path.join(os.path.expanduser('~'), 'DrissionPage', "userData", f"{random_port}")
|
|
218
|
+
opt.set_user_data_path(custom_data_dir) # 设置用户数据路径
|
|
219
|
+
opt.set_local_port(random_port)
|
|
220
|
+
page = ChromiumPage(opt)
|
|
221
|
+
tab = page.latest_tab
|
|
222
|
+
tab.get(url)
|
|
223
|
+
# todo: 目前没有更好的方式,为了数据渲染完全,只能硬等【受网速波动影响比较大】
|
|
224
|
+
time.sleep(10)
|
|
225
|
+
page_html = tab.html
|
|
226
|
+
# 无头浏览器在用完之后一定要记得再page级别进行quit
|
|
227
|
+
page.quit()
|
|
228
|
+
return page_html
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
# 压缩html
|
|
232
|
+
def compress_html(content, only_text=False):
|
|
233
|
+
doc = html.fromstring(content)
|
|
234
|
+
# 删除 style 和 script 标签
|
|
235
|
+
for element in doc.xpath('//style | //script'):
|
|
236
|
+
element.getparent().remove(element)
|
|
237
|
+
|
|
238
|
+
# 删除 link 标签
|
|
239
|
+
for link in doc.xpath('//link[@rel="stylesheet"]'):
|
|
240
|
+
link.getparent().remove(link)
|
|
241
|
+
|
|
242
|
+
# 删除 meta 标签(新增功能)
|
|
243
|
+
for meta in doc.xpath('//meta'):
|
|
244
|
+
meta.getparent().remove(meta)
|
|
245
|
+
|
|
246
|
+
for svg in doc.xpath('//svg'):
|
|
247
|
+
# 获取 SVG 内的文本内容
|
|
248
|
+
text_content = svg.text_content()
|
|
249
|
+
# 创建一个新的文本节点替换 SVG
|
|
250
|
+
parent = svg.getparent()
|
|
251
|
+
if parent is not None:
|
|
252
|
+
parent.text = (parent.text or '') + text_content
|
|
253
|
+
parent.remove(svg)
|
|
254
|
+
|
|
255
|
+
# 删除 style 属性
|
|
256
|
+
for element in doc.xpath('//*[@style]'):
|
|
257
|
+
element.attrib.pop('style')
|
|
258
|
+
|
|
259
|
+
# 删除所有 on* 事件属性
|
|
260
|
+
for element in doc.xpath('//*'):
|
|
261
|
+
for attr in list(element.attrib.keys()):
|
|
262
|
+
if attr.startswith('on'):
|
|
263
|
+
element.attrib.pop(attr)
|
|
264
|
+
|
|
265
|
+
result = etree.tostring(doc, encoding='unicode')
|
|
266
|
+
result = minify_html.minify(result)
|
|
267
|
+
compress_rate = round(len(content) / len(result) * 100)
|
|
268
|
+
print(f"html压缩比=> {compress_rate}%")
|
|
269
|
+
if not only_text:
|
|
270
|
+
return result, compress_rate
|
|
271
|
+
soup = BeautifulSoup(result, 'html.parser')
|
|
272
|
+
result = soup.get_text(strip=True)
|
|
273
|
+
return result, compress_rate
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
# 通过cookie判断是否有waf,需要通过遇到的例子,不断的完善cookie判别函数
|
|
277
|
+
def assert_waf_cookie(cookies: list):
|
|
278
|
+
for cookie in cookies:
|
|
279
|
+
cookie_name = cookie['name']
|
|
280
|
+
cookie_value = cookie['value']
|
|
281
|
+
if len(cookie_name) == 13 and len(cookie_value) == 88:
|
|
282
|
+
return True, "瑞数"
|
|
283
|
+
if "_jsl" in cookie_name:
|
|
284
|
+
return True, "加速乐"
|
|
285
|
+
return False, "没有waf"
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
# 对dp_mcp的消息打包
|
|
289
|
+
def dp_mcp_message_pack(message: str, **kwargs):
|
|
290
|
+
text_obj = {key: value for key, value in kwargs.items()}
|
|
291
|
+
text_obj.update({"message": message})
|
|
292
|
+
return {
|
|
293
|
+
"content": [{
|
|
294
|
+
"type": "text",
|
|
295
|
+
# "text": json.dumps(text_obj, ensure_ascii=False)
|
|
296
|
+
"text": text_obj
|
|
297
|
+
}]
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def btyes2Base64Img(target_byte):
|
|
302
|
+
"""
|
|
303
|
+
把byte转为base64,用于传输图片
|
|
304
|
+
:param target_byte:
|
|
305
|
+
:return:
|
|
306
|
+
"""
|
|
307
|
+
return "data:image/png;base64," + base64.b64encode(target_byte).decode()
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def compress_image_bytes(input_bytes, target_size_mb=1):
|
|
311
|
+
"""
|
|
312
|
+
压缩图片字节数据到目标大小
|
|
313
|
+
|
|
314
|
+
参数:
|
|
315
|
+
input_bytes: 输入图片的字节数据
|
|
316
|
+
target_size_mb: 目标大小(MB),默认1MB
|
|
317
|
+
|
|
318
|
+
返回:
|
|
319
|
+
压缩后的图片字节数据
|
|
320
|
+
"""
|
|
321
|
+
target_size = target_size_mb * 1024 * 1024 # 转换为字节
|
|
322
|
+
|
|
323
|
+
# 从字节数据打开图片
|
|
324
|
+
img = Image.open(io.BytesIO(input_bytes))
|
|
325
|
+
|
|
326
|
+
# 如果是PNG或其他格式,转换为RGB
|
|
327
|
+
if img.mode in ('RGBA', 'LA', 'P'):
|
|
328
|
+
img = img.convert('RGB')
|
|
329
|
+
|
|
330
|
+
# 初始质量设置
|
|
331
|
+
quality = 95
|
|
332
|
+
|
|
333
|
+
# 先尝试压缩
|
|
334
|
+
output_buffer = io.BytesIO()
|
|
335
|
+
img.save(output_buffer, 'JPEG', quality=quality, optimize=True)
|
|
336
|
+
output_bytes = output_buffer.getvalue()
|
|
337
|
+
|
|
338
|
+
# 如果文件仍然太大,逐步降低质量
|
|
339
|
+
while len(output_bytes) > target_size and quality > 10:
|
|
340
|
+
quality -= 5
|
|
341
|
+
output_buffer = io.BytesIO()
|
|
342
|
+
img.save(output_buffer, 'JPEG', quality=quality, optimize=True)
|
|
343
|
+
output_bytes = output_buffer.getvalue()
|
|
344
|
+
|
|
345
|
+
# 如果降低质量还不够,尝试缩小尺寸
|
|
346
|
+
if len(output_bytes) > target_size:
|
|
347
|
+
width, height = img.size
|
|
348
|
+
|
|
349
|
+
while len(output_bytes) > target_size and quality > 10:
|
|
350
|
+
# 缩小10%
|
|
351
|
+
width = int(width * 0.9)
|
|
352
|
+
height = int(height * 0.9)
|
|
353
|
+
img_resized = img.resize((width, height), Image.Resampling.LANCZOS)
|
|
354
|
+
output_buffer = io.BytesIO()
|
|
355
|
+
img_resized.save(output_buffer, 'JPEG', quality=quality, optimize=True)
|
|
356
|
+
output_bytes = output_buffer.getvalue()
|
|
357
|
+
|
|
358
|
+
final_size = len(output_bytes) / (1024 * 1024)
|
|
359
|
+
# print(f"压缩完成!")
|
|
360
|
+
# print(f"原始大小: {len(input_bytes) / (1024 * 1024):.2f}MB")
|
|
361
|
+
# print(f"压缩后大小: {final_size:.2f}MB")
|
|
362
|
+
# print(f"最终质量: {quality}")
|
|
363
|
+
|
|
364
|
+
return output_bytes
|
|
365
|
+
|
|
@@ -1,267 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
import random
|
|
3
|
-
import os
|
|
4
|
-
import minify_html
|
|
5
|
-
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
6
|
-
from bs4 import BeautifulSoup
|
|
7
|
-
from curl_cffi import requests
|
|
8
|
-
from lxml import html, etree
|
|
9
|
-
import base64
|
|
10
|
-
from PIL import Image
|
|
11
|
-
import io
|
|
12
|
-
|
|
13
|
-
compress_html_js = """
|
|
14
|
-
function getSimplifiedDOM(node) {
|
|
15
|
-
// 1. 处理文本节点
|
|
16
|
-
if (node.nodeType === Node.TEXT_NODE) {
|
|
17
|
-
const text = node.textContent.trim();
|
|
18
|
-
// 限制文本长度,避免大段文章消耗 token,保留前100个字符通常足够定位
|
|
19
|
-
return text ? text.slice(0, 100) + (text.length > 100 ? '...' : '') : null;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
// 2. 过滤无用标签
|
|
23
|
-
const ignoreTags = ['SCRIPT', 'STYLE', 'NOSCRIPT', 'IFRAME', 'SVG', 'LINK', 'META'];
|
|
24
|
-
if (ignoreTags.includes(node.tagName)) return null;
|
|
25
|
-
if (node.nodeType !== Node.ELEMENT_NODE) return null;
|
|
26
|
-
|
|
27
|
-
// 3. 过滤不可见元素
|
|
28
|
-
const style = window.getComputedStyle(node);
|
|
29
|
-
if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') return null;
|
|
30
|
-
// 过滤宽高太小的元素(往往是埋点空像素)
|
|
31
|
-
const rect = node.getBoundingClientRect();
|
|
32
|
-
if (rect.width === 0 || rect.height === 0) return null;
|
|
33
|
-
|
|
34
|
-
// --- 开始构建标签字符串 ---
|
|
35
|
-
const tagName = node.tagName.toLowerCase();
|
|
36
|
-
let tagStr = tagName;
|
|
37
|
-
|
|
38
|
-
// A. 基础标识符 (ID 和 Class)
|
|
39
|
-
if (node.id) tagStr += `#${node.id}`;
|
|
40
|
-
if (node.className && typeof node.className === 'string') {
|
|
41
|
-
// 过滤掉 Tailwind 等太长且无语义的 class,保留有意义的业务 class
|
|
42
|
-
// 这里简单处理,全部保留,让 LLM 自己判断
|
|
43
|
-
const classes = node.className.trim().split(/\s+/);
|
|
44
|
-
if (classes.length > 0) tagStr += `.${classes.join('.')}`;
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
// B. 关键属性白名单 (这是你指出问题的核心修复)
|
|
48
|
-
const props = [];
|
|
49
|
-
|
|
50
|
-
// 通用重要属性
|
|
51
|
-
if (node.getAttribute('role')) props.push(`role="${node.getAttribute('role')}"`);
|
|
52
|
-
if (node.getAttribute('aria-label')) props.push(`aria-label="${node.getAttribute('aria-label')}"`);
|
|
53
|
-
if (node.getAttribute('title')) props.push(`title="${node.getAttribute('title')}"`);
|
|
54
|
-
|
|
55
|
-
// 特定标签的特定属性
|
|
56
|
-
if (tagName === 'a') {
|
|
57
|
-
const href = node.getAttribute('href');
|
|
58
|
-
// 只保留有意义的链接,忽略 javascript:;
|
|
59
|
-
if (href && !href.startsWith('javascript')) props.push(`href="${href}"`);
|
|
60
|
-
} else if (tagName === 'input' || tagName === 'textarea' || tagName === 'select') {
|
|
61
|
-
if (node.getAttribute('type')) props.push(`type="${node.getAttribute('type')}"`);
|
|
62
|
-
if (node.getAttribute('name')) props.push(`name="${node.getAttribute('name')}"`);
|
|
63
|
-
if (node.getAttribute('placeholder')) props.push(`placeholder="${node.getAttribute('placeholder')}"`);
|
|
64
|
-
if (node.disabled) props.push('disabled');
|
|
65
|
-
if (node.checked) props.push('checked');
|
|
66
|
-
} else if (tagName === 'button') {
|
|
67
|
-
if (node.getAttribute('type')) props.push(`type="${node.getAttribute('type')}"`);
|
|
68
|
-
} else if (tagName === 'img') {
|
|
69
|
-
if (node.getAttribute('alt')) props.push(`alt="${node.getAttribute('alt')}"`);
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
if (props.length > 0) {
|
|
73
|
-
tagStr += ` ${props.join(' ')}`;
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
// 4. 递归子节点
|
|
77
|
-
const children = Array.from(node.childNodes)
|
|
78
|
-
.map(getSimplifiedDOM)
|
|
79
|
-
.filter(n => n !== null);
|
|
80
|
-
|
|
81
|
-
// 5. 组装输出
|
|
82
|
-
// 如果没有子节点,也没有ID/Class,也不是输入框/图片/链接,那这个标签可能只是布局用的 div,可以考虑跳过它直接返回子节点内容
|
|
83
|
-
// 但为了保持结构完整,我们暂时保留它
|
|
84
|
-
if (children.length === 0) {
|
|
85
|
-
// 自闭合标签或空标签
|
|
86
|
-
return `<${tagStr} />`;
|
|
87
|
-
}
|
|
88
|
-
return `<${tagStr}>${children.join('')}</${tagName}>`; // 结束标签只保留 tagName 节省 token
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
return getSimplifiedDOM(document.body);
|
|
92
|
-
"""
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
# 使用requests获取html,用于测试是否使用了瑞数和jsl
|
|
96
|
-
def requests_html(url):
|
|
97
|
-
headers = {
|
|
98
|
-
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36",
|
|
99
|
-
}
|
|
100
|
-
response = requests.get(url, headers=headers, verify=False)
|
|
101
|
-
response.encoding = "utf-8"
|
|
102
|
-
return response.text, response.status_code
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
# 使用dp无头模式获取html,用于测试是否使用了其他waf,如移动waf
|
|
106
|
-
def dp_headless_html(url):
|
|
107
|
-
opt = ChromiumOptions().headless(True)
|
|
108
|
-
opt.set_argument('--no-sandbox')
|
|
109
|
-
"""创建新的浏览器实例"""
|
|
110
|
-
random_port = random.randint(9934, 10034)
|
|
111
|
-
custom_data_dir = os.path.join(os.path.expanduser('~'), 'DrissionPage', "userData", f"{random_port}")
|
|
112
|
-
opt.set_user_data_path(custom_data_dir) # 设置用户数据路径
|
|
113
|
-
opt.set_local_port(random_port)
|
|
114
|
-
page = ChromiumPage(opt)
|
|
115
|
-
tab = page.latest_tab
|
|
116
|
-
tab.get(url)
|
|
117
|
-
# todo: 目前没有更好的方式,为了数据渲染完全,只能硬等【受网速波动影响比较大】
|
|
118
|
-
time.sleep(10)
|
|
119
|
-
page_html = tab.html
|
|
120
|
-
# 无头浏览器在用完之后一定要记得再page级别进行quit
|
|
121
|
-
page.quit()
|
|
122
|
-
return page_html
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
# 压缩html
|
|
126
|
-
def compress_html(content, only_text=False):
|
|
127
|
-
doc = html.fromstring(content)
|
|
128
|
-
# 删除 style 和 script 标签
|
|
129
|
-
for element in doc.xpath('//style | //script'):
|
|
130
|
-
element.getparent().remove(element)
|
|
131
|
-
|
|
132
|
-
# 删除 link 标签
|
|
133
|
-
for link in doc.xpath('//link[@rel="stylesheet"]'):
|
|
134
|
-
link.getparent().remove(link)
|
|
135
|
-
|
|
136
|
-
# 删除 meta 标签(新增功能)
|
|
137
|
-
for meta in doc.xpath('//meta'):
|
|
138
|
-
meta.getparent().remove(meta)
|
|
139
|
-
|
|
140
|
-
for svg in doc.xpath('//svg'):
|
|
141
|
-
# 获取 SVG 内的文本内容
|
|
142
|
-
text_content = svg.text_content()
|
|
143
|
-
# 创建一个新的文本节点替换 SVG
|
|
144
|
-
parent = svg.getparent()
|
|
145
|
-
if parent is not None:
|
|
146
|
-
parent.text = (parent.text or '') + text_content
|
|
147
|
-
parent.remove(svg)
|
|
148
|
-
|
|
149
|
-
# 删除 style 属性
|
|
150
|
-
for element in doc.xpath('//*[@style]'):
|
|
151
|
-
element.attrib.pop('style')
|
|
152
|
-
|
|
153
|
-
# 删除所有 on* 事件属性
|
|
154
|
-
for element in doc.xpath('//*'):
|
|
155
|
-
for attr in list(element.attrib.keys()):
|
|
156
|
-
if attr.startswith('on'):
|
|
157
|
-
element.attrib.pop(attr)
|
|
158
|
-
|
|
159
|
-
result = etree.tostring(doc, encoding='unicode')
|
|
160
|
-
result = minify_html.minify(result)
|
|
161
|
-
compress_rate = round(len(content) / len(result) * 100)
|
|
162
|
-
print(f"html压缩比=> {compress_rate}%")
|
|
163
|
-
if not only_text:
|
|
164
|
-
return result, compress_rate
|
|
165
|
-
soup = BeautifulSoup(result, 'html.parser')
|
|
166
|
-
result = soup.get_text(strip=True)
|
|
167
|
-
return result, compress_rate
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
# 通过cookie判断是否有waf,需要通过遇到的例子,不断的完善cookie判别函数
|
|
171
|
-
def assert_waf_cookie(cookies: list):
|
|
172
|
-
for cookie in cookies:
|
|
173
|
-
cookie_name = cookie['name']
|
|
174
|
-
cookie_value = cookie['value']
|
|
175
|
-
if len(cookie_name) == 13 and len(cookie_value) == 88:
|
|
176
|
-
return True, "瑞数"
|
|
177
|
-
if "_jsl" in cookie_name:
|
|
178
|
-
return True, "加速乐"
|
|
179
|
-
return False, "没有waf"
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
# 对dp_mcp的消息打包
|
|
183
|
-
def dp_mcp_message_pack(message: str, **kwargs):
|
|
184
|
-
text_obj = {key: value for key, value in kwargs.items()}
|
|
185
|
-
text_obj.update({"message": message})
|
|
186
|
-
return {
|
|
187
|
-
"content": [{
|
|
188
|
-
"type": "text",
|
|
189
|
-
# "text": json.dumps(text_obj, ensure_ascii=False)
|
|
190
|
-
"text": text_obj
|
|
191
|
-
}]
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
def btyes2Base64Img(target_byte):
|
|
196
|
-
"""
|
|
197
|
-
把byte转为base64,用于传输图片
|
|
198
|
-
:param target_byte:
|
|
199
|
-
:return:
|
|
200
|
-
"""
|
|
201
|
-
return "data:image/png;base64," + base64.b64encode(target_byte).decode()
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
def compress_image_bytes(input_bytes, target_size_mb=1):
|
|
205
|
-
"""
|
|
206
|
-
压缩图片字节数据到目标大小
|
|
207
|
-
|
|
208
|
-
参数:
|
|
209
|
-
input_bytes: 输入图片的字节数据
|
|
210
|
-
target_size_mb: 目标大小(MB),默认1MB
|
|
211
|
-
|
|
212
|
-
返回:
|
|
213
|
-
压缩后的图片字节数据
|
|
214
|
-
"""
|
|
215
|
-
target_size = target_size_mb * 1024 * 1024 # 转换为字节
|
|
216
|
-
|
|
217
|
-
# 从字节数据打开图片
|
|
218
|
-
img = Image.open(io.BytesIO(input_bytes))
|
|
219
|
-
|
|
220
|
-
# 如果是PNG或其他格式,转换为RGB
|
|
221
|
-
if img.mode in ('RGBA', 'LA', 'P'):
|
|
222
|
-
img = img.convert('RGB')
|
|
223
|
-
|
|
224
|
-
# 初始质量设置
|
|
225
|
-
quality = 95
|
|
226
|
-
|
|
227
|
-
# 先尝试压缩
|
|
228
|
-
output_buffer = io.BytesIO()
|
|
229
|
-
img.save(output_buffer, 'JPEG', quality=quality, optimize=True)
|
|
230
|
-
output_bytes = output_buffer.getvalue()
|
|
231
|
-
|
|
232
|
-
# 如果文件仍然太大,逐步降低质量
|
|
233
|
-
while len(output_bytes) > target_size and quality > 10:
|
|
234
|
-
quality -= 5
|
|
235
|
-
output_buffer = io.BytesIO()
|
|
236
|
-
img.save(output_buffer, 'JPEG', quality=quality, optimize=True)
|
|
237
|
-
output_bytes = output_buffer.getvalue()
|
|
238
|
-
|
|
239
|
-
# 如果降低质量还不够,尝试缩小尺寸
|
|
240
|
-
if len(output_bytes) > target_size:
|
|
241
|
-
width, height = img.size
|
|
242
|
-
|
|
243
|
-
while len(output_bytes) > target_size and quality > 10:
|
|
244
|
-
# 缩小10%
|
|
245
|
-
width = int(width * 0.9)
|
|
246
|
-
height = int(height * 0.9)
|
|
247
|
-
img_resized = img.resize((width, height), Image.Resampling.LANCZOS)
|
|
248
|
-
output_buffer = io.BytesIO()
|
|
249
|
-
img_resized.save(output_buffer, 'JPEG', quality=quality, optimize=True)
|
|
250
|
-
output_bytes = output_buffer.getvalue()
|
|
251
|
-
|
|
252
|
-
final_size = len(output_bytes) / (1024 * 1024)
|
|
253
|
-
# print(f"压缩完成!")
|
|
254
|
-
# print(f"原始大小: {len(input_bytes) / (1024 * 1024):.2f}MB")
|
|
255
|
-
# print(f"压缩后大小: {final_size:.2f}MB")
|
|
256
|
-
# print(f"最终质量: {quality}")
|
|
257
|
-
|
|
258
|
-
return output_bytes
|
|
259
|
-
|
|
260
|
-
# todo: 大致盘一下各种判定的逻辑【以下的所有压缩比之间的差距均取“绝对值”】
|
|
261
|
-
# 1. 如果requests、无头、有头获取到的压缩比之间从差距都在15%以内,则认定该页面是静态页面,此时优先使用requests请求
|
|
262
|
-
# 2. 如果requests的status_code为特定的412,或者521,则判定是瑞数和jsl。[此时还有一个特点:requests的压缩比会与其他两种方式获取到的压缩比差距非常大(一两千的那种)]
|
|
263
|
-
# 3. 如果requests、无头、有头获取到的压缩比之间差距都在40%以上,则判定该页面只可以用有头采集
|
|
264
|
-
# 4. 如果无头和有头获取到的压缩比之间差距小于15%,但是requests和无头的差距大于40%,则认定该页面可以使用无头浏览器采集
|
|
265
|
-
# 5. 如果requests和有头获取到的压缩比之间差距小于15%,但是无头和有头的差距大于40%,则认定该页面优先使用有头浏览器采集
|
|
266
|
-
# 【此时可能是:1.使用了别的检测无头的waf。2.网站使用瑞数,但是这次请求没有拦截requests(不知道是不是瑞数那边故意设置的),
|
|
267
|
-
# 此时如果想进一步判定是否是瑞数,可以使用有头浏览器取一下cookies,如果cookies里面存在瑞数的cookie,那么就可以断定是瑞数】
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|