Jarvis-Brain 0.1.3__py3-none-any.whl → 0.1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {jarvis_brain-0.1.3.dist-info → jarvis_brain-0.1.4.1.dist-info}/METADATA +1 -1
- jarvis_brain-0.1.4.1.dist-info/RECORD +10 -0
- mcp_tools/dp_tools.py +121 -183
- mcp_tools/main.py +10 -9
- tools/browser_manager.py +42 -0
- tools/tools.py +43 -87
- jarvis_brain-0.1.3.dist-info/RECORD +0 -11
- mcp_tools/requests_tools.py +0 -40
- tools/simhash_tools.py +0 -228
- {jarvis_brain-0.1.3.dist-info → jarvis_brain-0.1.4.1.dist-info}/WHEEL +0 -0
- {jarvis_brain-0.1.3.dist-info → jarvis_brain-0.1.4.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
mcp_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mcp_tools/dp_tools.py,sha256=PwLd1zTFIQ673-cZjdON4TfU854_h5BgLPOVRbt_kVw,8349
|
|
3
|
+
mcp_tools/main.py,sha256=WO9kNpIORRrIOAWW8jiAd3gNW6rFMExln8y4CquKrM8,837
|
|
4
|
+
tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
tools/browser_manager.py,sha256=AM9hIQKtgMVxIZsEPxfHj9q41ZtzHrOS69wtgGBE3-Q,1458
|
|
6
|
+
tools/tools.py,sha256=3vTMCT_h0eDPtOMHc2KnhNVpQBUs6jgIChutBZZFMK4,5008
|
|
7
|
+
jarvis_brain-0.1.4.1.dist-info/METADATA,sha256=jJjlYvPAjlzahJ9OtI6EuL1JOoAk5_OUku2XtqXeztE,277
|
|
8
|
+
jarvis_brain-0.1.4.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
9
|
+
jarvis_brain-0.1.4.1.dist-info/entry_points.txt,sha256=YFQT4xpkUqt5dM5wlKPQQOqcjMuFrT9iuRAzIpAyH7U,51
|
|
10
|
+
jarvis_brain-0.1.4.1.dist-info/RECORD,,
|
mcp_tools/dp_tools.py
CHANGED
|
@@ -1,231 +1,169 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import os
|
|
3
|
-
import random
|
|
4
2
|
from typing import Any
|
|
5
3
|
|
|
6
|
-
from urllib.parse import urlparse
|
|
7
|
-
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
8
4
|
from DrissionPage._elements.none_element import NoneElement
|
|
9
5
|
from fastmcp import FastMCP
|
|
10
6
|
|
|
11
|
-
from tools.tools import compress_html, requests_html
|
|
12
|
-
|
|
7
|
+
from tools.tools import compress_html, requests_html, dp_headless_html, assert_waf_cookie, dp_mcp_message_pack
|
|
8
|
+
import psutil
|
|
9
|
+
from pathlib import Path
|
|
13
10
|
|
|
14
11
|
html_source_code_local_save_path = os.path.join(os.getcwd(), "html-source-code")
|
|
15
|
-
|
|
12
|
+
waf_status_code_dict = {
|
|
13
|
+
412: "瑞数",
|
|
14
|
+
521: "加速乐"
|
|
15
|
+
}
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
def create_browser():
|
|
20
|
-
global browser_pool
|
|
21
|
-
random_port = random.randint(9222, 9934)
|
|
22
|
-
while random_port in browser_pool:
|
|
23
|
-
random_port = random.randint(9222, 9934)
|
|
24
|
-
co = ChromiumOptions().set_local_port(random_port)
|
|
25
|
-
browser_pool[random_port] = ChromiumPage(co)
|
|
26
|
-
return random_port, browser_pool[random_port]
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
# 根据传入的端口查找对应的浏览器对象
|
|
30
|
-
def get_page(port):
|
|
31
|
-
return browser_pool.get(port, None)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
# 根据传入的端口查找并弹出一个浏览器对象
|
|
35
|
-
def remove_page(port):
|
|
36
|
-
browser = browser_pool.pop(port, None)
|
|
37
|
-
return browser is not None, browser
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def register_visit_url(mcp: FastMCP):
|
|
18
|
+
def register_visit_url(mcp: FastMCP, browser_manager):
|
|
41
19
|
@mcp.tool(name="visit_url", description="使用Drissionpage打开url访问某个网站")
|
|
42
20
|
async def visit_url(url: str) -> dict[str, Any]:
|
|
43
|
-
port, _browser = create_browser()
|
|
21
|
+
port, _browser = browser_manager.create_browser()
|
|
44
22
|
tab = _browser.get_tab()
|
|
45
23
|
tab.get(url)
|
|
46
24
|
tab_id = tab.tab_id
|
|
47
|
-
return
|
|
48
|
-
"
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
}]
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def register_get_html(mcp: FastMCP):
|
|
25
|
+
return dp_mcp_message_pack(
|
|
26
|
+
f"已在[{port}]端口创建浏览器对象,并已打开链接:{url}",
|
|
27
|
+
tab_id=tab_id,
|
|
28
|
+
browser_port=port
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def register_get_html(mcp: FastMCP, browser_manager):
|
|
60
33
|
@mcp.tool(name="get_html", description="使用Drissionpage获取某一个tab页的html")
|
|
61
34
|
async def get_html(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
62
|
-
_browser =
|
|
35
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
63
36
|
tab = _browser.get_tab(tab_id)
|
|
64
37
|
file_name = tab.title + f"_{tab_id}.html"
|
|
65
38
|
if not os.path.exists(html_source_code_local_save_path):
|
|
66
39
|
os.makedirs(html_source_code_local_save_path)
|
|
67
40
|
abs_path = os.path.join(html_source_code_local_save_path, file_name)
|
|
68
41
|
with open(abs_path, "w", encoding="utf-8") as f:
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
"tab_id": tab_id,
|
|
76
|
-
"html_local_path": abs_path
|
|
77
|
-
}, ensure_ascii=False)
|
|
78
|
-
}]
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def register_get_new_tab(mcp: FastMCP):
|
|
42
|
+
min_html, compress_rate = compress_html(tab.html)
|
|
43
|
+
f.write(min_html)
|
|
44
|
+
return dp_mcp_message_pack(f"已保存tab页:【{tab_id}】的html源码", tab_id=tab_id, html_local_path=abs_path)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def register_get_new_tab(mcp: FastMCP, browser_manager):
|
|
83
48
|
@mcp.tool(name="get_new_tab", description="使用Drissionpage创建一个新的tab页,在新的tab页中打开url")
|
|
84
49
|
async def get_new_tab(browser_port: int, url: str) -> dict[str, Any]:
|
|
85
|
-
_browser =
|
|
50
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
86
51
|
tab = _browser.new_tab(url)
|
|
87
52
|
_browser.activate_tab(tab)
|
|
88
53
|
tab_id = tab.tab_id
|
|
89
|
-
return {
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
"message": f"已创建新的tab页,并打开链接:{url}",
|
|
94
|
-
"tab_id": tab_id,
|
|
95
|
-
}, ensure_ascii=False)
|
|
96
|
-
}]
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def register_switch_tab(mcp: FastMCP):
|
|
54
|
+
return dp_mcp_message_pack(f"已创建新的tab页,并打开链接:{url}", tab_id=tab_id)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def register_switch_tab(mcp: FastMCP, browser_manager):
|
|
101
58
|
@mcp.tool(name="switch_tab", description="根据传入的tab_id切换到对应的tab页", )
|
|
102
59
|
async def switch_tab(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
103
|
-
_browser =
|
|
60
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
104
61
|
_browser.activate_tab(tab_id)
|
|
105
|
-
return {
|
|
106
|
-
"content": [{
|
|
107
|
-
"type": "text",
|
|
108
|
-
"text": json.dumps({
|
|
109
|
-
"message": f"已将tab页:【{tab_id}】切换至最前端",
|
|
110
|
-
}, ensure_ascii=False)
|
|
111
|
-
}]
|
|
112
|
-
}
|
|
62
|
+
return dp_mcp_message_pack(f"已将tab页:【{tab_id}】切换至最前端")
|
|
113
63
|
|
|
114
64
|
|
|
115
|
-
def register_close_tab(mcp: FastMCP):
|
|
65
|
+
def register_close_tab(mcp: FastMCP, browser_manager):
|
|
116
66
|
@mcp.tool(name="close_tab", description="根据传入的tab_id关闭tab页", )
|
|
117
67
|
async def close_tab(browser_port, tab_id) -> dict[str, Any]:
|
|
118
|
-
_browser =
|
|
68
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
119
69
|
_browser.close_tabs(tab_id)
|
|
120
|
-
return {
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
}, ensure_ascii=False)
|
|
126
|
-
}]
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def register_check_selector(mcp: FastMCP):
|
|
131
|
-
@mcp.tool(name="check_selector", description="查找tab页中是否包含元素")
|
|
70
|
+
return dp_mcp_message_pack(f"已将tab页:【{tab_id}】关闭")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def register_check_selector(mcp: FastMCP, browser_manager):
|
|
74
|
+
@mcp.tool(name="check_selector", description="查找tab页中是否包含某个元素")
|
|
132
75
|
async def check_selector(browser_port: int, tab_id: str, css_selector: str) -> dict[str, Any]:
|
|
133
|
-
_browser =
|
|
76
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
134
77
|
target_tab = _browser.get_tab(tab_id)
|
|
135
78
|
css_selector = css_selector
|
|
136
79
|
if "css:" not in css_selector:
|
|
137
80
|
css_selector = "css:" + css_selector
|
|
138
81
|
target_ele = target_tab.ele(css_selector)
|
|
139
|
-
return
|
|
140
|
-
"
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
}]
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
def register_quit_browser(mcp: FastMCP):
|
|
82
|
+
return dp_mcp_message_pack(
|
|
83
|
+
f"已完成tab页:【{tab_id}】对:【{css_selector}】的检查",
|
|
84
|
+
tab_id=tab_id,
|
|
85
|
+
selector=css_selector,
|
|
86
|
+
selector_ele_exist=not isinstance(target_ele, NoneElement)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def register_quit_browser(mcp: FastMCP, browser_manager):
|
|
153
91
|
@mcp.tool(name="quit_browser", description="退出浏览器会话,关闭浏览器")
|
|
154
92
|
async def quit_browser(browser_port: int) -> dict[str, Any]:
|
|
155
|
-
flag, _browser = remove_page(browser_port)
|
|
93
|
+
flag, _browser = browser_manager.remove_page(browser_port)
|
|
156
94
|
if flag:
|
|
157
95
|
_browser.quit()
|
|
158
|
-
return
|
|
159
|
-
"
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
def register_assert_Static_Web(mcp: FastMCP):
|
|
171
|
-
@mcp.tool(name="assert_Static_Web", description="判断tab页中的网页是否是静态网页")
|
|
172
|
-
def assert_Static_Web(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
173
|
-
_browser = get_page(browser_port)
|
|
96
|
+
return dp_mcp_message_pack(
|
|
97
|
+
f"浏览器[{browser_port}],退出会话,关闭浏览器{'成功' if flag else '失败'}",
|
|
98
|
+
browser_port=browser_port,
|
|
99
|
+
quit_flag=flag
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def register_assert_waf(mcp: FastMCP, browser_manager):
|
|
104
|
+
@mcp.tool(name="assert_waf",
|
|
105
|
+
description="通过对比requests、有头浏览器、无头浏览器获取到的html,判断网页是否使用了waf以及是否为动态渲染的网页")
|
|
106
|
+
async def assert_waf(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
107
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
174
108
|
target_tab = _browser.get_tab(tab_id)
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
109
|
+
recommend_team = "drissionpage_head"
|
|
110
|
+
head_cookies = target_tab.cookies()
|
|
111
|
+
# 通过cookie判断是否有waf
|
|
112
|
+
waf_flag, waf_type = assert_waf_cookie(head_cookies)
|
|
113
|
+
head_html = target_tab.html
|
|
114
|
+
min_head_html, head_rate = compress_html(head_html, only_text=True)
|
|
115
|
+
raw_html, status_code = requests_html(target_tab.url)
|
|
116
|
+
min_raw_html, raw_rate = compress_html(raw_html, only_text=True)
|
|
117
|
+
r_h_rate_diff = abs(head_rate - raw_rate)
|
|
118
|
+
# 如果有已知的防火墙,则不浪费时间使用无头获取html和压缩比了
|
|
119
|
+
if waf_flag or status_code in waf_status_code_dict.keys():
|
|
120
|
+
return dp_mcp_message_pack(
|
|
121
|
+
f"已完成tab页:【{tab_id}】的分析,该tab页存在waf",
|
|
122
|
+
tab_id=tab_id,
|
|
123
|
+
recommend_team=recommend_team,
|
|
124
|
+
raw_head_rate_difference=r_h_rate_diff,
|
|
125
|
+
raw_headless_rate_difference=0,
|
|
126
|
+
head_headless_rate_difference=0
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
headless_html = dp_headless_html(target_tab.url)
|
|
130
|
+
min_headless_html, headless_rate = compress_html(headless_html, only_text=True)
|
|
131
|
+
r_hless_rate_diff = abs(raw_rate - headless_rate)
|
|
132
|
+
h_hless_rate_diff = abs(head_rate - headless_rate)
|
|
133
|
+
# 最优情况:requests,dp有头和无头拿到的结果基本一致,认定为没有防护的静态网页
|
|
134
|
+
if r_h_rate_diff < 40 and r_hless_rate_diff < 40 and h_hless_rate_diff < 40:
|
|
135
|
+
recommend_team = "requests"
|
|
136
|
+
# 最差情况:requests,dp有头和无头拿到的结果差距都很大,认定为有浏览器无头检测+动态网页
|
|
137
|
+
# if r_h_rate_diff < 40 and r_hless_rate_diff < 40 and h_hless_rate_diff < 40:
|
|
138
|
+
# 较差1:dp有头和无头差距很小,但是requests拿不到正确结果,认定为有requests防护 or 动态网页
|
|
139
|
+
elif h_hless_rate_diff < 30 and r_hless_rate_diff > 40:
|
|
140
|
+
recommend_team = "drissionpage_headless"
|
|
141
|
+
# 较差2:有头和无头差距很大,但是requests和有头拿到的结果基本一致,认定为要么有别的没有防护requests的waf,或者间歇性的瑞数【此时应该拿有头的cookie去判断其中是否有瑞数特征,上面已经做了】
|
|
142
|
+
# if r_h_rate_diff < 15 and h_hless_rate_diff > 40:
|
|
143
|
+
return dp_mcp_message_pack(
|
|
144
|
+
f"已完成tab页:【{tab_id}】的分析,该tab页存在waf",
|
|
145
|
+
tab_id=tab_id,
|
|
146
|
+
recommend_team=recommend_team,
|
|
147
|
+
raw_head_rate_difference=r_h_rate_diff,
|
|
148
|
+
raw_headless_rate_difference=h_hless_rate_diff,
|
|
149
|
+
head_headless_rate_difference=h_hless_rate_diff
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def register_highlight_element_captcha(mcp: FastMCP, browser_manager):
|
|
154
|
+
@mcp.tool(name="highlight_element_captcha",
|
|
155
|
+
description="将传入的Selector在页面上高亮,并截屏")
|
|
156
|
+
async def highlight_element_captcha(browser_port: int, tab_id: str, selector: str) -> dict[str, Any]:
|
|
157
|
+
_browser = browser_manager.get_browser(browser_port)
|
|
218
158
|
tab = _browser.get_tab(tab_id)
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
}]
|
|
231
|
-
}
|
|
159
|
+
tab.ele
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def register_test(mcp: FastMCP, cwd: str):
|
|
163
|
+
@mcp.tool(name="test", description="用户说要进行测试时调用这个函数")
|
|
164
|
+
async def test() -> dict[str, Any]:
|
|
165
|
+
test_project = "获取根目录"
|
|
166
|
+
return dp_mcp_message_pack(
|
|
167
|
+
f"当前测试项目是:{test_project}",
|
|
168
|
+
result=cwd
|
|
169
|
+
)
|
mcp_tools/main.py
CHANGED
|
@@ -1,24 +1,25 @@
|
|
|
1
1
|
# main.py
|
|
2
2
|
from mcp_tools.dp_tools import *
|
|
3
|
-
from mcp_tools.requests_tools import *
|
|
4
3
|
from fastmcp import FastMCP
|
|
4
|
+
from tools.browser_manager import browser_manager
|
|
5
5
|
|
|
6
6
|
mcp = FastMCP("Jarvis Brain Mcp Tools")
|
|
7
7
|
|
|
8
8
|
# 根据环境变量加载模块
|
|
9
9
|
enabled_modules = os.getenv("MCP_MODULES", "TeamNode-Dp").split(",")
|
|
10
|
+
base_cwd = os.getenv("BASE_CWD", os.environ.get("PWD"))
|
|
10
11
|
|
|
11
12
|
if "TeamNode-Dp" in enabled_modules:
|
|
12
|
-
register_visit_url(mcp)
|
|
13
|
-
register_close_tab(mcp)
|
|
14
|
-
register_switch_tab(mcp)
|
|
15
|
-
register_get_html(mcp)
|
|
16
|
-
register_get_new_tab(mcp)
|
|
17
|
-
register_check_selector(mcp)
|
|
13
|
+
register_visit_url(mcp, browser_manager)
|
|
14
|
+
register_close_tab(mcp, browser_manager)
|
|
15
|
+
register_switch_tab(mcp, browser_manager)
|
|
16
|
+
register_get_html(mcp, browser_manager)
|
|
17
|
+
register_get_new_tab(mcp, browser_manager)
|
|
18
|
+
register_check_selector(mcp, browser_manager)
|
|
18
19
|
|
|
19
20
|
if "JarvisNode" in enabled_modules:
|
|
20
|
-
register_assert_waf(mcp)
|
|
21
|
-
|
|
21
|
+
register_assert_waf(mcp, browser_manager)
|
|
22
|
+
register_test(mcp, base_cwd)
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
def main():
|
tools/browser_manager.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""浏览器池管理模块 - 单例模式确保状态共享"""
|
|
2
|
+
import random
|
|
3
|
+
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
4
|
+
from typing import Optional, Tuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BrowserManager:
|
|
8
|
+
"""浏览器池管理器 - 使用单例模式"""
|
|
9
|
+
_instance = None
|
|
10
|
+
|
|
11
|
+
def __new__(cls):
|
|
12
|
+
if cls._instance is None:
|
|
13
|
+
cls._instance = super().__new__(cls)
|
|
14
|
+
cls._instance.browser_pool = {}
|
|
15
|
+
return cls._instance
|
|
16
|
+
|
|
17
|
+
def create_browser(self) -> Tuple[int, ChromiumPage]:
|
|
18
|
+
"""创建新的浏览器实例"""
|
|
19
|
+
random_port = random.randint(9223, 9934)
|
|
20
|
+
while random_port in self.browser_pool:
|
|
21
|
+
random_port = random.randint(9223, 9934)
|
|
22
|
+
|
|
23
|
+
co = ChromiumOptions().set_local_port(random_port)
|
|
24
|
+
self.browser_pool[random_port] = ChromiumPage(co)
|
|
25
|
+
return random_port, self.browser_pool[random_port]
|
|
26
|
+
|
|
27
|
+
def get_browser(self, port: int) -> Optional[ChromiumPage]:
|
|
28
|
+
"""根据端口获取浏览器实例"""
|
|
29
|
+
return self.browser_pool.get(port)
|
|
30
|
+
|
|
31
|
+
def remove_browser(self, port: int) -> Tuple[bool, Optional[ChromiumPage]]:
|
|
32
|
+
"""根据端口移除浏览器实例"""
|
|
33
|
+
browser = self.browser_pool.pop(port, None)
|
|
34
|
+
return browser is not None, browser
|
|
35
|
+
|
|
36
|
+
def list_browsers(self) -> list[int]:
|
|
37
|
+
"""列出所有活跃的浏览器端口"""
|
|
38
|
+
return list(self.browser_pool.keys())
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# 创建全局单例实例
|
|
42
|
+
browser_manager = BrowserManager()
|
tools/tools.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import time
|
|
2
|
+
import json
|
|
2
3
|
|
|
3
4
|
import htmlmin
|
|
4
5
|
from curl_cffi import requests
|
|
@@ -35,42 +36,15 @@ def requests_html(url):
|
|
|
35
36
|
def dp_headless_html(url):
|
|
36
37
|
opt = ChromiumOptions().headless(True)
|
|
37
38
|
opt.set_argument('--no-sandbox')
|
|
38
|
-
|
|
39
|
-
|
|
39
|
+
# 随机端口,固定9222端口的话,可能会被瑞数检测到
|
|
40
|
+
opt.auto_port()
|
|
40
41
|
page = ChromiumPage(opt)
|
|
41
|
-
# page = ChromiumPage()
|
|
42
|
-
|
|
43
42
|
tab = page.latest_tab
|
|
44
|
-
# tab.set.load_mode.normal()
|
|
45
43
|
tab.get(url)
|
|
46
|
-
#
|
|
44
|
+
# todo: 目前没有更好的方式,为了数据渲染完全,只能硬等【受网速波动影响比较大】
|
|
47
45
|
time.sleep(10)
|
|
48
|
-
# tab.wait.load_start()
|
|
49
46
|
page_html = tab.html
|
|
50
|
-
#
|
|
51
|
-
# print("dp_cookies=>", tab.cookies())
|
|
52
|
-
page.quit()
|
|
53
|
-
return page_html
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
# 使用dp无头模式获取html,用于测试是否使用了其他waf,如移动waf
|
|
57
|
-
def dp_html(url):
|
|
58
|
-
opt = ChromiumOptions()
|
|
59
|
-
opt.set_local_port(9223)
|
|
60
|
-
# opt.set_argument('--no-sandbox')
|
|
61
|
-
page = ChromiumPage(opt)
|
|
62
|
-
# page = ChromiumPage()
|
|
63
|
-
|
|
64
|
-
tab = page.latest_tab
|
|
65
|
-
# tab.set.load_mode.normal()
|
|
66
|
-
tab.get(url)
|
|
67
|
-
# tab.wait.eles_loaded()
|
|
68
|
-
# time.sleep(10)
|
|
69
|
-
|
|
70
|
-
tab.wait.doc_loaded()
|
|
71
|
-
page_html = tab.html
|
|
72
|
-
# print(page_html)
|
|
73
|
-
# print("dp_cookies=>", tab.cookies())
|
|
47
|
+
# 无头浏览器在用完之后一定要记得再page级别进行quit
|
|
74
48
|
page.quit()
|
|
75
49
|
return page_html
|
|
76
50
|
|
|
@@ -102,62 +76,44 @@ def compress_html(content, only_text=False):
|
|
|
102
76
|
|
|
103
77
|
result = etree.tostring(doc, encoding='unicode')
|
|
104
78
|
result = htmlmin.minify(result)
|
|
105
|
-
|
|
79
|
+
compress_rate = round(len(content) / len(result) * 100)
|
|
80
|
+
print(f"html压缩比=> {compress_rate}%")
|
|
106
81
|
if not only_text:
|
|
107
|
-
return result
|
|
82
|
+
return result, compress_rate
|
|
108
83
|
soup = BeautifulSoup(result, 'html.parser')
|
|
109
84
|
result = soup.get_text(strip=True)
|
|
110
|
-
return result
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
# url = "https://www.gsgh.org.cn/#/moreNews_?position=%E7%9C%81%E6%80%BB%E6%96%B0%E9%97%BB&categoryId=502"
|
|
146
|
-
url = "https://www.acftu.org/xwdt/ghyw/"
|
|
147
|
-
# url = "https://www.nxzgh.org.cn/#/newsCenter/index2/2" # 移动waf,会检测浏览器无头
|
|
148
|
-
# url = "https://www.chengdu.gov.cn/cdsrmzf/zfxx/cjj.shtml" # 超严格瑞数6,怀疑他会检测端口,我使用9222端口无论如何都获取不到结果
|
|
149
|
-
# url = "https://www.jsgh.org/col/col3577/index.html?uid=18462&pageNum=1"
|
|
150
|
-
# render_html = dp_headless_html(url)
|
|
151
|
-
# print("\n")
|
|
152
|
-
# print("render_html=>", render_html)
|
|
153
|
-
# for i in range(20):
|
|
154
|
-
test(url)
|
|
155
|
-
# todo: 大致盘一下各种判定的逻辑【以下的所有压缩比之间的差距均取“绝对值”】
|
|
156
|
-
# 1. 如果requests、无头、有头获取到的压缩比之间从差距都在15%以内,则认定该页面是静态页面,此时优先使用requests请求
|
|
157
|
-
# 2. 如果requests的status_code为特定的412,或者521,则判定是瑞数和jsl。[此时还有一个特点:requests的压缩比会与其他两种方式获取到的压缩比差距非常大(一两千的那种)]
|
|
158
|
-
# 3. 如果requests、无头、有头获取到的压缩比之间差距都在40%以上,则判定该页面只可以用有头采集
|
|
159
|
-
# 4. 如果无头和有头获取到的压缩比之间差距小于15%,但是requests和无头的差距大于40%,则认定该页面可以使用无头浏览器采集
|
|
160
|
-
# 5. 如果requests和有头获取到的压缩比之间差距小于15%,但是无头和有头的差距大于40%,则认定该页面优先使用有头浏览器采集
|
|
161
|
-
# 【此时可能是:1.使用了别的检测无头的waf。2.网站使用瑞数,但是这次请求没有拦截requests(不知道是不是瑞数那边故意设置的),
|
|
162
|
-
# 此时如果想进一步判定是否是瑞数,可以使用有头浏览器取一下cookies,如果cookies里面存在瑞数的cookie,那么就可以断定是瑞数】
|
|
163
|
-
# 6.
|
|
85
|
+
return result, compress_rate
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# 通过cookie判断是否有waf,需要通过遇到的例子,不断的完善cookie判别函数
|
|
89
|
+
def assert_waf_cookie(cookies: list):
|
|
90
|
+
for cookie in cookies:
|
|
91
|
+
cookie_name = cookie['name']
|
|
92
|
+
cookie_value = cookie['value']
|
|
93
|
+
if len(cookie_name) == 13 and len(cookie_value) == 88:
|
|
94
|
+
return True, "瑞数"
|
|
95
|
+
if "_jsl" in cookie_name:
|
|
96
|
+
return True, "加速乐"
|
|
97
|
+
return False, "没有waf"
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# 对dp_mcp的消息打包
|
|
101
|
+
def dp_mcp_message_pack(message: str, **kwargs):
|
|
102
|
+
text_obj = {key: value for key, value in kwargs.items()}
|
|
103
|
+
text_obj.update({"message": message})
|
|
104
|
+
return {
|
|
105
|
+
"content": [{
|
|
106
|
+
"type": "text",
|
|
107
|
+
# "text": json.dumps(text_obj, ensure_ascii=False)
|
|
108
|
+
"text": text_obj
|
|
109
|
+
}]
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# todo: 大致盘一下各种判定的逻辑【以下的所有压缩比之间的差距均取“绝对值”】
|
|
113
|
+
# 1. 如果requests、无头、有头获取到的压缩比之间从差距都在15%以内,则认定该页面是静态页面,此时优先使用requests请求
|
|
114
|
+
# 2. 如果requests的status_code为特定的412,或者521,则判定是瑞数和jsl。[此时还有一个特点:requests的压缩比会与其他两种方式获取到的压缩比差距非常大(一两千的那种)]
|
|
115
|
+
# 3. 如果requests、无头、有头获取到的压缩比之间差距都在40%以上,则判定该页面只可以用有头采集
|
|
116
|
+
# 4. 如果无头和有头获取到的压缩比之间差距小于15%,但是requests和无头的差距大于40%,则认定该页面可以使用无头浏览器采集
|
|
117
|
+
# 5. 如果requests和有头获取到的压缩比之间差距小于15%,但是无头和有头的差距大于40%,则认定该页面优先使用有头浏览器采集
|
|
118
|
+
# 【此时可能是:1.使用了别的检测无头的waf。2.网站使用瑞数,但是这次请求没有拦截requests(不知道是不是瑞数那边故意设置的),
|
|
119
|
+
# 此时如果想进一步判定是否是瑞数,可以使用有头浏览器取一下cookies,如果cookies里面存在瑞数的cookie,那么就可以断定是瑞数】
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
mcp_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mcp_tools/dp_tools.py,sha256=jDilSFnOdwGC2453983k6VdGHNu3lDaDcioa-XXhUc0,9003
|
|
3
|
-
mcp_tools/main.py,sha256=lxH8PafR4KVM0-OURgHD0jNaC68kLmPYaNJ1wSKfNdY,654
|
|
4
|
-
mcp_tools/requests_tools.py,sha256=2rP4NBYaGTJI9-Txn2vaoPH5A2fWM05RNNjiSCHvbtk,1373
|
|
5
|
-
tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
tools/simhash_tools.py,sha256=eRxUqcWJiNMNZPpVRBk1kI7OYSK_vThoVg3jctFfmhY,7368
|
|
7
|
-
tools/tools.py,sha256=AP5lw65o2VYQ0Q6hEvCRA7m9WD6e7G8j4KBkdheLNmU,6913
|
|
8
|
-
jarvis_brain-0.1.3.dist-info/METADATA,sha256=wVy9AR3LB40xokpRF5U9MrsZxCJJ0DuFSEW4CUJ3ox8,275
|
|
9
|
-
jarvis_brain-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
10
|
-
jarvis_brain-0.1.3.dist-info/entry_points.txt,sha256=YFQT4xpkUqt5dM5wlKPQQOqcjMuFrT9iuRAzIpAyH7U,51
|
|
11
|
-
jarvis_brain-0.1.3.dist-info/RECORD,,
|
mcp_tools/requests_tools.py
DELETED
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
|
|
3
|
-
from fastmcp import FastMCP
|
|
4
|
-
from tools.tools import requests_html, dp_headless_html
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def register_assert_waf(mcp: FastMCP):
|
|
8
|
-
@mcp.tool(name="assert_waf", description="判断传入的url对应的网页是否存在瑞数、jsl等风控防火墙")
|
|
9
|
-
def assert_waf(url: str):
|
|
10
|
-
# 先试用requests判断瑞数和jsl
|
|
11
|
-
text, code = requests_html(url)
|
|
12
|
-
waf_text_type = {
|
|
13
|
-
521: "jsl",
|
|
14
|
-
412: "瑞数",
|
|
15
|
-
"触发WAF": "其他waf"
|
|
16
|
-
}
|
|
17
|
-
has_waf = code in waf_text_type.keys()
|
|
18
|
-
if not has_waf:
|
|
19
|
-
# 不是瑞数和jsl,使用无头浏览器判断是否为其他waf
|
|
20
|
-
headless_html = dp_headless_html(url)
|
|
21
|
-
if "触发WAF" in headless_html:
|
|
22
|
-
has_waf = True
|
|
23
|
-
code = "触发WAF"
|
|
24
|
-
if not has_waf:
|
|
25
|
-
waf_type = "不存在waf"
|
|
26
|
-
else:
|
|
27
|
-
waf_type = waf_text_type[code]
|
|
28
|
-
return {
|
|
29
|
-
"content": [{
|
|
30
|
-
"type": "text",
|
|
31
|
-
"text": json.dumps(
|
|
32
|
-
{
|
|
33
|
-
"message": f"链接{url} [{'存在' if has_waf else '不存在'}] waf",
|
|
34
|
-
"url": url,
|
|
35
|
-
"waf_type": waf_type,
|
|
36
|
-
"has_waf": has_waf
|
|
37
|
-
}, ensure_ascii=False
|
|
38
|
-
)
|
|
39
|
-
}]
|
|
40
|
-
}
|
tools/simhash_tools.py
DELETED
|
@@ -1,228 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from bs4 import BeautifulSoup
|
|
3
|
-
from simhash import Simhash, SimhashIndex
|
|
4
|
-
import jieba
|
|
5
|
-
import hashlib
|
|
6
|
-
from tools.tools import compress_html
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class HTMLSimHashComparator:
|
|
10
|
-
def __init__(self, html1, html2, is_file=False):
|
|
11
|
-
"""
|
|
12
|
-
初始化比较器
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
html1: 第一个HTML内容或文件路径
|
|
16
|
-
html2: 第二个HTML内容或文件路径
|
|
17
|
-
is_file: 是否为文件路径(True则为文件路径,False则为HTML字符串)
|
|
18
|
-
"""
|
|
19
|
-
if is_file:
|
|
20
|
-
with open(html1, 'r', encoding='utf-8') as f1:
|
|
21
|
-
self.html1 = f1.read()
|
|
22
|
-
with open(html2, 'r', encoding='utf-8') as f2:
|
|
23
|
-
self.html2 = f2.read()
|
|
24
|
-
else:
|
|
25
|
-
self.html1 = html1
|
|
26
|
-
self.html2 = html2
|
|
27
|
-
|
|
28
|
-
def clean_html(self, html_content):
|
|
29
|
-
text = compress_html(html_content, only_text=True)
|
|
30
|
-
return text
|
|
31
|
-
|
|
32
|
-
def extract_features(self, text):
|
|
33
|
-
"""从文本中提取特征"""
|
|
34
|
-
# 使用jieba进行中文分词(如果是中文内容)
|
|
35
|
-
# 如果是英文,可以使用空格分词或其他方式
|
|
36
|
-
words = jieba.lcut(text)
|
|
37
|
-
|
|
38
|
-
# 过滤停用词和短词
|
|
39
|
-
stop_words = set(['的', '了', '在', '是', '我', '有', '和', '就',
|
|
40
|
-
'不', '人', '都', '一', '一个', '上', '也', '很',
|
|
41
|
-
'到', '说', '要', '去', '你', '会', '着', '没有',
|
|
42
|
-
'看', '好', '自己', '这'])
|
|
43
|
-
|
|
44
|
-
features = []
|
|
45
|
-
for word in words:
|
|
46
|
-
# 过滤停用词和长度小于2的词
|
|
47
|
-
if word not in stop_words and len(word) >= 2:
|
|
48
|
-
features.append(word)
|
|
49
|
-
|
|
50
|
-
return features
|
|
51
|
-
|
|
52
|
-
def calculate_simhash(self, html_content):
|
|
53
|
-
"""计算HTML内容的SimHash值"""
|
|
54
|
-
# 清洗HTML
|
|
55
|
-
text = self.clean_html(html_content)
|
|
56
|
-
|
|
57
|
-
# 提取特征
|
|
58
|
-
features = self.extract_features(text)
|
|
59
|
-
|
|
60
|
-
# 计算SimHash(默认使用64位)
|
|
61
|
-
simhash = Simhash(features, f=64)
|
|
62
|
-
|
|
63
|
-
return simhash
|
|
64
|
-
|
|
65
|
-
def compare_simhash(self):
|
|
66
|
-
"""比较两个HTML的SimHash值"""
|
|
67
|
-
# 计算SimHash值
|
|
68
|
-
simhash1 = self.calculate_simhash(self.html1)
|
|
69
|
-
simhash2 = self.calculate_simhash(self.html2)
|
|
70
|
-
|
|
71
|
-
# 计算汉明距离
|
|
72
|
-
hamming_distance = simhash1.distance(simhash2)
|
|
73
|
-
|
|
74
|
-
# 计算相似度(0-1之间)
|
|
75
|
-
# 64位SimHash的最大汉明距离是64
|
|
76
|
-
similarity = 1 - (hamming_distance / 64)
|
|
77
|
-
|
|
78
|
-
return {
|
|
79
|
-
'simhash1': bin(simhash1.value),
|
|
80
|
-
'simhash2': bin(simhash2.value),
|
|
81
|
-
'hamming_distance': hamming_distance,
|
|
82
|
-
'similarity': similarity,
|
|
83
|
-
'similarity_percentage': f"{similarity * 100:.2f}%"
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
def compare_with_threshold(self, threshold=0.8):
|
|
87
|
-
"""基于阈值判断是否相似"""
|
|
88
|
-
result = self.compare_simhash()
|
|
89
|
-
is_similar = result['similarity'] >= threshold
|
|
90
|
-
|
|
91
|
-
return {
|
|
92
|
-
**result,
|
|
93
|
-
'threshold': threshold,
|
|
94
|
-
'is_similar': is_similar
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
# 3. 使用示例
|
|
99
|
-
def main():
|
|
100
|
-
# 示例1:直接传入HTML字符串
|
|
101
|
-
html1 = """
|
|
102
|
-
<html>
|
|
103
|
-
<head><title>测试页面1</title></head>
|
|
104
|
-
<body>
|
|
105
|
-
<h1>欢迎来到我的网站</h1>
|
|
106
|
-
<p>这是一个测试页面,用于演示SimHash比较。</p>
|
|
107
|
-
<p>Python是一种流行的编程语言。</p>
|
|
108
|
-
</body>
|
|
109
|
-
</html>
|
|
110
|
-
"""
|
|
111
|
-
|
|
112
|
-
html2 = """
|
|
113
|
-
<html>
|
|
114
|
-
<head><title>测试页面2</title></head>
|
|
115
|
-
<body>
|
|
116
|
-
<h1>欢迎访问我的网站</h1>
|
|
117
|
-
<p>这是一个测试页面,用于展示SimHash比较功能。</p>
|
|
118
|
-
<p>Python编程语言非常流行。</p>
|
|
119
|
-
</body>
|
|
120
|
-
</html>
|
|
121
|
-
"""
|
|
122
|
-
|
|
123
|
-
# 创建比较器并比较
|
|
124
|
-
comparator = HTMLSimHashComparator(html1, html2)
|
|
125
|
-
result = comparator.compare_simhash()
|
|
126
|
-
|
|
127
|
-
print("SimHash比较结果:")
|
|
128
|
-
print(f"页面1 SimHash: {result['simhash1']}")
|
|
129
|
-
print(f"页面2 SimHash: {result['simhash2']}")
|
|
130
|
-
print(f"汉明距离: {result['hamming_distance']}")
|
|
131
|
-
print(f"相似度: {result['similarity_percentage']}")
|
|
132
|
-
|
|
133
|
-
# 基于阈值判断
|
|
134
|
-
threshold_result = comparator.compare_with_threshold(threshold=0.7)
|
|
135
|
-
print(f"\n基于阈值{threshold_result['threshold']}的判断:")
|
|
136
|
-
print(f"是否相似: {threshold_result['is_similar']}")
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
# 4. 高级功能:批量比较和聚类
|
|
140
|
-
class BatchHTMLComparator:
|
|
141
|
-
def __init__(self):
|
|
142
|
-
self.documents = []
|
|
143
|
-
self.simhashes = []
|
|
144
|
-
|
|
145
|
-
def add_document(self, doc_id, html_content):
|
|
146
|
-
"""添加文档到比较器"""
|
|
147
|
-
comparator = HTMLSimHashComparator(html_content, html_content)
|
|
148
|
-
simhash = comparator.calculate_simhash(html_content)
|
|
149
|
-
|
|
150
|
-
self.documents.append({
|
|
151
|
-
'id': doc_id,
|
|
152
|
-
'content': html_content,
|
|
153
|
-
'simhash': simhash
|
|
154
|
-
})
|
|
155
|
-
self.simhashes.append((doc_id, simhash))
|
|
156
|
-
|
|
157
|
-
def find_duplicates(self, k=3):
|
|
158
|
-
"""查找相似的文档(k为汉明距离阈值)"""
|
|
159
|
-
index = SimhashIndex(self.simhashes, k=k)
|
|
160
|
-
|
|
161
|
-
duplicates = []
|
|
162
|
-
for doc_id, simhash in self.simhashes:
|
|
163
|
-
# 查找相似文档
|
|
164
|
-
similar_ids = index.get_near_dups(simhash)
|
|
165
|
-
if len(similar_ids) > 1:
|
|
166
|
-
duplicates.append({
|
|
167
|
-
'doc_id': doc_id,
|
|
168
|
-
'similar_docs': similar_ids
|
|
169
|
-
})
|
|
170
|
-
|
|
171
|
-
return duplicates
|
|
172
|
-
|
|
173
|
-
def compare_all_pairs(self):
|
|
174
|
-
"""比较所有文档对"""
|
|
175
|
-
comparisons = []
|
|
176
|
-
n = len(self.documents)
|
|
177
|
-
|
|
178
|
-
for i in range(n):
|
|
179
|
-
for j in range(i + 1, n):
|
|
180
|
-
comparator = HTMLSimHashComparator(
|
|
181
|
-
self.documents[i]['content'],
|
|
182
|
-
self.documents[j]['content']
|
|
183
|
-
)
|
|
184
|
-
result = comparator.compare_simhash()
|
|
185
|
-
|
|
186
|
-
comparisons.append({
|
|
187
|
-
'doc1': self.documents[i]['id'],
|
|
188
|
-
'doc2': self.documents[j]['id'],
|
|
189
|
-
'hamming_distance': result['hamming_distance'],
|
|
190
|
-
'similarity': result['similarity']
|
|
191
|
-
})
|
|
192
|
-
|
|
193
|
-
return sorted(comparisons, key=lambda x: x['similarity'], reverse=True)
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
if __name__ == "__main__":
|
|
197
|
-
# 运行示例
|
|
198
|
-
main()
|
|
199
|
-
|
|
200
|
-
# 批量比较示例
|
|
201
|
-
print("\n=== 批量比较示例 ===")
|
|
202
|
-
batch_comparator = BatchHTMLComparator()
|
|
203
|
-
|
|
204
|
-
# 添加多个文档
|
|
205
|
-
batch_comparator.add_document("doc1", """
|
|
206
|
-
<html><body><h1>Python编程</h1><p>学习Python很有趣。</p></body></html>
|
|
207
|
-
""")
|
|
208
|
-
|
|
209
|
-
batch_comparator.add_document("doc2", """
|
|
210
|
-
<html><body><h1>Python代码</h1><p>编写Python代码很有趣。</p></body></html>
|
|
211
|
-
""")
|
|
212
|
-
|
|
213
|
-
batch_comparator.add_document("doc3", """
|
|
214
|
-
<html><body><h1>Java编程</h1><p>Java是一种编程语言。</p></body></html>
|
|
215
|
-
""")
|
|
216
|
-
|
|
217
|
-
# 查找重复文档
|
|
218
|
-
duplicates = batch_comparator.find_duplicates(k=3)
|
|
219
|
-
print("相似的文档:")
|
|
220
|
-
for dup in duplicates:
|
|
221
|
-
print(f"文档 {dup['doc_id']} 与 {dup['similar_docs']} 相似")
|
|
222
|
-
|
|
223
|
-
# 比较所有文档对
|
|
224
|
-
all_comparisons = batch_comparator.compare_all_pairs()
|
|
225
|
-
print("\n所有文档对比较:")
|
|
226
|
-
for comp in all_comparisons:
|
|
227
|
-
print(f"{comp['doc1']} vs {comp['doc2']}: "
|
|
228
|
-
f"相似度 {comp['similarity']:.2%}")
|
|
File without changes
|
|
File without changes
|