Jarvis-Brain 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {jarvis_brain-0.1.0.dist-info → jarvis_brain-0.1.2.dist-info}/METADATA +4 -1
- jarvis_brain-0.1.2.dist-info/RECORD +11 -0
- mcp_tools/dp_tools.py +73 -35
- mcp_tools/main.py +13 -4
- mcp_tools/requests_tools.py +36 -0
- tools/__init__.py +0 -0
- tools/simhash_tools.py +228 -0
- tools/tools.py +92 -0
- jarvis_brain-0.1.0.dist-info/RECORD +0 -7
- {jarvis_brain-0.1.0.dist-info → jarvis_brain-0.1.2.dist-info}/WHEEL +0 -0
- {jarvis_brain-0.1.0.dist-info → jarvis_brain-0.1.2.dist-info}/entry_points.txt +0 -0
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: Jarvis_Brain
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Jarvis brain
|
|
5
5
|
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: beautifulsoup4
|
|
6
7
|
Requires-Dist: drissionpage
|
|
7
8
|
Requires-Dist: fastmcp
|
|
8
9
|
Requires-Dist: htmlmin
|
|
10
|
+
Requires-Dist: jieba
|
|
11
|
+
Requires-Dist: simhash
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
mcp_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mcp_tools/dp_tools.py,sha256=xtpJoRmpqZQRjfVUEIjo3KAd5_SqK01J1VAt8U79xbA,8448
|
|
3
|
+
mcp_tools/main.py,sha256=lxH8PafR4KVM0-OURgHD0jNaC68kLmPYaNJ1wSKfNdY,654
|
|
4
|
+
mcp_tools/requests_tools.py,sha256=K1eHbvhzWhLzn9AZhjDGTmDu_yipvkEM9IOhveDMsPM,1063
|
|
5
|
+
tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
tools/simhash_tools.py,sha256=eRxUqcWJiNMNZPpVRBk1kI7OYSK_vThoVg3jctFfmhY,7368
|
|
7
|
+
tools/tools.py,sha256=lPlXdVA4Dx3irP6alMOgmE-HxxuN_y4XQ3Q-fZyV-OU,3482
|
|
8
|
+
jarvis_brain-0.1.2.dist-info/METADATA,sha256=Otgmq6QJe4XoWiIhC1i6jBpQsgWhraoPpi8v8Su7SYw,250
|
|
9
|
+
jarvis_brain-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
10
|
+
jarvis_brain-0.1.2.dist-info/entry_points.txt,sha256=YFQT4xpkUqt5dM5wlKPQQOqcjMuFrT9iuRAzIpAyH7U,51
|
|
11
|
+
jarvis_brain-0.1.2.dist-info/RECORD,,
|
mcp_tools/dp_tools.py
CHANGED
|
@@ -1,45 +1,20 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import random
|
|
4
|
+
from pyexpat.errors import messages
|
|
4
5
|
from typing import Any
|
|
5
6
|
|
|
6
|
-
import htmlmin
|
|
7
7
|
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
8
8
|
from DrissionPage._elements.none_element import NoneElement
|
|
9
9
|
from fastmcp import FastMCP
|
|
10
|
-
|
|
10
|
+
|
|
11
|
+
from tools.tools import compress_html, requests_html
|
|
12
|
+
from tools.simhash_tools import HTMLSimHashComparator
|
|
11
13
|
|
|
12
14
|
html_source_code_local_save_path = os.path.join(os.getcwd(), "html-source-code")
|
|
13
15
|
browser_pool = {}
|
|
14
16
|
|
|
15
17
|
|
|
16
|
-
# 压缩html
|
|
17
|
-
def compress_html(content):
|
|
18
|
-
doc = html.fromstring(content)
|
|
19
|
-
# 删除 style 和 script 标签
|
|
20
|
-
for element in doc.xpath('//style | //script'):
|
|
21
|
-
element.getparent().remove(element)
|
|
22
|
-
|
|
23
|
-
# 删除 link 标签
|
|
24
|
-
for link in doc.xpath('//link[@rel="stylesheet"]'):
|
|
25
|
-
link.getparent().remove(link)
|
|
26
|
-
|
|
27
|
-
# 删除 style 属性
|
|
28
|
-
for element in doc.xpath('//*[@style]'):
|
|
29
|
-
element.attrib.pop('style')
|
|
30
|
-
|
|
31
|
-
# 删除所有 on* 事件属性
|
|
32
|
-
for element in doc.xpath('//*'):
|
|
33
|
-
for attr in list(element.attrib.keys()):
|
|
34
|
-
if attr.startswith('on'):
|
|
35
|
-
element.attrib.pop(attr)
|
|
36
|
-
|
|
37
|
-
result = etree.tostring(doc, encoding='unicode')
|
|
38
|
-
result = htmlmin.minify((result))
|
|
39
|
-
print(f"html压缩比=> {len(content) / len(result) * 100:.2f}%")
|
|
40
|
-
return result
|
|
41
|
-
|
|
42
|
-
|
|
43
18
|
# 随机一个浏览器池中不存在的端口,创建一个浏览器,返回随机端口,和浏览器对象。
|
|
44
19
|
def create_browser():
|
|
45
20
|
global browser_pool
|
|
@@ -56,6 +31,12 @@ def get_page(port):
|
|
|
56
31
|
return browser_pool.get(port, None)
|
|
57
32
|
|
|
58
33
|
|
|
34
|
+
# 根据传入的端口查找并弹出一个浏览器对象
|
|
35
|
+
def remove_page(port):
|
|
36
|
+
browser = browser_pool.pop(port, None)
|
|
37
|
+
return browser is not None, browser
|
|
38
|
+
|
|
39
|
+
|
|
59
40
|
def register_visit_url(mcp: FastMCP):
|
|
60
41
|
@mcp.tool(name="visit_url", description="使用Drissionpage打开url访问某个网站")
|
|
61
42
|
async def visit_url(url: str) -> dict[str, Any]:
|
|
@@ -81,6 +62,8 @@ def register_get_html(mcp: FastMCP):
|
|
|
81
62
|
_browser = get_page(browser_port)
|
|
82
63
|
tab = _browser.get_tab(tab_id)
|
|
83
64
|
file_name = tab.title + f"_{tab_id}.html"
|
|
65
|
+
if not os.path.exists(html_source_code_local_save_path):
|
|
66
|
+
os.makedirs(html_source_code_local_save_path)
|
|
84
67
|
abs_path = os.path.join(html_source_code_local_save_path, file_name)
|
|
85
68
|
with open(abs_path, "w", encoding="utf-8") as f:
|
|
86
69
|
f.write(compress_html(tab.html))
|
|
@@ -165,9 +148,64 @@ def register_check_selector(mcp: FastMCP):
|
|
|
165
148
|
}]
|
|
166
149
|
}
|
|
167
150
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
151
|
+
|
|
152
|
+
def register_quit_browser(mcp: FastMCP):
|
|
153
|
+
@mcp.tool(name="quit_browser", description="退出浏览器会话,关闭浏览器")
|
|
154
|
+
async def quit_browser(browser_port: int) -> dict[str, Any]:
|
|
155
|
+
flag, _browser = remove_page(browser_port)
|
|
156
|
+
if flag:
|
|
157
|
+
_browser.quit()
|
|
158
|
+
return {
|
|
159
|
+
"content": [{
|
|
160
|
+
"type": "text",
|
|
161
|
+
"text": json.dumps({
|
|
162
|
+
"message": f"浏览器[{browser_port}],退出会话,关闭浏览器{'成功' if flag else '失败'}",
|
|
163
|
+
"browser_port": browser_port,
|
|
164
|
+
"quit_flag": flag,
|
|
165
|
+
}, ensure_ascii=False)
|
|
166
|
+
}]
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def register_assert_Static_Web(mcp: FastMCP):
|
|
171
|
+
@mcp.tool(name="assert_Static_Web", description="判断tab页中的网页是否是静态网页")
|
|
172
|
+
def assert_Static_Web(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
173
|
+
_browser = get_page(browser_port)
|
|
174
|
+
target_tab = _browser.get_tab(tab_id)
|
|
175
|
+
target_url = target_tab.url
|
|
176
|
+
raw_html, stat_code = requests_html(target_url)
|
|
177
|
+
if stat_code != 200:
|
|
178
|
+
return {
|
|
179
|
+
"content": [{
|
|
180
|
+
"type": "text",
|
|
181
|
+
"text": json.dumps({
|
|
182
|
+
"messages": f"已完成tab页:【{tab_id}】的分析,该tab页使用requests获取的html状态码为[{stat_code}],并非200,请使用assert_waf判断是否有waf",
|
|
183
|
+
"tab_id": tab_id,
|
|
184
|
+
"url": target_url,
|
|
185
|
+
"stat_code": stat_code,
|
|
186
|
+
}, ensure_ascii=False)
|
|
187
|
+
}]
|
|
188
|
+
}
|
|
189
|
+
render_html = target_tab.html
|
|
190
|
+
comparator = HTMLSimHashComparator(raw_html, render_html, False)
|
|
191
|
+
result = comparator.compare_simhash()
|
|
192
|
+
print("SimHash比较结果:")
|
|
193
|
+
print(f"页面1 SimHash: {result['simhash1']}")
|
|
194
|
+
print(f"页面2 SimHash: {result['simhash2']}")
|
|
195
|
+
print(f"汉明距离: {result['hamming_distance']}")
|
|
196
|
+
print(f"相似度: {result['similarity_percentage']}")
|
|
197
|
+
threshold_result = comparator.compare_with_threshold(threshold=0.7)
|
|
198
|
+
print(f"\n基于阈值{threshold_result['threshold']}的判断:")
|
|
199
|
+
print(f"是否相似: {threshold_result['is_similar']}")
|
|
200
|
+
static_html_flag = threshold_result['is_similar']
|
|
201
|
+
return {
|
|
202
|
+
"content": [{
|
|
203
|
+
"type": "text",
|
|
204
|
+
"text": json.dumps({
|
|
205
|
+
"message": f"已完成tab页:【{tab_id}】的分析,该tab页静态页面和渲染页面的相似度为[{result['similarity_percentage']}],判定为 {'静态页面' if static_html_flag else '动态渲染'}",
|
|
206
|
+
"tab_id": tab_id,
|
|
207
|
+
"is_static_web": static_html_flag,
|
|
208
|
+
"static_web_possibility": result['similarity_percentage'],
|
|
209
|
+
}, ensure_ascii=False)
|
|
210
|
+
}]
|
|
211
|
+
}
|
mcp_tools/main.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
# main.py
|
|
2
2
|
from mcp_tools.dp_tools import *
|
|
3
|
+
from mcp_tools.requests_tools import *
|
|
3
4
|
from fastmcp import FastMCP
|
|
4
5
|
|
|
5
|
-
mcp = FastMCP("Jarvis Brain
|
|
6
|
+
mcp = FastMCP("Jarvis Brain Mcp Tools")
|
|
6
7
|
|
|
7
8
|
# 根据环境变量加载模块
|
|
8
|
-
enabled_modules = os.getenv("MCP_MODULES", "
|
|
9
|
+
enabled_modules = os.getenv("MCP_MODULES", "TeamNode-Dp").split(",")
|
|
9
10
|
|
|
10
|
-
if "
|
|
11
|
+
if "TeamNode-Dp" in enabled_modules:
|
|
11
12
|
register_visit_url(mcp)
|
|
12
13
|
register_close_tab(mcp)
|
|
13
14
|
register_switch_tab(mcp)
|
|
@@ -15,6 +16,14 @@ if "DrissionPage" in enabled_modules:
|
|
|
15
16
|
register_get_new_tab(mcp)
|
|
16
17
|
register_check_selector(mcp)
|
|
17
18
|
|
|
19
|
+
if "JarvisNode" in enabled_modules:
|
|
20
|
+
register_assert_waf(mcp)
|
|
21
|
+
register_assert_Static_Web(mcp)
|
|
22
|
+
|
|
18
23
|
|
|
19
24
|
def main():
|
|
20
|
-
mcp.run()
|
|
25
|
+
mcp.run(transport="stdio")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == '__main__':
|
|
29
|
+
main()
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from fastmcp import FastMCP
|
|
4
|
+
from tools.tools import requests_html
|
|
5
|
+
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def register_assert_waf(mcp: FastMCP):
|
|
9
|
+
@mcp.tool(name="assert_waf", description="判断传入的url对应的网页是否存在瑞数、jsl等风控防火墙")
|
|
10
|
+
def assert_waf(url: str):
|
|
11
|
+
text, code = requests_html(url)
|
|
12
|
+
waf_text_type = {
|
|
13
|
+
521: "jsl",
|
|
14
|
+
412: "瑞数"
|
|
15
|
+
}
|
|
16
|
+
has_waf = code in waf_text_type.keys()
|
|
17
|
+
if not has_waf:
|
|
18
|
+
waf_type = "不存在waf"
|
|
19
|
+
else:
|
|
20
|
+
waf_type = waf_text_type[code]
|
|
21
|
+
return {
|
|
22
|
+
"content": [{
|
|
23
|
+
"type": "text",
|
|
24
|
+
"text": json.dumps(
|
|
25
|
+
{
|
|
26
|
+
"message": f"链接{url} [{'存在' if has_waf else '不存在'}] waf",
|
|
27
|
+
"url": url,
|
|
28
|
+
"waf_type": waf_type,
|
|
29
|
+
"has_waf": has_waf
|
|
30
|
+
}, ensure_ascii=False
|
|
31
|
+
)
|
|
32
|
+
}]
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
|
tools/__init__.py
ADDED
|
File without changes
|
tools/simhash_tools.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from bs4 import BeautifulSoup
|
|
3
|
+
from simhash import Simhash, SimhashIndex
|
|
4
|
+
import jieba
|
|
5
|
+
import hashlib
|
|
6
|
+
from tools.tools import compress_html
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class HTMLSimHashComparator:
|
|
10
|
+
def __init__(self, html1, html2, is_file=False):
|
|
11
|
+
"""
|
|
12
|
+
初始化比较器
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
html1: 第一个HTML内容或文件路径
|
|
16
|
+
html2: 第二个HTML内容或文件路径
|
|
17
|
+
is_file: 是否为文件路径(True则为文件路径,False则为HTML字符串)
|
|
18
|
+
"""
|
|
19
|
+
if is_file:
|
|
20
|
+
with open(html1, 'r', encoding='utf-8') as f1:
|
|
21
|
+
self.html1 = f1.read()
|
|
22
|
+
with open(html2, 'r', encoding='utf-8') as f2:
|
|
23
|
+
self.html2 = f2.read()
|
|
24
|
+
else:
|
|
25
|
+
self.html1 = html1
|
|
26
|
+
self.html2 = html2
|
|
27
|
+
|
|
28
|
+
def clean_html(self, html_content):
|
|
29
|
+
text = compress_html(html_content, only_text=True)
|
|
30
|
+
return text
|
|
31
|
+
|
|
32
|
+
def extract_features(self, text):
|
|
33
|
+
"""从文本中提取特征"""
|
|
34
|
+
# 使用jieba进行中文分词(如果是中文内容)
|
|
35
|
+
# 如果是英文,可以使用空格分词或其他方式
|
|
36
|
+
words = jieba.lcut(text)
|
|
37
|
+
|
|
38
|
+
# 过滤停用词和短词
|
|
39
|
+
stop_words = set(['的', '了', '在', '是', '我', '有', '和', '就',
|
|
40
|
+
'不', '人', '都', '一', '一个', '上', '也', '很',
|
|
41
|
+
'到', '说', '要', '去', '你', '会', '着', '没有',
|
|
42
|
+
'看', '好', '自己', '这'])
|
|
43
|
+
|
|
44
|
+
features = []
|
|
45
|
+
for word in words:
|
|
46
|
+
# 过滤停用词和长度小于2的词
|
|
47
|
+
if word not in stop_words and len(word) >= 2:
|
|
48
|
+
features.append(word)
|
|
49
|
+
|
|
50
|
+
return features
|
|
51
|
+
|
|
52
|
+
def calculate_simhash(self, html_content):
|
|
53
|
+
"""计算HTML内容的SimHash值"""
|
|
54
|
+
# 清洗HTML
|
|
55
|
+
text = self.clean_html(html_content)
|
|
56
|
+
|
|
57
|
+
# 提取特征
|
|
58
|
+
features = self.extract_features(text)
|
|
59
|
+
|
|
60
|
+
# 计算SimHash(默认使用64位)
|
|
61
|
+
simhash = Simhash(features, f=64)
|
|
62
|
+
|
|
63
|
+
return simhash
|
|
64
|
+
|
|
65
|
+
def compare_simhash(self):
|
|
66
|
+
"""比较两个HTML的SimHash值"""
|
|
67
|
+
# 计算SimHash值
|
|
68
|
+
simhash1 = self.calculate_simhash(self.html1)
|
|
69
|
+
simhash2 = self.calculate_simhash(self.html2)
|
|
70
|
+
|
|
71
|
+
# 计算汉明距离
|
|
72
|
+
hamming_distance = simhash1.distance(simhash2)
|
|
73
|
+
|
|
74
|
+
# 计算相似度(0-1之间)
|
|
75
|
+
# 64位SimHash的最大汉明距离是64
|
|
76
|
+
similarity = 1 - (hamming_distance / 64)
|
|
77
|
+
|
|
78
|
+
return {
|
|
79
|
+
'simhash1': bin(simhash1.value),
|
|
80
|
+
'simhash2': bin(simhash2.value),
|
|
81
|
+
'hamming_distance': hamming_distance,
|
|
82
|
+
'similarity': similarity,
|
|
83
|
+
'similarity_percentage': f"{similarity * 100:.2f}%"
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
def compare_with_threshold(self, threshold=0.8):
|
|
87
|
+
"""基于阈值判断是否相似"""
|
|
88
|
+
result = self.compare_simhash()
|
|
89
|
+
is_similar = result['similarity'] >= threshold
|
|
90
|
+
|
|
91
|
+
return {
|
|
92
|
+
**result,
|
|
93
|
+
'threshold': threshold,
|
|
94
|
+
'is_similar': is_similar
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# 3. 使用示例
|
|
99
|
+
def main():
|
|
100
|
+
# 示例1:直接传入HTML字符串
|
|
101
|
+
html1 = """
|
|
102
|
+
<html>
|
|
103
|
+
<head><title>测试页面1</title></head>
|
|
104
|
+
<body>
|
|
105
|
+
<h1>欢迎来到我的网站</h1>
|
|
106
|
+
<p>这是一个测试页面,用于演示SimHash比较。</p>
|
|
107
|
+
<p>Python是一种流行的编程语言。</p>
|
|
108
|
+
</body>
|
|
109
|
+
</html>
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
html2 = """
|
|
113
|
+
<html>
|
|
114
|
+
<head><title>测试页面2</title></head>
|
|
115
|
+
<body>
|
|
116
|
+
<h1>欢迎访问我的网站</h1>
|
|
117
|
+
<p>这是一个测试页面,用于展示SimHash比较功能。</p>
|
|
118
|
+
<p>Python编程语言非常流行。</p>
|
|
119
|
+
</body>
|
|
120
|
+
</html>
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
# 创建比较器并比较
|
|
124
|
+
comparator = HTMLSimHashComparator(html1, html2)
|
|
125
|
+
result = comparator.compare_simhash()
|
|
126
|
+
|
|
127
|
+
print("SimHash比较结果:")
|
|
128
|
+
print(f"页面1 SimHash: {result['simhash1']}")
|
|
129
|
+
print(f"页面2 SimHash: {result['simhash2']}")
|
|
130
|
+
print(f"汉明距离: {result['hamming_distance']}")
|
|
131
|
+
print(f"相似度: {result['similarity_percentage']}")
|
|
132
|
+
|
|
133
|
+
# 基于阈值判断
|
|
134
|
+
threshold_result = comparator.compare_with_threshold(threshold=0.7)
|
|
135
|
+
print(f"\n基于阈值{threshold_result['threshold']}的判断:")
|
|
136
|
+
print(f"是否相似: {threshold_result['is_similar']}")
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
# 4. 高级功能:批量比较和聚类
|
|
140
|
+
class BatchHTMLComparator:
|
|
141
|
+
def __init__(self):
|
|
142
|
+
self.documents = []
|
|
143
|
+
self.simhashes = []
|
|
144
|
+
|
|
145
|
+
def add_document(self, doc_id, html_content):
|
|
146
|
+
"""添加文档到比较器"""
|
|
147
|
+
comparator = HTMLSimHashComparator(html_content, html_content)
|
|
148
|
+
simhash = comparator.calculate_simhash(html_content)
|
|
149
|
+
|
|
150
|
+
self.documents.append({
|
|
151
|
+
'id': doc_id,
|
|
152
|
+
'content': html_content,
|
|
153
|
+
'simhash': simhash
|
|
154
|
+
})
|
|
155
|
+
self.simhashes.append((doc_id, simhash))
|
|
156
|
+
|
|
157
|
+
def find_duplicates(self, k=3):
|
|
158
|
+
"""查找相似的文档(k为汉明距离阈值)"""
|
|
159
|
+
index = SimhashIndex(self.simhashes, k=k)
|
|
160
|
+
|
|
161
|
+
duplicates = []
|
|
162
|
+
for doc_id, simhash in self.simhashes:
|
|
163
|
+
# 查找相似文档
|
|
164
|
+
similar_ids = index.get_near_dups(simhash)
|
|
165
|
+
if len(similar_ids) > 1:
|
|
166
|
+
duplicates.append({
|
|
167
|
+
'doc_id': doc_id,
|
|
168
|
+
'similar_docs': similar_ids
|
|
169
|
+
})
|
|
170
|
+
|
|
171
|
+
return duplicates
|
|
172
|
+
|
|
173
|
+
def compare_all_pairs(self):
|
|
174
|
+
"""比较所有文档对"""
|
|
175
|
+
comparisons = []
|
|
176
|
+
n = len(self.documents)
|
|
177
|
+
|
|
178
|
+
for i in range(n):
|
|
179
|
+
for j in range(i + 1, n):
|
|
180
|
+
comparator = HTMLSimHashComparator(
|
|
181
|
+
self.documents[i]['content'],
|
|
182
|
+
self.documents[j]['content']
|
|
183
|
+
)
|
|
184
|
+
result = comparator.compare_simhash()
|
|
185
|
+
|
|
186
|
+
comparisons.append({
|
|
187
|
+
'doc1': self.documents[i]['id'],
|
|
188
|
+
'doc2': self.documents[j]['id'],
|
|
189
|
+
'hamming_distance': result['hamming_distance'],
|
|
190
|
+
'similarity': result['similarity']
|
|
191
|
+
})
|
|
192
|
+
|
|
193
|
+
return sorted(comparisons, key=lambda x: x['similarity'], reverse=True)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
if __name__ == "__main__":
|
|
197
|
+
# 运行示例
|
|
198
|
+
main()
|
|
199
|
+
|
|
200
|
+
# 批量比较示例
|
|
201
|
+
print("\n=== 批量比较示例 ===")
|
|
202
|
+
batch_comparator = BatchHTMLComparator()
|
|
203
|
+
|
|
204
|
+
# 添加多个文档
|
|
205
|
+
batch_comparator.add_document("doc1", """
|
|
206
|
+
<html><body><h1>Python编程</h1><p>学习Python很有趣。</p></body></html>
|
|
207
|
+
""")
|
|
208
|
+
|
|
209
|
+
batch_comparator.add_document("doc2", """
|
|
210
|
+
<html><body><h1>Python代码</h1><p>编写Python代码很有趣。</p></body></html>
|
|
211
|
+
""")
|
|
212
|
+
|
|
213
|
+
batch_comparator.add_document("doc3", """
|
|
214
|
+
<html><body><h1>Java编程</h1><p>Java是一种编程语言。</p></body></html>
|
|
215
|
+
""")
|
|
216
|
+
|
|
217
|
+
# 查找重复文档
|
|
218
|
+
duplicates = batch_comparator.find_duplicates(k=3)
|
|
219
|
+
print("相似的文档:")
|
|
220
|
+
for dup in duplicates:
|
|
221
|
+
print(f"文档 {dup['doc_id']} 与 {dup['similar_docs']} 相似")
|
|
222
|
+
|
|
223
|
+
# 比较所有文档对
|
|
224
|
+
all_comparisons = batch_comparator.compare_all_pairs()
|
|
225
|
+
print("\n所有文档对比较:")
|
|
226
|
+
for comp in all_comparisons:
|
|
227
|
+
print(f"{comp['doc1']} vs {comp['doc2']}: "
|
|
228
|
+
f"相似度 {comp['similarity']:.2%}")
|
tools/tools.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import htmlmin
|
|
2
|
+
import requests
|
|
3
|
+
from lxml import html, etree
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# 使用requests获取html,用于测试是否使用了瑞数和jsl
|
|
9
|
+
def requests_html(url):
|
|
10
|
+
headers = {
|
|
11
|
+
# "sec-ch-ua": "\"Chromium\";v=\"142\", \"Google Chrome\";v=\"142\", \"Not_A Brand\";v=\"99\"",
|
|
12
|
+
# "sec-ch-ua-mobile": "?0",
|
|
13
|
+
# "Upgrade-Insecure-Requests": "1",
|
|
14
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36",
|
|
15
|
+
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
16
|
+
# "Sec-Fetch-Site": "none",
|
|
17
|
+
# "Sec-Fetch-Mode": "navigate",
|
|
18
|
+
# "Sec-Fetch-User": "?1",
|
|
19
|
+
# "Sec-Fetch-Dest": "document",
|
|
20
|
+
# "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
|
|
21
|
+
}
|
|
22
|
+
response = requests.get(url, headers=headers, verify=False)
|
|
23
|
+
print("response headers=> ", response.headers)
|
|
24
|
+
return response.text, response.status_code
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# 使用dp无头模式获取html,用于测试是否使用了其他waf,如移动waf
|
|
28
|
+
def dp_headless_html(url):
|
|
29
|
+
opt = ChromiumOptions().headless(True)
|
|
30
|
+
opt.set_argument('--no-sandbox')
|
|
31
|
+
page = ChromiumPage(opt)
|
|
32
|
+
page.get(url)
|
|
33
|
+
page_html = page.html
|
|
34
|
+
page.quit()
|
|
35
|
+
return page_html
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# 压缩html
|
|
39
|
+
def compress_html(content, only_text=False):
|
|
40
|
+
doc = html.fromstring(content)
|
|
41
|
+
# 删除 style 和 script 标签
|
|
42
|
+
for element in doc.xpath('//style | //script'):
|
|
43
|
+
element.getparent().remove(element)
|
|
44
|
+
|
|
45
|
+
# 删除 link 标签
|
|
46
|
+
for link in doc.xpath('//link[@rel="stylesheet"]'):
|
|
47
|
+
link.getparent().remove(link)
|
|
48
|
+
|
|
49
|
+
# 删除 meta 标签(新增功能)
|
|
50
|
+
for meta in doc.xpath('//meta'):
|
|
51
|
+
meta.getparent().remove(meta)
|
|
52
|
+
|
|
53
|
+
# 删除 style 属性
|
|
54
|
+
for element in doc.xpath('//*[@style]'):
|
|
55
|
+
element.attrib.pop('style')
|
|
56
|
+
|
|
57
|
+
# 删除所有 on* 事件属性
|
|
58
|
+
for element in doc.xpath('//*'):
|
|
59
|
+
for attr in list(element.attrib.keys()):
|
|
60
|
+
if attr.startswith('on'):
|
|
61
|
+
element.attrib.pop(attr)
|
|
62
|
+
|
|
63
|
+
result = etree.tostring(doc, encoding='unicode')
|
|
64
|
+
result = htmlmin.minify(result)
|
|
65
|
+
print(f"html压缩比=> {len(content) / len(result) * 100:.2f}%")
|
|
66
|
+
if not only_text:
|
|
67
|
+
return result
|
|
68
|
+
soup = BeautifulSoup(result, 'html.parser')
|
|
69
|
+
result = soup.get_text(strip=True)
|
|
70
|
+
return result
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test(target_url):
|
|
74
|
+
page_html = dp_headless_html(target_url)
|
|
75
|
+
print("render_text=>", compress_html(page_html, only_text=True))
|
|
76
|
+
print("\n")
|
|
77
|
+
raw_html, status_code = requests_html(target_url)
|
|
78
|
+
print("raw_html=>", status_code, compress_html(raw_html, only_text=True))
|
|
79
|
+
print("\n")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == '__main__':
|
|
83
|
+
# raw_html, status_code = requests_html("https://www.nxzgh.org.cn/#/newsCenter/index2/2")
|
|
84
|
+
# raw_html, status_code = requests_html("http://www.ncha.gov.cn/col/col722/index.html")
|
|
85
|
+
# raw_html, status_code = requests_html("https://scgh.org/page/news/tmore36403294e28a11ea8ba48cec4b967595.html")
|
|
86
|
+
# raw_html, status_code = requests_html("https://scgh.org/page/news/tmore36403294e28a11ea8ba48cec4b967595.html")
|
|
87
|
+
# url = "https://www.nmpa.gov.cn/yaowen/ypjgyw/index.html"
|
|
88
|
+
url = "http://www.customs.gov.cn/customs/xwfb34/302425/index.html"
|
|
89
|
+
# url = "https://www.acftu.org/xwdt/ghyw/"
|
|
90
|
+
|
|
91
|
+
for i in range(20):
|
|
92
|
+
test(url)
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
mcp_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mcp_tools/dp_tools.py,sha256=f5jl6BHfuvqVjFplnUEvlzLbK0k0bJnMpR7cVHxWsy8,6017
|
|
3
|
-
mcp_tools/main.py,sha256=mKuVyhAKAAKgbP3YSDVm5qehVNb5dtK8ybDZGQS8Y7U,465
|
|
4
|
-
jarvis_brain-0.1.0.dist-info/METADATA,sha256=jBepatGeCD2ByZojTKYPyrH9Yg_Brtzj8mmbZqreu-c,176
|
|
5
|
-
jarvis_brain-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
-
jarvis_brain-0.1.0.dist-info/entry_points.txt,sha256=YFQT4xpkUqt5dM5wlKPQQOqcjMuFrT9iuRAzIpAyH7U,51
|
|
7
|
-
jarvis_brain-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|