Jarvis-Brain 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {jarvis_brain-0.1.2.dist-info → jarvis_brain-0.1.3.dist-info}/METADATA +2 -1
- jarvis_brain-0.1.3.dist-info/RECORD +11 -0
- mcp_tools/dp_tools.py +21 -1
- mcp_tools/requests_tools.py +10 -6
- tools/tools.py +94 -23
- jarvis_brain-0.1.2.dist-info/RECORD +0 -11
- {jarvis_brain-0.1.2.dist-info → jarvis_brain-0.1.3.dist-info}/WHEEL +0 -0
- {jarvis_brain-0.1.2.dist-info → jarvis_brain-0.1.3.dist-info}/entry_points.txt +0 -0
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: Jarvis_Brain
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Jarvis brain
|
|
5
5
|
Requires-Python: >=3.10
|
|
6
6
|
Requires-Dist: beautifulsoup4
|
|
7
|
+
Requires-Dist: curl-cffi
|
|
7
8
|
Requires-Dist: drissionpage
|
|
8
9
|
Requires-Dist: fastmcp
|
|
9
10
|
Requires-Dist: htmlmin
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
mcp_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mcp_tools/dp_tools.py,sha256=jDilSFnOdwGC2453983k6VdGHNu3lDaDcioa-XXhUc0,9003
|
|
3
|
+
mcp_tools/main.py,sha256=lxH8PafR4KVM0-OURgHD0jNaC68kLmPYaNJ1wSKfNdY,654
|
|
4
|
+
mcp_tools/requests_tools.py,sha256=2rP4NBYaGTJI9-Txn2vaoPH5A2fWM05RNNjiSCHvbtk,1373
|
|
5
|
+
tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
tools/simhash_tools.py,sha256=eRxUqcWJiNMNZPpVRBk1kI7OYSK_vThoVg3jctFfmhY,7368
|
|
7
|
+
tools/tools.py,sha256=AP5lw65o2VYQ0Q6hEvCRA7m9WD6e7G8j4KBkdheLNmU,6913
|
|
8
|
+
jarvis_brain-0.1.3.dist-info/METADATA,sha256=wVy9AR3LB40xokpRF5U9MrsZxCJJ0DuFSEW4CUJ3ox8,275
|
|
9
|
+
jarvis_brain-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
10
|
+
jarvis_brain-0.1.3.dist-info/entry_points.txt,sha256=YFQT4xpkUqt5dM5wlKPQQOqcjMuFrT9iuRAzIpAyH7U,51
|
|
11
|
+
jarvis_brain-0.1.3.dist-info/RECORD,,
|
mcp_tools/dp_tools.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import random
|
|
4
|
-
from pyexpat.errors import messages
|
|
5
4
|
from typing import Any
|
|
6
5
|
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
7
|
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
8
8
|
from DrissionPage._elements.none_element import NoneElement
|
|
9
9
|
from fastmcp import FastMCP
|
|
@@ -209,3 +209,23 @@ def register_assert_Static_Web(mcp: FastMCP):
|
|
|
209
209
|
}, ensure_ascii=False)
|
|
210
210
|
}]
|
|
211
211
|
}
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def register_get_cookies(mcp: FastMCP):
|
|
215
|
+
@mcp.tool(name="get_cookies")
|
|
216
|
+
def get_cookies(browser_port: int, tab_id: str) -> dict[str, Any]:
|
|
217
|
+
_browser = get_page(browser_port)
|
|
218
|
+
tab = _browser.get_tab(tab_id)
|
|
219
|
+
target_url = tab.url
|
|
220
|
+
parser = urlparse(target_url)
|
|
221
|
+
domain = parser.netloc
|
|
222
|
+
cookies = tab.cookies()
|
|
223
|
+
# tab.cookies
|
|
224
|
+
return {
|
|
225
|
+
"content": [{
|
|
226
|
+
"type": "text",
|
|
227
|
+
"text": json.dumps({
|
|
228
|
+
|
|
229
|
+
}, ensure_ascii=False)
|
|
230
|
+
}]
|
|
231
|
+
}
|
mcp_tools/requests_tools.py
CHANGED
|
@@ -1,19 +1,26 @@
|
|
|
1
1
|
import json
|
|
2
2
|
|
|
3
3
|
from fastmcp import FastMCP
|
|
4
|
-
from tools.tools import requests_html
|
|
5
|
-
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
4
|
+
from tools.tools import requests_html, dp_headless_html
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
def register_assert_waf(mcp: FastMCP):
|
|
9
8
|
@mcp.tool(name="assert_waf", description="判断传入的url对应的网页是否存在瑞数、jsl等风控防火墙")
|
|
10
9
|
def assert_waf(url: str):
|
|
10
|
+
# 先试用requests判断瑞数和jsl
|
|
11
11
|
text, code = requests_html(url)
|
|
12
12
|
waf_text_type = {
|
|
13
13
|
521: "jsl",
|
|
14
|
-
412: "瑞数"
|
|
14
|
+
412: "瑞数",
|
|
15
|
+
"触发WAF": "其他waf"
|
|
15
16
|
}
|
|
16
17
|
has_waf = code in waf_text_type.keys()
|
|
18
|
+
if not has_waf:
|
|
19
|
+
# 不是瑞数和jsl,使用无头浏览器判断是否为其他waf
|
|
20
|
+
headless_html = dp_headless_html(url)
|
|
21
|
+
if "触发WAF" in headless_html:
|
|
22
|
+
has_waf = True
|
|
23
|
+
code = "触发WAF"
|
|
17
24
|
if not has_waf:
|
|
18
25
|
waf_type = "不存在waf"
|
|
19
26
|
else:
|
|
@@ -31,6 +38,3 @@ def register_assert_waf(mcp: FastMCP):
|
|
|
31
38
|
)
|
|
32
39
|
}]
|
|
33
40
|
}
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
tools/tools.py
CHANGED
|
@@ -1,26 +1,33 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
1
3
|
import htmlmin
|
|
2
|
-
import requests
|
|
4
|
+
from curl_cffi import requests
|
|
3
5
|
from lxml import html, etree
|
|
4
6
|
from bs4 import BeautifulSoup
|
|
5
7
|
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
6
8
|
|
|
7
9
|
|
|
10
|
+
# 传入requests的set-cookie的str,返回一个cookie dict
|
|
11
|
+
def cookie_str2dict(cookie_str: str):
|
|
12
|
+
if cookie_str == "":
|
|
13
|
+
return {}
|
|
14
|
+
cookie_dict = {}
|
|
15
|
+
cookie_list = [cookie.split(";")[0] for cookie in cookie_str.split("HttpOnly,")]
|
|
16
|
+
for cookie in cookie_list:
|
|
17
|
+
key, value = cookie.split("=")
|
|
18
|
+
cookie_dict[key] = value
|
|
19
|
+
return cookie_dict
|
|
20
|
+
|
|
21
|
+
|
|
8
22
|
# 使用requests获取html,用于测试是否使用了瑞数和jsl
|
|
9
23
|
def requests_html(url):
|
|
10
24
|
headers = {
|
|
11
|
-
# "sec-ch-ua": "\"Chromium\";v=\"142\", \"Google Chrome\";v=\"142\", \"Not_A Brand\";v=\"99\"",
|
|
12
|
-
# "sec-ch-ua-mobile": "?0",
|
|
13
|
-
# "Upgrade-Insecure-Requests": "1",
|
|
14
25
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36",
|
|
15
|
-
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
16
|
-
# "Sec-Fetch-Site": "none",
|
|
17
|
-
# "Sec-Fetch-Mode": "navigate",
|
|
18
|
-
# "Sec-Fetch-User": "?1",
|
|
19
|
-
# "Sec-Fetch-Dest": "document",
|
|
20
|
-
# "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
|
|
21
26
|
}
|
|
22
27
|
response = requests.get(url, headers=headers, verify=False)
|
|
23
|
-
print("response headers=> ", response.headers)
|
|
28
|
+
# print("response headers=> ", type(response.headers.get("Set-Cookie")),
|
|
29
|
+
# cookie_str2dict(response.headers.get("Set-Cookie", "")))
|
|
30
|
+
response.encoding = "utf-8"
|
|
24
31
|
return response.text, response.status_code
|
|
25
32
|
|
|
26
33
|
|
|
@@ -28,9 +35,42 @@ def requests_html(url):
|
|
|
28
35
|
def dp_headless_html(url):
|
|
29
36
|
opt = ChromiumOptions().headless(True)
|
|
30
37
|
opt.set_argument('--no-sandbox')
|
|
38
|
+
opt.set_local_port(9976)
|
|
39
|
+
|
|
31
40
|
page = ChromiumPage(opt)
|
|
32
|
-
page
|
|
33
|
-
|
|
41
|
+
# page = ChromiumPage()
|
|
42
|
+
|
|
43
|
+
tab = page.latest_tab
|
|
44
|
+
# tab.set.load_mode.normal()
|
|
45
|
+
tab.get(url)
|
|
46
|
+
# tab.wait.eles_loaded()
|
|
47
|
+
time.sleep(10)
|
|
48
|
+
# tab.wait.load_start()
|
|
49
|
+
page_html = tab.html
|
|
50
|
+
# print(page_html)
|
|
51
|
+
# print("dp_cookies=>", tab.cookies())
|
|
52
|
+
page.quit()
|
|
53
|
+
return page_html
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# 使用dp无头模式获取html,用于测试是否使用了其他waf,如移动waf
|
|
57
|
+
def dp_html(url):
|
|
58
|
+
opt = ChromiumOptions()
|
|
59
|
+
opt.set_local_port(9223)
|
|
60
|
+
# opt.set_argument('--no-sandbox')
|
|
61
|
+
page = ChromiumPage(opt)
|
|
62
|
+
# page = ChromiumPage()
|
|
63
|
+
|
|
64
|
+
tab = page.latest_tab
|
|
65
|
+
# tab.set.load_mode.normal()
|
|
66
|
+
tab.get(url)
|
|
67
|
+
# tab.wait.eles_loaded()
|
|
68
|
+
# time.sleep(10)
|
|
69
|
+
|
|
70
|
+
tab.wait.doc_loaded()
|
|
71
|
+
page_html = tab.html
|
|
72
|
+
# print(page_html)
|
|
73
|
+
# print("dp_cookies=>", tab.cookies())
|
|
34
74
|
page.quit()
|
|
35
75
|
return page_html
|
|
36
76
|
|
|
@@ -71,12 +111,26 @@ def compress_html(content, only_text=False):
|
|
|
71
111
|
|
|
72
112
|
|
|
73
113
|
def test(target_url):
|
|
74
|
-
page_html = dp_headless_html(target_url)
|
|
75
|
-
print("render_text=>", compress_html(page_html, only_text=True))
|
|
76
|
-
print("\n")
|
|
77
114
|
raw_html, status_code = requests_html(target_url)
|
|
78
|
-
|
|
79
|
-
print("
|
|
115
|
+
raw_html = compress_html(raw_html, only_text=True)
|
|
116
|
+
# print("raw_html=>", status_code, compress_html(raw_html))
|
|
117
|
+
print("raw_html=>", status_code, raw_html)
|
|
118
|
+
page_html = dp_html(target_url)
|
|
119
|
+
page_html = compress_html(page_html, only_text=True)
|
|
120
|
+
# print("render_text=>", compress_html(page_html))
|
|
121
|
+
print("render_text_head=>", page_html)
|
|
122
|
+
|
|
123
|
+
page_html = dp_headless_html(target_url)
|
|
124
|
+
page_html = compress_html(page_html, only_text=True)
|
|
125
|
+
# print("render_text=>", compress_html(page_html))
|
|
126
|
+
print("render_text=>", page_html)
|
|
127
|
+
|
|
128
|
+
# print("\n")
|
|
129
|
+
# raw_html, status_code = requests_html(target_url)
|
|
130
|
+
# raw_html = compress_html(raw_html, only_text=True)
|
|
131
|
+
# # print("raw_html=>", status_code, compress_html(raw_html))
|
|
132
|
+
# print("raw_html=>", status_code, raw_html)
|
|
133
|
+
# print("\n")
|
|
80
134
|
|
|
81
135
|
|
|
82
136
|
if __name__ == '__main__':
|
|
@@ -85,8 +139,25 @@ if __name__ == '__main__':
|
|
|
85
139
|
# raw_html, status_code = requests_html("https://scgh.org/page/news/tmore36403294e28a11ea8ba48cec4b967595.html")
|
|
86
140
|
# raw_html, status_code = requests_html("https://scgh.org/page/news/tmore36403294e28a11ea8ba48cec4b967595.html")
|
|
87
141
|
# url = "https://www.nmpa.gov.cn/yaowen/ypjgyw/index.html"
|
|
88
|
-
url = "http://www.customs.gov.cn/customs/xwfb34/302425/index.html"
|
|
89
|
-
# url = "https://
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
142
|
+
# url = "http://www.customs.gov.cn/customs/xwfb34/302425/index.html"
|
|
143
|
+
# url = "https://scgh.org/page/news/tmore36403294e28a11ea8ba48cec4b967595.html"
|
|
144
|
+
# url = "https://www.gjxfj.gov.cn/gjxfj/news/ttxw.htm"
|
|
145
|
+
# url = "https://www.gsgh.org.cn/#/moreNews_?position=%E7%9C%81%E6%80%BB%E6%96%B0%E9%97%BB&categoryId=502"
|
|
146
|
+
url = "https://www.acftu.org/xwdt/ghyw/"
|
|
147
|
+
# url = "https://www.nxzgh.org.cn/#/newsCenter/index2/2" # 移动waf,会检测浏览器无头
|
|
148
|
+
# url = "https://www.chengdu.gov.cn/cdsrmzf/zfxx/cjj.shtml" # 超严格瑞数6,怀疑他会检测端口,我使用9222端口无论如何都获取不到结果
|
|
149
|
+
# url = "https://www.jsgh.org/col/col3577/index.html?uid=18462&pageNum=1"
|
|
150
|
+
# render_html = dp_headless_html(url)
|
|
151
|
+
# print("\n")
|
|
152
|
+
# print("render_html=>", render_html)
|
|
153
|
+
# for i in range(20):
|
|
154
|
+
test(url)
|
|
155
|
+
# todo: 大致盘一下各种判定的逻辑【以下的所有压缩比之间的差距均取“绝对值”】
|
|
156
|
+
# 1. 如果requests、无头、有头获取到的压缩比之间从差距都在15%以内,则认定该页面是静态页面,此时优先使用requests请求
|
|
157
|
+
# 2. 如果requests的status_code为特定的412,或者521,则判定是瑞数和jsl。[此时还有一个特点:requests的压缩比会与其他两种方式获取到的压缩比差距非常大(一两千的那种)]
|
|
158
|
+
# 3. 如果requests、无头、有头获取到的压缩比之间差距都在40%以上,则判定该页面只可以用有头采集
|
|
159
|
+
# 4. 如果无头和有头获取到的压缩比之间差距小于15%,但是requests和无头的差距大于40%,则认定该页面可以使用无头浏览器采集
|
|
160
|
+
# 5. 如果requests和有头获取到的压缩比之间差距小于15%,但是无头和有头的差距大于40%,则认定该页面优先使用有头浏览器采集
|
|
161
|
+
# 【此时可能是:1.使用了别的检测无头的waf。2.网站使用瑞数,但是这次请求没有拦截requests(不知道是不是瑞数那边故意设置的),
|
|
162
|
+
# 此时如果想进一步判定是否是瑞数,可以使用有头浏览器取一下cookies,如果cookies里面存在瑞数的cookie,那么就可以断定是瑞数】
|
|
163
|
+
# 6.
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
mcp_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mcp_tools/dp_tools.py,sha256=xtpJoRmpqZQRjfVUEIjo3KAd5_SqK01J1VAt8U79xbA,8448
|
|
3
|
-
mcp_tools/main.py,sha256=lxH8PafR4KVM0-OURgHD0jNaC68kLmPYaNJ1wSKfNdY,654
|
|
4
|
-
mcp_tools/requests_tools.py,sha256=K1eHbvhzWhLzn9AZhjDGTmDu_yipvkEM9IOhveDMsPM,1063
|
|
5
|
-
tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
tools/simhash_tools.py,sha256=eRxUqcWJiNMNZPpVRBk1kI7OYSK_vThoVg3jctFfmhY,7368
|
|
7
|
-
tools/tools.py,sha256=lPlXdVA4Dx3irP6alMOgmE-HxxuN_y4XQ3Q-fZyV-OU,3482
|
|
8
|
-
jarvis_brain-0.1.2.dist-info/METADATA,sha256=Otgmq6QJe4XoWiIhC1i6jBpQsgWhraoPpi8v8Su7SYw,250
|
|
9
|
-
jarvis_brain-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
10
|
-
jarvis_brain-0.1.2.dist-info/entry_points.txt,sha256=YFQT4xpkUqt5dM5wlKPQQOqcjMuFrT9iuRAzIpAyH7U,51
|
|
11
|
-
jarvis_brain-0.1.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|