Jarvis-Brain 0.1.2__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,8 +13,8 @@
13
13
  <component name="FileTemplateManagerImpl">
14
14
  <option name="RECENT_TEMPLATES">
15
15
  <list>
16
- <option value="HTML File" />
17
16
  <option value="Python Script" />
17
+ <option value="HTML File" />
18
18
  </list>
19
19
  </option>
20
20
  </component>
@@ -26,24 +26,25 @@
26
26
  <option name="hideEmptyMiddlePackages" value="true" />
27
27
  <option name="showLibraryContents" value="true" />
28
28
  </component>
29
- <component name="PropertiesComponent">{
30
- &quot;keyToString&quot;: {
31
- &quot;DefaultHtmlFileTemplate&quot;: &quot;HTML File&quot;,
32
- &quot;ModuleVcsDetector.initialDetectionPerformed&quot;: &quot;true&quot;,
33
- &quot;Python.1.executor&quot;: &quot;Run&quot;,
34
- &quot;Python.2.executor&quot;: &quot;Run&quot;,
35
- &quot;Python.dp_tools.executor&quot;: &quot;Run&quot;,
36
- &quot;Python.main.executor&quot;: &quot;Run&quot;,
37
- &quot;Python.requests_tools.executor&quot;: &quot;Run&quot;,
38
- &quot;Python.simhash.executor&quot;: &quot;Run&quot;,
39
- &quot;Python.simhash_tools.executor&quot;: &quot;Run&quot;,
40
- &quot;Python.tools.executor&quot;: &quot;Run&quot;,
41
- &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
42
- &quot;RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager.252&quot;: &quot;true&quot;,
43
- &quot;last_opened_file_path&quot;: &quot;/Users/user/PycharmProjects/JARVIS&quot;,
44
- &quot;settings.editor.selected.configurable&quot;: &quot;com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable&quot;
29
+ <component name="PropertiesComponent"><![CDATA[{
30
+ "keyToString": {
31
+ "DefaultHtmlFileTemplate": "HTML File",
32
+ "ModuleVcsDetector.initialDetectionPerformed": "true",
33
+ "Python.1.executor": "Run",
34
+ "Python.2.executor": "Run",
35
+ "Python.3.executor": "Run",
36
+ "Python.dp_tools.executor": "Run",
37
+ "Python.main.executor": "Run",
38
+ "Python.requests_tools.executor": "Run",
39
+ "Python.simhash.executor": "Run",
40
+ "Python.simhash_tools.executor": "Run",
41
+ "Python.tools.executor": "Run",
42
+ "RunOnceActivity.ShowReadmeOnStart": "true",
43
+ "RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager.252": "true",
44
+ "last_opened_file_path": "/Users/user/PycharmProjects/JARVIS",
45
+ "settings.editor.selected.configurable": "com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable"
45
46
  }
46
- }</component>
47
+ }]]></component>
47
48
  <component name="RecentsManager">
48
49
  <key name="CopyFile.RECENT_KEYS">
49
50
  <recent name="$PROJECT_DIR$" />
@@ -99,7 +100,7 @@
99
100
  <option name="INPUT_FILE" value="" />
100
101
  <method v="2" />
101
102
  </configuration>
102
- <configuration name="dp_tools" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
103
+ <configuration name="3" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
103
104
  <module name="JARVIS" />
104
105
  <option name="ENV_FILES" value="" />
105
106
  <option name="INTERPRETER_OPTIONS" value="" />
@@ -108,11 +109,11 @@
108
109
  <env name="PYTHONUNBUFFERED" value="1" />
109
110
  </envs>
110
111
  <option name="SDK_HOME" value="" />
111
- <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
112
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/task_router" />
112
113
  <option name="IS_MODULE_SDK" value="true" />
113
114
  <option name="ADD_CONTENT_ROOTS" value="true" />
114
115
  <option name="ADD_SOURCE_ROOTS" value="true" />
115
- <option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/mcp_tools/dp_tools.py" />
116
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/task_router/3.py" />
116
117
  <option name="PARAMETERS" value="" />
117
118
  <option name="SHOW_COMMAND_LINE" value="false" />
118
119
  <option name="EMULATE_TERMINAL" value="false" />
@@ -121,7 +122,7 @@
121
122
  <option name="INPUT_FILE" value="" />
122
123
  <method v="2" />
123
124
  </configuration>
124
- <configuration name="requests_tools" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
125
+ <configuration name="dp_tools" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
125
126
  <module name="JARVIS" />
126
127
  <option name="ENV_FILES" value="" />
127
128
  <option name="INTERPRETER_OPTIONS" value="" />
@@ -130,11 +131,11 @@
130
131
  <env name="PYTHONUNBUFFERED" value="1" />
131
132
  </envs>
132
133
  <option name="SDK_HOME" value="" />
133
- <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/mcp_tools" />
134
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
134
135
  <option name="IS_MODULE_SDK" value="true" />
135
136
  <option name="ADD_CONTENT_ROOTS" value="true" />
136
137
  <option name="ADD_SOURCE_ROOTS" value="true" />
137
- <option name="SCRIPT_NAME" value="$PROJECT_DIR$/mcp_tools/requests_tools.py" />
138
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/mcp_tools/dp_tools.py" />
138
139
  <option name="PARAMETERS" value="" />
139
140
  <option name="SHOW_COMMAND_LINE" value="false" />
140
141
  <option name="EMULATE_TERMINAL" value="false" />
@@ -143,7 +144,7 @@
143
144
  <option name="INPUT_FILE" value="" />
144
145
  <method v="2" />
145
146
  </configuration>
146
- <configuration name="simhash_tools" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
147
+ <configuration name="requests_tools" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
147
148
  <module name="JARVIS" />
148
149
  <option name="ENV_FILES" value="" />
149
150
  <option name="INTERPRETER_OPTIONS" value="" />
@@ -152,11 +153,11 @@
152
153
  <env name="PYTHONUNBUFFERED" value="1" />
153
154
  </envs>
154
155
  <option name="SDK_HOME" value="" />
155
- <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/tools" />
156
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/mcp_tools" />
156
157
  <option name="IS_MODULE_SDK" value="true" />
157
158
  <option name="ADD_CONTENT_ROOTS" value="true" />
158
159
  <option name="ADD_SOURCE_ROOTS" value="true" />
159
- <option name="SCRIPT_NAME" value="$PROJECT_DIR$/tools/simhash_tools.py" />
160
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/mcp_tools/requests_tools.py" />
160
161
  <option name="PARAMETERS" value="" />
161
162
  <option name="SHOW_COMMAND_LINE" value="false" />
162
163
  <option name="EMULATE_TERMINAL" value="false" />
@@ -191,9 +192,9 @@
191
192
  <list>
192
193
  <item itemvalue="Python.1" />
193
194
  <item itemvalue="Python.tools" />
195
+ <item itemvalue="Python.3" />
194
196
  <item itemvalue="Python.2" />
195
197
  <item itemvalue="Python.requests_tools" />
196
- <item itemvalue="Python.simhash_tools" />
197
198
  </list>
198
199
  </recent_temporary>
199
200
  </component>
@@ -214,4 +215,15 @@
214
215
  </task>
215
216
  <servers />
216
217
  </component>
218
+ <component name="XDebuggerManager">
219
+ <breakpoint-manager>
220
+ <breakpoints>
221
+ <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
222
+ <url>file://$PROJECT_DIR$/tools/tools.py</url>
223
+ <line>16</line>
224
+ <option name="timeStamp" value="1" />
225
+ </line-breakpoint>
226
+ </breakpoints>
227
+ </breakpoint-manager>
228
+ </component>
217
229
  </project>
@@ -1,9 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: Jarvis_Brain
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: Jarvis brain
5
5
  Requires-Python: >=3.10
6
6
  Requires-Dist: beautifulsoup4
7
+ Requires-Dist: curl-cffi
7
8
  Requires-Dist: drissionpage
8
9
  Requires-Dist: fastmcp
9
10
  Requires-Dist: htmlmin
@@ -1,9 +1,9 @@
1
1
  import json
2
2
  import os
3
3
  import random
4
- from pyexpat.errors import messages
5
4
  from typing import Any
6
5
 
6
+ from urllib.parse import urlparse
7
7
  from DrissionPage import ChromiumPage, ChromiumOptions
8
8
  from DrissionPage._elements.none_element import NoneElement
9
9
  from fastmcp import FastMCP
@@ -209,3 +209,23 @@ def register_assert_Static_Web(mcp: FastMCP):
209
209
  }, ensure_ascii=False)
210
210
  }]
211
211
  }
212
+
213
+
214
+ def register_get_cookies(mcp: FastMCP):
215
+ @mcp.tool(name="get_cookies")
216
+ def get_cookies(browser_port: int, tab_id: str) -> dict[str, Any]:
217
+ _browser = get_page(browser_port)
218
+ tab = _browser.get_tab(tab_id)
219
+ target_url = tab.url
220
+ parser = urlparse(target_url)
221
+ domain = parser.netloc
222
+ cookies = tab.cookies()
223
+ # tab.cookies
224
+ return {
225
+ "content": [{
226
+ "type": "text",
227
+ "text": json.dumps({
228
+
229
+ }, ensure_ascii=False)
230
+ }]
231
+ }
@@ -1,19 +1,26 @@
1
1
  import json
2
2
 
3
3
  from fastmcp import FastMCP
4
- from tools.tools import requests_html
5
- from DrissionPage import ChromiumPage, ChromiumOptions
4
+ from tools.tools import requests_html, dp_headless_html
6
5
 
7
6
 
8
7
  def register_assert_waf(mcp: FastMCP):
9
8
  @mcp.tool(name="assert_waf", description="判断传入的url对应的网页是否存在瑞数、jsl等风控防火墙")
10
9
  def assert_waf(url: str):
10
+ # 先试用requests判断瑞数和jsl
11
11
  text, code = requests_html(url)
12
12
  waf_text_type = {
13
13
  521: "jsl",
14
- 412: "瑞数"
14
+ 412: "瑞数",
15
+ "触发WAF": "其他waf"
15
16
  }
16
17
  has_waf = code in waf_text_type.keys()
18
+ if not has_waf:
19
+ # 不是瑞数和jsl,使用无头浏览器判断是否为其他waf
20
+ headless_html = dp_headless_html(url)
21
+ if "触发WAF" in headless_html:
22
+ has_waf = True
23
+ code = "触发WAF"
17
24
  if not has_waf:
18
25
  waf_type = "不存在waf"
19
26
  else:
@@ -31,6 +38,3 @@ def register_assert_waf(mcp: FastMCP):
31
38
  )
32
39
  }]
33
40
  }
34
-
35
-
36
-
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "Jarvis_Brain" # 别人下载时用的名字,必须在 PyPI 上唯一
3
- version = "0.1.2"
3
+ version = "0.1.3"
4
4
  description = "Jarvis brain"
5
5
  dependencies = [
6
6
  "fastmcp",
@@ -9,7 +9,7 @@ dependencies = [
9
9
  "jieba",
10
10
  "simhash",
11
11
  "beautifulsoup4",
12
- # 在这里列出 dp_tools.py 需要的其他库,例如 "pandas", "requests"
12
+ "curl_cffi"
13
13
  ]
14
14
  requires-python = ">=3.10"
15
15
 
@@ -51,41 +51,64 @@ options = ClaudeAgentOptions(
51
51
  permission_mode='bypassPermissions',
52
52
  cwd=base_cwd,
53
53
  mcp_servers={
54
- "team-node-dp": {
55
- "command": "uv",
56
- "args": [
57
- "run",
58
- "--directory",
59
- "/Users/user/PycharmProjects/JARVIS/mcp_tools",
60
- "main.py"
61
- ],
54
+ "jarvis": {
55
+ "command": "uvx",
56
+ "args": ["--from", "jarvis_brain", "jarvis-mcp"],
62
57
  "env": {
63
58
  "MCP_MODULES": "TeamNode-Dp"
64
59
  }
65
60
  },
66
- "jarvis-node": {
67
- "command": "uv",
68
- "args": [
69
- "run",
70
- "--directory",
71
- "/Users/user/PycharmProjects/JARVIS/mcp_tools",
72
- "main.py"
73
- ],
74
- "env": {
75
- "MCP_MODULES": "JarvisNode"
76
- }
77
- },
61
+ # "team-node-dp": {
62
+ # "command": "uv",
63
+ # "args": [
64
+ # "run",
65
+ # "--directory",
66
+ # "/Users/user/PycharmProjects/JARVIS/mcp_tools",
67
+ # "main.py"
68
+ # ],
69
+ # "env": {
70
+ # "MCP_MODULES": "TeamNode-Dp"
71
+ # }
72
+ # },
73
+ # "jarvis-node": {
74
+ # "command": "uv",
75
+ # "args": [
76
+ # "run",
77
+ # "--directory",
78
+ # "/Users/user/PycharmProjects/JARVIS/mcp_tools",
79
+ # "main.py"
80
+ # ],
81
+ # "env": {
82
+ # "MCP_MODULES": "JarvisNode"
83
+ # }
84
+ # },
78
85
  },
79
86
  # setting_sources=["project"],
87
+ # allowed_tools=[
88
+ # "mcp__team-node-dp__get_html",
89
+ # "mcp__team-node-dp__visit_url",
90
+ # "mcp__team-node-dp__get_new_tab",
91
+ # "mcp__team-node-dp__switch_tab",
92
+ # "mcp__team-node-dp__close_tab",
93
+ # "mcp__team-node-dp__check_selector",
94
+ # "mcp__jarvis-node__assert_Static_Web", # 判断当前页面是否为静态页面
95
+ # "mcp__jarvis-node__assert_waf", # 判断传入的url是否使用了瑞数,jsl等防火墙
96
+ # 'Read',
97
+ # 'Write',
98
+ # 'Edit',
99
+ # 'MultiEdit',
100
+ # 'Grep',
101
+ # 'Glob',
102
+ # 'TodoWrite'
103
+ # ]
80
104
  allowed_tools=[
81
- "mcp__team-node-dp__get_html",
82
- "mcp__team-node-dp__visit_url",
83
- "mcp__team-node-dp__get_new_tab",
84
- "mcp__team-node-dp__switch_tab",
85
- "mcp__team-node-dp__close_tab",
86
- "mcp__team-node-dp__check_selector",
87
- "mcp__jarvis-node__assert_Static_Web", # 判断当前页面是否为静态页面
88
- "mcp__jarvis-node__assert_waf", # 判断传入的url是否使用了瑞数,jsl等防火墙
105
+ "mcp__jarvis__get_html",
106
+ "mcp__jarvis__visit_url",
107
+ "mcp__jarvis__get_new_tab",
108
+ "mcp__jarvis__switch_tab",
109
+ "mcp__jarvis__close_tab",
110
+ "mcp__jarvis__check_selector",
111
+ "mcp__jarvis__check_selector",
89
112
  'Read',
90
113
  'Write',
91
114
  'Edit',
@@ -136,7 +159,7 @@ async def main():
136
159
  tasks.append((run, (), { # 关键字参数字典
137
160
  "url": value,
138
161
  }))
139
- manager = SimpleTaskManager(max_concurrent=1)
162
+ manager = SimpleTaskManager(max_concurrent=5)
140
163
  results = await manager.process_all(tasks)
141
164
  # 输出结果
142
165
  print("\n任务结果摘要:")
@@ -0,0 +1,163 @@
1
+ import time
2
+
3
+ import htmlmin
4
+ from curl_cffi import requests
5
+ from lxml import html, etree
6
+ from bs4 import BeautifulSoup
7
+ from DrissionPage import ChromiumPage, ChromiumOptions
8
+
9
+
10
+ # 传入requests的set-cookie的str,返回一个cookie dict
11
+ def cookie_str2dict(cookie_str: str):
12
+ if cookie_str == "":
13
+ return {}
14
+ cookie_dict = {}
15
+ cookie_list = [cookie.split(";")[0] for cookie in cookie_str.split("HttpOnly,")]
16
+ for cookie in cookie_list:
17
+ key, value = cookie.split("=")
18
+ cookie_dict[key] = value
19
+ return cookie_dict
20
+
21
+
22
+ # 使用requests获取html,用于测试是否使用了瑞数和jsl
23
+ def requests_html(url):
24
+ headers = {
25
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36",
26
+ }
27
+ response = requests.get(url, headers=headers, verify=False)
28
+ # print("response headers=> ", type(response.headers.get("Set-Cookie")),
29
+ # cookie_str2dict(response.headers.get("Set-Cookie", "")))
30
+ response.encoding = "utf-8"
31
+ return response.text, response.status_code
32
+
33
+
34
+ # 使用dp无头模式获取html,用于测试是否使用了其他waf,如移动waf
35
+ def dp_headless_html(url):
36
+ opt = ChromiumOptions().headless(True)
37
+ opt.set_argument('--no-sandbox')
38
+ opt.set_local_port(9976)
39
+
40
+ page = ChromiumPage(opt)
41
+ # page = ChromiumPage()
42
+
43
+ tab = page.latest_tab
44
+ # tab.set.load_mode.normal()
45
+ tab.get(url)
46
+ # tab.wait.eles_loaded()
47
+ time.sleep(10)
48
+ # tab.wait.load_start()
49
+ page_html = tab.html
50
+ # print(page_html)
51
+ # print("dp_cookies=>", tab.cookies())
52
+ page.quit()
53
+ return page_html
54
+
55
+
56
+ # 使用dp无头模式获取html,用于测试是否使用了其他waf,如移动waf
57
+ def dp_html(url):
58
+ opt = ChromiumOptions()
59
+ opt.set_local_port(9223)
60
+ # opt.set_argument('--no-sandbox')
61
+ page = ChromiumPage(opt)
62
+ # page = ChromiumPage()
63
+
64
+ tab = page.latest_tab
65
+ # tab.set.load_mode.normal()
66
+ tab.get(url)
67
+ # tab.wait.eles_loaded()
68
+ # time.sleep(10)
69
+
70
+ tab.wait.doc_loaded()
71
+ page_html = tab.html
72
+ # print(page_html)
73
+ # print("dp_cookies=>", tab.cookies())
74
+ page.quit()
75
+ return page_html
76
+
77
+
78
+ # 压缩html
79
+ def compress_html(content, only_text=False):
80
+ doc = html.fromstring(content)
81
+ # 删除 style 和 script 标签
82
+ for element in doc.xpath('//style | //script'):
83
+ element.getparent().remove(element)
84
+
85
+ # 删除 link 标签
86
+ for link in doc.xpath('//link[@rel="stylesheet"]'):
87
+ link.getparent().remove(link)
88
+
89
+ # 删除 meta 标签(新增功能)
90
+ for meta in doc.xpath('//meta'):
91
+ meta.getparent().remove(meta)
92
+
93
+ # 删除 style 属性
94
+ for element in doc.xpath('//*[@style]'):
95
+ element.attrib.pop('style')
96
+
97
+ # 删除所有 on* 事件属性
98
+ for element in doc.xpath('//*'):
99
+ for attr in list(element.attrib.keys()):
100
+ if attr.startswith('on'):
101
+ element.attrib.pop(attr)
102
+
103
+ result = etree.tostring(doc, encoding='unicode')
104
+ result = htmlmin.minify(result)
105
+ print(f"html压缩比=> {len(content) / len(result) * 100:.2f}%")
106
+ if not only_text:
107
+ return result
108
+ soup = BeautifulSoup(result, 'html.parser')
109
+ result = soup.get_text(strip=True)
110
+ return result
111
+
112
+
113
+ def test(target_url):
114
+ raw_html, status_code = requests_html(target_url)
115
+ raw_html = compress_html(raw_html, only_text=True)
116
+ # print("raw_html=>", status_code, compress_html(raw_html))
117
+ print("raw_html=>", status_code, raw_html)
118
+ page_html = dp_html(target_url)
119
+ page_html = compress_html(page_html, only_text=True)
120
+ # print("render_text=>", compress_html(page_html))
121
+ print("render_text_head=>", page_html)
122
+
123
+ page_html = dp_headless_html(target_url)
124
+ page_html = compress_html(page_html, only_text=True)
125
+ # print("render_text=>", compress_html(page_html))
126
+ print("render_text=>", page_html)
127
+
128
+ # print("\n")
129
+ # raw_html, status_code = requests_html(target_url)
130
+ # raw_html = compress_html(raw_html, only_text=True)
131
+ # # print("raw_html=>", status_code, compress_html(raw_html))
132
+ # print("raw_html=>", status_code, raw_html)
133
+ # print("\n")
134
+
135
+
136
+ if __name__ == '__main__':
137
+ # raw_html, status_code = requests_html("https://www.nxzgh.org.cn/#/newsCenter/index2/2")
138
+ # raw_html, status_code = requests_html("http://www.ncha.gov.cn/col/col722/index.html")
139
+ # raw_html, status_code = requests_html("https://scgh.org/page/news/tmore36403294e28a11ea8ba48cec4b967595.html")
140
+ # raw_html, status_code = requests_html("https://scgh.org/page/news/tmore36403294e28a11ea8ba48cec4b967595.html")
141
+ # url = "https://www.nmpa.gov.cn/yaowen/ypjgyw/index.html"
142
+ # url = "http://www.customs.gov.cn/customs/xwfb34/302425/index.html"
143
+ # url = "https://scgh.org/page/news/tmore36403294e28a11ea8ba48cec4b967595.html"
144
+ # url = "https://www.gjxfj.gov.cn/gjxfj/news/ttxw.htm"
145
+ # url = "https://www.gsgh.org.cn/#/moreNews_?position=%E7%9C%81%E6%80%BB%E6%96%B0%E9%97%BB&categoryId=502"
146
+ url = "https://www.acftu.org/xwdt/ghyw/"
147
+ # url = "https://www.nxzgh.org.cn/#/newsCenter/index2/2" # 移动waf,会检测浏览器无头
148
+ # url = "https://www.chengdu.gov.cn/cdsrmzf/zfxx/cjj.shtml" # 超严格瑞数6,怀疑他会检测端口,我使用9222端口无论如何都获取不到结果
149
+ # url = "https://www.jsgh.org/col/col3577/index.html?uid=18462&pageNum=1"
150
+ # render_html = dp_headless_html(url)
151
+ # print("\n")
152
+ # print("render_html=>", render_html)
153
+ # for i in range(20):
154
+ test(url)
155
+ # todo: 大致盘一下各种判定的逻辑【以下的所有压缩比之间的差距均取“绝对值”】
156
+ # 1. 如果requests、无头、有头获取到的压缩比之间从差距都在15%以内,则认定该页面是静态页面,此时优先使用requests请求
157
+ # 2. 如果requests的status_code为特定的412,或者521,则判定是瑞数和jsl。[此时还有一个特点:requests的压缩比会与其他两种方式获取到的压缩比差距非常大(一两千的那种)]
158
+ # 3. 如果requests、无头、有头获取到的压缩比之间差距都在40%以上,则判定该页面只可以用有头采集
159
+ # 4. 如果无头和有头获取到的压缩比之间差距小于15%,但是requests和无头的差距大于40%,则认定该页面可以使用无头浏览器采集
160
+ # 5. 如果requests和有头获取到的压缩比之间差距小于15%,但是无头和有头的差距大于40%,则认定该页面优先使用有头浏览器采集
161
+ # 【此时可能是:1.使用了别的检测无头的waf。2.网站使用瑞数,但是这次请求没有拦截requests(不知道是不是瑞数那边故意设置的),
162
+ # 此时如果想进一步判定是否是瑞数,可以使用有头浏览器取一下cookies,如果cookies里面存在瑞数的cookie,那么就可以断定是瑞数】
163
+ # 6.
@@ -585,7 +585,7 @@ wheels = [
585
585
 
586
586
  [[package]]
587
587
  name = "jarvis-brain"
588
- version = "0.1.1"
588
+ version = "0.1.2"
589
589
  source = { editable = "." }
590
590
  dependencies = [
591
591
  { name = "beautifulsoup4" },
@@ -1,14 +0,0 @@
1
- from tools.simhash_tools import HTMLSimHashComparator
2
-
3
- text1 = "四川省总工会官网欢迎访问四川省总工会领导信箱微 信微 博首    页工会概况新闻中心部门动态信息公开媒体矩阵工作平台首页>新闻中心>工会新闻 >主办单位:四川省总工会 Copyright 2021 www.scgh.org All Rights Reserved.电话:(028)86255956  (028)86122339邮编:610000川公网安备 51010502010464号    技术单位 :四川银利华应用科技有限责任公司建议浏览器:360、谷歌、火狐、IE10蜀ICP备06003706号-1总访问量:21392491 今年:14426832 本月:7210766 今日:6061"
4
- text2 = "四川省总工会官网欢迎访问四川省总工会领导信箱微 信微 博首    页工会概况新闻中心部门动态信息公开媒体矩阵工作平台首页>新闻中心>工会新闻 >粽叶裹童心 四川工会解锁“1+1>2“的快乐公式——我省各级工会组织开展2025年“六一”儿童节关爱活动[2025-06-01]四川省工会资产监督管理工作现场会在宜宾召开[2025-05-30]“六一”关爱来袭!全国爱心托育用人单位经验在这里开花[2025-05-29]“引智助企”示范活动暨“劳模工匠助企行”数智化应用专项行动在乐山举行[2025-05-29]四川省交通运输行业开展职工安全生产“大培训、大测试、大比武”[2025-05-28]省总女职委巾帼劳模工匠学习贯彻习近平总书记重要讲话精神座谈会召开[2025-05-28]四川省工会党支部书记培训班开班[2025-05-20]中国职工保险互助会来川开展座谈交流[2025-05-19]四川省劳模和先进工作者座谈会在蓉召开[2025-05-16]川渝滇黔浙五省市茶界高手首聚蒙顶山“论剑”[2025-05-15]共 5207 条上一页12345…521下一页10 条/页20 条/页30 条/页40 条/页50 条/页到第页确定主办单位:四川省总工会 Copyright 2021 www.scgh.org All Rights Reserved.电话:(028)86255956  (028)86122339邮编:610000川公网安备 51010502010464号    技术单位 :四川银利华应用科技有限责任公司建议浏览器:360、谷歌、火狐、IE10蜀ICP备06003706号-1总访问量:21392491 今年:14426832 本月:7210766 今日:6061"
5
- comparator = HTMLSimHashComparator(text1, text2)
6
- result = comparator.compare_simhash()
7
- print("SimHash比较结果:")
8
- print(f"页面1 SimHash: {result['simhash1']}")
9
- print(f"页面2 SimHash: {result['simhash2']}")
10
- print(f"汉明距离: {result['hamming_distance']}")
11
- print(f"相似度: {result['similarity_percentage']}")
12
- threshold_result = comparator.compare_with_threshold(threshold=0.7)
13
- print(f"\n基于阈值{threshold_result['threshold']}的判断:")
14
- print(f"是否相似: {threshold_result['is_similar']}")
@@ -1,92 +0,0 @@
1
- import htmlmin
2
- import requests
3
- from lxml import html, etree
4
- from bs4 import BeautifulSoup
5
- from DrissionPage import ChromiumPage, ChromiumOptions
6
-
7
-
8
- # 使用requests获取html,用于测试是否使用了瑞数和jsl
9
- def requests_html(url):
10
- headers = {
11
- # "sec-ch-ua": "\"Chromium\";v=\"142\", \"Google Chrome\";v=\"142\", \"Not_A Brand\";v=\"99\"",
12
- # "sec-ch-ua-mobile": "?0",
13
- # "Upgrade-Insecure-Requests": "1",
14
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36",
15
- # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
16
- # "Sec-Fetch-Site": "none",
17
- # "Sec-Fetch-Mode": "navigate",
18
- # "Sec-Fetch-User": "?1",
19
- # "Sec-Fetch-Dest": "document",
20
- # "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
21
- }
22
- response = requests.get(url, headers=headers, verify=False)
23
- print("response headers=> ", response.headers)
24
- return response.text, response.status_code
25
-
26
-
27
- # 使用dp无头模式获取html,用于测试是否使用了其他waf,如移动waf
28
- def dp_headless_html(url):
29
- opt = ChromiumOptions().headless(True)
30
- opt.set_argument('--no-sandbox')
31
- page = ChromiumPage(opt)
32
- page.get(url)
33
- page_html = page.html
34
- page.quit()
35
- return page_html
36
-
37
-
38
- # 压缩html
39
- def compress_html(content, only_text=False):
40
- doc = html.fromstring(content)
41
- # 删除 style 和 script 标签
42
- for element in doc.xpath('//style | //script'):
43
- element.getparent().remove(element)
44
-
45
- # 删除 link 标签
46
- for link in doc.xpath('//link[@rel="stylesheet"]'):
47
- link.getparent().remove(link)
48
-
49
- # 删除 meta 标签(新增功能)
50
- for meta in doc.xpath('//meta'):
51
- meta.getparent().remove(meta)
52
-
53
- # 删除 style 属性
54
- for element in doc.xpath('//*[@style]'):
55
- element.attrib.pop('style')
56
-
57
- # 删除所有 on* 事件属性
58
- for element in doc.xpath('//*'):
59
- for attr in list(element.attrib.keys()):
60
- if attr.startswith('on'):
61
- element.attrib.pop(attr)
62
-
63
- result = etree.tostring(doc, encoding='unicode')
64
- result = htmlmin.minify(result)
65
- print(f"html压缩比=> {len(content) / len(result) * 100:.2f}%")
66
- if not only_text:
67
- return result
68
- soup = BeautifulSoup(result, 'html.parser')
69
- result = soup.get_text(strip=True)
70
- return result
71
-
72
-
73
- def test(target_url):
74
- page_html = dp_headless_html(target_url)
75
- print("render_text=>", compress_html(page_html, only_text=True))
76
- print("\n")
77
- raw_html, status_code = requests_html(target_url)
78
- print("raw_html=>", status_code, compress_html(raw_html, only_text=True))
79
- print("\n")
80
-
81
-
82
- if __name__ == '__main__':
83
- # raw_html, status_code = requests_html("https://www.nxzgh.org.cn/#/newsCenter/index2/2")
84
- # raw_html, status_code = requests_html("http://www.ncha.gov.cn/col/col722/index.html")
85
- # raw_html, status_code = requests_html("https://scgh.org/page/news/tmore36403294e28a11ea8ba48cec4b967595.html")
86
- # raw_html, status_code = requests_html("https://scgh.org/page/news/tmore36403294e28a11ea8ba48cec4b967595.html")
87
- # url = "https://www.nmpa.gov.cn/yaowen/ypjgyw/index.html"
88
- url = "http://www.customs.gov.cn/customs/xwfb34/302425/index.html"
89
- # url = "https://www.acftu.org/xwdt/ghyw/"
90
-
91
- for i in range(20):
92
- test(url)