pycoze 0.1.488__py3-none-any.whl → 0.1.489__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycoze/api/lib/web.py +0 -58
- pycoze/utils/__init__.py +3 -1
- pycoze/utils/web.py +56 -0
- {pycoze-0.1.488.dist-info → pycoze-0.1.489.dist-info}/METADATA +1 -1
- {pycoze-0.1.488.dist-info → pycoze-0.1.489.dist-info}/RECORD +8 -7
- {pycoze-0.1.488.dist-info → pycoze-0.1.489.dist-info}/LICENSE +0 -0
- {pycoze-0.1.488.dist-info → pycoze-0.1.489.dist-info}/WHEEL +0 -0
- {pycoze-0.1.488.dist-info → pycoze-0.1.489.dist-info}/top_level.txt +0 -0
pycoze/api/lib/web.py
CHANGED
@@ -6,62 +6,4 @@ socket = utils.socket
|
|
6
6
|
|
7
7
|
|
8
8
|
class WebCls:
|
9
|
-
def get_simplified_webpage(self, url: str) -> str:
|
10
|
-
return socket.post_and_recv_result(
|
11
|
-
"getSimplifiedWebpage", {"url": url}
|
12
|
-
)
|
13
9
|
|
14
|
-
def get_simplified_html(self, html: str, selector=None) -> str:
|
15
|
-
soup = BeautifulSoup(html, 'html.parser')
|
16
|
-
|
17
|
-
# 如果指定了selector,则只提取该元素的内容
|
18
|
-
if selector:
|
19
|
-
element = soup.select_one(selector)
|
20
|
-
if element:
|
21
|
-
soup = BeautifulSoup(str(element), 'html.parser')
|
22
|
-
else:
|
23
|
-
return f"element not found: {selector}"
|
24
|
-
|
25
|
-
# 定义需要移除的标签
|
26
|
-
tags_to_remove = ['script', 'style', 'noscript', 'meta', 'link']
|
27
|
-
for tag in tags_to_remove:
|
28
|
-
for element in soup(tag):
|
29
|
-
element.decompose()
|
30
|
-
|
31
|
-
# 移除注释
|
32
|
-
for element in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
33
|
-
element.extract()
|
34
|
-
|
35
|
-
# 定义需要保留的交互属性
|
36
|
-
INTERACTIVE_ATTRIBUTES = {
|
37
|
-
'a': ['href', 'onclick'],
|
38
|
-
'button': ['onclick'],
|
39
|
-
'img': ['src', 'onload'],
|
40
|
-
'form': ['action', 'onsubmit'],
|
41
|
-
'input': ['type', 'onclick', 'onchange'],
|
42
|
-
'*': ['onclick', 'onload', 'onchange', 'onsubmit', 'onmouseover']
|
43
|
-
}
|
44
|
-
|
45
|
-
# 遍历所有标签,保留交互属性并移除其他属性
|
46
|
-
for element in soup.find_all(True):
|
47
|
-
tag_name = element.name
|
48
|
-
allowed_attrs = INTERACTIVE_ATTRIBUTES.get(tag_name, []) + INTERACTIVE_ATTRIBUTES['*']
|
49
|
-
attrs = list(element.attrs.keys())
|
50
|
-
for attr in attrs:
|
51
|
-
if attr not in allowed_attrs:
|
52
|
-
del element[attr]
|
53
|
-
|
54
|
-
# 如果是<img>标签,检查src是否为Base64
|
55
|
-
if tag_name == 'img' and 'src' in element.attrs and element['src'].startswith('data:'):
|
56
|
-
del element['src']
|
57
|
-
|
58
|
-
# 处理文本内容,超过1000字符则截取
|
59
|
-
if element.string and len(element.string) > 1000:
|
60
|
-
element.string = element.string[:1000] + '...'
|
61
|
-
|
62
|
-
# 移除标签之间的多余空白
|
63
|
-
for element in soup.find_all(True):
|
64
|
-
if not element.get_text(strip=True):
|
65
|
-
element.string = ''
|
66
|
-
|
67
|
-
return str(soup)
|
pycoze/utils/__init__.py
CHANGED
@@ -3,11 +3,13 @@ from .env import read_params_file, params, read_json_file
|
|
3
3
|
from .socket import TcpSocket, socket, socket_subscribe
|
4
4
|
from .text_or_file import to_text
|
5
5
|
from .process import better_kill, execute_script, execute_script_and_block, execute_script_no_block
|
6
|
+
from .web import get_simplified_html
|
6
7
|
|
7
8
|
__all__ = [
|
8
9
|
read_arg,
|
9
10
|
read_params_file, params, read_json_file,
|
10
11
|
TcpSocket, socket, socket_subscribe,
|
11
12
|
to_text,
|
12
|
-
better_kill, execute_script, execute_script_and_block, execute_script_no_block
|
13
|
+
better_kill, execute_script, execute_script_and_block, execute_script_no_block,
|
14
|
+
get_simplified_html
|
13
15
|
]
|
pycoze/utils/web.py
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
from bs4 import BeautifulSoup, Comment
|
2
|
+
|
3
|
+
def get_simplified_html(html: str, selector=None) -> str:
|
4
|
+
soup = BeautifulSoup(html, 'html.parser')
|
5
|
+
|
6
|
+
# 如果指定了selector,则只提取该元素的内容
|
7
|
+
if selector:
|
8
|
+
element = soup.select_one(selector)
|
9
|
+
if element:
|
10
|
+
soup = BeautifulSoup(str(element), 'html.parser')
|
11
|
+
else:
|
12
|
+
return f"element not found: {selector}"
|
13
|
+
|
14
|
+
# 定义需要移除的标签
|
15
|
+
tags_to_remove = ['script', 'style', 'noscript', 'meta', 'link']
|
16
|
+
for tag in tags_to_remove:
|
17
|
+
for element in soup(tag):
|
18
|
+
element.decompose()
|
19
|
+
|
20
|
+
# 移除注释
|
21
|
+
for element in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
22
|
+
element.extract()
|
23
|
+
|
24
|
+
# 定义需要保留的交互属性
|
25
|
+
INTERACTIVE_ATTRIBUTES = {
|
26
|
+
'a': ['href', 'onclick'],
|
27
|
+
'button': ['onclick'],
|
28
|
+
'img': ['src', 'onload'],
|
29
|
+
'form': ['action', 'onsubmit'],
|
30
|
+
'input': ['type', 'onclick', 'onchange'],
|
31
|
+
'*': ['onclick', 'onload', 'onchange', 'onsubmit', 'onmouseover']
|
32
|
+
}
|
33
|
+
|
34
|
+
# 遍历所有标签,保留交互属性并移除其他属性
|
35
|
+
for element in soup.find_all(True):
|
36
|
+
tag_name = element.name
|
37
|
+
allowed_attrs = INTERACTIVE_ATTRIBUTES.get(tag_name, []) + INTERACTIVE_ATTRIBUTES['*']
|
38
|
+
attrs = list(element.attrs.keys())
|
39
|
+
for attr in attrs:
|
40
|
+
if attr not in allowed_attrs:
|
41
|
+
del element[attr]
|
42
|
+
|
43
|
+
# 如果是<img>标签,检查src是否为Base64
|
44
|
+
if tag_name == 'img' and 'src' in element.attrs and element['src'].startswith('data:'):
|
45
|
+
del element['src']
|
46
|
+
|
47
|
+
# 处理文本内容,超过1000字符则截取
|
48
|
+
if element.string and len(element.string) > 1000:
|
49
|
+
element.string = element.string[:1000] + '...'
|
50
|
+
|
51
|
+
# 移除标签之间的多余空白
|
52
|
+
for element in soup.find_all(True):
|
53
|
+
if not element.get_text(strip=True):
|
54
|
+
element.string = ''
|
55
|
+
|
56
|
+
return str(soup)
|
@@ -7,7 +7,7 @@ pycoze/api/__init__.py,sha256=TLKvaZlRzTTt0KiXijLjj9b_iCr7fU1siwsXqyd74b8,375
|
|
7
7
|
pycoze/api/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
pycoze/api/lib/tab.py,sha256=UnvMGxawET0J2Gp0mMDqiBH-I21sUN88T7GtU4ZD2bE,3161
|
9
9
|
pycoze/api/lib/view.py,sha256=_PIpTfeuTPPlMDKshMGsqFQYMq7ZiO4Hg5XwHwDoU60,7357
|
10
|
-
pycoze/api/lib/web.py,sha256=
|
10
|
+
pycoze/api/lib/web.py,sha256=g8pK_heoQOORb8UPAIpEJgr1IKNLlfFkIYeN0qUBvYI,109
|
11
11
|
pycoze/api/lib/window.py,sha256=ZZacqnX0fvZUnLUhNVjbbjhBcKkYdi_6E0LswLp1MqM,2071
|
12
12
|
pycoze/bot/__init__.py,sha256=rL3Q-ycczRpSFfKn84fg3QBl5k22WpyeIU5qOEjEby8,79
|
13
13
|
pycoze/bot/chat.py,sha256=UjiQeK-7rGmBY1w5EhQCx_-Y9ccHXHY_2F5LQX5NIa8,6582
|
@@ -27,14 +27,15 @@ pycoze/ui/base.py,sha256=7drlRZ40zF1nwGIRwLTC3EuZOSENz2qhQEWUM5yd9cg,1081
|
|
27
27
|
pycoze/ui/color.py,sha256=cT9Ib8uNzkOKxyW0IwVj46o4LwdB1xgNCj1_Rou9d_4,854
|
28
28
|
pycoze/ui/typ.py,sha256=NpT0FrbHvByOszBZMFtroRp7I7pN-38tYz_zPOPejF4,1723
|
29
29
|
pycoze/ui/ui_def.py,sha256=lGWZGpzRoegP34D562PvK0EJHrmVZrlHW1JjsIG9A9Q,4521
|
30
|
-
pycoze/utils/__init__.py,sha256=
|
30
|
+
pycoze/utils/__init__.py,sha256=qspSyKzJneb2-esPG_RB7jF40m-VxllHO_NUFV1AaSs,550
|
31
31
|
pycoze/utils/arg.py,sha256=jop1tBfe5hYkHW1NSpCeaZBEznkgguBscj_7M2dWfrs,503
|
32
32
|
pycoze/utils/env.py,sha256=5pWlXfM1F5ZU9hhv1rHlDEanjEW5wf0nbyez9bNRqqA,559
|
33
33
|
pycoze/utils/process.py,sha256=U2MURGmxfyWBqdbKfy5UvyV17M40B6HHlNELgWfgrTE,3824
|
34
34
|
pycoze/utils/socket.py,sha256=4Wm4LlwdWXC_kAV0NnZbUc0Y3Kc6KRMyFRqSw79u-9w,2468
|
35
35
|
pycoze/utils/text_or_file.py,sha256=gpxZVWt2DW6YiEg_MnMuwg36VNf3TX383QD_1oZNB0Y,551
|
36
|
-
pycoze
|
37
|
-
pycoze-0.1.
|
38
|
-
pycoze-0.1.
|
39
|
-
pycoze-0.1.
|
40
|
-
pycoze-0.1.
|
36
|
+
pycoze/utils/web.py,sha256=Fkre-ZtLCInbbylOMgXWF_WHkLwIY_THReflhnP3CM4,2007
|
37
|
+
pycoze-0.1.489.dist-info/LICENSE,sha256=QStd_Qsd0-kAam_-sOesCIp_uKrGWeoKwt9M49NVkNU,1090
|
38
|
+
pycoze-0.1.489.dist-info/METADATA,sha256=r5DB4_ltj9pNXK22rNik-ANKhMYl_qlvNQzrcVqYTkk,854
|
39
|
+
pycoze-0.1.489.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
40
|
+
pycoze-0.1.489.dist-info/top_level.txt,sha256=76dPeDhKvOCleL3ZC5gl1-y4vdS1tT_U1hxWVAn7sFo,7
|
41
|
+
pycoze-0.1.489.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|