pycoze 0.1.488__py3-none-any.whl → 0.1.490__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pycoze/api/__init__.py CHANGED
@@ -1,16 +1,13 @@
1
1
  from .lib.window import WindowCls
2
2
  from .lib.tab import TabCls
3
- from .lib.web import WebCls
4
3
 
5
4
  class Api:
6
5
  def __init__(self) -> None:
7
6
  self.window = WindowCls()
8
7
  self.tab = TabCls()
9
- self.web = WebCls()
10
8
 
11
9
  api = Api()
12
10
  window = api.window
13
11
  tab = api.tab
14
- web = api.web
15
12
 
16
13
  # from ps_view import ViewCls, WebsiteViewCls, FileViewCls, DirectoryViewCls, WorkflowCls
pycoze/utils/__init__.py CHANGED
@@ -3,11 +3,13 @@ from .env import read_params_file, params, read_json_file
3
3
  from .socket import TcpSocket, socket, socket_subscribe
4
4
  from .text_or_file import to_text
5
5
  from .process import better_kill, execute_script, execute_script_and_block, execute_script_no_block
6
+ from .web import get_simplified_html
6
7
 
7
8
  __all__ = [
8
9
  read_arg,
9
10
  read_params_file, params, read_json_file,
10
11
  TcpSocket, socket, socket_subscribe,
11
12
  to_text,
12
- better_kill, execute_script, execute_script_and_block, execute_script_no_block
13
+ better_kill, execute_script, execute_script_and_block, execute_script_no_block,
14
+ get_simplified_html
13
15
  ]
pycoze/utils/web.py ADDED
@@ -0,0 +1,56 @@
1
+ from bs4 import BeautifulSoup, Comment
2
+
3
+ def get_simplified_html(html: str, selector=None) -> str:
4
+ soup = BeautifulSoup(html, 'html.parser')
5
+
6
+ # 如果指定了selector,则只提取该元素的内容
7
+ if selector:
8
+ element = soup.select_one(selector)
9
+ if element:
10
+ soup = BeautifulSoup(str(element), 'html.parser')
11
+ else:
12
+ return f"element not found: {selector}"
13
+
14
+ # 定义需要移除的标签
15
+ tags_to_remove = ['script', 'style', 'noscript', 'meta', 'link']
16
+ for tag in tags_to_remove:
17
+ for element in soup(tag):
18
+ element.decompose()
19
+
20
+ # 移除注释
21
+ for element in soup.find_all(string=lambda text: isinstance(text, Comment)):
22
+ element.extract()
23
+
24
+ # 定义需要保留的交互属性
25
+ INTERACTIVE_ATTRIBUTES = {
26
+ 'a': ['href', 'onclick'],
27
+ 'button': ['onclick'],
28
+ 'img': ['src', 'onload'],
29
+ 'form': ['action', 'onsubmit'],
30
+ 'input': ['type', 'onclick', 'onchange'],
31
+ '*': ['onclick', 'onload', 'onchange', 'onsubmit', 'onmouseover']
32
+ }
33
+
34
+ # 遍历所有标签,保留交互属性并移除其他属性
35
+ for element in soup.find_all(True):
36
+ tag_name = element.name
37
+ allowed_attrs = INTERACTIVE_ATTRIBUTES.get(tag_name, []) + INTERACTIVE_ATTRIBUTES['*']
38
+ attrs = list(element.attrs.keys())
39
+ for attr in attrs:
40
+ if attr not in allowed_attrs:
41
+ del element[attr]
42
+
43
+ # 如果是<img>标签,检查src是否为Base64
44
+ if tag_name == 'img' and 'src' in element.attrs and element['src'].startswith('data:'):
45
+ del element['src']
46
+
47
+ # 处理文本内容,超过1000字符则截取
48
+ if element.string and len(element.string) > 1000:
49
+ element.string = element.string[:1000] + '...'
50
+
51
+ # 移除标签之间的多余空白
52
+ for element in soup.find_all(True):
53
+ if not element.get_text(strip=True):
54
+ element.string = ''
55
+
56
+ return str(soup)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: pycoze
3
- Version: 0.1.488
3
+ Version: 0.1.490
4
4
  Summary: Package for pycoze only!
5
5
  Author: Yuan Jie Xiong
6
6
  Author-email: aiqqqqqqq@qq.com
@@ -3,11 +3,10 @@ pycoze/ai/__init__.py,sha256=e8cRzp4bLXILIUVtOPqwpiV-szD2eKtaWIodYIuw-7s,312
3
3
  pycoze/ai/llm/__init__.py,sha256=7qmligvCSneLx5AFCjKYfGURIiI4KlB4hE19SxIr-Xk,342
4
4
  pycoze/ai/llm/chat.py,sha256=sQZT0ImvRW81fXdlKG0ZrHdDB8g5M4iudaWdG4Kpd6Q,6373
5
5
  pycoze/ai/llm/text_to_image_prompt.py,sha256=0bx2C_YRvjAo7iphHGp1-pmGKsKqwur7dM0t3SiA8kA,3398
6
- pycoze/api/__init__.py,sha256=TLKvaZlRzTTt0KiXijLjj9b_iCr7fU1siwsXqyd74b8,375
6
+ pycoze/api/__init__.py,sha256=UQo7g4AIyNNktcxUbcb2gRqb4yKtTAoDOCAFZ_dvji8,305
7
7
  pycoze/api/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  pycoze/api/lib/tab.py,sha256=UnvMGxawET0J2Gp0mMDqiBH-I21sUN88T7GtU4ZD2bE,3161
9
9
  pycoze/api/lib/view.py,sha256=_PIpTfeuTPPlMDKshMGsqFQYMq7ZiO4Hg5XwHwDoU60,7357
10
- pycoze/api/lib/web.py,sha256=4HaOV_zYHq6-SSQQIc_O7REnC1asIlXIELxee-pU6zc,2421
11
10
  pycoze/api/lib/window.py,sha256=ZZacqnX0fvZUnLUhNVjbbjhBcKkYdi_6E0LswLp1MqM,2071
12
11
  pycoze/bot/__init__.py,sha256=rL3Q-ycczRpSFfKn84fg3QBl5k22WpyeIU5qOEjEby8,79
13
12
  pycoze/bot/chat.py,sha256=UjiQeK-7rGmBY1w5EhQCx_-Y9ccHXHY_2F5LQX5NIa8,6582
@@ -27,14 +26,15 @@ pycoze/ui/base.py,sha256=7drlRZ40zF1nwGIRwLTC3EuZOSENz2qhQEWUM5yd9cg,1081
27
26
  pycoze/ui/color.py,sha256=cT9Ib8uNzkOKxyW0IwVj46o4LwdB1xgNCj1_Rou9d_4,854
28
27
  pycoze/ui/typ.py,sha256=NpT0FrbHvByOszBZMFtroRp7I7pN-38tYz_zPOPejF4,1723
29
28
  pycoze/ui/ui_def.py,sha256=lGWZGpzRoegP34D562PvK0EJHrmVZrlHW1JjsIG9A9Q,4521
30
- pycoze/utils/__init__.py,sha256=yj1LLPIRL7EhYuMzO1-NghW_6OMQgef3ofOeyLMksiA,488
29
+ pycoze/utils/__init__.py,sha256=qspSyKzJneb2-esPG_RB7jF40m-VxllHO_NUFV1AaSs,550
31
30
  pycoze/utils/arg.py,sha256=jop1tBfe5hYkHW1NSpCeaZBEznkgguBscj_7M2dWfrs,503
32
31
  pycoze/utils/env.py,sha256=5pWlXfM1F5ZU9hhv1rHlDEanjEW5wf0nbyez9bNRqqA,559
33
32
  pycoze/utils/process.py,sha256=U2MURGmxfyWBqdbKfy5UvyV17M40B6HHlNELgWfgrTE,3824
34
33
  pycoze/utils/socket.py,sha256=4Wm4LlwdWXC_kAV0NnZbUc0Y3Kc6KRMyFRqSw79u-9w,2468
35
34
  pycoze/utils/text_or_file.py,sha256=gpxZVWt2DW6YiEg_MnMuwg36VNf3TX383QD_1oZNB0Y,551
36
- pycoze-0.1.488.dist-info/LICENSE,sha256=QStd_Qsd0-kAam_-sOesCIp_uKrGWeoKwt9M49NVkNU,1090
37
- pycoze-0.1.488.dist-info/METADATA,sha256=DTz7Cgp0sYAt2i2yfM-JFZwIZo7oeblaA-BrMbQYtWA,854
38
- pycoze-0.1.488.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
39
- pycoze-0.1.488.dist-info/top_level.txt,sha256=76dPeDhKvOCleL3ZC5gl1-y4vdS1tT_U1hxWVAn7sFo,7
40
- pycoze-0.1.488.dist-info/RECORD,,
35
+ pycoze/utils/web.py,sha256=Fkre-ZtLCInbbylOMgXWF_WHkLwIY_THReflhnP3CM4,2007
36
+ pycoze-0.1.490.dist-info/LICENSE,sha256=QStd_Qsd0-kAam_-sOesCIp_uKrGWeoKwt9M49NVkNU,1090
37
+ pycoze-0.1.490.dist-info/METADATA,sha256=T9LmX3sLLZHmHYdvPnVzn6iV4phT17MofolBDKp1dbo,854
38
+ pycoze-0.1.490.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
39
+ pycoze-0.1.490.dist-info/top_level.txt,sha256=76dPeDhKvOCleL3ZC5gl1-y4vdS1tT_U1hxWVAn7sFo,7
40
+ pycoze-0.1.490.dist-info/RECORD,,
pycoze/api/lib/web.py DELETED
@@ -1,67 +0,0 @@
1
- from pycoze import utils
2
- from bs4 import BeautifulSoup, Comment
3
-
4
-
5
- socket = utils.socket
6
-
7
-
8
- class WebCls:
9
- def get_simplified_webpage(self, url: str) -> str:
10
- return socket.post_and_recv_result(
11
- "getSimplifiedWebpage", {"url": url}
12
- )
13
-
14
- def get_simplified_html(self, html: str, selector=None) -> str:
15
- soup = BeautifulSoup(html, 'html.parser')
16
-
17
- # 如果指定了selector,则只提取该元素的内容
18
- if selector:
19
- element = soup.select_one(selector)
20
- if element:
21
- soup = BeautifulSoup(str(element), 'html.parser')
22
- else:
23
- return f"element not found: {selector}"
24
-
25
- # 定义需要移除的标签
26
- tags_to_remove = ['script', 'style', 'noscript', 'meta', 'link']
27
- for tag in tags_to_remove:
28
- for element in soup(tag):
29
- element.decompose()
30
-
31
- # 移除注释
32
- for element in soup.find_all(string=lambda text: isinstance(text, Comment)):
33
- element.extract()
34
-
35
- # 定义需要保留的交互属性
36
- INTERACTIVE_ATTRIBUTES = {
37
- 'a': ['href', 'onclick'],
38
- 'button': ['onclick'],
39
- 'img': ['src', 'onload'],
40
- 'form': ['action', 'onsubmit'],
41
- 'input': ['type', 'onclick', 'onchange'],
42
- '*': ['onclick', 'onload', 'onchange', 'onsubmit', 'onmouseover']
43
- }
44
-
45
- # 遍历所有标签,保留交互属性并移除其他属性
46
- for element in soup.find_all(True):
47
- tag_name = element.name
48
- allowed_attrs = INTERACTIVE_ATTRIBUTES.get(tag_name, []) + INTERACTIVE_ATTRIBUTES['*']
49
- attrs = list(element.attrs.keys())
50
- for attr in attrs:
51
- if attr not in allowed_attrs:
52
- del element[attr]
53
-
54
- # 如果是<img>标签,检查src是否为Base64
55
- if tag_name == 'img' and 'src' in element.attrs and element['src'].startswith('data:'):
56
- del element['src']
57
-
58
- # 处理文本内容,超过1000字符则截取
59
- if element.string and len(element.string) > 1000:
60
- element.string = element.string[:1000] + '...'
61
-
62
- # 移除标签之间的多余空白
63
- for element in soup.find_all(True):
64
- if not element.get_text(strip=True):
65
- element.string = ''
66
-
67
- return str(soup)