beswarm 0.1.51__py3-none-any.whl → 0.1.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
beswarm/aient/setup.py CHANGED
@@ -4,7 +4,7 @@ from setuptools import setup, find_packages
4
4
 
5
5
  setup(
6
6
  name="aient",
7
- version="1.1.7",
7
+ version="1.1.9",
8
8
  description="Aient: The Awakening of Agent.",
9
9
  long_description=Path.open(Path("README.md"), encoding="utf-8").read(),
10
10
  long_description_content_type="text/markdown",
@@ -124,10 +124,10 @@ def jina_ai_Web_crawler(url: str, isSearch=False) -> str:
124
124
  @register_tool()
125
125
  def get_url_content(url: str) -> str:
126
126
  """
127
- 获取 url 的网页内容,以 markdown 格式返回给用户
127
+ 获取 url 的网页内容,以 markdown 格式返回给用户
128
128
 
129
- :param url: 要爬取的网页URL
130
- :return: 网页内容
129
+ :param url: 要爬取的网页URL
130
+ :return: 网页内容
131
131
  """
132
132
  markdown_content = url_to_markdown(url)
133
133
  # print(markdown_content)
@@ -1,6 +1,19 @@
1
1
  from .registry import register_tool
2
2
 
3
3
  import os
4
+ import html
5
+
6
+ def unescape_html(input_string: str) -> str:
7
+ """
8
+ 将字符串中的 HTML 实体(例如 &)转换回其原始字符(例如 &)。
9
+
10
+ Args:
11
+ input_string: 包含 HTML 实体的输入字符串。
12
+
13
+ Returns:
14
+ 转换后的字符串。
15
+ """
16
+ return html.unescape(input_string)
4
17
 
5
18
  @register_tool()
6
19
  def write_to_file(path, content, mode='w'):
@@ -49,6 +62,30 @@ Example: Requesting to write to frontend-config.json
49
62
 
50
63
  # 写入文件
51
64
  with open(path, mode, encoding='utf-8') as file:
52
- file.write(content)
65
+ file.write(unescape_html(content))
53
66
 
54
- return f"已成功写入文件:{path}"
67
+ return f"已成功写入文件:{path}"
68
+
69
+
70
+ if __name__ == "__main__":
71
+ text = """
72
+ <!DOCTYPE html>
73
+ <html lang="zh-CN">
74
+ <head>
75
+ <meta charset="UTF-8">
76
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
77
+ <title>Continuous Thought Machines (CTM) 原理解读</title>
78
+ <script>MathJax={chtml:{fontURL:'https://cdn.jsdelivr.net/npm/mathjax@3/es5/output/chtml/fonts/woff-v2'}}</script>
79
+ <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js" id="MathJax-script" async></script>
80
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/viz.js/2.1.2/viz.js" defer></script>
81
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/viz.js/2.1.2/full.render.js" defer></script>
82
+ <script src="https://unpkg.com/@panzoom/panzoom@4.5.1/dist/panzoom.min.js" defer></script>
83
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/themes/prism-okaidia.min.css" rel="stylesheet"/>
84
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=Fira+Code:wght@400;500&display=swap" rel="stylesheet">
85
+ <link href="https://fonts.googleapis.com/icon?family=Material+Icons+Outlined" rel="stylesheet">
86
+ <style>
87
+ """
88
+ with open("test.txt", "r", encoding="utf-8") as file:
89
+ content = file.read()
90
+ print(write_to_file("test.txt", content))
91
+ # python -m beswarm.aient.src.aient.plugins.write_file
beswarm/tools/__init__.py CHANGED
@@ -6,6 +6,7 @@ from .UIworker import UIworker
6
6
  from .search_arxiv import search_arxiv
7
7
  from .repomap import get_code_repo_map
8
8
  from .click import find_and_click_element, scroll_screen
9
+ from .search_web import search_web
9
10
  #显式导入 aient.plugins 中的所需内容
10
11
  from ..aient.src.aient.plugins import (
11
12
  excute_command,
@@ -42,4 +43,5 @@ __all__ = [
42
43
  "scroll_screen",
43
44
  "register_tool",
44
45
  "UIworker",
46
+ "search_web",
45
47
  ]
@@ -0,0 +1,296 @@
1
+ import re
2
+ import os
3
+ import json
4
+ import httpx
5
+ import threading
6
+
7
+ from ..aient.src.aient.plugins import register_tool, get_url_content # Assuming a similar plugin structure
8
+
9
+ class ThreadWithReturnValue(threading.Thread):
10
+ def run(self):
11
+ if self._target is not None:
12
+ self._return = self._target(*self._args, **self._kwargs)
13
+
14
+ def join(self):
15
+ super().join()
16
+ return self._return
17
+
18
+ @register_tool()
19
+ async def search_web(query: str):
20
+ """
21
+ 获取 Google 搜索结果。
22
+
23
+ 参数:
24
+ query (str): 要在 Google 上搜索的查询字符串。
25
+
26
+ 返回:
27
+ dict: 包含搜索结果的字典,如果发生错误则包含错误信息。
28
+ """
29
+ api_key = os.environ.get('THORDATA_KEY')
30
+ if not api_key:
31
+ raise ValueError("THORDATA_KEY is not set in environment variables")
32
+
33
+ api_url = "https://scraperapi.thordata.com/request"
34
+ headers = {
35
+ "Content-Type": "application/json",
36
+ "Authorization": f"Bearer {api_key}" # 请注意:硬编码的 API 密钥
37
+ }
38
+ payload = {
39
+ "url": f"https://www.google.com/search?q={query}"
40
+ }
41
+ results = []
42
+
43
+ try:
44
+ async with httpx.AsyncClient() as client:
45
+ response = await client.post(api_url, headers=headers, json=payload)
46
+ response.raise_for_status() # 如果状态码是 4xx 或 5xx,则引发 HTTPStatusError
47
+ results = response.json()
48
+ except httpx.HTTPStatusError as e:
49
+ return {
50
+ "error": f"HTTP error occurred: {e.response.status_code} - {e.response.text}",
51
+ "status_code": e.response.status_code
52
+ }
53
+ except httpx.RequestError as e:
54
+ return {
55
+ "error": f"An error occurred while requesting {e.request.url!r}: {e}",
56
+ "request_url": str(e.request.url)
57
+ }
58
+ except json.JSONDecodeError:
59
+ return {
60
+ "error": "Failed to decode JSON response from the API.",
61
+ "response_text": response.text if 'response' in locals() else "No response text available"
62
+ }
63
+ except Exception as e:
64
+ return {
65
+ "error": f"An unexpected error occurred: {str(e)}"
66
+ }
67
+
68
+ unique_urls = []
69
+ if "error" in results:
70
+ print(f"Error fetching search results for '{query}':")
71
+ print(json.dumps(results, indent=2, ensure_ascii=False))
72
+ else:
73
+ # print(f"Search results for '{query}':")
74
+ html_content = results.get("data", {}).get("result", {}).get("html", "")
75
+ if html_content:
76
+ # 使用正则表达式查找所有 URL
77
+ # 导入 html 和 urllib.parse 模块
78
+ import html
79
+ import urllib.parse
80
+
81
+ # 1. 初步提取潜在的 URL 字符串
82
+ # 使用更宽容的正则,允许末尾有非URL字符,后续清理
83
+ candidate_urls = re.findall(r'https?://[^\s"]+|www\.[^\s"]+', html_content)
84
+
85
+ processed_urls = []
86
+ for url_str in candidate_urls:
87
+ # 2. 解码十六进制表示 (例如 \x26 -> &)
88
+ try:
89
+ def replace_hex(match):
90
+ return chr(int(match.group(1), 16))
91
+ url_str = re.sub(r'\\x([0-9a-fA-F]{2})', replace_hex, url_str)
92
+ except ValueError:
93
+ pass
94
+
95
+ # 3. 解码 HTML 实体 (例如 & -> &)
96
+ url_str = html.unescape(url_str)
97
+
98
+ # 4. 解码 URL 百分号编码 (例如 %3F -> ?, %3D -> =)
99
+ url_str = urllib.parse.unquote(url_str)
100
+
101
+ # 5. 精确截断已知的非 URL 参数或模式
102
+ # 截断 ved= 参数
103
+ if 'ved=' in url_str:
104
+ url_str = url_str.split('ved=', 1)[0]
105
+ url_str = url_str.rstrip('&?') # 移除可能残留的末尾 & 或 ?
106
+
107
+ # 6. 迭代移除末尾的 HTML 标签
108
+ # 例如 </cite>, <div...>, </span></span>
109
+ old_url_len = -1
110
+ while old_url_len != len(url_str): # 循环直到字符串不再变短
111
+ old_url_len = len(url_str)
112
+ # 移除末尾的完整闭合标签, e.g., </div>
113
+ url_str = re.sub(r'</[^>]+>$', '', url_str)
114
+ # 移除末尾的开始标签或不完整标签, e.g., <cite or <div
115
+ # (包括 < 开头到结尾的所有内容)
116
+ url_str = re.sub(r'<[^>]*$', '', url_str)
117
+ # 移除末尾的 > 单独字符,如果标签移除后残留
118
+ url_str = url_str.rstrip('>')
119
+
120
+
121
+ # 7. 移除末尾的常见非URL字符 (引号,特定标点)
122
+ # 注意顺序,这个应该在HTML标签移除后
123
+ url_str = url_str.rstrip('\'";,.?!<>()[]{}') # '<' 也在这里再次检查
124
+
125
+ # 8. 移除末尾单独的 '&' 字符 (在所有其他清理之后)
126
+ url_str = url_str.rstrip('&')
127
+ url_str = url_str.split("#:~:")[0]
128
+
129
+ if url_str: #确保URL不为空
130
+ processed_urls.append(url_str)
131
+
132
+ # 定义要过滤的域名列表
133
+ excluded_domains = [
134
+ "www.w3.org",
135
+ "www.google.com",
136
+ "ssl.gstatic.com",
137
+ "translate.google.com",
138
+ "www.googleadservices.com",
139
+ "gstatic.com",
140
+ "lens.google.com",
141
+ "schema.org",
142
+ "id.google.com",
143
+ "maps.google.com",
144
+ "clients6.google.com",
145
+ "ogs.google.com",
146
+ "policies.google.com",
147
+ "support.google.com",
148
+ "tpc.googlesyndication.com",
149
+ "adssettings.google.com"
150
+ ]
151
+
152
+ final_urls_before_dedup = []
153
+ for url in processed_urls:
154
+ if not url:
155
+ continue
156
+ if not any(excluded_domain in url for excluded_domain in excluded_domains):
157
+ # 9. 进一步规范化
158
+ # 9a. 移除末尾的 /
159
+ normalized_url = url.rstrip('/')
160
+
161
+ # 9b. 添加默认协议 (https) 如果缺失
162
+ if normalized_url and not normalized_url.startswith(('http://', 'https://')):
163
+ normalized_url = 'https://' + normalized_url
164
+
165
+ if normalized_url:
166
+ final_urls_before_dedup.append(normalized_url)
167
+
168
+ # 10. 去重
169
+ temp_unique_urls_set = set(final_urls_before_dedup)
170
+ temp_unique_urls_set.discard("https://baike.baidu.com")
171
+ temp_unique_urls_set.discard("https://zhuanlan.zhihu.com")
172
+ unique_urls = sorted(list(temp_unique_urls_set))
173
+
174
+ results = unique_urls
175
+ if not results:
176
+ return "No search results returned or results list is empty."
177
+
178
+ web_contents_raw = []
179
+ if results and isinstance(results, list) and len(results) > 0:
180
+ # print(f"Fetching content for {len(results)} URLs...")
181
+ # threads = []
182
+ # for url in url_set_list:
183
+ # # url_search_thread = ThreadWithReturnValue(target=jina_ai_Web_crawler, args=(url,True,))
184
+ # url_search_thread = ThreadWithReturnValue(target=get_url_content, args=(url,))
185
+ # # url_search_thread = ThreadWithReturnValue(target=Web_crawler, args=(url,True,))
186
+ # url_search_thread.start()
187
+ # threads.append(url_search_thread)
188
+ threads = []
189
+ for i, link in enumerate(results):
190
+ print(f"Processing URL {i + 1}/{len(results)}: {link}")
191
+ # Assuming get_url_content is synchronous and returns a string or None
192
+ # content_text = get_url_content(link)
193
+ url_search_thread = ThreadWithReturnValue(target=get_url_content, args=(link,))
194
+ url_search_thread.start()
195
+ threads.append(url_search_thread)
196
+
197
+ for thread in threads:
198
+ content_text = thread.join()
199
+ # content_text = thread.get_result()
200
+ if content_text and len(content_text.split("\n\n")) > 10: # Ensure content_text is not None or empty before adding
201
+ web_contents_raw.append({"url": link, "content": str(content_text)}) # Ensure content is string
202
+ else:
203
+ print(f"Warning: Failed to get content or content was empty for URL: {link}")
204
+ elif not results or (isinstance(results, list) and len(results) == 0) :
205
+ print("No search results returned or results list is empty.")
206
+ else:
207
+ print(f"Search results in unexpected format: {type(results)}")
208
+
209
+ # print(f"Fetched {len(web_contents_raw)} web contents with text.")
210
+
211
+ if not web_contents_raw:
212
+ return "No web content"
213
+ # if not web_contents_raw:
214
+ # print("No web content with text to process for similarity.")
215
+ # output_filename = "web_content_filtered.json"
216
+ # with open(output_filename, "w", encoding="utf-8") as f:
217
+ # json.dump([], f, indent=2, ensure_ascii=False)
218
+ # print(f"Empty list saved to {output_filename}")
219
+ # return
220
+
221
+ # output_filename = "web_content.json"
222
+ # with open(output_filename, "w", encoding="utf-8") as f:
223
+ # json.dump(web_contents_raw, f, indent=2, ensure_ascii=False)
224
+
225
+ n = len(web_contents_raw)
226
+ to_keep_flags = [True] * n # Flags to mark which items to keep
227
+
228
+ # print("Starting similarity comparison...")
229
+ for i in range(n):
230
+ if not to_keep_flags[i]: # Skip if item i is already marked for discard
231
+ continue
232
+
233
+ content_i = web_contents_raw[i].get('content', "")
234
+ if not isinstance(content_i, str):
235
+ content_i = str(content_i) # Fallback, though str(content_text) above should handle it
236
+
237
+ for j in range(i + 1, n):
238
+ if not to_keep_flags[j]: # Skip if item j is already marked for discard
239
+ continue
240
+
241
+ content_j = web_contents_raw[j].get('content', "")
242
+ if not isinstance(content_j, str):
243
+ content_j = str(content_j) # Fallback
244
+
245
+ similarity = calculate_similarity(content_i, content_j)
246
+
247
+ if similarity > 0.9:
248
+ # print(f"Similarity > 0.9 ({similarity:.4f}) between content from '{web_contents_raw[i]['url']}' and '{web_contents_raw[j]['url']}'. Discarding the latter.")
249
+ to_keep_flags[j] = False # Discard the second item (item j)
250
+
251
+ final_web_content = [web_contents_raw[i] for i in range(n) if to_keep_flags[i]]
252
+ # print(f"Number of items after filtering: {len(final_web_content)}")
253
+
254
+ # output_filename = "web_content_filtered.json"
255
+ # with open(output_filename, "w", encoding="utf-8") as f:
256
+ # json.dump(final_web_content, f, indent=2, ensure_ascii=False)
257
+ # print(f"Filtered web content saved to {output_filename}")
258
+ final_result = ""
259
+ for item in final_web_content:
260
+ final_result += item["content"]
261
+ final_result += "\n\n"
262
+ if not final_result:
263
+ return "No web content"
264
+ return final_result
265
+
266
+ import difflib
267
+
268
+
269
+ def calculate_similarity(string1: str, string2: str) -> float:
270
+ """Calculates the similarity ratio between two strings.
271
+
272
+ Args:
273
+ string1: The first string.
274
+ string2: The second string.
275
+
276
+ Returns:
277
+ A float between 0 and 1, where 1 means the strings are identical
278
+ and 0 means they are completely different.
279
+ """
280
+ return difflib.SequenceMatcher(None, string1, string2).ratio()
281
+
282
+ if __name__ == '__main__':
283
+ import asyncio
284
+ import re
285
+
286
+ async def main():
287
+ # 示例用法
288
+ search_query = "美国"
289
+ print(f"Performing web search for: '{search_query}'")
290
+ results = await search_web(search_query) # results is a list of URLs
291
+
292
+ print(results)
293
+
294
+ asyncio.run(main())
295
+
296
+ # python -m beswarm.tools.search_web
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: beswarm
3
- Version: 0.1.51
3
+ Version: 0.1.53
4
4
  Summary: MAS
5
5
  Requires-Python: >=3.11
6
6
  Description-Content-Type: text/markdown
@@ -1,7 +1,7 @@
1
1
  beswarm/__init__.py,sha256=HZjUOJtZR5QhMuDbq-wukQQn1VrBusNWai_ysGo-VVI,20
2
2
  beswarm/utils.py,sha256=AdDCcqAIIKQEMl7PfryVgeT9G5sHe7QNsZnrvmTGA8E,283
3
3
  beswarm/aient/main.py,sha256=SiYAIgQlLJqYusnTVEJOx1WNkSJKMImhgn5aWjfroxg,3814
4
- beswarm/aient/setup.py,sha256=VK80NYZ8fNHRLYh2gQe-kBWCY489rUmZsqy7VHpAZAA,486
4
+ beswarm/aient/setup.py,sha256=7hYwy55_Ncx42ns6TQjOyhMBMIFkFdbSWupWV7K00vQ,486
5
5
  beswarm/aient/src/aient/__init__.py,sha256=SRfF7oDVlOOAi6nGKiJIUK6B_arqYLO9iSMp-2IZZps,21
6
6
  beswarm/aient/src/aient/core/__init__.py,sha256=NxjebTlku35S4Dzr16rdSqSTWUvvwEeACe8KvHJnjPg,34
7
7
  beswarm/aient/src/aient/core/log_config.py,sha256=kz2_yJv1p-o3lUQOwA3qh-LSc3wMHv13iCQclw44W9c,274
@@ -32,8 +32,8 @@ beswarm/aient/src/aient/plugins/list_directory.py,sha256=5ubm-mfrj-tanGSDp4M_Tmb
32
32
  beswarm/aient/src/aient/plugins/read_file.py,sha256=-RRmaj-rSl8y--5VKnxCsZ1YQHe75OhnqvsDRLJyujM,8412
33
33
  beswarm/aient/src/aient/plugins/registry.py,sha256=YknzhieU_8nQ3oKlUSSWDB4X7t2Jx0JnqT2Jd9Xsvfk,3574
34
34
  beswarm/aient/src/aient/plugins/run_python.py,sha256=dgcUwBunMuDkaSKR5bToudVzSdrXVewktDDFUz_iIOQ,4589
35
- beswarm/aient/src/aient/plugins/websearch.py,sha256=I4tYU7CGLdyG6Hd3yK19V-PoG5IbFI9FEEVggyrshRg,15227
36
- beswarm/aient/src/aient/plugins/write_file.py,sha256=qmT6iQ3mDyVAa9Sld1jfJq0KPZj0w2kRIHq0JyjpGeA,1853
35
+ beswarm/aient/src/aient/plugins/websearch.py,sha256=llxy1U0vJiNMiKvamMr4p7IruLb3nnDR4YErz8TYimc,15215
36
+ beswarm/aient/src/aient/plugins/write_file.py,sha256=YRvQKMvV-5lwohxlvwt9hjfxz2dRJP85AJWAMUIqbBY,3804
37
37
  beswarm/aient/src/aient/prompt/__init__.py,sha256=GBtn6-JDT8KHFCcuPpfSNE_aGddg5p4FEyMCy4BfwGs,20
38
38
  beswarm/aient/src/aient/prompt/agent.py,sha256=6f5ZB66Rb8y0iQScHMRhvXZ1qMM3YsKpCBPCTAAw2rg,24917
39
39
  beswarm/aient/src/aient/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -119,15 +119,16 @@ beswarm/queries/tree-sitter-languages/rust-tags.scm,sha256=9ljM1nzhfPs_ZTRw7cr2P
119
119
  beswarm/queries/tree-sitter-languages/scala-tags.scm,sha256=UxQjz80JIrrJ7Pm56uUnQyThfmQNvwk7aQzPNypB-Ao,1761
120
120
  beswarm/queries/tree-sitter-languages/typescript-tags.scm,sha256=OMdCeedPiA24ky82DpgTMKXK_l2ySTuF2zrQ2fJAi9E,1253
121
121
  beswarm/tools/UIworker.py,sha256=YRrzW5GxWqA-tcmmm2c6mMbkVI0kHIqosIUz-GcoQOQ,6339
122
- beswarm/tools/__init__.py,sha256=-h_zoMEjnLCg9iVgSoor9BI3yK64LdVOQkVB0DgGFmo,1001
122
+ beswarm/tools/__init__.py,sha256=EKOiLDGDrJ5GPM31SYtsYzDGSri_EINnO8M9ud0BifU,1054
123
123
  beswarm/tools/click.py,sha256=TygaekCXTmU3fIu6Uom7ZcyzEgYMlCC_GX-5SmWHuLI,20762
124
124
  beswarm/tools/edit_file.py,sha256=hfpLaE4ekDiAya0Le0fJuYa-xUefWHLTxc3F6zGZd7M,6912
125
125
  beswarm/tools/planner.py,sha256=lguBCS6kpwNPoXQvqH-WySabVubT82iyWOkJnjt6dXw,1265
126
126
  beswarm/tools/repomap.py,sha256=CwvwoN5Swr42EzrORTTeV8MMb7mPviy4a4b0fxBu50k,40828
127
127
  beswarm/tools/search_arxiv.py,sha256=9slwBemXjEqrd7-YgVmyMijPXlkhZCybEDRVhWVQ9B0,7937
128
+ beswarm/tools/search_web.py,sha256=B24amOnGHnmdV_6S8bw8O2PdhZRRIDtJjg-wXcfP7dQ,11859
128
129
  beswarm/tools/think.py,sha256=WLw-7jNIsnS6n8MMSYUin_f-BGLENFmnKM2LISEp0co,1760
129
130
  beswarm/tools/worker.py,sha256=FfKCx7KFNbMRoAXtjU1_nJQjx9WHny7KBq8OXSYICJs,5334
130
- beswarm-0.1.51.dist-info/METADATA,sha256=MRqRv-NZv77ggRrhlNPiCoD8yFQES8iUyJdb1mNi_vA,3537
131
- beswarm-0.1.51.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
132
- beswarm-0.1.51.dist-info/top_level.txt,sha256=pJw4O87wvt5882smuSO6DfByJz7FJ8SxxT8h9fHCmpo,8
133
- beswarm-0.1.51.dist-info/RECORD,,
131
+ beswarm-0.1.53.dist-info/METADATA,sha256=N0kOlCXH6zGdg6F7a44dI1Ao9hUmi5Gt6fDJ6Kz699Q,3537
132
+ beswarm-0.1.53.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
133
+ beswarm-0.1.53.dist-info/top_level.txt,sha256=pJw4O87wvt5882smuSO6DfByJz7FJ8SxxT8h9fHCmpo,8
134
+ beswarm-0.1.53.dist-info/RECORD,,