beswarm 0.2.24__py3-none-any.whl → 0.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@ import json
4
4
  import httpx
5
5
  from urllib.parse import quote_plus
6
6
  import threading
7
+ import time
7
8
 
8
9
  from ..aient.src.aient.plugins import register_tool, get_url_content # Assuming a similar plugin structure
9
10
 
@@ -101,6 +102,17 @@ async def search_web(query: str):
101
102
  except ValueError:
102
103
  pass
103
104
 
105
+ # 2.5. 解码 Unicode 转义序列 (例如 \u003d -> =)
106
+ try:
107
+ def replace_unicode(match):
108
+ return chr(int(match.group(1), 16))
109
+ # 只查找和替换 \uXXXX 格式的序列
110
+ url_str = re.sub(r'\\u([0-9a-fA-F]{4})', replace_unicode, url_str)
111
+ except (ValueError, TypeError):
112
+ # 如果转换失败(例如,格式错误的序列),则忽略
113
+ print(f"Error decoding Unicode escape sequence in URL: {url_str}")
114
+ pass
115
+
104
116
  # 3. 解码 HTML 实体 (例如 & -> &)
105
117
  url_str = html.unescape(url_str)
106
118
 
@@ -142,20 +154,35 @@ async def search_web(query: str):
142
154
  excluded_domains = [
143
155
  "www.w3.org",
144
156
  "www.google.com",
145
- "ssl.gstatic.com",
146
157
  "translate.google.com",
158
+ "id.google.com",
159
+ "lens.google.com",
160
+ "ssl.gstatic.com",
147
161
  "www.googleadservices.com",
148
162
  "gstatic.com",
149
- "lens.google.com",
150
163
  "schema.org",
151
- "id.google.com",
152
164
  "maps.google.com",
153
165
  "clients6.google.com",
154
166
  "ogs.google.com",
155
167
  "policies.google.com",
156
168
  "support.google.com",
157
169
  "tpc.googlesyndication.com",
158
- "adssettings.google.com"
170
+ "adssettings.google.com",
171
+ ]
172
+
173
+ full_excluded_urls = [
174
+ "https://google.com",
175
+ "https://patents.google.com",
176
+ "https://patentpc.com",
177
+ "https://www.mdpi.com",
178
+ "https://trackobit.com",
179
+ "https://www.researchgate.net",
180
+ "https://www.sciencedirect.com",
181
+ "https://rosap.ntl.bts.gov",
182
+ "https://portal.unifiedpatents.com",
183
+ "https://ieeexplore.ieee.org",
184
+ "https://files-backend.assets.thrillshare.com",
185
+ "https://patentimages.storage.googleapis.com",
159
186
  ]
160
187
 
161
188
  final_urls_before_dedup = []
@@ -171,7 +198,7 @@ async def search_web(query: str):
171
198
  if normalized_url and not normalized_url.startswith(('http://', 'https://')):
172
199
  normalized_url = 'https://' + normalized_url
173
200
 
174
- if normalized_url:
201
+ if normalized_url and normalized_url not in full_excluded_urls:
175
202
  final_urls_before_dedup.append(normalized_url)
176
203
 
177
204
  # 10. 去重
@@ -188,16 +215,16 @@ async def search_web(query: str):
188
215
  if results and isinstance(results, list) and len(results) > 0:
189
216
  # print(f"Fetching content for {len(results)} URLs...")
190
217
 
191
- threads = []
218
+ threads_with_links = []
192
219
  for i, link in enumerate(results):
193
220
  print(f"Processing URL {i + 1}/{len(results)}: {link}")
194
221
  # Assuming get_url_content is synchronous and returns a string or None
195
222
  # content_text = get_url_content(link)
196
223
  url_search_thread = ThreadWithReturnValue(target=get_url_content, args=(link,))
197
224
  url_search_thread.start()
198
- threads.append(url_search_thread)
225
+ threads_with_links.append((url_search_thread, link))
199
226
 
200
- for thread in threads:
227
+ for thread, link in threads_with_links:
201
228
  content_text = thread.join()
202
229
  # content_text = thread.get_result()
203
230
  if content_text and len(content_text.split("\n\n")) > 10: # Ensure content_text is not None or empty before adding
@@ -229,6 +256,7 @@ async def search_web(query: str):
229
256
  to_keep_flags = [True] * n # Flags to mark which items to keep
230
257
 
231
258
  # print("Starting similarity comparison...")
259
+ # start_time = time.time()
232
260
  for i in range(n):
233
261
  if not to_keep_flags[i]: # Skip if item i is already marked for discard
234
262
  continue
@@ -246,14 +274,16 @@ async def search_web(query: str):
246
274
  content_j = str(content_j) # Fallback
247
275
 
248
276
  similarity = calculate_similarity(content_i, content_j)
277
+ # print(f"Similarity between {web_contents_raw[i]['url']} and {web_contents_raw[j]['url']}: {similarity:.4f}")
249
278
 
250
- if similarity > 0.9:
279
+ if similarity > 0.5:
251
280
  # print(f"Similarity > 0.9 ({similarity:.4f}) between content from '{web_contents_raw[i]['url']}' and '{web_contents_raw[j]['url']}'. Discarding the latter.")
252
281
  to_keep_flags[j] = False # Discard the second item (item j)
253
282
 
254
283
  final_web_content = [web_contents_raw[i] for i in range(n) if to_keep_flags[i]]
255
284
  # print(f"Number of items after filtering: {len(final_web_content)}")
256
-
285
+ # end_time = time.time()
286
+ # print(f"Time taken: {end_time - start_time:.2f} seconds")
257
287
  # output_filename = "web_content_filtered.json"
258
288
  # with open(output_filename, "w", encoding="utf-8") as f:
259
289
  # json.dump(final_web_content, f, indent=2, ensure_ascii=False)
@@ -270,16 +300,56 @@ import difflib
270
300
 
271
301
 
272
302
  def calculate_similarity(string1: str, string2: str) -> float:
273
- """Calculates the similarity ratio between two strings.
274
-
275
- Args:
276
- string1: The first string.
277
- string2: The second string.
278
-
279
- Returns:
280
- A float between 0 and 1, where 1 means the strings are identical
281
- and 0 means they are completely different.
282
303
  """
304
+ 根据您的最终反馈,整合了多级筛选策略来优化性能,且所有修改均在函数内部。
305
+
306
+ 优化思路:
307
+ 1. 长度筛选: 使用“min/max比例法”进行快速检查。如果difflib相似度的
308
+ 数学上限已经低于主循环中使用的阈值(0.5),则直接退出。
309
+ 2. 分块筛选: 采纳您提出的分块思想。我们将较短的字符串切分为20个块,
310
+ 并快速计算有多少块也出现在另一个字符串中。这是一个成本远低于difflib的内容预筛选。
311
+ - 如果重合度很高 (>80%),可以提前判断为相似。
312
+ - 如果重合度很低 (<20%),可以提前判断为不相似。
313
+ 3. 最终精确计算: 只有当相似度处于“中间地带”,前两级筛选无法确定时,
314
+ 我们才动用最精确但最耗时的difflib进行最终裁决。
315
+ """
316
+ len1, len2 = len(string1), len(string2)
317
+
318
+ # 第一级筛选: 长度检查 (非常廉价)
319
+ # 2.0 * min(len1, len2) / (len1 + len2) 是 difflib.ratio() 的数学上限。
320
+ # 这里的阈值0.5必须与主循环中的 `if similarity > 0.5:` 保持一致。
321
+ # print(len1, len2, (2.0 * min(len1, len2) / (len1 + len2)))
322
+ if not len1 or not len2 or (2.0 * min(len1, len2) / (len1 + len2)) < 0.5:
323
+ return 0.0
324
+
325
+ # 对于短字符串,分块没有意义,直接比较
326
+ if len1 < 40 or len2 < 40:
327
+ return difflib.SequenceMatcher(None, string1, string2).ratio()
328
+
329
+ # 第二级筛选: 分块检查 (中等成本)
330
+ shorter_str, longer_str = (string1, string2) if len1 < len2 else (string2, string1)
331
+
332
+ num_chunks = 1000
333
+ chunk_size = len(shorter_str) // num_chunks
334
+
335
+ # 因为上面已经有len < 40的检查,这里的chunk_size不可能为0,所以之前的if chunk_size == 0是冗余的。
336
+ matching_chunks = 0
337
+ for i in range(num_chunks):
338
+ start = i * chunk_size
339
+ chunk = shorter_str[start:start+chunk_size]
340
+ if chunk in longer_str:
341
+ matching_chunks += 1
342
+
343
+ match_ratio = matching_chunks / num_chunks
344
+ # print(matching_chunks, match_ratio)
345
+
346
+ # 根据分块匹配率进行判断,这些阈值是基于经验的启发式规则。
347
+ if match_ratio > 0.8: # 超过80%的块匹配,几乎可以肯定是高度相似
348
+ return match_ratio # 返回一个确保能通过主循环判断的高值
349
+ if match_ratio < 0.2: # 少于20%的块匹配,几乎不可能相似
350
+ return match_ratio
351
+
352
+ # 第三级:最终精确计算 (高成本)
283
353
  return difflib.SequenceMatcher(None, string1, string2).ratio()
284
354
 
285
355
  if __name__ == '__main__':
@@ -289,7 +359,9 @@ if __name__ == '__main__':
289
359
  async def main():
290
360
  # 示例用法
291
361
  # search_query = "美国"
292
- search_query = "machine learning models for higher heating value prediction using proximate vs ultimate analysis"
362
+ # search_query = "machine learning models for higher heating value prediction using proximate vs ultimate analysis"
363
+ # search_query = "patent driver cognitive load monitoring micro-expression thermal imaging fusion"
364
+ search_query = "patent predictive driver fatigue warning V2X data fusion driving behavior sequence"
293
365
  print(f"Performing web search for: '{search_query}'")
294
366
  results = await search_web(search_query) # results is a list of URLs
295
367
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: beswarm
3
- Version: 0.2.24
3
+ Version: 0.2.25
4
4
  Summary: MAS
5
5
  Requires-Python: >=3.11
6
6
  Description-Content-Type: text/markdown
@@ -133,10 +133,10 @@ beswarm/tools/repomap.py,sha256=YsTPq5MXfn_Ds5begcvHDnY_Xp2d4jH-xmWqNMHnNHY,4523
133
133
  beswarm/tools/request_input.py,sha256=gXNAJPOJektMqxJVyzNTFOeMQ7xUkO-wWMYH-r2Rdwk,942
134
134
  beswarm/tools/screenshot.py,sha256=u6t8FCgW5YHJ_Oc4coo8e0F3wTusWE_-H8dFh1rBq9Q,1011
135
135
  beswarm/tools/search_arxiv.py,sha256=caVIUOzMhFu-r_gVgJZrH2EO9xI5iV_qLAg0b3Ie9Xg,8095
136
- beswarm/tools/search_web.py,sha256=tLdw63doMTorrCG3ZoQkKvQPYBdx-m-SJskAXxfdim8,11958
136
+ beswarm/tools/search_web.py,sha256=eEE_aRcocttAwWfkcQdElI_BZw73xiRIEfbHDWAoQqU,15996
137
137
  beswarm/tools/taskmanager.py,sha256=oB_768qy6Lb58JNIcSLVgbPrgNB3duIq9DawbVHRbrg,6270
138
138
  beswarm/tools/worker.py,sha256=Vwn1XuTZ2dIStd5dQ6DhJ4f7LmwDc-Sx9PwLG0Xw-MQ,24062
139
- beswarm-0.2.24.dist-info/METADATA,sha256=PbLmlEKhy1mc1phhuG61O-XtynVlNWpG0y0NKB9Szho,3847
140
- beswarm-0.2.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
141
- beswarm-0.2.24.dist-info/top_level.txt,sha256=pJw4O87wvt5882smuSO6DfByJz7FJ8SxxT8h9fHCmpo,8
142
- beswarm-0.2.24.dist-info/RECORD,,
139
+ beswarm-0.2.25.dist-info/METADATA,sha256=Wwq8gqLH0xv1KMUkNsbfZEw6XRGKNORHzNTMXyPLjxs,3847
140
+ beswarm-0.2.25.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
141
+ beswarm-0.2.25.dist-info/top_level.txt,sha256=pJw4O87wvt5882smuSO6DfByJz7FJ8SxxT8h9fHCmpo,8
142
+ beswarm-0.2.25.dist-info/RECORD,,