beswarm 0.2.24__py3-none-any.whl → 0.2.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- beswarm/tools/search_web.py +92 -20
- {beswarm-0.2.24.dist-info → beswarm-0.2.25.dist-info}/METADATA +1 -1
- {beswarm-0.2.24.dist-info → beswarm-0.2.25.dist-info}/RECORD +5 -5
- {beswarm-0.2.24.dist-info → beswarm-0.2.25.dist-info}/WHEEL +0 -0
- {beswarm-0.2.24.dist-info → beswarm-0.2.25.dist-info}/top_level.txt +0 -0
beswarm/tools/search_web.py
CHANGED
@@ -4,6 +4,7 @@ import json
|
|
4
4
|
import httpx
|
5
5
|
from urllib.parse import quote_plus
|
6
6
|
import threading
|
7
|
+
import time
|
7
8
|
|
8
9
|
from ..aient.src.aient.plugins import register_tool, get_url_content # Assuming a similar plugin structure
|
9
10
|
|
@@ -101,6 +102,17 @@ async def search_web(query: str):
|
|
101
102
|
except ValueError:
|
102
103
|
pass
|
103
104
|
|
105
|
+
# 2.5. 解码 Unicode 转义序列 (例如 \u003d -> =)
|
106
|
+
try:
|
107
|
+
def replace_unicode(match):
|
108
|
+
return chr(int(match.group(1), 16))
|
109
|
+
# 只查找和替换 \uXXXX 格式的序列
|
110
|
+
url_str = re.sub(r'\\u([0-9a-fA-F]{4})', replace_unicode, url_str)
|
111
|
+
except (ValueError, TypeError):
|
112
|
+
# 如果转换失败(例如,格式错误的序列),则忽略
|
113
|
+
print(f"Error decoding Unicode escape sequence in URL: {url_str}")
|
114
|
+
pass
|
115
|
+
|
104
116
|
# 3. 解码 HTML 实体 (例如 & -> &)
|
105
117
|
url_str = html.unescape(url_str)
|
106
118
|
|
@@ -142,20 +154,35 @@ async def search_web(query: str):
|
|
142
154
|
excluded_domains = [
|
143
155
|
"www.w3.org",
|
144
156
|
"www.google.com",
|
145
|
-
"ssl.gstatic.com",
|
146
157
|
"translate.google.com",
|
158
|
+
"id.google.com",
|
159
|
+
"lens.google.com",
|
160
|
+
"ssl.gstatic.com",
|
147
161
|
"www.googleadservices.com",
|
148
162
|
"gstatic.com",
|
149
|
-
"lens.google.com",
|
150
163
|
"schema.org",
|
151
|
-
"id.google.com",
|
152
164
|
"maps.google.com",
|
153
165
|
"clients6.google.com",
|
154
166
|
"ogs.google.com",
|
155
167
|
"policies.google.com",
|
156
168
|
"support.google.com",
|
157
169
|
"tpc.googlesyndication.com",
|
158
|
-
"adssettings.google.com"
|
170
|
+
"adssettings.google.com",
|
171
|
+
]
|
172
|
+
|
173
|
+
full_excluded_urls = [
|
174
|
+
"https://google.com",
|
175
|
+
"https://patents.google.com",
|
176
|
+
"https://patentpc.com",
|
177
|
+
"https://www.mdpi.com",
|
178
|
+
"https://trackobit.com",
|
179
|
+
"https://www.researchgate.net",
|
180
|
+
"https://www.sciencedirect.com",
|
181
|
+
"https://rosap.ntl.bts.gov",
|
182
|
+
"https://portal.unifiedpatents.com",
|
183
|
+
"https://ieeexplore.ieee.org",
|
184
|
+
"https://files-backend.assets.thrillshare.com",
|
185
|
+
"https://patentimages.storage.googleapis.com",
|
159
186
|
]
|
160
187
|
|
161
188
|
final_urls_before_dedup = []
|
@@ -171,7 +198,7 @@ async def search_web(query: str):
|
|
171
198
|
if normalized_url and not normalized_url.startswith(('http://', 'https://')):
|
172
199
|
normalized_url = 'https://' + normalized_url
|
173
200
|
|
174
|
-
if normalized_url:
|
201
|
+
if normalized_url and normalized_url not in full_excluded_urls:
|
175
202
|
final_urls_before_dedup.append(normalized_url)
|
176
203
|
|
177
204
|
# 10. 去重
|
@@ -188,16 +215,16 @@ async def search_web(query: str):
|
|
188
215
|
if results and isinstance(results, list) and len(results) > 0:
|
189
216
|
# print(f"Fetching content for {len(results)} URLs...")
|
190
217
|
|
191
|
-
|
218
|
+
threads_with_links = []
|
192
219
|
for i, link in enumerate(results):
|
193
220
|
print(f"Processing URL {i + 1}/{len(results)}: {link}")
|
194
221
|
# Assuming get_url_content is synchronous and returns a string or None
|
195
222
|
# content_text = get_url_content(link)
|
196
223
|
url_search_thread = ThreadWithReturnValue(target=get_url_content, args=(link,))
|
197
224
|
url_search_thread.start()
|
198
|
-
|
225
|
+
threads_with_links.append((url_search_thread, link))
|
199
226
|
|
200
|
-
for thread in
|
227
|
+
for thread, link in threads_with_links:
|
201
228
|
content_text = thread.join()
|
202
229
|
# content_text = thread.get_result()
|
203
230
|
if content_text and len(content_text.split("\n\n")) > 10: # Ensure content_text is not None or empty before adding
|
@@ -229,6 +256,7 @@ async def search_web(query: str):
|
|
229
256
|
to_keep_flags = [True] * n # Flags to mark which items to keep
|
230
257
|
|
231
258
|
# print("Starting similarity comparison...")
|
259
|
+
# start_time = time.time()
|
232
260
|
for i in range(n):
|
233
261
|
if not to_keep_flags[i]: # Skip if item i is already marked for discard
|
234
262
|
continue
|
@@ -246,14 +274,16 @@ async def search_web(query: str):
|
|
246
274
|
content_j = str(content_j) # Fallback
|
247
275
|
|
248
276
|
similarity = calculate_similarity(content_i, content_j)
|
277
|
+
# print(f"Similarity between {web_contents_raw[i]['url']} and {web_contents_raw[j]['url']}: {similarity:.4f}")
|
249
278
|
|
250
|
-
if similarity > 0.
|
279
|
+
if similarity > 0.5:
|
251
280
|
# print(f"Similarity > 0.9 ({similarity:.4f}) between content from '{web_contents_raw[i]['url']}' and '{web_contents_raw[j]['url']}'. Discarding the latter.")
|
252
281
|
to_keep_flags[j] = False # Discard the second item (item j)
|
253
282
|
|
254
283
|
final_web_content = [web_contents_raw[i] for i in range(n) if to_keep_flags[i]]
|
255
284
|
# print(f"Number of items after filtering: {len(final_web_content)}")
|
256
|
-
|
285
|
+
# end_time = time.time()
|
286
|
+
# print(f"Time taken: {end_time - start_time:.2f} seconds")
|
257
287
|
# output_filename = "web_content_filtered.json"
|
258
288
|
# with open(output_filename, "w", encoding="utf-8") as f:
|
259
289
|
# json.dump(final_web_content, f, indent=2, ensure_ascii=False)
|
@@ -270,16 +300,56 @@ import difflib
|
|
270
300
|
|
271
301
|
|
272
302
|
def calculate_similarity(string1: str, string2: str) -> float:
|
273
|
-
"""Calculates the similarity ratio between two strings.
|
274
|
-
|
275
|
-
Args:
|
276
|
-
string1: The first string.
|
277
|
-
string2: The second string.
|
278
|
-
|
279
|
-
Returns:
|
280
|
-
A float between 0 and 1, where 1 means the strings are identical
|
281
|
-
and 0 means they are completely different.
|
282
303
|
"""
|
304
|
+
根据您的最终反馈,整合了多级筛选策略来优化性能,且所有修改均在函数内部。
|
305
|
+
|
306
|
+
优化思路:
|
307
|
+
1. 长度筛选: 使用“min/max比例法”进行快速检查。如果difflib相似度的
|
308
|
+
数学上限已经低于主循环中使用的阈值(0.5),则直接退出。
|
309
|
+
2. 分块筛选: 采纳您提出的分块思想。我们将较短的字符串切分为20个块,
|
310
|
+
并快速计算有多少块也出现在另一个字符串中。这是一个成本远低于difflib的内容预筛选。
|
311
|
+
- 如果重合度很高 (>80%),可以提前判断为相似。
|
312
|
+
- 如果重合度很低 (<20%),可以提前判断为不相似。
|
313
|
+
3. 最终精确计算: 只有当相似度处于“中间地带”,前两级筛选无法确定时,
|
314
|
+
我们才动用最精确但最耗时的difflib进行最终裁决。
|
315
|
+
"""
|
316
|
+
len1, len2 = len(string1), len(string2)
|
317
|
+
|
318
|
+
# 第一级筛选: 长度检查 (非常廉价)
|
319
|
+
# 2.0 * min(len1, len2) / (len1 + len2) 是 difflib.ratio() 的数学上限。
|
320
|
+
# 这里的阈值0.5必须与主循环中的 `if similarity > 0.5:` 保持一致。
|
321
|
+
# print(len1, len2, (2.0 * min(len1, len2) / (len1 + len2)))
|
322
|
+
if not len1 or not len2 or (2.0 * min(len1, len2) / (len1 + len2)) < 0.5:
|
323
|
+
return 0.0
|
324
|
+
|
325
|
+
# 对于短字符串,分块没有意义,直接比较
|
326
|
+
if len1 < 40 or len2 < 40:
|
327
|
+
return difflib.SequenceMatcher(None, string1, string2).ratio()
|
328
|
+
|
329
|
+
# 第二级筛选: 分块检查 (中等成本)
|
330
|
+
shorter_str, longer_str = (string1, string2) if len1 < len2 else (string2, string1)
|
331
|
+
|
332
|
+
num_chunks = 1000
|
333
|
+
chunk_size = len(shorter_str) // num_chunks
|
334
|
+
|
335
|
+
# 因为上面已经有len < 40的检查,这里的chunk_size不可能为0,所以之前的if chunk_size == 0是冗余的。
|
336
|
+
matching_chunks = 0
|
337
|
+
for i in range(num_chunks):
|
338
|
+
start = i * chunk_size
|
339
|
+
chunk = shorter_str[start:start+chunk_size]
|
340
|
+
if chunk in longer_str:
|
341
|
+
matching_chunks += 1
|
342
|
+
|
343
|
+
match_ratio = matching_chunks / num_chunks
|
344
|
+
# print(matching_chunks, match_ratio)
|
345
|
+
|
346
|
+
# 根据分块匹配率进行判断,这些阈值是基于经验的启发式规则。
|
347
|
+
if match_ratio > 0.8: # 超过80%的块匹配,几乎可以肯定是高度相似
|
348
|
+
return match_ratio # 返回一个确保能通过主循环判断的高值
|
349
|
+
if match_ratio < 0.2: # 少于20%的块匹配,几乎不可能相似
|
350
|
+
return match_ratio
|
351
|
+
|
352
|
+
# 第三级:最终精确计算 (高成本)
|
283
353
|
return difflib.SequenceMatcher(None, string1, string2).ratio()
|
284
354
|
|
285
355
|
if __name__ == '__main__':
|
@@ -289,7 +359,9 @@ if __name__ == '__main__':
|
|
289
359
|
async def main():
|
290
360
|
# 示例用法
|
291
361
|
# search_query = "美国"
|
292
|
-
search_query = "machine learning models for higher heating value prediction using proximate vs ultimate analysis"
|
362
|
+
# search_query = "machine learning models for higher heating value prediction using proximate vs ultimate analysis"
|
363
|
+
# search_query = "patent driver cognitive load monitoring micro-expression thermal imaging fusion"
|
364
|
+
search_query = "patent predictive driver fatigue warning V2X data fusion driving behavior sequence"
|
293
365
|
print(f"Performing web search for: '{search_query}'")
|
294
366
|
results = await search_web(search_query) # results is a list of URLs
|
295
367
|
|
@@ -133,10 +133,10 @@ beswarm/tools/repomap.py,sha256=YsTPq5MXfn_Ds5begcvHDnY_Xp2d4jH-xmWqNMHnNHY,4523
|
|
133
133
|
beswarm/tools/request_input.py,sha256=gXNAJPOJektMqxJVyzNTFOeMQ7xUkO-wWMYH-r2Rdwk,942
|
134
134
|
beswarm/tools/screenshot.py,sha256=u6t8FCgW5YHJ_Oc4coo8e0F3wTusWE_-H8dFh1rBq9Q,1011
|
135
135
|
beswarm/tools/search_arxiv.py,sha256=caVIUOzMhFu-r_gVgJZrH2EO9xI5iV_qLAg0b3Ie9Xg,8095
|
136
|
-
beswarm/tools/search_web.py,sha256=
|
136
|
+
beswarm/tools/search_web.py,sha256=eEE_aRcocttAwWfkcQdElI_BZw73xiRIEfbHDWAoQqU,15996
|
137
137
|
beswarm/tools/taskmanager.py,sha256=oB_768qy6Lb58JNIcSLVgbPrgNB3duIq9DawbVHRbrg,6270
|
138
138
|
beswarm/tools/worker.py,sha256=Vwn1XuTZ2dIStd5dQ6DhJ4f7LmwDc-Sx9PwLG0Xw-MQ,24062
|
139
|
-
beswarm-0.2.
|
140
|
-
beswarm-0.2.
|
141
|
-
beswarm-0.2.
|
142
|
-
beswarm-0.2.
|
139
|
+
beswarm-0.2.25.dist-info/METADATA,sha256=Wwq8gqLH0xv1KMUkNsbfZEw6XRGKNORHzNTMXyPLjxs,3847
|
140
|
+
beswarm-0.2.25.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
141
|
+
beswarm-0.2.25.dist-info/top_level.txt,sha256=pJw4O87wvt5882smuSO6DfByJz7FJ8SxxT8h9fHCmpo,8
|
142
|
+
beswarm-0.2.25.dist-info/RECORD,,
|
File without changes
|
File without changes
|