signalwire-agents 0.1.54__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +1 -1
- signalwire_agents/cli/build_search.py +3 -3
- signalwire_agents/search/pgvector_backend.py +17 -11
- signalwire_agents/search/query_processor.py +28 -31
- signalwire_agents/search/search_engine.py +8 -8
- signalwire_agents/search/search_service.py +44 -11
- signalwire_agents/skills/native_vector_search/skill.py +16 -10
- signalwire_agents/skills/web_search/skill.py +519 -124
- {signalwire_agents-0.1.54.dist-info → signalwire_agents-1.0.0.dist-info}/METADATA +1 -1
- {signalwire_agents-0.1.54.dist-info → signalwire_agents-1.0.0.dist-info}/RECORD +14 -14
- {signalwire_agents-0.1.54.dist-info → signalwire_agents-1.0.0.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.54.dist-info → signalwire_agents-1.0.0.dist-info}/entry_points.txt +0 -0
- {signalwire_agents-0.1.54.dist-info → signalwire_agents-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.54.dist-info → signalwire_agents-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -10,17 +10,18 @@ See LICENSE file in the project root for full license information.
|
|
|
10
10
|
import os
|
|
11
11
|
import requests
|
|
12
12
|
import time
|
|
13
|
+
import re
|
|
13
14
|
from urllib.parse import urljoin, urlparse
|
|
14
15
|
from bs4 import BeautifulSoup
|
|
15
16
|
import json
|
|
16
|
-
from typing import Optional, List, Dict, Any
|
|
17
|
+
from typing import Optional, List, Dict, Any, Tuple
|
|
17
18
|
|
|
18
19
|
from signalwire_agents.core.skill_base import SkillBase
|
|
19
20
|
from signalwire_agents.core.function_result import SwaigFunctionResult
|
|
20
21
|
|
|
21
22
|
class GoogleSearchScraper:
|
|
22
|
-
"""Google Search and Web Scraping functionality"""
|
|
23
|
-
|
|
23
|
+
"""Google Search and Web Scraping functionality with quality scoring"""
|
|
24
|
+
|
|
24
25
|
def __init__(self, api_key: str, search_engine_id: str, max_content_length: int = 32768):
|
|
25
26
|
self.api_key = api_key
|
|
26
27
|
self.search_engine_id = search_engine_id
|
|
@@ -33,22 +34,22 @@ class GoogleSearchScraper:
|
|
|
33
34
|
def search_google(self, query: str, num_results: int = 5) -> list:
|
|
34
35
|
"""Search Google using Custom Search JSON API"""
|
|
35
36
|
url = "https://www.googleapis.com/customsearch/v1"
|
|
36
|
-
|
|
37
|
+
|
|
37
38
|
params = {
|
|
38
39
|
'key': self.api_key,
|
|
39
40
|
'cx': self.search_engine_id,
|
|
40
41
|
'q': query,
|
|
41
42
|
'num': min(num_results, 10)
|
|
42
43
|
}
|
|
43
|
-
|
|
44
|
+
|
|
44
45
|
try:
|
|
45
46
|
response = self.session.get(url, params=params)
|
|
46
47
|
response.raise_for_status()
|
|
47
48
|
data = response.json()
|
|
48
|
-
|
|
49
|
+
|
|
49
50
|
if 'items' not in data:
|
|
50
51
|
return []
|
|
51
|
-
|
|
52
|
+
|
|
52
53
|
results = []
|
|
53
54
|
for item in data['items'][:num_results]:
|
|
54
55
|
results.append({
|
|
@@ -56,19 +57,149 @@ class GoogleSearchScraper:
|
|
|
56
57
|
'url': item.get('link', ''),
|
|
57
58
|
'snippet': item.get('snippet', '')
|
|
58
59
|
})
|
|
59
|
-
|
|
60
|
+
|
|
60
61
|
return results
|
|
61
|
-
|
|
62
|
+
|
|
62
63
|
except Exception as e:
|
|
63
64
|
return []
|
|
64
65
|
|
|
65
|
-
def
|
|
66
|
-
"""
|
|
66
|
+
def is_reddit_url(self, url: str) -> bool:
|
|
67
|
+
"""Check if URL is from Reddit"""
|
|
68
|
+
domain = urlparse(url).netloc.lower()
|
|
69
|
+
return 'reddit.com' in domain or 'redd.it' in domain
|
|
67
70
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
71
|
+
def extract_reddit_content(self, url: str, content_limit: int = None, timeout: int = 10) -> Tuple[str, Dict[str, Any]]:
|
|
72
|
+
"""
|
|
73
|
+
Extract Reddit content using JSON API for better quality
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Tuple of (text_content, quality_metrics)
|
|
77
|
+
"""
|
|
78
|
+
try:
|
|
79
|
+
# Convert to JSON endpoint
|
|
80
|
+
if not url.endswith('.json'):
|
|
81
|
+
json_url = url.rstrip('/') + '.json'
|
|
82
|
+
else:
|
|
83
|
+
json_url = url
|
|
84
|
+
|
|
85
|
+
# Fetch with proper headers (Reddit requires User-Agent)
|
|
86
|
+
headers = {'User-Agent': 'SignalWire-WebSearch/2.0'}
|
|
87
|
+
response = requests.get(json_url, headers=headers, timeout=timeout)
|
|
88
|
+
response.raise_for_status()
|
|
89
|
+
|
|
90
|
+
data = response.json()
|
|
91
|
+
|
|
92
|
+
# Extract post information
|
|
93
|
+
if not data or not isinstance(data, list) or len(data) < 1:
|
|
94
|
+
return "", {"error": "Invalid Reddit JSON structure", "quality_score": 0}
|
|
95
|
+
|
|
96
|
+
# First element is the post, second (if exists) contains comments
|
|
97
|
+
post_data = data[0]['data']['children'][0]['data']
|
|
98
|
+
|
|
99
|
+
# Build content from post
|
|
100
|
+
content_parts = []
|
|
101
|
+
|
|
102
|
+
# Add post title and metadata
|
|
103
|
+
title = post_data.get('title', 'No title')
|
|
104
|
+
author = post_data.get('author', 'unknown')
|
|
105
|
+
score = post_data.get('score', 0)
|
|
106
|
+
num_comments = post_data.get('num_comments', 0)
|
|
107
|
+
subreddit = post_data.get('subreddit', '')
|
|
108
|
+
|
|
109
|
+
content_parts.append(f"Reddit r/{subreddit} Discussion")
|
|
110
|
+
content_parts.append(f"\nPost: {title}")
|
|
111
|
+
content_parts.append(f"Author: {author} | Score: {score} | Comments: {num_comments}")
|
|
112
|
+
|
|
113
|
+
# Add original post text if it's a text post
|
|
114
|
+
selftext = post_data.get('selftext', '').strip()
|
|
115
|
+
if selftext and selftext != '[removed]' and selftext != '[deleted]':
|
|
116
|
+
content_parts.append(f"\nOriginal Post:\n{selftext[:1000]}") # Limit post text
|
|
117
|
+
|
|
118
|
+
# Extract top comments if available
|
|
119
|
+
valid_comments = []
|
|
120
|
+
if len(data) > 1 and 'data' in data[1] and 'children' in data[1]['data']:
|
|
121
|
+
comments = data[1]['data']['children']
|
|
122
|
+
|
|
123
|
+
# Filter and sort comments by score
|
|
124
|
+
for comment in comments[:20]: # Look at top 20 comments
|
|
125
|
+
if comment.get('kind') == 't1': # t1 = comment
|
|
126
|
+
comment_data = comment.get('data', {})
|
|
127
|
+
body = comment_data.get('body', '').strip()
|
|
128
|
+
if (body and
|
|
129
|
+
body != '[removed]' and
|
|
130
|
+
body != '[deleted]' and
|
|
131
|
+
len(body) > 50): # Skip very short comments
|
|
132
|
+
valid_comments.append({
|
|
133
|
+
'body': body,
|
|
134
|
+
'author': comment_data.get('author', 'unknown'),
|
|
135
|
+
'score': comment_data.get('score', 0)
|
|
136
|
+
})
|
|
137
|
+
|
|
138
|
+
# Sort by score and take top comments
|
|
139
|
+
valid_comments.sort(key=lambda x: x['score'], reverse=True)
|
|
140
|
+
|
|
141
|
+
if valid_comments:
|
|
142
|
+
content_parts.append("\n--- Top Discussion ---")
|
|
143
|
+
for i, comment in enumerate(valid_comments[:5], 1):
|
|
144
|
+
# Truncate long comments
|
|
145
|
+
comment_text = comment['body'][:500]
|
|
146
|
+
if len(comment['body']) > 500:
|
|
147
|
+
comment_text += "..."
|
|
148
|
+
|
|
149
|
+
content_parts.append(f"\nComment {i} (Score: {comment['score']}, Author: {comment['author']}):")
|
|
150
|
+
content_parts.append(comment_text)
|
|
151
|
+
|
|
152
|
+
# Join all content
|
|
153
|
+
text = '\n'.join(content_parts)
|
|
154
|
+
|
|
155
|
+
# Calculate quality metrics specifically for Reddit content
|
|
156
|
+
metrics = {
|
|
157
|
+
'text_length': len(text),
|
|
158
|
+
'score': score,
|
|
159
|
+
'num_comments': num_comments,
|
|
160
|
+
'domain': urlparse(url).netloc.lower(),
|
|
161
|
+
'is_reddit': True
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
# Quality score based on Reddit-specific factors
|
|
165
|
+
length_score = min(1.0, len(text) / 2000) # Want at least 2000 chars
|
|
166
|
+
engagement_score = min(1.0, (score + num_comments) / 100) # High engagement is good
|
|
167
|
+
has_comments = 1.0 if len(valid_comments) > 0 else 0.3 # Heavily penalize if no good comments
|
|
168
|
+
|
|
169
|
+
quality_score = (
|
|
170
|
+
length_score * 0.4 +
|
|
171
|
+
engagement_score * 0.3 +
|
|
172
|
+
has_comments * 0.3
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
metrics['quality_score'] = round(quality_score, 3)
|
|
176
|
+
|
|
177
|
+
# Limit content if needed
|
|
178
|
+
limit = content_limit if content_limit is not None else self.max_content_length
|
|
179
|
+
if len(text) > limit:
|
|
180
|
+
text = text[:limit]
|
|
181
|
+
|
|
182
|
+
return text, metrics
|
|
183
|
+
|
|
184
|
+
except Exception as e:
|
|
185
|
+
# Fall back to HTML extraction if JSON fails
|
|
186
|
+
return self.extract_html_content(url, content_limit, timeout)
|
|
187
|
+
|
|
188
|
+
def extract_text_from_url(self, url: str, content_limit: int = None, timeout: int = 10) -> Tuple[str, Dict[str, Any]]:
|
|
189
|
+
"""
|
|
190
|
+
Main extraction method that routes to appropriate extractor
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Tuple of (text_content, quality_metrics)
|
|
194
|
+
"""
|
|
195
|
+
if self.is_reddit_url(url):
|
|
196
|
+
return self.extract_reddit_content(url, content_limit, timeout)
|
|
197
|
+
else:
|
|
198
|
+
return self.extract_html_content(url, content_limit, timeout)
|
|
199
|
+
|
|
200
|
+
def extract_html_content(self, url: str, content_limit: int = None, timeout: int = 10) -> Tuple[str, Dict[str, Any]]:
|
|
201
|
+
"""
|
|
202
|
+
Original HTML extraction method (renamed from extract_text_from_url)
|
|
72
203
|
"""
|
|
73
204
|
try:
|
|
74
205
|
response = self.session.get(url, timeout=timeout)
|
|
@@ -76,95 +207,347 @@ class GoogleSearchScraper:
|
|
|
76
207
|
|
|
77
208
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
78
209
|
|
|
79
|
-
#
|
|
80
|
-
|
|
81
|
-
|
|
210
|
+
# Extract main content areas (common content selectors)
|
|
211
|
+
main_content = None
|
|
212
|
+
content_selectors = [
|
|
213
|
+
'article', 'main', '[role="main"]', '.content', '#content',
|
|
214
|
+
'.post', '.entry-content', '.article-body', '.story-body',
|
|
215
|
+
'.markdown-body', '.wiki-body', '.documentation'
|
|
216
|
+
]
|
|
217
|
+
|
|
218
|
+
for selector in content_selectors:
|
|
219
|
+
main_content = soup.select_one(selector)
|
|
220
|
+
if main_content:
|
|
221
|
+
break
|
|
222
|
+
|
|
223
|
+
# If no main content found, use the whole body
|
|
224
|
+
if not main_content:
|
|
225
|
+
main_content = soup.find('body') or soup
|
|
226
|
+
|
|
227
|
+
# Clone for processing
|
|
228
|
+
content_soup = BeautifulSoup(str(main_content), 'html.parser')
|
|
229
|
+
|
|
230
|
+
# Remove unwanted elements from the content area
|
|
231
|
+
unwanted_tags = ["script", "style", "nav", "footer", "header", "aside", "noscript", "iframe"]
|
|
232
|
+
for tag in unwanted_tags:
|
|
233
|
+
for element in content_soup(tag):
|
|
234
|
+
element.decompose()
|
|
82
235
|
|
|
83
|
-
|
|
236
|
+
# Remove elements with unwanted classes/ids
|
|
237
|
+
unwanted_patterns = [
|
|
238
|
+
'sidebar', 'navigation', 'menu', 'advertisement', 'ads', 'banner',
|
|
239
|
+
'popup', 'modal', 'cookie', 'gdpr', 'subscribe', 'newsletter',
|
|
240
|
+
'comments', 'related', 'share', 'social'
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
for pattern in unwanted_patterns:
|
|
244
|
+
# Remove by class
|
|
245
|
+
for element in content_soup.find_all(class_=re.compile(pattern, re.I)):
|
|
246
|
+
element.decompose()
|
|
247
|
+
# Remove by id
|
|
248
|
+
for element in content_soup.find_all(id=re.compile(pattern, re.I)):
|
|
249
|
+
element.decompose()
|
|
250
|
+
|
|
251
|
+
# Extract text
|
|
252
|
+
text = content_soup.get_text()
|
|
84
253
|
|
|
85
254
|
# Clean up the text
|
|
86
|
-
lines =
|
|
87
|
-
|
|
88
|
-
|
|
255
|
+
lines = [line.strip() for line in text.splitlines()]
|
|
256
|
+
# Remove empty lines and join
|
|
257
|
+
lines = [line for line in lines if line]
|
|
258
|
+
text = ' '.join(lines)
|
|
259
|
+
|
|
260
|
+
# Remove excessive whitespace
|
|
261
|
+
text = re.sub(r'\s+', ' ', text).strip()
|
|
262
|
+
|
|
263
|
+
# Calculate quality metrics (need to pass query for relevance)
|
|
264
|
+
quality_metrics = self._calculate_content_quality(text, url, "")
|
|
89
265
|
|
|
90
266
|
# Limit text length
|
|
91
267
|
limit = content_limit if content_limit is not None else self.max_content_length
|
|
92
268
|
if len(text) > limit:
|
|
93
269
|
text = text[:limit]
|
|
94
270
|
|
|
95
|
-
return text
|
|
271
|
+
return text, quality_metrics
|
|
96
272
|
|
|
97
273
|
except Exception as e:
|
|
98
|
-
return ""
|
|
274
|
+
return "", {"error": str(e), "quality_score": 0}
|
|
99
275
|
|
|
100
|
-
def
|
|
101
|
-
"""
|
|
276
|
+
def _calculate_content_quality(self, text: str, url: str, query: str = "") -> Dict[str, Any]:
|
|
277
|
+
"""
|
|
278
|
+
Calculate quality metrics for extracted content
|
|
279
|
+
|
|
280
|
+
Quality factors:
|
|
281
|
+
- Text length (substantive content)
|
|
282
|
+
- Word diversity (not repetitive)
|
|
283
|
+
- Sentence structure (proper formatting)
|
|
284
|
+
- Lack of boilerplate phrases
|
|
285
|
+
- Domain reputation
|
|
286
|
+
"""
|
|
287
|
+
if not text:
|
|
288
|
+
return {"quality_score": 0, "text_length": 0}
|
|
289
|
+
|
|
290
|
+
metrics = {}
|
|
291
|
+
|
|
292
|
+
# Text length (MUCH stricter - prefer 2000-10000 chars of actual content)
|
|
293
|
+
text_length = len(text)
|
|
294
|
+
metrics['text_length'] = text_length
|
|
295
|
+
if text_length < 500:
|
|
296
|
+
length_score = 0 # Too short to be useful
|
|
297
|
+
elif text_length < 2000:
|
|
298
|
+
length_score = (text_length - 500) / 1500 * 0.5 # Scale to 0.5 max
|
|
299
|
+
elif text_length <= 10000:
|
|
300
|
+
length_score = 1.0 # Ideal range
|
|
301
|
+
else:
|
|
302
|
+
length_score = max(0.8, 1.0 - (text_length - 10000) / 20000)
|
|
303
|
+
|
|
304
|
+
# Word diversity (unique words / total words)
|
|
305
|
+
words = text.lower().split()
|
|
306
|
+
if words:
|
|
307
|
+
unique_words = len(set(words))
|
|
308
|
+
total_words = len(words)
|
|
309
|
+
diversity_score = min(1.0, unique_words / (total_words * 0.3)) # Expect 30% unique
|
|
310
|
+
metrics['word_diversity'] = unique_words / total_words if total_words > 0 else 0
|
|
311
|
+
else:
|
|
312
|
+
diversity_score = 0
|
|
313
|
+
metrics['word_diversity'] = 0
|
|
314
|
+
|
|
315
|
+
# Check for boilerplate/navigation text (MUCH stricter)
|
|
316
|
+
boilerplate_phrases = [
|
|
317
|
+
'cookie', 'privacy policy', 'terms of service', 'subscribe',
|
|
318
|
+
'sign up', 'log in', 'advertisement', 'sponsored', 'copyright',
|
|
319
|
+
'all rights reserved', 'skip to', 'navigation', 'breadcrumb',
|
|
320
|
+
'reddit inc', 'google llc', 'expand navigation', 'members •',
|
|
321
|
+
'archived post', 'votes cannot be cast', 'r/', 'subreddit',
|
|
322
|
+
'youtube', 'facebook', 'twitter', 'instagram', 'pinterest'
|
|
323
|
+
]
|
|
324
|
+
|
|
325
|
+
lower_text = text.lower()
|
|
326
|
+
boilerplate_count = sum(1 for phrase in boilerplate_phrases if phrase in lower_text)
|
|
327
|
+
boilerplate_penalty = max(0, 1.0 - (boilerplate_count * 0.15)) # -15% per boilerplate phrase
|
|
328
|
+
metrics['boilerplate_count'] = boilerplate_count
|
|
329
|
+
|
|
330
|
+
# Sentence detection (need MORE sentences for quality content)
|
|
331
|
+
sentences = re.split(r'[.!?]+', text)
|
|
332
|
+
sentence_count = len([s for s in sentences if len(s.strip()) > 30]) # Longer min sentence
|
|
333
|
+
sentence_score = min(1.0, sentence_count / 10) # Need at least 10 proper sentences
|
|
334
|
+
metrics['sentence_count'] = sentence_count
|
|
335
|
+
|
|
336
|
+
# Domain quality (MUCH harsher on social media)
|
|
337
|
+
domain = urlparse(url).netloc.lower()
|
|
338
|
+
quality_domains = [
|
|
339
|
+
'wikipedia.org', 'starwars.fandom.com', 'imdb.com',
|
|
340
|
+
'screenrant.com', 'denofgeek.com', 'ign.com',
|
|
341
|
+
'hollywoodreporter.com', 'variety.com', 'ew.com',
|
|
342
|
+
'stackexchange.com', 'stackoverflow.com',
|
|
343
|
+
'github.com', 'medium.com', 'dev.to', 'arxiv.org',
|
|
344
|
+
'nature.com', 'sciencedirect.com', 'ieee.org'
|
|
345
|
+
]
|
|
346
|
+
|
|
347
|
+
low_quality_domains = [
|
|
348
|
+
'reddit.com', 'youtube.com', 'facebook.com', 'twitter.com',
|
|
349
|
+
'instagram.com', 'pinterest.com', 'tiktok.com', 'x.com'
|
|
350
|
+
]
|
|
351
|
+
|
|
352
|
+
if any(d in domain for d in quality_domains):
|
|
353
|
+
domain_score = 1.5 # Strong bonus for quality domains
|
|
354
|
+
elif any(d in domain for d in low_quality_domains):
|
|
355
|
+
domain_score = 0.1 # Severe penalty for social media
|
|
356
|
+
else:
|
|
357
|
+
domain_score = 1.0
|
|
358
|
+
|
|
359
|
+
metrics['domain'] = domain
|
|
360
|
+
|
|
361
|
+
# Query relevance scoring - check for query terms in content
|
|
362
|
+
relevance_score = 0
|
|
363
|
+
if query:
|
|
364
|
+
# Split query into meaningful words (skip common words)
|
|
365
|
+
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were'}
|
|
366
|
+
query_words = [w.lower() for w in query.split() if w.lower() not in stop_words and len(w) > 2]
|
|
102
367
|
|
|
103
|
-
|
|
104
|
-
|
|
368
|
+
if query_words:
|
|
369
|
+
lower_content = text.lower()
|
|
370
|
+
# Count how many query words appear in content
|
|
371
|
+
words_found = sum(1 for word in query_words if word in lower_content)
|
|
372
|
+
|
|
373
|
+
# Also check for exact phrase matches (bonus points)
|
|
374
|
+
exact_phrase_bonus = 0
|
|
375
|
+
if len(query_words) > 1:
|
|
376
|
+
# Check for consecutive word pairs
|
|
377
|
+
for i in range(len(query_words) - 1):
|
|
378
|
+
phrase = f"{query_words[i]} {query_words[i+1]}"
|
|
379
|
+
if phrase in lower_content:
|
|
380
|
+
exact_phrase_bonus += 0.2
|
|
381
|
+
|
|
382
|
+
relevance_score = min(1.0, (words_found / len(query_words)) + exact_phrase_bonus)
|
|
383
|
+
metrics['query_relevance'] = round(relevance_score, 3)
|
|
384
|
+
metrics['query_words_found'] = f"{words_found}/{len(query_words)}"
|
|
385
|
+
else:
|
|
386
|
+
relevance_score = 0.5 # Neutral if no query provided
|
|
387
|
+
metrics['query_relevance'] = 0.5
|
|
388
|
+
|
|
389
|
+
# Calculate final quality score (adjusted weights with relevance)
|
|
390
|
+
quality_score = (
|
|
391
|
+
length_score * 0.25 + # Length important
|
|
392
|
+
diversity_score * 0.10 + # Less weight
|
|
393
|
+
boilerplate_penalty * 0.10 + # Less weight
|
|
394
|
+
sentence_score * 0.15 + # Sentences important
|
|
395
|
+
domain_score * 0.15 + # Domain important
|
|
396
|
+
relevance_score * 0.25 # Query relevance important
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
metrics['quality_score'] = round(quality_score, 3)
|
|
400
|
+
metrics['length_score'] = round(length_score, 3)
|
|
401
|
+
metrics['diversity_score'] = round(diversity_score, 3)
|
|
402
|
+
metrics['boilerplate_penalty'] = round(boilerplate_penalty, 3)
|
|
403
|
+
metrics['sentence_score'] = round(sentence_score, 3)
|
|
404
|
+
metrics['domain_score'] = round(domain_score, 3)
|
|
405
|
+
|
|
406
|
+
return metrics
|
|
407
|
+
|
|
408
|
+
def search_and_scrape_best(self, query: str, num_results: int = 3,
|
|
409
|
+
oversample_factor: float = 4.0, delay: float = 0.5,
|
|
410
|
+
min_quality_score: float = 0.2) -> str:
|
|
105
411
|
"""
|
|
106
|
-
|
|
412
|
+
Search and scrape with quality filtering and source diversity
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
query: Search query
|
|
416
|
+
num_results: Number of best results to return
|
|
417
|
+
oversample_factor: How many extra results to fetch (e.g., 4.0 = fetch 4x)
|
|
418
|
+
delay: Delay between requests
|
|
419
|
+
min_quality_score: Minimum quality score to include a result
|
|
420
|
+
|
|
421
|
+
Returns:
|
|
422
|
+
Formatted string with the best N results from diverse sources
|
|
423
|
+
"""
|
|
424
|
+
# Fetch more results than requested (increased to 4x for better pool)
|
|
425
|
+
fetch_count = min(10, int(num_results * oversample_factor))
|
|
426
|
+
search_results = self.search_google(query, fetch_count)
|
|
107
427
|
|
|
108
428
|
if not search_results:
|
|
109
429
|
return f"No search results found for query: {query}"
|
|
110
430
|
|
|
111
|
-
#
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
431
|
+
# Process all results and collect quality metrics
|
|
432
|
+
processed_results = []
|
|
433
|
+
|
|
434
|
+
for result in search_results:
|
|
435
|
+
# Extract content and quality metrics (pass query for relevance scoring)
|
|
436
|
+
page_text, metrics = self.extract_text_from_url(result['url'])
|
|
437
|
+
|
|
438
|
+
# Recalculate metrics with query relevance
|
|
439
|
+
if page_text:
|
|
440
|
+
metrics = self._calculate_content_quality(page_text, result['url'], query)
|
|
441
|
+
|
|
442
|
+
if metrics.get('quality_score', 0) >= min_quality_score and page_text:
|
|
443
|
+
processed_results.append({
|
|
444
|
+
'title': result['title'],
|
|
445
|
+
'url': result['url'],
|
|
446
|
+
'snippet': result['snippet'],
|
|
447
|
+
'content': page_text,
|
|
448
|
+
'metrics': metrics,
|
|
449
|
+
'quality_score': metrics.get('quality_score', 0),
|
|
450
|
+
'domain': metrics.get('domain', '')
|
|
451
|
+
})
|
|
452
|
+
|
|
453
|
+
# Small delay between requests
|
|
454
|
+
if delay > 0:
|
|
455
|
+
time.sleep(delay)
|
|
116
456
|
|
|
117
|
-
|
|
118
|
-
|
|
457
|
+
if not processed_results:
|
|
458
|
+
return f"No quality results found for query: {query}. All results were below quality threshold."
|
|
119
459
|
|
|
460
|
+
# Sort by quality score
|
|
461
|
+
processed_results.sort(key=lambda x: x['quality_score'], reverse=True)
|
|
462
|
+
|
|
463
|
+
# Select diverse results (prefer different domains)
|
|
464
|
+
best_results = []
|
|
465
|
+
seen_domains = set()
|
|
466
|
+
|
|
467
|
+
# First pass: Add highest quality result from each unique domain
|
|
468
|
+
for result in processed_results:
|
|
469
|
+
domain = result['domain']
|
|
470
|
+
if domain not in seen_domains and len(best_results) < num_results:
|
|
471
|
+
best_results.append(result)
|
|
472
|
+
seen_domains.add(domain)
|
|
473
|
+
|
|
474
|
+
# Second pass: If we need more results, add remaining high-quality ones
|
|
475
|
+
if len(best_results) < num_results:
|
|
476
|
+
for result in processed_results:
|
|
477
|
+
if result not in best_results and len(best_results) < num_results:
|
|
478
|
+
best_results.append(result)
|
|
479
|
+
|
|
480
|
+
if not best_results:
|
|
481
|
+
return f"No quality results found for query: {query}. Try a different search term."
|
|
482
|
+
|
|
483
|
+
# Calculate per-result content budget for the best results
|
|
484
|
+
estimated_overhead_per_result = 400 # Including quality info
|
|
485
|
+
total_overhead = len(best_results) * estimated_overhead_per_result
|
|
486
|
+
available_for_content = self.max_content_length - total_overhead
|
|
487
|
+
per_result_limit = max(2000, available_for_content // len(best_results)) # Increased minimum
|
|
488
|
+
|
|
489
|
+
# Format the best results
|
|
120
490
|
all_text = []
|
|
491
|
+
all_text.append(f"Found {len(processed_results)} results meeting quality threshold from {len(search_results)} searched.")
|
|
492
|
+
all_text.append(f"Showing top {len(best_results)} from diverse sources:\n")
|
|
121
493
|
|
|
122
|
-
for i, result in enumerate(
|
|
123
|
-
text_content = f"=== RESULT {i} ===\n"
|
|
494
|
+
for i, result in enumerate(best_results, 1):
|
|
495
|
+
text_content = f"=== RESULT {i} (Quality: {result['quality_score']:.2f}) ===\n"
|
|
124
496
|
text_content += f"Title: {result['title']}\n"
|
|
125
497
|
text_content += f"URL: {result['url']}\n"
|
|
498
|
+
text_content += f"Source: {result['domain']}\n"
|
|
126
499
|
text_content += f"Snippet: {result['snippet']}\n"
|
|
127
|
-
text_content += f"Content:\n"
|
|
128
500
|
|
|
129
|
-
#
|
|
130
|
-
|
|
501
|
+
# Add quality indicators
|
|
502
|
+
metrics = result['metrics']
|
|
503
|
+
text_content += f"Content Stats: {metrics.get('text_length', 0)} chars, "
|
|
504
|
+
text_content += f"{metrics.get('sentence_count', 0)} sentences\n"
|
|
505
|
+
text_content += f"Query Relevance: {metrics.get('query_relevance', 0):.2f} "
|
|
506
|
+
text_content += f"(keywords: {metrics.get('query_words_found', 'N/A')})\n"
|
|
507
|
+
text_content += f"Content:\n"
|
|
131
508
|
|
|
132
|
-
if
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
509
|
+
# Truncate content if needed
|
|
510
|
+
content = result['content']
|
|
511
|
+
if len(content) > per_result_limit:
|
|
512
|
+
content = content[:per_result_limit] + "..."
|
|
513
|
+
text_content += content
|
|
136
514
|
|
|
137
515
|
text_content += f"\n{'='*50}\n\n"
|
|
138
516
|
all_text.append(text_content)
|
|
139
517
|
|
|
140
|
-
if i < len(search_results):
|
|
141
|
-
time.sleep(delay)
|
|
142
|
-
|
|
143
518
|
return '\n'.join(all_text)
|
|
144
519
|
|
|
520
|
+
def search_and_scrape(self, query: str, num_results: int = 3, delay: float = 0.5) -> str:
|
|
521
|
+
"""
|
|
522
|
+
Backward compatible method that uses the improved search
|
|
523
|
+
"""
|
|
524
|
+
return self.search_and_scrape_best(
|
|
525
|
+
query=query,
|
|
526
|
+
num_results=num_results,
|
|
527
|
+
oversample_factor=4.0,
|
|
528
|
+
delay=delay,
|
|
529
|
+
min_quality_score=0.2
|
|
530
|
+
)
|
|
531
|
+
|
|
145
532
|
|
|
146
533
|
class WebSearchSkill(SkillBase):
|
|
147
|
-
"""Web search capability using Google Custom Search API"""
|
|
148
|
-
|
|
534
|
+
"""Web search capability using Google Custom Search API with quality filtering"""
|
|
535
|
+
|
|
149
536
|
SKILL_NAME = "web_search"
|
|
150
537
|
SKILL_DESCRIPTION = "Search the web for information using Google Custom Search API"
|
|
151
|
-
SKILL_VERSION = "
|
|
538
|
+
SKILL_VERSION = "2.0.0" # Bumped version for improved functionality
|
|
152
539
|
REQUIRED_PACKAGES = ["bs4", "requests"]
|
|
153
|
-
REQUIRED_ENV_VARS = []
|
|
154
|
-
|
|
540
|
+
REQUIRED_ENV_VARS = []
|
|
541
|
+
|
|
155
542
|
# Enable multiple instances support
|
|
156
543
|
SUPPORTS_MULTIPLE_INSTANCES = True
|
|
157
|
-
|
|
544
|
+
|
|
158
545
|
def get_instance_key(self) -> str:
|
|
159
|
-
"""
|
|
160
|
-
Get the key used to track this skill instance
|
|
161
|
-
|
|
162
|
-
For web search, we use the search_engine_id to differentiate instances
|
|
163
|
-
"""
|
|
546
|
+
"""Get the key used to track this skill instance"""
|
|
164
547
|
search_engine_id = self.params.get('search_engine_id', 'default')
|
|
165
548
|
tool_name = self.params.get('tool_name', 'web_search')
|
|
166
549
|
return f"{self.SKILL_NAME}_{search_engine_id}_{tool_name}"
|
|
167
|
-
|
|
550
|
+
|
|
168
551
|
def setup(self) -> bool:
|
|
169
552
|
"""Setup the web search skill"""
|
|
170
553
|
# Validate required parameters
|
|
@@ -173,41 +556,46 @@ class WebSearchSkill(SkillBase):
|
|
|
173
556
|
if missing_params:
|
|
174
557
|
self.logger.error(f"Missing required parameters: {missing_params}")
|
|
175
558
|
return False
|
|
176
|
-
|
|
559
|
+
|
|
177
560
|
if not self.validate_packages():
|
|
178
561
|
return False
|
|
179
|
-
|
|
562
|
+
|
|
180
563
|
# Set parameters from config
|
|
181
564
|
self.api_key = self.params['api_key']
|
|
182
565
|
self.search_engine_id = self.params['search_engine_id']
|
|
183
|
-
|
|
566
|
+
|
|
184
567
|
# Set default parameters
|
|
185
|
-
self.default_num_results = self.params.get('num_results',
|
|
186
|
-
self.default_delay = self.params.get('delay', 0)
|
|
568
|
+
self.default_num_results = self.params.get('num_results', 3)
|
|
569
|
+
self.default_delay = self.params.get('delay', 0.5)
|
|
187
570
|
self.max_content_length = self.params.get('max_content_length', 32768)
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
571
|
+
|
|
572
|
+
# Quality control parameters (new)
|
|
573
|
+
self.oversample_factor = self.params.get('oversample_factor', 2.5)
|
|
574
|
+
self.min_quality_score = self.params.get('min_quality_score', 0.3)
|
|
575
|
+
|
|
576
|
+
self.no_results_message = self.params.get('no_results_message',
|
|
577
|
+
"I couldn't find quality results for '{query}'. "
|
|
578
|
+
"The search returned only low-quality or inaccessible pages. "
|
|
191
579
|
"Try rephrasing your search or asking about a different topic."
|
|
192
580
|
)
|
|
193
|
-
|
|
581
|
+
|
|
194
582
|
# Tool name (for multiple instances)
|
|
195
583
|
self.tool_name = self.params.get('tool_name', 'web_search')
|
|
196
|
-
|
|
197
|
-
# Initialize the search scraper
|
|
584
|
+
|
|
585
|
+
# Initialize the improved search scraper
|
|
198
586
|
self.search_scraper = GoogleSearchScraper(
|
|
199
587
|
api_key=self.api_key,
|
|
200
588
|
search_engine_id=self.search_engine_id,
|
|
201
589
|
max_content_length=self.max_content_length
|
|
202
590
|
)
|
|
203
|
-
|
|
591
|
+
|
|
204
592
|
return True
|
|
205
|
-
|
|
593
|
+
|
|
206
594
|
def register_tools(self) -> None:
|
|
207
595
|
"""Register web search tool with the agent"""
|
|
208
596
|
self.define_tool(
|
|
209
597
|
name=self.tool_name,
|
|
210
|
-
description="Search the web for information
|
|
598
|
+
description="Search the web for high-quality information, automatically filtering low-quality results",
|
|
211
599
|
parameters={
|
|
212
600
|
"query": {
|
|
213
601
|
"type": "string",
|
|
@@ -216,105 +604,96 @@ class WebSearchSkill(SkillBase):
|
|
|
216
604
|
},
|
|
217
605
|
handler=self._web_search_handler
|
|
218
606
|
)
|
|
219
|
-
|
|
607
|
+
|
|
220
608
|
def _web_search_handler(self, args, raw_data):
|
|
221
|
-
"""Handler for web search tool"""
|
|
609
|
+
"""Handler for web search tool with quality filtering"""
|
|
222
610
|
query = args.get("query", "").strip()
|
|
223
|
-
|
|
611
|
+
|
|
224
612
|
if not query:
|
|
225
613
|
return SwaigFunctionResult(
|
|
226
614
|
"Please provide a search query. What would you like me to search for?"
|
|
227
615
|
)
|
|
228
|
-
|
|
229
|
-
# Use the configured number of results
|
|
616
|
+
|
|
617
|
+
# Use the configured number of results
|
|
230
618
|
num_results = self.default_num_results
|
|
231
|
-
|
|
232
|
-
self.logger.info(f"Web search requested: '{query}' ({num_results} results)")
|
|
233
|
-
|
|
234
|
-
# Perform the search
|
|
619
|
+
|
|
620
|
+
self.logger.info(f"Web search requested: '{query}' (requesting {num_results} quality results)")
|
|
621
|
+
|
|
622
|
+
# Perform the improved search
|
|
235
623
|
try:
|
|
236
|
-
search_results = self.search_scraper.
|
|
624
|
+
search_results = self.search_scraper.search_and_scrape_best(
|
|
237
625
|
query=query,
|
|
238
626
|
num_results=num_results,
|
|
239
|
-
|
|
627
|
+
oversample_factor=self.oversample_factor,
|
|
628
|
+
delay=self.default_delay,
|
|
629
|
+
min_quality_score=self.min_quality_score
|
|
240
630
|
)
|
|
241
|
-
|
|
242
|
-
if not search_results or "No search results found" in search_results:
|
|
243
|
-
# Format the no results message with the query if it contains a placeholder
|
|
631
|
+
|
|
632
|
+
if not search_results or "No quality results found" in search_results or "No search results found" in search_results:
|
|
244
633
|
formatted_message = self.no_results_message.format(query=query) if '{query}' in self.no_results_message else self.no_results_message
|
|
245
634
|
return SwaigFunctionResult(formatted_message)
|
|
246
|
-
|
|
247
|
-
response = f"
|
|
635
|
+
|
|
636
|
+
response = f"Quality web search results for '{query}':\n\n{search_results}"
|
|
248
637
|
return SwaigFunctionResult(response)
|
|
249
|
-
|
|
638
|
+
|
|
250
639
|
except Exception as e:
|
|
251
640
|
self.logger.error(f"Error performing web search: {e}")
|
|
252
641
|
return SwaigFunctionResult(
|
|
253
642
|
"Sorry, I encountered an error while searching. Please try again later."
|
|
254
643
|
)
|
|
255
|
-
|
|
644
|
+
|
|
256
645
|
def get_hints(self) -> List[str]:
|
|
257
646
|
"""Return speech recognition hints"""
|
|
258
|
-
# Currently no hints provided, but you could add them like:
|
|
259
|
-
# return [
|
|
260
|
-
# "Google", "search", "internet", "web", "information",
|
|
261
|
-
# "find", "look up", "research", "query", "results"
|
|
262
|
-
# ]
|
|
263
647
|
return []
|
|
264
|
-
|
|
648
|
+
|
|
265
649
|
def get_global_data(self) -> Dict[str, Any]:
|
|
266
650
|
"""Return global data for agent context"""
|
|
267
651
|
return {
|
|
268
652
|
"web_search_enabled": True,
|
|
269
|
-
"search_provider": "Google Custom Search"
|
|
653
|
+
"search_provider": "Google Custom Search",
|
|
654
|
+
"quality_filtering": True
|
|
270
655
|
}
|
|
271
|
-
|
|
656
|
+
|
|
272
657
|
def get_prompt_sections(self) -> List[Dict[str, Any]]:
|
|
273
658
|
"""Return prompt sections to add to agent"""
|
|
274
659
|
return [
|
|
275
660
|
{
|
|
276
|
-
"title": "Web Search Capability",
|
|
277
|
-
"body": f"You can search the internet for
|
|
661
|
+
"title": "Web Search Capability (Quality Enhanced)",
|
|
662
|
+
"body": f"You can search the internet for high-quality information using the {self.tool_name} tool.",
|
|
278
663
|
"bullets": [
|
|
279
664
|
f"Use the {self.tool_name} tool when users ask for information you need to look up",
|
|
280
|
-
"
|
|
281
|
-
"
|
|
282
|
-
"
|
|
665
|
+
"The search automatically filters out low-quality results like empty pages",
|
|
666
|
+
"Results are ranked by content quality, relevance, and domain reputation",
|
|
667
|
+
"Summarize the high-quality results in a clear, helpful way"
|
|
283
668
|
]
|
|
284
669
|
}
|
|
285
670
|
]
|
|
286
|
-
|
|
671
|
+
|
|
287
672
|
@classmethod
|
|
288
673
|
def get_parameter_schema(cls) -> Dict[str, Dict[str, Any]]:
|
|
289
|
-
"""
|
|
290
|
-
Get the parameter schema for the web search skill
|
|
291
|
-
|
|
292
|
-
Returns all configurable parameters for web search including
|
|
293
|
-
API credentials, search settings, and response customization.
|
|
294
|
-
"""
|
|
295
|
-
# Get base schema from parent
|
|
674
|
+
"""Get the parameter schema for the web search skill"""
|
|
296
675
|
schema = super().get_parameter_schema()
|
|
297
|
-
|
|
676
|
+
|
|
298
677
|
# Add web search specific parameters
|
|
299
678
|
schema.update({
|
|
300
679
|
"api_key": {
|
|
301
680
|
"type": "string",
|
|
302
681
|
"description": "Google Custom Search API key",
|
|
303
682
|
"required": True,
|
|
304
|
-
"hidden": True,
|
|
683
|
+
"hidden": True,
|
|
305
684
|
"env_var": "GOOGLE_SEARCH_API_KEY"
|
|
306
685
|
},
|
|
307
686
|
"search_engine_id": {
|
|
308
687
|
"type": "string",
|
|
309
688
|
"description": "Google Custom Search Engine ID",
|
|
310
689
|
"required": True,
|
|
311
|
-
"hidden": True,
|
|
690
|
+
"hidden": True,
|
|
312
691
|
"env_var": "GOOGLE_SEARCH_ENGINE_ID"
|
|
313
692
|
},
|
|
314
693
|
"num_results": {
|
|
315
694
|
"type": "integer",
|
|
316
|
-
"description": "
|
|
317
|
-
"default":
|
|
695
|
+
"description": "Number of high-quality results to return",
|
|
696
|
+
"default": 3,
|
|
318
697
|
"required": False,
|
|
319
698
|
"min": 1,
|
|
320
699
|
"max": 10
|
|
@@ -322,23 +701,39 @@ class WebSearchSkill(SkillBase):
|
|
|
322
701
|
"delay": {
|
|
323
702
|
"type": "number",
|
|
324
703
|
"description": "Delay between scraping pages in seconds",
|
|
325
|
-
"default": 0,
|
|
704
|
+
"default": 0.5,
|
|
326
705
|
"required": False,
|
|
327
706
|
"min": 0
|
|
328
707
|
},
|
|
329
708
|
"max_content_length": {
|
|
330
709
|
"type": "integer",
|
|
331
|
-
"description": "Maximum total response size in characters
|
|
710
|
+
"description": "Maximum total response size in characters",
|
|
332
711
|
"default": 32768,
|
|
333
712
|
"required": False,
|
|
334
713
|
"min": 1000
|
|
335
714
|
},
|
|
715
|
+
"oversample_factor": {
|
|
716
|
+
"type": "number",
|
|
717
|
+
"description": "How many extra results to fetch for quality filtering (e.g., 2.5 = fetch 2.5x requested)",
|
|
718
|
+
"default": 2.5,
|
|
719
|
+
"required": False,
|
|
720
|
+
"min": 1.0,
|
|
721
|
+
"max": 3.5
|
|
722
|
+
},
|
|
723
|
+
"min_quality_score": {
|
|
724
|
+
"type": "number",
|
|
725
|
+
"description": "Minimum quality score (0-1) for including a result",
|
|
726
|
+
"default": 0.3,
|
|
727
|
+
"required": False,
|
|
728
|
+
"min": 0.0,
|
|
729
|
+
"max": 1.0
|
|
730
|
+
},
|
|
336
731
|
"no_results_message": {
|
|
337
732
|
"type": "string",
|
|
338
|
-
"description": "Message to show when no results are found. Use {query} as placeholder.",
|
|
339
|
-
"default": "I couldn't find
|
|
733
|
+
"description": "Message to show when no quality results are found. Use {query} as placeholder.",
|
|
734
|
+
"default": "I couldn't find quality results for '{query}'. The search returned only low-quality or inaccessible pages. Try rephrasing your search or asking about a different topic.",
|
|
340
735
|
"required": False
|
|
341
736
|
}
|
|
342
737
|
})
|
|
343
|
-
|
|
344
|
-
return schema
|
|
738
|
+
|
|
739
|
+
return schema
|