signalwire-agents 0.1.54__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,17 +10,18 @@ See LICENSE file in the project root for full license information.
10
10
  import os
11
11
  import requests
12
12
  import time
13
+ import re
13
14
  from urllib.parse import urljoin, urlparse
14
15
  from bs4 import BeautifulSoup
15
16
  import json
16
- from typing import Optional, List, Dict, Any
17
+ from typing import Optional, List, Dict, Any, Tuple
17
18
 
18
19
  from signalwire_agents.core.skill_base import SkillBase
19
20
  from signalwire_agents.core.function_result import SwaigFunctionResult
20
21
 
21
22
  class GoogleSearchScraper:
22
- """Google Search and Web Scraping functionality"""
23
-
23
+ """Google Search and Web Scraping functionality with quality scoring"""
24
+
24
25
  def __init__(self, api_key: str, search_engine_id: str, max_content_length: int = 32768):
25
26
  self.api_key = api_key
26
27
  self.search_engine_id = search_engine_id
@@ -33,22 +34,22 @@ class GoogleSearchScraper:
33
34
  def search_google(self, query: str, num_results: int = 5) -> list:
34
35
  """Search Google using Custom Search JSON API"""
35
36
  url = "https://www.googleapis.com/customsearch/v1"
36
-
37
+
37
38
  params = {
38
39
  'key': self.api_key,
39
40
  'cx': self.search_engine_id,
40
41
  'q': query,
41
42
  'num': min(num_results, 10)
42
43
  }
43
-
44
+
44
45
  try:
45
46
  response = self.session.get(url, params=params)
46
47
  response.raise_for_status()
47
48
  data = response.json()
48
-
49
+
49
50
  if 'items' not in data:
50
51
  return []
51
-
52
+
52
53
  results = []
53
54
  for item in data['items'][:num_results]:
54
55
  results.append({
@@ -56,19 +57,149 @@ class GoogleSearchScraper:
56
57
  'url': item.get('link', ''),
57
58
  'snippet': item.get('snippet', '')
58
59
  })
59
-
60
+
60
61
  return results
61
-
62
+
62
63
  except Exception as e:
63
64
  return []
64
65
 
65
- def extract_text_from_url(self, url: str, content_limit: int = None, timeout: int = 10) -> str:
66
- """Scrape a URL and extract readable text content
66
+ def is_reddit_url(self, url: str) -> bool:
67
+ """Check if URL is from Reddit"""
68
+ domain = urlparse(url).netloc.lower()
69
+ return 'reddit.com' in domain or 'redd.it' in domain
67
70
 
68
- Args:
69
- url: URL to scrape
70
- content_limit: Maximum characters to return (uses self.max_content_length if not provided)
71
- timeout: Request timeout in seconds
71
+ def extract_reddit_content(self, url: str, content_limit: int = None, timeout: int = 10) -> Tuple[str, Dict[str, Any]]:
72
+ """
73
+ Extract Reddit content using JSON API for better quality
74
+
75
+ Returns:
76
+ Tuple of (text_content, quality_metrics)
77
+ """
78
+ try:
79
+ # Convert to JSON endpoint
80
+ if not url.endswith('.json'):
81
+ json_url = url.rstrip('/') + '.json'
82
+ else:
83
+ json_url = url
84
+
85
+ # Fetch with proper headers (Reddit requires User-Agent)
86
+ headers = {'User-Agent': 'SignalWire-WebSearch/2.0'}
87
+ response = requests.get(json_url, headers=headers, timeout=timeout)
88
+ response.raise_for_status()
89
+
90
+ data = response.json()
91
+
92
+ # Extract post information
93
+ if not data or not isinstance(data, list) or len(data) < 1:
94
+ return "", {"error": "Invalid Reddit JSON structure", "quality_score": 0}
95
+
96
+ # First element is the post, second (if exists) contains comments
97
+ post_data = data[0]['data']['children'][0]['data']
98
+
99
+ # Build content from post
100
+ content_parts = []
101
+
102
+ # Add post title and metadata
103
+ title = post_data.get('title', 'No title')
104
+ author = post_data.get('author', 'unknown')
105
+ score = post_data.get('score', 0)
106
+ num_comments = post_data.get('num_comments', 0)
107
+ subreddit = post_data.get('subreddit', '')
108
+
109
+ content_parts.append(f"Reddit r/{subreddit} Discussion")
110
+ content_parts.append(f"\nPost: {title}")
111
+ content_parts.append(f"Author: {author} | Score: {score} | Comments: {num_comments}")
112
+
113
+ # Add original post text if it's a text post
114
+ selftext = post_data.get('selftext', '').strip()
115
+ if selftext and selftext != '[removed]' and selftext != '[deleted]':
116
+ content_parts.append(f"\nOriginal Post:\n{selftext[:1000]}") # Limit post text
117
+
118
+ # Extract top comments if available
119
+ valid_comments = []
120
+ if len(data) > 1 and 'data' in data[1] and 'children' in data[1]['data']:
121
+ comments = data[1]['data']['children']
122
+
123
+ # Filter and sort comments by score
124
+ for comment in comments[:20]: # Look at top 20 comments
125
+ if comment.get('kind') == 't1': # t1 = comment
126
+ comment_data = comment.get('data', {})
127
+ body = comment_data.get('body', '').strip()
128
+ if (body and
129
+ body != '[removed]' and
130
+ body != '[deleted]' and
131
+ len(body) > 50): # Skip very short comments
132
+ valid_comments.append({
133
+ 'body': body,
134
+ 'author': comment_data.get('author', 'unknown'),
135
+ 'score': comment_data.get('score', 0)
136
+ })
137
+
138
+ # Sort by score and take top comments
139
+ valid_comments.sort(key=lambda x: x['score'], reverse=True)
140
+
141
+ if valid_comments:
142
+ content_parts.append("\n--- Top Discussion ---")
143
+ for i, comment in enumerate(valid_comments[:5], 1):
144
+ # Truncate long comments
145
+ comment_text = comment['body'][:500]
146
+ if len(comment['body']) > 500:
147
+ comment_text += "..."
148
+
149
+ content_parts.append(f"\nComment {i} (Score: {comment['score']}, Author: {comment['author']}):")
150
+ content_parts.append(comment_text)
151
+
152
+ # Join all content
153
+ text = '\n'.join(content_parts)
154
+
155
+ # Calculate quality metrics specifically for Reddit content
156
+ metrics = {
157
+ 'text_length': len(text),
158
+ 'score': score,
159
+ 'num_comments': num_comments,
160
+ 'domain': urlparse(url).netloc.lower(),
161
+ 'is_reddit': True
162
+ }
163
+
164
+ # Quality score based on Reddit-specific factors
165
+ length_score = min(1.0, len(text) / 2000) # Want at least 2000 chars
166
+ engagement_score = min(1.0, (score + num_comments) / 100) # High engagement is good
167
+ has_comments = 1.0 if len(valid_comments) > 0 else 0.3 # Heavily penalize if no good comments
168
+
169
+ quality_score = (
170
+ length_score * 0.4 +
171
+ engagement_score * 0.3 +
172
+ has_comments * 0.3
173
+ )
174
+
175
+ metrics['quality_score'] = round(quality_score, 3)
176
+
177
+ # Limit content if needed
178
+ limit = content_limit if content_limit is not None else self.max_content_length
179
+ if len(text) > limit:
180
+ text = text[:limit]
181
+
182
+ return text, metrics
183
+
184
+ except Exception as e:
185
+ # Fall back to HTML extraction if JSON fails
186
+ return self.extract_html_content(url, content_limit, timeout)
187
+
188
+ def extract_text_from_url(self, url: str, content_limit: int = None, timeout: int = 10) -> Tuple[str, Dict[str, Any]]:
189
+ """
190
+ Main extraction method that routes to appropriate extractor
191
+
192
+ Returns:
193
+ Tuple of (text_content, quality_metrics)
194
+ """
195
+ if self.is_reddit_url(url):
196
+ return self.extract_reddit_content(url, content_limit, timeout)
197
+ else:
198
+ return self.extract_html_content(url, content_limit, timeout)
199
+
200
+ def extract_html_content(self, url: str, content_limit: int = None, timeout: int = 10) -> Tuple[str, Dict[str, Any]]:
201
+ """
202
+ Original HTML extraction method (renamed from extract_text_from_url)
72
203
  """
73
204
  try:
74
205
  response = self.session.get(url, timeout=timeout)
@@ -76,95 +207,347 @@ class GoogleSearchScraper:
76
207
 
77
208
  soup = BeautifulSoup(response.content, 'html.parser')
78
209
 
79
- # Remove unwanted elements
80
- for script in soup(["script", "style", "nav", "footer", "header", "aside"]):
81
- script.decompose()
210
+ # Extract main content areas (common content selectors)
211
+ main_content = None
212
+ content_selectors = [
213
+ 'article', 'main', '[role="main"]', '.content', '#content',
214
+ '.post', '.entry-content', '.article-body', '.story-body',
215
+ '.markdown-body', '.wiki-body', '.documentation'
216
+ ]
217
+
218
+ for selector in content_selectors:
219
+ main_content = soup.select_one(selector)
220
+ if main_content:
221
+ break
222
+
223
+ # If no main content found, use the whole body
224
+ if not main_content:
225
+ main_content = soup.find('body') or soup
226
+
227
+ # Clone for processing
228
+ content_soup = BeautifulSoup(str(main_content), 'html.parser')
229
+
230
+ # Remove unwanted elements from the content area
231
+ unwanted_tags = ["script", "style", "nav", "footer", "header", "aside", "noscript", "iframe"]
232
+ for tag in unwanted_tags:
233
+ for element in content_soup(tag):
234
+ element.decompose()
82
235
 
83
- text = soup.get_text()
236
+ # Remove elements with unwanted classes/ids
237
+ unwanted_patterns = [
238
+ 'sidebar', 'navigation', 'menu', 'advertisement', 'ads', 'banner',
239
+ 'popup', 'modal', 'cookie', 'gdpr', 'subscribe', 'newsletter',
240
+ 'comments', 'related', 'share', 'social'
241
+ ]
242
+
243
+ for pattern in unwanted_patterns:
244
+ # Remove by class
245
+ for element in content_soup.find_all(class_=re.compile(pattern, re.I)):
246
+ element.decompose()
247
+ # Remove by id
248
+ for element in content_soup.find_all(id=re.compile(pattern, re.I)):
249
+ element.decompose()
250
+
251
+ # Extract text
252
+ text = content_soup.get_text()
84
253
 
85
254
  # Clean up the text
86
- lines = (line.strip() for line in text.splitlines())
87
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
88
- text = ' '.join(chunk for chunk in chunks if chunk)
255
+ lines = [line.strip() for line in text.splitlines()]
256
+ # Remove empty lines and join
257
+ lines = [line for line in lines if line]
258
+ text = ' '.join(lines)
259
+
260
+ # Remove excessive whitespace
261
+ text = re.sub(r'\s+', ' ', text).strip()
262
+
263
+ # Calculate quality metrics (need to pass query for relevance)
264
+ quality_metrics = self._calculate_content_quality(text, url, "")
89
265
 
90
266
  # Limit text length
91
267
  limit = content_limit if content_limit is not None else self.max_content_length
92
268
  if len(text) > limit:
93
269
  text = text[:limit]
94
270
 
95
- return text
271
+ return text, quality_metrics
96
272
 
97
273
  except Exception as e:
98
- return ""
274
+ return "", {"error": str(e), "quality_score": 0}
99
275
 
100
- def search_and_scrape(self, query: str, num_results: int = 3, delay: float = 0.5) -> str:
101
- """Main function: search Google and scrape the resulting pages
276
+ def _calculate_content_quality(self, text: str, url: str, query: str = "") -> Dict[str, Any]:
277
+ """
278
+ Calculate quality metrics for extracted content
279
+
280
+ Quality factors:
281
+ - Text length (substantive content)
282
+ - Word diversity (not repetitive)
283
+ - Sentence structure (proper formatting)
284
+ - Lack of boilerplate phrases
285
+ - Domain reputation
286
+ """
287
+ if not text:
288
+ return {"quality_score": 0, "text_length": 0}
289
+
290
+ metrics = {}
291
+
292
+ # Text length (MUCH stricter - prefer 2000-10000 chars of actual content)
293
+ text_length = len(text)
294
+ metrics['text_length'] = text_length
295
+ if text_length < 500:
296
+ length_score = 0 # Too short to be useful
297
+ elif text_length < 2000:
298
+ length_score = (text_length - 500) / 1500 * 0.5 # Scale to 0.5 max
299
+ elif text_length <= 10000:
300
+ length_score = 1.0 # Ideal range
301
+ else:
302
+ length_score = max(0.8, 1.0 - (text_length - 10000) / 20000)
303
+
304
+ # Word diversity (unique words / total words)
305
+ words = text.lower().split()
306
+ if words:
307
+ unique_words = len(set(words))
308
+ total_words = len(words)
309
+ diversity_score = min(1.0, unique_words / (total_words * 0.3)) # Expect 30% unique
310
+ metrics['word_diversity'] = unique_words / total_words if total_words > 0 else 0
311
+ else:
312
+ diversity_score = 0
313
+ metrics['word_diversity'] = 0
314
+
315
+ # Check for boilerplate/navigation text (MUCH stricter)
316
+ boilerplate_phrases = [
317
+ 'cookie', 'privacy policy', 'terms of service', 'subscribe',
318
+ 'sign up', 'log in', 'advertisement', 'sponsored', 'copyright',
319
+ 'all rights reserved', 'skip to', 'navigation', 'breadcrumb',
320
+ 'reddit inc', 'google llc', 'expand navigation', 'members •',
321
+ 'archived post', 'votes cannot be cast', 'r/', 'subreddit',
322
+ 'youtube', 'facebook', 'twitter', 'instagram', 'pinterest'
323
+ ]
324
+
325
+ lower_text = text.lower()
326
+ boilerplate_count = sum(1 for phrase in boilerplate_phrases if phrase in lower_text)
327
+ boilerplate_penalty = max(0, 1.0 - (boilerplate_count * 0.15)) # -15% per boilerplate phrase
328
+ metrics['boilerplate_count'] = boilerplate_count
329
+
330
+ # Sentence detection (need MORE sentences for quality content)
331
+ sentences = re.split(r'[.!?]+', text)
332
+ sentence_count = len([s for s in sentences if len(s.strip()) > 30]) # Longer min sentence
333
+ sentence_score = min(1.0, sentence_count / 10) # Need at least 10 proper sentences
334
+ metrics['sentence_count'] = sentence_count
335
+
336
+ # Domain quality (MUCH harsher on social media)
337
+ domain = urlparse(url).netloc.lower()
338
+ quality_domains = [
339
+ 'wikipedia.org', 'starwars.fandom.com', 'imdb.com',
340
+ 'screenrant.com', 'denofgeek.com', 'ign.com',
341
+ 'hollywoodreporter.com', 'variety.com', 'ew.com',
342
+ 'stackexchange.com', 'stackoverflow.com',
343
+ 'github.com', 'medium.com', 'dev.to', 'arxiv.org',
344
+ 'nature.com', 'sciencedirect.com', 'ieee.org'
345
+ ]
346
+
347
+ low_quality_domains = [
348
+ 'reddit.com', 'youtube.com', 'facebook.com', 'twitter.com',
349
+ 'instagram.com', 'pinterest.com', 'tiktok.com', 'x.com'
350
+ ]
351
+
352
+ if any(d in domain for d in quality_domains):
353
+ domain_score = 1.5 # Strong bonus for quality domains
354
+ elif any(d in domain for d in low_quality_domains):
355
+ domain_score = 0.1 # Severe penalty for social media
356
+ else:
357
+ domain_score = 1.0
358
+
359
+ metrics['domain'] = domain
360
+
361
+ # Query relevance scoring - check for query terms in content
362
+ relevance_score = 0
363
+ if query:
364
+ # Split query into meaningful words (skip common words)
365
+ stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were'}
366
+ query_words = [w.lower() for w in query.split() if w.lower() not in stop_words and len(w) > 2]
102
367
 
103
- Dynamically calculates per-result content limit based on total max_content_length
104
- and number of results to ensure total response stays within bounds.
368
+ if query_words:
369
+ lower_content = text.lower()
370
+ # Count how many query words appear in content
371
+ words_found = sum(1 for word in query_words if word in lower_content)
372
+
373
+ # Also check for exact phrase matches (bonus points)
374
+ exact_phrase_bonus = 0
375
+ if len(query_words) > 1:
376
+ # Check for consecutive word pairs
377
+ for i in range(len(query_words) - 1):
378
+ phrase = f"{query_words[i]} {query_words[i+1]}"
379
+ if phrase in lower_content:
380
+ exact_phrase_bonus += 0.2
381
+
382
+ relevance_score = min(1.0, (words_found / len(query_words)) + exact_phrase_bonus)
383
+ metrics['query_relevance'] = round(relevance_score, 3)
384
+ metrics['query_words_found'] = f"{words_found}/{len(query_words)}"
385
+ else:
386
+ relevance_score = 0.5 # Neutral if no query provided
387
+ metrics['query_relevance'] = 0.5
388
+
389
+ # Calculate final quality score (adjusted weights with relevance)
390
+ quality_score = (
391
+ length_score * 0.25 + # Length important
392
+ diversity_score * 0.10 + # Less weight
393
+ boilerplate_penalty * 0.10 + # Less weight
394
+ sentence_score * 0.15 + # Sentences important
395
+ domain_score * 0.15 + # Domain important
396
+ relevance_score * 0.25 # Query relevance important
397
+ )
398
+
399
+ metrics['quality_score'] = round(quality_score, 3)
400
+ metrics['length_score'] = round(length_score, 3)
401
+ metrics['diversity_score'] = round(diversity_score, 3)
402
+ metrics['boilerplate_penalty'] = round(boilerplate_penalty, 3)
403
+ metrics['sentence_score'] = round(sentence_score, 3)
404
+ metrics['domain_score'] = round(domain_score, 3)
405
+
406
+ return metrics
407
+
408
+ def search_and_scrape_best(self, query: str, num_results: int = 3,
409
+ oversample_factor: float = 4.0, delay: float = 0.5,
410
+ min_quality_score: float = 0.2) -> str:
105
411
  """
106
- search_results = self.search_google(query, num_results)
412
+ Search and scrape with quality filtering and source diversity
413
+
414
+ Args:
415
+ query: Search query
416
+ num_results: Number of best results to return
417
+ oversample_factor: How many extra results to fetch (e.g., 4.0 = fetch 4x)
418
+ delay: Delay between requests
419
+ min_quality_score: Minimum quality score to include a result
420
+
421
+ Returns:
422
+ Formatted string with the best N results from diverse sources
423
+ """
424
+ # Fetch more results than requested (increased to 4x for better pool)
425
+ fetch_count = min(10, int(num_results * oversample_factor))
426
+ search_results = self.search_google(query, fetch_count)
107
427
 
108
428
  if not search_results:
109
429
  return f"No search results found for query: {query}"
110
430
 
111
- # Calculate per-result content budget
112
- # Reserve ~300 chars per result for overhead (titles, URLs, snippets, formatting)
113
- estimated_overhead_per_result = 300
114
- total_overhead = num_results * estimated_overhead_per_result
115
- available_for_content = self.max_content_length - total_overhead
431
+ # Process all results and collect quality metrics
432
+ processed_results = []
433
+
434
+ for result in search_results:
435
+ # Extract content and quality metrics (pass query for relevance scoring)
436
+ page_text, metrics = self.extract_text_from_url(result['url'])
437
+
438
+ # Recalculate metrics with query relevance
439
+ if page_text:
440
+ metrics = self._calculate_content_quality(page_text, result['url'], query)
441
+
442
+ if metrics.get('quality_score', 0) >= min_quality_score and page_text:
443
+ processed_results.append({
444
+ 'title': result['title'],
445
+ 'url': result['url'],
446
+ 'snippet': result['snippet'],
447
+ 'content': page_text,
448
+ 'metrics': metrics,
449
+ 'quality_score': metrics.get('quality_score', 0),
450
+ 'domain': metrics.get('domain', '')
451
+ })
452
+
453
+ # Small delay between requests
454
+ if delay > 0:
455
+ time.sleep(delay)
116
456
 
117
- # Ensure we have at least 1000 chars per result
118
- per_result_limit = max(1000, available_for_content // num_results)
457
+ if not processed_results:
458
+ return f"No quality results found for query: {query}. All results were below quality threshold."
119
459
 
460
+ # Sort by quality score
461
+ processed_results.sort(key=lambda x: x['quality_score'], reverse=True)
462
+
463
+ # Select diverse results (prefer different domains)
464
+ best_results = []
465
+ seen_domains = set()
466
+
467
+ # First pass: Add highest quality result from each unique domain
468
+ for result in processed_results:
469
+ domain = result['domain']
470
+ if domain not in seen_domains and len(best_results) < num_results:
471
+ best_results.append(result)
472
+ seen_domains.add(domain)
473
+
474
+ # Second pass: If we need more results, add remaining high-quality ones
475
+ if len(best_results) < num_results:
476
+ for result in processed_results:
477
+ if result not in best_results and len(best_results) < num_results:
478
+ best_results.append(result)
479
+
480
+ if not best_results:
481
+ return f"No quality results found for query: {query}. Try a different search term."
482
+
483
+ # Calculate per-result content budget for the best results
484
+ estimated_overhead_per_result = 400 # Including quality info
485
+ total_overhead = len(best_results) * estimated_overhead_per_result
486
+ available_for_content = self.max_content_length - total_overhead
487
+ per_result_limit = max(2000, available_for_content // len(best_results)) # Increased minimum
488
+
489
+ # Format the best results
120
490
  all_text = []
491
+ all_text.append(f"Found {len(processed_results)} results meeting quality threshold from {len(search_results)} searched.")
492
+ all_text.append(f"Showing top {len(best_results)} from diverse sources:\n")
121
493
 
122
- for i, result in enumerate(search_results, 1):
123
- text_content = f"=== RESULT {i} ===\n"
494
+ for i, result in enumerate(best_results, 1):
495
+ text_content = f"=== RESULT {i} (Quality: {result['quality_score']:.2f}) ===\n"
124
496
  text_content += f"Title: {result['title']}\n"
125
497
  text_content += f"URL: {result['url']}\n"
498
+ text_content += f"Source: {result['domain']}\n"
126
499
  text_content += f"Snippet: {result['snippet']}\n"
127
- text_content += f"Content:\n"
128
500
 
129
- # Pass the calculated per-result limit
130
- page_text = self.extract_text_from_url(result['url'], content_limit=per_result_limit)
501
+ # Add quality indicators
502
+ metrics = result['metrics']
503
+ text_content += f"Content Stats: {metrics.get('text_length', 0)} chars, "
504
+ text_content += f"{metrics.get('sentence_count', 0)} sentences\n"
505
+ text_content += f"Query Relevance: {metrics.get('query_relevance', 0):.2f} "
506
+ text_content += f"(keywords: {metrics.get('query_words_found', 'N/A')})\n"
507
+ text_content += f"Content:\n"
131
508
 
132
- if page_text:
133
- text_content += page_text
134
- else:
135
- text_content += "Failed to extract content from this page."
509
+ # Truncate content if needed
510
+ content = result['content']
511
+ if len(content) > per_result_limit:
512
+ content = content[:per_result_limit] + "..."
513
+ text_content += content
136
514
 
137
515
  text_content += f"\n{'='*50}\n\n"
138
516
  all_text.append(text_content)
139
517
 
140
- if i < len(search_results):
141
- time.sleep(delay)
142
-
143
518
  return '\n'.join(all_text)
144
519
 
520
+ def search_and_scrape(self, query: str, num_results: int = 3, delay: float = 0.5) -> str:
521
+ """
522
+ Backward compatible method that uses the improved search
523
+ """
524
+ return self.search_and_scrape_best(
525
+ query=query,
526
+ num_results=num_results,
527
+ oversample_factor=4.0,
528
+ delay=delay,
529
+ min_quality_score=0.2
530
+ )
531
+
145
532
 
146
533
  class WebSearchSkill(SkillBase):
147
- """Web search capability using Google Custom Search API"""
148
-
534
+ """Web search capability using Google Custom Search API with quality filtering"""
535
+
149
536
  SKILL_NAME = "web_search"
150
537
  SKILL_DESCRIPTION = "Search the web for information using Google Custom Search API"
151
- SKILL_VERSION = "1.0.0"
538
+ SKILL_VERSION = "2.0.0" # Bumped version for improved functionality
152
539
  REQUIRED_PACKAGES = ["bs4", "requests"]
153
- REQUIRED_ENV_VARS = [] # No required env vars since all config comes from params
154
-
540
+ REQUIRED_ENV_VARS = []
541
+
155
542
  # Enable multiple instances support
156
543
  SUPPORTS_MULTIPLE_INSTANCES = True
157
-
544
+
158
545
  def get_instance_key(self) -> str:
159
- """
160
- Get the key used to track this skill instance
161
-
162
- For web search, we use the search_engine_id to differentiate instances
163
- """
546
+ """Get the key used to track this skill instance"""
164
547
  search_engine_id = self.params.get('search_engine_id', 'default')
165
548
  tool_name = self.params.get('tool_name', 'web_search')
166
549
  return f"{self.SKILL_NAME}_{search_engine_id}_{tool_name}"
167
-
550
+
168
551
  def setup(self) -> bool:
169
552
  """Setup the web search skill"""
170
553
  # Validate required parameters
@@ -173,41 +556,46 @@ class WebSearchSkill(SkillBase):
173
556
  if missing_params:
174
557
  self.logger.error(f"Missing required parameters: {missing_params}")
175
558
  return False
176
-
559
+
177
560
  if not self.validate_packages():
178
561
  return False
179
-
562
+
180
563
  # Set parameters from config
181
564
  self.api_key = self.params['api_key']
182
565
  self.search_engine_id = self.params['search_engine_id']
183
-
566
+
184
567
  # Set default parameters
185
- self.default_num_results = self.params.get('num_results', 1)
186
- self.default_delay = self.params.get('delay', 0)
568
+ self.default_num_results = self.params.get('num_results', 3)
569
+ self.default_delay = self.params.get('delay', 0.5)
187
570
  self.max_content_length = self.params.get('max_content_length', 32768)
188
- self.no_results_message = self.params.get('no_results_message',
189
- "I couldn't find any results for '{query}'. "
190
- "This might be due to a very specific query or temporary issues. "
571
+
572
+ # Quality control parameters (new)
573
+ self.oversample_factor = self.params.get('oversample_factor', 2.5)
574
+ self.min_quality_score = self.params.get('min_quality_score', 0.3)
575
+
576
+ self.no_results_message = self.params.get('no_results_message',
577
+ "I couldn't find quality results for '{query}'. "
578
+ "The search returned only low-quality or inaccessible pages. "
191
579
  "Try rephrasing your search or asking about a different topic."
192
580
  )
193
-
581
+
194
582
  # Tool name (for multiple instances)
195
583
  self.tool_name = self.params.get('tool_name', 'web_search')
196
-
197
- # Initialize the search scraper
584
+
585
+ # Initialize the improved search scraper
198
586
  self.search_scraper = GoogleSearchScraper(
199
587
  api_key=self.api_key,
200
588
  search_engine_id=self.search_engine_id,
201
589
  max_content_length=self.max_content_length
202
590
  )
203
-
591
+
204
592
  return True
205
-
593
+
206
594
  def register_tools(self) -> None:
207
595
  """Register web search tool with the agent"""
208
596
  self.define_tool(
209
597
  name=self.tool_name,
210
- description="Search the web for information on any topic and return detailed results with content from multiple sources",
598
+ description="Search the web for high-quality information, automatically filtering low-quality results",
211
599
  parameters={
212
600
  "query": {
213
601
  "type": "string",
@@ -216,105 +604,96 @@ class WebSearchSkill(SkillBase):
216
604
  },
217
605
  handler=self._web_search_handler
218
606
  )
219
-
607
+
220
608
  def _web_search_handler(self, args, raw_data):
221
- """Handler for web search tool"""
609
+ """Handler for web search tool with quality filtering"""
222
610
  query = args.get("query", "").strip()
223
-
611
+
224
612
  if not query:
225
613
  return SwaigFunctionResult(
226
614
  "Please provide a search query. What would you like me to search for?"
227
615
  )
228
-
229
- # Use the configured number of results (no longer a parameter)
616
+
617
+ # Use the configured number of results
230
618
  num_results = self.default_num_results
231
-
232
- self.logger.info(f"Web search requested: '{query}' ({num_results} results)")
233
-
234
- # Perform the search
619
+
620
+ self.logger.info(f"Web search requested: '{query}' (requesting {num_results} quality results)")
621
+
622
+ # Perform the improved search
235
623
  try:
236
- search_results = self.search_scraper.search_and_scrape(
624
+ search_results = self.search_scraper.search_and_scrape_best(
237
625
  query=query,
238
626
  num_results=num_results,
239
- delay=self.default_delay
627
+ oversample_factor=self.oversample_factor,
628
+ delay=self.default_delay,
629
+ min_quality_score=self.min_quality_score
240
630
  )
241
-
242
- if not search_results or "No search results found" in search_results:
243
- # Format the no results message with the query if it contains a placeholder
631
+
632
+ if not search_results or "No quality results found" in search_results or "No search results found" in search_results:
244
633
  formatted_message = self.no_results_message.format(query=query) if '{query}' in self.no_results_message else self.no_results_message
245
634
  return SwaigFunctionResult(formatted_message)
246
-
247
- response = f"Here are {num_results} results for '{query}':\n\nReiterate them to the user in a concise summary format\n\n{search_results}"
635
+
636
+ response = f"Quality web search results for '{query}':\n\n{search_results}"
248
637
  return SwaigFunctionResult(response)
249
-
638
+
250
639
  except Exception as e:
251
640
  self.logger.error(f"Error performing web search: {e}")
252
641
  return SwaigFunctionResult(
253
642
  "Sorry, I encountered an error while searching. Please try again later."
254
643
  )
255
-
644
+
256
645
  def get_hints(self) -> List[str]:
257
646
  """Return speech recognition hints"""
258
- # Currently no hints provided, but you could add them like:
259
- # return [
260
- # "Google", "search", "internet", "web", "information",
261
- # "find", "look up", "research", "query", "results"
262
- # ]
263
647
  return []
264
-
648
+
265
649
  def get_global_data(self) -> Dict[str, Any]:
266
650
  """Return global data for agent context"""
267
651
  return {
268
652
  "web_search_enabled": True,
269
- "search_provider": "Google Custom Search"
653
+ "search_provider": "Google Custom Search",
654
+ "quality_filtering": True
270
655
  }
271
-
656
+
272
657
  def get_prompt_sections(self) -> List[Dict[str, Any]]:
273
658
  """Return prompt sections to add to agent"""
274
659
  return [
275
660
  {
276
- "title": "Web Search Capability",
277
- "body": f"You can search the internet for current, accurate information on any topic using the {self.tool_name} tool.",
661
+ "title": "Web Search Capability (Quality Enhanced)",
662
+ "body": f"You can search the internet for high-quality information using the {self.tool_name} tool.",
278
663
  "bullets": [
279
664
  f"Use the {self.tool_name} tool when users ask for information you need to look up",
280
- "Search for news, current events, product information, or any current data",
281
- "Summarize search results in a clear, helpful way",
282
- "Include relevant URLs so users can read more if interested"
665
+ "The search automatically filters out low-quality results like empty pages",
666
+ "Results are ranked by content quality, relevance, and domain reputation",
667
+ "Summarize the high-quality results in a clear, helpful way"
283
668
  ]
284
669
  }
285
670
  ]
286
-
671
+
287
672
  @classmethod
288
673
  def get_parameter_schema(cls) -> Dict[str, Dict[str, Any]]:
289
- """
290
- Get the parameter schema for the web search skill
291
-
292
- Returns all configurable parameters for web search including
293
- API credentials, search settings, and response customization.
294
- """
295
- # Get base schema from parent
674
+ """Get the parameter schema for the web search skill"""
296
675
  schema = super().get_parameter_schema()
297
-
676
+
298
677
  # Add web search specific parameters
299
678
  schema.update({
300
679
  "api_key": {
301
680
  "type": "string",
302
681
  "description": "Google Custom Search API key",
303
682
  "required": True,
304
- "hidden": True, # Mark as hidden since it's a secret
683
+ "hidden": True,
305
684
  "env_var": "GOOGLE_SEARCH_API_KEY"
306
685
  },
307
686
  "search_engine_id": {
308
687
  "type": "string",
309
688
  "description": "Google Custom Search Engine ID",
310
689
  "required": True,
311
- "hidden": True, # Also a secret
690
+ "hidden": True,
312
691
  "env_var": "GOOGLE_SEARCH_ENGINE_ID"
313
692
  },
314
693
  "num_results": {
315
694
  "type": "integer",
316
- "description": "Default number of search results to return",
317
- "default": 1,
695
+ "description": "Number of high-quality results to return",
696
+ "default": 3,
318
697
  "required": False,
319
698
  "min": 1,
320
699
  "max": 10
@@ -322,23 +701,39 @@ class WebSearchSkill(SkillBase):
322
701
  "delay": {
323
702
  "type": "number",
324
703
  "description": "Delay between scraping pages in seconds",
325
- "default": 0,
704
+ "default": 0.5,
326
705
  "required": False,
327
706
  "min": 0
328
707
  },
329
708
  "max_content_length": {
330
709
  "type": "integer",
331
- "description": "Maximum total response size in characters (distributed across all results)",
710
+ "description": "Maximum total response size in characters",
332
711
  "default": 32768,
333
712
  "required": False,
334
713
  "min": 1000
335
714
  },
715
+ "oversample_factor": {
716
+ "type": "number",
717
+ "description": "How many extra results to fetch for quality filtering (e.g., 2.5 = fetch 2.5x requested)",
718
+ "default": 2.5,
719
+ "required": False,
720
+ "min": 1.0,
721
+ "max": 3.5
722
+ },
723
+ "min_quality_score": {
724
+ "type": "number",
725
+ "description": "Minimum quality score (0-1) for including a result",
726
+ "default": 0.3,
727
+ "required": False,
728
+ "min": 0.0,
729
+ "max": 1.0
730
+ },
336
731
  "no_results_message": {
337
732
  "type": "string",
338
- "description": "Message to show when no results are found. Use {query} as placeholder.",
339
- "default": "I couldn't find any results for '{query}'. This might be due to a very specific query or temporary issues. Try rephrasing your search or asking about a different topic.",
733
+ "description": "Message to show when no quality results are found. Use {query} as placeholder.",
734
+ "default": "I couldn't find quality results for '{query}'. The search returned only low-quality or inaccessible pages. Try rephrasing your search or asking about a different topic.",
340
735
  "required": False
341
736
  }
342
737
  })
343
-
344
- return schema
738
+
739
+ return schema