signalwire-agents 0.1.6__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. signalwire_agents/__init__.py +130 -4
  2. signalwire_agents/agent_server.py +438 -32
  3. signalwire_agents/agents/bedrock.py +296 -0
  4. signalwire_agents/cli/__init__.py +18 -0
  5. signalwire_agents/cli/build_search.py +1367 -0
  6. signalwire_agents/cli/config.py +80 -0
  7. signalwire_agents/cli/core/__init__.py +10 -0
  8. signalwire_agents/cli/core/agent_loader.py +470 -0
  9. signalwire_agents/cli/core/argparse_helpers.py +179 -0
  10. signalwire_agents/cli/core/dynamic_config.py +71 -0
  11. signalwire_agents/cli/core/service_loader.py +303 -0
  12. signalwire_agents/cli/execution/__init__.py +10 -0
  13. signalwire_agents/cli/execution/datamap_exec.py +446 -0
  14. signalwire_agents/cli/execution/webhook_exec.py +134 -0
  15. signalwire_agents/cli/init_project.py +1225 -0
  16. signalwire_agents/cli/output/__init__.py +10 -0
  17. signalwire_agents/cli/output/output_formatter.py +255 -0
  18. signalwire_agents/cli/output/swml_dump.py +186 -0
  19. signalwire_agents/cli/simulation/__init__.py +10 -0
  20. signalwire_agents/cli/simulation/data_generation.py +374 -0
  21. signalwire_agents/cli/simulation/data_overrides.py +200 -0
  22. signalwire_agents/cli/simulation/mock_env.py +282 -0
  23. signalwire_agents/cli/swaig_test_wrapper.py +52 -0
  24. signalwire_agents/cli/test_swaig.py +809 -0
  25. signalwire_agents/cli/types.py +81 -0
  26. signalwire_agents/core/__init__.py +2 -2
  27. signalwire_agents/core/agent/__init__.py +12 -0
  28. signalwire_agents/core/agent/config/__init__.py +12 -0
  29. signalwire_agents/core/agent/deployment/__init__.py +9 -0
  30. signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
  31. signalwire_agents/core/agent/prompt/__init__.py +14 -0
  32. signalwire_agents/core/agent/prompt/manager.py +306 -0
  33. signalwire_agents/core/agent/routing/__init__.py +9 -0
  34. signalwire_agents/core/agent/security/__init__.py +9 -0
  35. signalwire_agents/core/agent/swml/__init__.py +9 -0
  36. signalwire_agents/core/agent/tools/__init__.py +15 -0
  37. signalwire_agents/core/agent/tools/decorator.py +97 -0
  38. signalwire_agents/core/agent/tools/registry.py +210 -0
  39. signalwire_agents/core/agent_base.py +959 -2166
  40. signalwire_agents/core/auth_handler.py +233 -0
  41. signalwire_agents/core/config_loader.py +259 -0
  42. signalwire_agents/core/contexts.py +707 -0
  43. signalwire_agents/core/data_map.py +487 -0
  44. signalwire_agents/core/function_result.py +1150 -1
  45. signalwire_agents/core/logging_config.py +376 -0
  46. signalwire_agents/core/mixins/__init__.py +28 -0
  47. signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
  48. signalwire_agents/core/mixins/auth_mixin.py +287 -0
  49. signalwire_agents/core/mixins/prompt_mixin.py +358 -0
  50. signalwire_agents/core/mixins/serverless_mixin.py +368 -0
  51. signalwire_agents/core/mixins/skill_mixin.py +55 -0
  52. signalwire_agents/core/mixins/state_mixin.py +153 -0
  53. signalwire_agents/core/mixins/tool_mixin.py +230 -0
  54. signalwire_agents/core/mixins/web_mixin.py +1134 -0
  55. signalwire_agents/core/security/session_manager.py +174 -86
  56. signalwire_agents/core/security_config.py +333 -0
  57. signalwire_agents/core/skill_base.py +200 -0
  58. signalwire_agents/core/skill_manager.py +244 -0
  59. signalwire_agents/core/swaig_function.py +33 -9
  60. signalwire_agents/core/swml_builder.py +212 -12
  61. signalwire_agents/core/swml_handler.py +43 -13
  62. signalwire_agents/core/swml_renderer.py +123 -297
  63. signalwire_agents/core/swml_service.py +277 -260
  64. signalwire_agents/prefabs/concierge.py +6 -2
  65. signalwire_agents/prefabs/info_gatherer.py +149 -33
  66. signalwire_agents/prefabs/receptionist.py +14 -22
  67. signalwire_agents/prefabs/survey.py +6 -2
  68. signalwire_agents/schema.json +9218 -5489
  69. signalwire_agents/search/__init__.py +137 -0
  70. signalwire_agents/search/document_processor.py +1223 -0
  71. signalwire_agents/search/index_builder.py +804 -0
  72. signalwire_agents/search/migration.py +418 -0
  73. signalwire_agents/search/models.py +30 -0
  74. signalwire_agents/search/pgvector_backend.py +752 -0
  75. signalwire_agents/search/query_processor.py +502 -0
  76. signalwire_agents/search/search_engine.py +1264 -0
  77. signalwire_agents/search/search_service.py +574 -0
  78. signalwire_agents/skills/README.md +452 -0
  79. signalwire_agents/skills/__init__.py +23 -0
  80. signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
  81. signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
  82. signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
  83. signalwire_agents/skills/datasphere/README.md +210 -0
  84. signalwire_agents/skills/datasphere/__init__.py +12 -0
  85. signalwire_agents/skills/datasphere/skill.py +310 -0
  86. signalwire_agents/skills/datasphere_serverless/README.md +258 -0
  87. signalwire_agents/skills/datasphere_serverless/__init__.py +10 -0
  88. signalwire_agents/skills/datasphere_serverless/skill.py +237 -0
  89. signalwire_agents/skills/datetime/README.md +132 -0
  90. signalwire_agents/skills/datetime/__init__.py +10 -0
  91. signalwire_agents/skills/datetime/skill.py +126 -0
  92. signalwire_agents/skills/joke/README.md +149 -0
  93. signalwire_agents/skills/joke/__init__.py +10 -0
  94. signalwire_agents/skills/joke/skill.py +109 -0
  95. signalwire_agents/skills/math/README.md +161 -0
  96. signalwire_agents/skills/math/__init__.py +10 -0
  97. signalwire_agents/skills/math/skill.py +105 -0
  98. signalwire_agents/skills/mcp_gateway/README.md +230 -0
  99. signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
  100. signalwire_agents/skills/mcp_gateway/skill.py +421 -0
  101. signalwire_agents/skills/native_vector_search/README.md +210 -0
  102. signalwire_agents/skills/native_vector_search/__init__.py +10 -0
  103. signalwire_agents/skills/native_vector_search/skill.py +820 -0
  104. signalwire_agents/skills/play_background_file/README.md +218 -0
  105. signalwire_agents/skills/play_background_file/__init__.py +12 -0
  106. signalwire_agents/skills/play_background_file/skill.py +242 -0
  107. signalwire_agents/skills/registry.py +459 -0
  108. signalwire_agents/skills/spider/README.md +236 -0
  109. signalwire_agents/skills/spider/__init__.py +13 -0
  110. signalwire_agents/skills/spider/skill.py +598 -0
  111. signalwire_agents/skills/swml_transfer/README.md +395 -0
  112. signalwire_agents/skills/swml_transfer/__init__.py +10 -0
  113. signalwire_agents/skills/swml_transfer/skill.py +359 -0
  114. signalwire_agents/skills/weather_api/README.md +178 -0
  115. signalwire_agents/skills/weather_api/__init__.py +12 -0
  116. signalwire_agents/skills/weather_api/skill.py +191 -0
  117. signalwire_agents/skills/web_search/README.md +163 -0
  118. signalwire_agents/skills/web_search/__init__.py +10 -0
  119. signalwire_agents/skills/web_search/skill.py +739 -0
  120. signalwire_agents/skills/wikipedia_search/README.md +228 -0
  121. signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
  122. signalwire_agents/skills/wikipedia_search/skill.py +210 -0
  123. signalwire_agents/utils/__init__.py +14 -0
  124. signalwire_agents/utils/schema_utils.py +111 -44
  125. signalwire_agents/web/__init__.py +17 -0
  126. signalwire_agents/web/web_service.py +559 -0
  127. signalwire_agents-1.0.7.data/data/share/man/man1/sw-agent-init.1 +307 -0
  128. signalwire_agents-1.0.7.data/data/share/man/man1/sw-search.1 +483 -0
  129. signalwire_agents-1.0.7.data/data/share/man/man1/swaig-test.1 +308 -0
  130. signalwire_agents-1.0.7.dist-info/METADATA +992 -0
  131. signalwire_agents-1.0.7.dist-info/RECORD +142 -0
  132. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/WHEEL +1 -1
  133. signalwire_agents-1.0.7.dist-info/entry_points.txt +4 -0
  134. signalwire_agents/core/state/file_state_manager.py +0 -219
  135. signalwire_agents/core/state/state_manager.py +0 -101
  136. signalwire_agents-0.1.6.data/data/schema.json +0 -5611
  137. signalwire_agents-0.1.6.dist-info/METADATA +0 -199
  138. signalwire_agents-0.1.6.dist-info/RECORD +0 -34
  139. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/licenses/LICENSE +0 -0
  140. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,739 @@
1
+ """
2
+ Copyright (c) 2025 SignalWire
3
+
4
+ This file is part of the SignalWire AI Agents SDK.
5
+
6
+ Licensed under the MIT License.
7
+ See LICENSE file in the project root for full license information.
8
+ """
9
+
10
+ import os
11
+ import requests
12
+ import time
13
+ import re
14
+ from urllib.parse import urljoin, urlparse
15
+ from bs4 import BeautifulSoup
16
+ import json
17
+ from typing import Optional, List, Dict, Any, Tuple
18
+
19
+ from signalwire_agents.core.skill_base import SkillBase
20
+ from signalwire_agents.core.function_result import SwaigFunctionResult
21
+
22
+ class GoogleSearchScraper:
23
+ """Google Search and Web Scraping functionality with quality scoring"""
24
+
25
+ def __init__(self, api_key: str, search_engine_id: str, max_content_length: int = 32768):
26
+ self.api_key = api_key
27
+ self.search_engine_id = search_engine_id
28
+ self.max_content_length = max_content_length
29
+ self.session = requests.Session()
30
+ self.session.headers.update({
31
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
32
+ })
33
+
34
+ def search_google(self, query: str, num_results: int = 5) -> list:
35
+ """Search Google using Custom Search JSON API"""
36
+ url = "https://www.googleapis.com/customsearch/v1"
37
+
38
+ params = {
39
+ 'key': self.api_key,
40
+ 'cx': self.search_engine_id,
41
+ 'q': query,
42
+ 'num': min(num_results, 10)
43
+ }
44
+
45
+ try:
46
+ response = self.session.get(url, params=params)
47
+ response.raise_for_status()
48
+ data = response.json()
49
+
50
+ if 'items' not in data:
51
+ return []
52
+
53
+ results = []
54
+ for item in data['items'][:num_results]:
55
+ results.append({
56
+ 'title': item.get('title', ''),
57
+ 'url': item.get('link', ''),
58
+ 'snippet': item.get('snippet', '')
59
+ })
60
+
61
+ return results
62
+
63
+ except Exception as e:
64
+ return []
65
+
66
+ def is_reddit_url(self, url: str) -> bool:
67
+ """Check if URL is from Reddit"""
68
+ domain = urlparse(url).netloc.lower()
69
+ return 'reddit.com' in domain or 'redd.it' in domain
70
+
71
+ def extract_reddit_content(self, url: str, content_limit: int = None, timeout: int = 10) -> Tuple[str, Dict[str, Any]]:
72
+ """
73
+ Extract Reddit content using JSON API for better quality
74
+
75
+ Returns:
76
+ Tuple of (text_content, quality_metrics)
77
+ """
78
+ try:
79
+ # Convert to JSON endpoint
80
+ if not url.endswith('.json'):
81
+ json_url = url.rstrip('/') + '.json'
82
+ else:
83
+ json_url = url
84
+
85
+ # Fetch with proper headers (Reddit requires User-Agent)
86
+ headers = {'User-Agent': 'SignalWire-WebSearch/2.0'}
87
+ response = requests.get(json_url, headers=headers, timeout=timeout)
88
+ response.raise_for_status()
89
+
90
+ data = response.json()
91
+
92
+ # Extract post information
93
+ if not data or not isinstance(data, list) or len(data) < 1:
94
+ return "", {"error": "Invalid Reddit JSON structure", "quality_score": 0}
95
+
96
+ # First element is the post, second (if exists) contains comments
97
+ post_data = data[0]['data']['children'][0]['data']
98
+
99
+ # Build content from post
100
+ content_parts = []
101
+
102
+ # Add post title and metadata
103
+ title = post_data.get('title', 'No title')
104
+ author = post_data.get('author', 'unknown')
105
+ score = post_data.get('score', 0)
106
+ num_comments = post_data.get('num_comments', 0)
107
+ subreddit = post_data.get('subreddit', '')
108
+
109
+ content_parts.append(f"Reddit r/{subreddit} Discussion")
110
+ content_parts.append(f"\nPost: {title}")
111
+ content_parts.append(f"Author: {author} | Score: {score} | Comments: {num_comments}")
112
+
113
+ # Add original post text if it's a text post
114
+ selftext = post_data.get('selftext', '').strip()
115
+ if selftext and selftext != '[removed]' and selftext != '[deleted]':
116
+ content_parts.append(f"\nOriginal Post:\n{selftext[:1000]}") # Limit post text
117
+
118
+ # Extract top comments if available
119
+ valid_comments = []
120
+ if len(data) > 1 and 'data' in data[1] and 'children' in data[1]['data']:
121
+ comments = data[1]['data']['children']
122
+
123
+ # Filter and sort comments by score
124
+ for comment in comments[:20]: # Look at top 20 comments
125
+ if comment.get('kind') == 't1': # t1 = comment
126
+ comment_data = comment.get('data', {})
127
+ body = comment_data.get('body', '').strip()
128
+ if (body and
129
+ body != '[removed]' and
130
+ body != '[deleted]' and
131
+ len(body) > 50): # Skip very short comments
132
+ valid_comments.append({
133
+ 'body': body,
134
+ 'author': comment_data.get('author', 'unknown'),
135
+ 'score': comment_data.get('score', 0)
136
+ })
137
+
138
+ # Sort by score and take top comments
139
+ valid_comments.sort(key=lambda x: x['score'], reverse=True)
140
+
141
+ if valid_comments:
142
+ content_parts.append("\n--- Top Discussion ---")
143
+ for i, comment in enumerate(valid_comments[:5], 1):
144
+ # Truncate long comments
145
+ comment_text = comment['body'][:500]
146
+ if len(comment['body']) > 500:
147
+ comment_text += "..."
148
+
149
+ content_parts.append(f"\nComment {i} (Score: {comment['score']}, Author: {comment['author']}):")
150
+ content_parts.append(comment_text)
151
+
152
+ # Join all content
153
+ text = '\n'.join(content_parts)
154
+
155
+ # Calculate quality metrics specifically for Reddit content
156
+ metrics = {
157
+ 'text_length': len(text),
158
+ 'score': score,
159
+ 'num_comments': num_comments,
160
+ 'domain': urlparse(url).netloc.lower(),
161
+ 'is_reddit': True
162
+ }
163
+
164
+ # Quality score based on Reddit-specific factors
165
+ length_score = min(1.0, len(text) / 2000) # Want at least 2000 chars
166
+ engagement_score = min(1.0, (score + num_comments) / 100) # High engagement is good
167
+ has_comments = 1.0 if len(valid_comments) > 0 else 0.3 # Heavily penalize if no good comments
168
+
169
+ quality_score = (
170
+ length_score * 0.4 +
171
+ engagement_score * 0.3 +
172
+ has_comments * 0.3
173
+ )
174
+
175
+ metrics['quality_score'] = round(quality_score, 3)
176
+
177
+ # Limit content if needed
178
+ limit = content_limit if content_limit is not None else self.max_content_length
179
+ if len(text) > limit:
180
+ text = text[:limit]
181
+
182
+ return text, metrics
183
+
184
+ except Exception as e:
185
+ # Fall back to HTML extraction if JSON fails
186
+ return self.extract_html_content(url, content_limit, timeout)
187
+
188
+ def extract_text_from_url(self, url: str, content_limit: int = None, timeout: int = 10) -> Tuple[str, Dict[str, Any]]:
189
+ """
190
+ Main extraction method that routes to appropriate extractor
191
+
192
+ Returns:
193
+ Tuple of (text_content, quality_metrics)
194
+ """
195
+ if self.is_reddit_url(url):
196
+ return self.extract_reddit_content(url, content_limit, timeout)
197
+ else:
198
+ return self.extract_html_content(url, content_limit, timeout)
199
+
200
+ def extract_html_content(self, url: str, content_limit: int = None, timeout: int = 10) -> Tuple[str, Dict[str, Any]]:
201
+ """
202
+ Original HTML extraction method (renamed from extract_text_from_url)
203
+ """
204
+ try:
205
+ response = self.session.get(url, timeout=timeout)
206
+ response.raise_for_status()
207
+
208
+ soup = BeautifulSoup(response.content, 'html.parser')
209
+
210
+ # Extract main content areas (common content selectors)
211
+ main_content = None
212
+ content_selectors = [
213
+ 'article', 'main', '[role="main"]', '.content', '#content',
214
+ '.post', '.entry-content', '.article-body', '.story-body',
215
+ '.markdown-body', '.wiki-body', '.documentation'
216
+ ]
217
+
218
+ for selector in content_selectors:
219
+ main_content = soup.select_one(selector)
220
+ if main_content:
221
+ break
222
+
223
+ # If no main content found, use the whole body
224
+ if not main_content:
225
+ main_content = soup.find('body') or soup
226
+
227
+ # Clone for processing
228
+ content_soup = BeautifulSoup(str(main_content), 'html.parser')
229
+
230
+ # Remove unwanted elements from the content area
231
+ unwanted_tags = ["script", "style", "nav", "footer", "header", "aside", "noscript", "iframe"]
232
+ for tag in unwanted_tags:
233
+ for element in content_soup(tag):
234
+ element.decompose()
235
+
236
+ # Remove elements with unwanted classes/ids
237
+ unwanted_patterns = [
238
+ 'sidebar', 'navigation', 'menu', 'advertisement', 'ads', 'banner',
239
+ 'popup', 'modal', 'cookie', 'gdpr', 'subscribe', 'newsletter',
240
+ 'comments', 'related', 'share', 'social'
241
+ ]
242
+
243
+ for pattern in unwanted_patterns:
244
+ # Remove by class
245
+ for element in content_soup.find_all(class_=re.compile(pattern, re.I)):
246
+ element.decompose()
247
+ # Remove by id
248
+ for element in content_soup.find_all(id=re.compile(pattern, re.I)):
249
+ element.decompose()
250
+
251
+ # Extract text
252
+ text = content_soup.get_text()
253
+
254
+ # Clean up the text
255
+ lines = [line.strip() for line in text.splitlines()]
256
+ # Remove empty lines and join
257
+ lines = [line for line in lines if line]
258
+ text = ' '.join(lines)
259
+
260
+ # Remove excessive whitespace
261
+ text = re.sub(r'\s+', ' ', text).strip()
262
+
263
+ # Calculate quality metrics (need to pass query for relevance)
264
+ quality_metrics = self._calculate_content_quality(text, url, "")
265
+
266
+ # Limit text length
267
+ limit = content_limit if content_limit is not None else self.max_content_length
268
+ if len(text) > limit:
269
+ text = text[:limit]
270
+
271
+ return text, quality_metrics
272
+
273
+ except Exception as e:
274
+ return "", {"error": str(e), "quality_score": 0}
275
+
276
+ def _calculate_content_quality(self, text: str, url: str, query: str = "") -> Dict[str, Any]:
277
+ """
278
+ Calculate quality metrics for extracted content
279
+
280
+ Quality factors:
281
+ - Text length (substantive content)
282
+ - Word diversity (not repetitive)
283
+ - Sentence structure (proper formatting)
284
+ - Lack of boilerplate phrases
285
+ - Domain reputation
286
+ """
287
+ if not text:
288
+ return {"quality_score": 0, "text_length": 0}
289
+
290
+ metrics = {}
291
+
292
+ # Text length (MUCH stricter - prefer 2000-10000 chars of actual content)
293
+ text_length = len(text)
294
+ metrics['text_length'] = text_length
295
+ if text_length < 500:
296
+ length_score = 0 # Too short to be useful
297
+ elif text_length < 2000:
298
+ length_score = (text_length - 500) / 1500 * 0.5 # Scale to 0.5 max
299
+ elif text_length <= 10000:
300
+ length_score = 1.0 # Ideal range
301
+ else:
302
+ length_score = max(0.8, 1.0 - (text_length - 10000) / 20000)
303
+
304
+ # Word diversity (unique words / total words)
305
+ words = text.lower().split()
306
+ if words:
307
+ unique_words = len(set(words))
308
+ total_words = len(words)
309
+ diversity_score = min(1.0, unique_words / (total_words * 0.3)) # Expect 30% unique
310
+ metrics['word_diversity'] = unique_words / total_words if total_words > 0 else 0
311
+ else:
312
+ diversity_score = 0
313
+ metrics['word_diversity'] = 0
314
+
315
+ # Check for boilerplate/navigation text (MUCH stricter)
316
+ boilerplate_phrases = [
317
+ 'cookie', 'privacy policy', 'terms of service', 'subscribe',
318
+ 'sign up', 'log in', 'advertisement', 'sponsored', 'copyright',
319
+ 'all rights reserved', 'skip to', 'navigation', 'breadcrumb',
320
+ 'reddit inc', 'google llc', 'expand navigation', 'members •',
321
+ 'archived post', 'votes cannot be cast', 'r/', 'subreddit',
322
+ 'youtube', 'facebook', 'twitter', 'instagram', 'pinterest'
323
+ ]
324
+
325
+ lower_text = text.lower()
326
+ boilerplate_count = sum(1 for phrase in boilerplate_phrases if phrase in lower_text)
327
+ boilerplate_penalty = max(0, 1.0 - (boilerplate_count * 0.15)) # -15% per boilerplate phrase
328
+ metrics['boilerplate_count'] = boilerplate_count
329
+
330
+ # Sentence detection (need MORE sentences for quality content)
331
+ sentences = re.split(r'[.!?]+', text)
332
+ sentence_count = len([s for s in sentences if len(s.strip()) > 30]) # Longer min sentence
333
+ sentence_score = min(1.0, sentence_count / 10) # Need at least 10 proper sentences
334
+ metrics['sentence_count'] = sentence_count
335
+
336
+ # Domain quality (MUCH harsher on social media)
337
+ domain = urlparse(url).netloc.lower()
338
+ quality_domains = [
339
+ 'wikipedia.org', 'starwars.fandom.com', 'imdb.com',
340
+ 'screenrant.com', 'denofgeek.com', 'ign.com',
341
+ 'hollywoodreporter.com', 'variety.com', 'ew.com',
342
+ 'stackexchange.com', 'stackoverflow.com',
343
+ 'github.com', 'medium.com', 'dev.to', 'arxiv.org',
344
+ 'nature.com', 'sciencedirect.com', 'ieee.org'
345
+ ]
346
+
347
+ low_quality_domains = [
348
+ 'reddit.com', 'youtube.com', 'facebook.com', 'twitter.com',
349
+ 'instagram.com', 'pinterest.com', 'tiktok.com', 'x.com'
350
+ ]
351
+
352
+ if any(d in domain for d in quality_domains):
353
+ domain_score = 1.5 # Strong bonus for quality domains
354
+ elif any(d in domain for d in low_quality_domains):
355
+ domain_score = 0.1 # Severe penalty for social media
356
+ else:
357
+ domain_score = 1.0
358
+
359
+ metrics['domain'] = domain
360
+
361
+ # Query relevance scoring - check for query terms in content
362
+ relevance_score = 0
363
+ if query:
364
+ # Split query into meaningful words (skip common words)
365
+ stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were'}
366
+ query_words = [w.lower() for w in query.split() if w.lower() not in stop_words and len(w) > 2]
367
+
368
+ if query_words:
369
+ lower_content = text.lower()
370
+ # Count how many query words appear in content
371
+ words_found = sum(1 for word in query_words if word in lower_content)
372
+
373
+ # Also check for exact phrase matches (bonus points)
374
+ exact_phrase_bonus = 0
375
+ if len(query_words) > 1:
376
+ # Check for consecutive word pairs
377
+ for i in range(len(query_words) - 1):
378
+ phrase = f"{query_words[i]} {query_words[i+1]}"
379
+ if phrase in lower_content:
380
+ exact_phrase_bonus += 0.2
381
+
382
+ relevance_score = min(1.0, (words_found / len(query_words)) + exact_phrase_bonus)
383
+ metrics['query_relevance'] = round(relevance_score, 3)
384
+ metrics['query_words_found'] = f"{words_found}/{len(query_words)}"
385
+ else:
386
+ relevance_score = 0.5 # Neutral if no query provided
387
+ metrics['query_relevance'] = 0.5
388
+
389
+ # Calculate final quality score (adjusted weights with relevance)
390
+ quality_score = (
391
+ length_score * 0.25 + # Length important
392
+ diversity_score * 0.10 + # Less weight
393
+ boilerplate_penalty * 0.10 + # Less weight
394
+ sentence_score * 0.15 + # Sentences important
395
+ domain_score * 0.15 + # Domain important
396
+ relevance_score * 0.25 # Query relevance important
397
+ )
398
+
399
+ metrics['quality_score'] = round(quality_score, 3)
400
+ metrics['length_score'] = round(length_score, 3)
401
+ metrics['diversity_score'] = round(diversity_score, 3)
402
+ metrics['boilerplate_penalty'] = round(boilerplate_penalty, 3)
403
+ metrics['sentence_score'] = round(sentence_score, 3)
404
+ metrics['domain_score'] = round(domain_score, 3)
405
+
406
+ return metrics
407
+
408
+ def search_and_scrape_best(self, query: str, num_results: int = 3,
409
+ oversample_factor: float = 4.0, delay: float = 0.5,
410
+ min_quality_score: float = 0.2) -> str:
411
+ """
412
+ Search and scrape with quality filtering and source diversity
413
+
414
+ Args:
415
+ query: Search query
416
+ num_results: Number of best results to return
417
+ oversample_factor: How many extra results to fetch (e.g., 4.0 = fetch 4x)
418
+ delay: Delay between requests
419
+ min_quality_score: Minimum quality score to include a result
420
+
421
+ Returns:
422
+ Formatted string with the best N results from diverse sources
423
+ """
424
+ # Fetch more results than requested (increased to 4x for better pool)
425
+ fetch_count = min(10, int(num_results * oversample_factor))
426
+ search_results = self.search_google(query, fetch_count)
427
+
428
+ if not search_results:
429
+ return f"No search results found for query: {query}"
430
+
431
+ # Process all results and collect quality metrics
432
+ processed_results = []
433
+
434
+ for result in search_results:
435
+ # Extract content and quality metrics (pass query for relevance scoring)
436
+ page_text, metrics = self.extract_text_from_url(result['url'])
437
+
438
+ # Recalculate metrics with query relevance
439
+ if page_text:
440
+ metrics = self._calculate_content_quality(page_text, result['url'], query)
441
+
442
+ if metrics.get('quality_score', 0) >= min_quality_score and page_text:
443
+ processed_results.append({
444
+ 'title': result['title'],
445
+ 'url': result['url'],
446
+ 'snippet': result['snippet'],
447
+ 'content': page_text,
448
+ 'metrics': metrics,
449
+ 'quality_score': metrics.get('quality_score', 0),
450
+ 'domain': metrics.get('domain', '')
451
+ })
452
+
453
+ # Small delay between requests
454
+ if delay > 0:
455
+ time.sleep(delay)
456
+
457
+ if not processed_results:
458
+ return f"No quality results found for query: {query}. All results were below quality threshold."
459
+
460
+ # Sort by quality score
461
+ processed_results.sort(key=lambda x: x['quality_score'], reverse=True)
462
+
463
+ # Select diverse results (prefer different domains)
464
+ best_results = []
465
+ seen_domains = set()
466
+
467
+ # First pass: Add highest quality result from each unique domain
468
+ for result in processed_results:
469
+ domain = result['domain']
470
+ if domain not in seen_domains and len(best_results) < num_results:
471
+ best_results.append(result)
472
+ seen_domains.add(domain)
473
+
474
+ # Second pass: If we need more results, add remaining high-quality ones
475
+ if len(best_results) < num_results:
476
+ for result in processed_results:
477
+ if result not in best_results and len(best_results) < num_results:
478
+ best_results.append(result)
479
+
480
+ if not best_results:
481
+ return f"No quality results found for query: {query}. Try a different search term."
482
+
483
+ # Calculate per-result content budget for the best results
484
+ estimated_overhead_per_result = 400 # Including quality info
485
+ total_overhead = len(best_results) * estimated_overhead_per_result
486
+ available_for_content = self.max_content_length - total_overhead
487
+ per_result_limit = max(2000, available_for_content // len(best_results)) # Increased minimum
488
+
489
+ # Format the best results
490
+ all_text = []
491
+ all_text.append(f"Found {len(processed_results)} results meeting quality threshold from {len(search_results)} searched.")
492
+ all_text.append(f"Showing top {len(best_results)} from diverse sources:\n")
493
+
494
+ for i, result in enumerate(best_results, 1):
495
+ text_content = f"=== RESULT {i} (Quality: {result['quality_score']:.2f}) ===\n"
496
+ text_content += f"Title: {result['title']}\n"
497
+ text_content += f"URL: {result['url']}\n"
498
+ text_content += f"Source: {result['domain']}\n"
499
+ text_content += f"Snippet: {result['snippet']}\n"
500
+
501
+ # Add quality indicators
502
+ metrics = result['metrics']
503
+ text_content += f"Content Stats: {metrics.get('text_length', 0)} chars, "
504
+ text_content += f"{metrics.get('sentence_count', 0)} sentences\n"
505
+ text_content += f"Query Relevance: {metrics.get('query_relevance', 0):.2f} "
506
+ text_content += f"(keywords: {metrics.get('query_words_found', 'N/A')})\n"
507
+ text_content += f"Content:\n"
508
+
509
+ # Truncate content if needed
510
+ content = result['content']
511
+ if len(content) > per_result_limit:
512
+ content = content[:per_result_limit] + "..."
513
+ text_content += content
514
+
515
+ text_content += f"\n{'='*50}\n\n"
516
+ all_text.append(text_content)
517
+
518
+ return '\n'.join(all_text)
519
+
520
+ def search_and_scrape(self, query: str, num_results: int = 3, delay: float = 0.5) -> str:
521
+ """
522
+ Backward compatible method that uses the improved search
523
+ """
524
+ return self.search_and_scrape_best(
525
+ query=query,
526
+ num_results=num_results,
527
+ oversample_factor=4.0,
528
+ delay=delay,
529
+ min_quality_score=0.2
530
+ )
531
+
532
+
533
+ class WebSearchSkill(SkillBase):
534
+ """Web search capability using Google Custom Search API with quality filtering"""
535
+
536
+ SKILL_NAME = "web_search"
537
+ SKILL_DESCRIPTION = "Search the web for information using Google Custom Search API"
538
+ SKILL_VERSION = "2.0.0" # Bumped version for improved functionality
539
+ REQUIRED_PACKAGES = ["bs4", "requests"]
540
+ REQUIRED_ENV_VARS = []
541
+
542
+ # Enable multiple instances support
543
+ SUPPORTS_MULTIPLE_INSTANCES = True
544
+
545
+ def get_instance_key(self) -> str:
546
+ """Get the key used to track this skill instance"""
547
+ search_engine_id = self.params.get('search_engine_id', 'default')
548
+ tool_name = self.params.get('tool_name', 'web_search')
549
+ return f"{self.SKILL_NAME}_{search_engine_id}_{tool_name}"
550
+
551
+ def setup(self) -> bool:
552
+ """Setup the web search skill"""
553
+ # Validate required parameters
554
+ required_params = ['api_key', 'search_engine_id']
555
+ missing_params = [param for param in required_params if not self.params.get(param)]
556
+ if missing_params:
557
+ self.logger.error(f"Missing required parameters: {missing_params}")
558
+ return False
559
+
560
+ if not self.validate_packages():
561
+ return False
562
+
563
+ # Set parameters from config
564
+ self.api_key = self.params['api_key']
565
+ self.search_engine_id = self.params['search_engine_id']
566
+
567
+ # Set default parameters
568
+ self.default_num_results = self.params.get('num_results', 3)
569
+ self.default_delay = self.params.get('delay', 0.5)
570
+ self.max_content_length = self.params.get('max_content_length', 32768)
571
+
572
+ # Quality control parameters (new)
573
+ self.oversample_factor = self.params.get('oversample_factor', 2.5)
574
+ self.min_quality_score = self.params.get('min_quality_score', 0.3)
575
+
576
+ self.no_results_message = self.params.get('no_results_message',
577
+ "I couldn't find quality results for '{query}'. "
578
+ "The search returned only low-quality or inaccessible pages. "
579
+ "Try rephrasing your search or asking about a different topic."
580
+ )
581
+
582
+ # Tool name (for multiple instances)
583
+ self.tool_name = self.params.get('tool_name', 'web_search')
584
+
585
+ # Initialize the improved search scraper
586
+ self.search_scraper = GoogleSearchScraper(
587
+ api_key=self.api_key,
588
+ search_engine_id=self.search_engine_id,
589
+ max_content_length=self.max_content_length
590
+ )
591
+
592
+ return True
593
+
594
+ def register_tools(self) -> None:
595
+ """Register web search tool with the agent"""
596
+ self.define_tool(
597
+ name=self.tool_name,
598
+ description="Search the web for high-quality information, automatically filtering low-quality results",
599
+ parameters={
600
+ "query": {
601
+ "type": "string",
602
+ "description": "The search query - what you want to find information about"
603
+ }
604
+ },
605
+ handler=self._web_search_handler
606
+ )
607
+
608
+ def _web_search_handler(self, args, raw_data):
609
+ """Handler for web search tool with quality filtering"""
610
+ query = args.get("query", "").strip()
611
+
612
+ if not query:
613
+ return SwaigFunctionResult(
614
+ "Please provide a search query. What would you like me to search for?"
615
+ )
616
+
617
+ # Use the configured number of results
618
+ num_results = self.default_num_results
619
+
620
+ self.logger.info(f"Web search requested: '{query}' (requesting {num_results} quality results)")
621
+
622
+ # Perform the improved search
623
+ try:
624
+ search_results = self.search_scraper.search_and_scrape_best(
625
+ query=query,
626
+ num_results=num_results,
627
+ oversample_factor=self.oversample_factor,
628
+ delay=self.default_delay,
629
+ min_quality_score=self.min_quality_score
630
+ )
631
+
632
+ if not search_results or "No quality results found" in search_results or "No search results found" in search_results:
633
+ formatted_message = self.no_results_message.format(query=query) if '{query}' in self.no_results_message else self.no_results_message
634
+ return SwaigFunctionResult(formatted_message)
635
+
636
+ response = f"Quality web search results for '{query}':\n\n{search_results}"
637
+ return SwaigFunctionResult(response)
638
+
639
+ except Exception as e:
640
+ self.logger.error(f"Error performing web search: {e}")
641
+ return SwaigFunctionResult(
642
+ "Sorry, I encountered an error while searching. Please try again later."
643
+ )
644
+
645
+ def get_hints(self) -> List[str]:
646
+ """Return speech recognition hints"""
647
+ return []
648
+
649
+ def get_global_data(self) -> Dict[str, Any]:
650
+ """Return global data for agent context"""
651
+ return {
652
+ "web_search_enabled": True,
653
+ "search_provider": "Google Custom Search",
654
+ "quality_filtering": True
655
+ }
656
+
657
+ def get_prompt_sections(self) -> List[Dict[str, Any]]:
658
+ """Return prompt sections to add to agent"""
659
+ return [
660
+ {
661
+ "title": "Web Search Capability (Quality Enhanced)",
662
+ "body": f"You can search the internet for high-quality information using the {self.tool_name} tool.",
663
+ "bullets": [
664
+ f"Use the {self.tool_name} tool when users ask for information you need to look up",
665
+ "The search automatically filters out low-quality results like empty pages",
666
+ "Results are ranked by content quality, relevance, and domain reputation",
667
+ "Summarize the high-quality results in a clear, helpful way"
668
+ ]
669
+ }
670
+ ]
671
+
672
+ @classmethod
673
+ def get_parameter_schema(cls) -> Dict[str, Dict[str, Any]]:
674
+ """Get the parameter schema for the web search skill"""
675
+ schema = super().get_parameter_schema()
676
+
677
+ # Add web search specific parameters
678
+ schema.update({
679
+ "api_key": {
680
+ "type": "string",
681
+ "description": "Google Custom Search API key",
682
+ "required": True,
683
+ "hidden": True,
684
+ "env_var": "GOOGLE_SEARCH_API_KEY"
685
+ },
686
+ "search_engine_id": {
687
+ "type": "string",
688
+ "description": "Google Custom Search Engine ID",
689
+ "required": True,
690
+ "hidden": True,
691
+ "env_var": "GOOGLE_SEARCH_ENGINE_ID"
692
+ },
693
+ "num_results": {
694
+ "type": "integer",
695
+ "description": "Number of high-quality results to return",
696
+ "default": 3,
697
+ "required": False,
698
+ "min": 1,
699
+ "max": 10
700
+ },
701
+ "delay": {
702
+ "type": "number",
703
+ "description": "Delay between scraping pages in seconds",
704
+ "default": 0.5,
705
+ "required": False,
706
+ "min": 0
707
+ },
708
+ "max_content_length": {
709
+ "type": "integer",
710
+ "description": "Maximum total response size in characters",
711
+ "default": 32768,
712
+ "required": False,
713
+ "min": 1000
714
+ },
715
+ "oversample_factor": {
716
+ "type": "number",
717
+ "description": "How many extra results to fetch for quality filtering (e.g., 2.5 = fetch 2.5x requested)",
718
+ "default": 2.5,
719
+ "required": False,
720
+ "min": 1.0,
721
+ "max": 3.5
722
+ },
723
+ "min_quality_score": {
724
+ "type": "number",
725
+ "description": "Minimum quality score (0-1) for including a result",
726
+ "default": 0.3,
727
+ "required": False,
728
+ "min": 0.0,
729
+ "max": 1.0
730
+ },
731
+ "no_results_message": {
732
+ "type": "string",
733
+ "description": "Message to show when no quality results are found. Use {query} as placeholder.",
734
+ "default": "I couldn't find quality results for '{query}'. The search returned only low-quality or inaccessible pages. Try rephrasing your search or asking about a different topic.",
735
+ "required": False
736
+ }
737
+ })
738
+
739
+ return schema