signalwire-agents 0.1.13__py3-none-any.whl → 1.0.17.dev4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +99 -15
- signalwire_agents/agent_server.py +248 -60
- signalwire_agents/agents/bedrock.py +296 -0
- signalwire_agents/cli/__init__.py +9 -0
- signalwire_agents/cli/build_search.py +951 -41
- signalwire_agents/cli/config.py +80 -0
- signalwire_agents/cli/core/__init__.py +10 -0
- signalwire_agents/cli/core/agent_loader.py +470 -0
- signalwire_agents/cli/core/argparse_helpers.py +179 -0
- signalwire_agents/cli/core/dynamic_config.py +71 -0
- signalwire_agents/cli/core/service_loader.py +303 -0
- signalwire_agents/cli/dokku.py +2320 -0
- signalwire_agents/cli/execution/__init__.py +10 -0
- signalwire_agents/cli/execution/datamap_exec.py +446 -0
- signalwire_agents/cli/execution/webhook_exec.py +134 -0
- signalwire_agents/cli/init_project.py +2636 -0
- signalwire_agents/cli/output/__init__.py +10 -0
- signalwire_agents/cli/output/output_formatter.py +255 -0
- signalwire_agents/cli/output/swml_dump.py +186 -0
- signalwire_agents/cli/simulation/__init__.py +10 -0
- signalwire_agents/cli/simulation/data_generation.py +374 -0
- signalwire_agents/cli/simulation/data_overrides.py +200 -0
- signalwire_agents/cli/simulation/mock_env.py +282 -0
- signalwire_agents/cli/swaig_test_wrapper.py +52 -0
- signalwire_agents/cli/test_swaig.py +566 -2366
- signalwire_agents/cli/types.py +81 -0
- signalwire_agents/core/__init__.py +2 -2
- signalwire_agents/core/agent/__init__.py +12 -0
- signalwire_agents/core/agent/config/__init__.py +12 -0
- signalwire_agents/core/agent/deployment/__init__.py +9 -0
- signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
- signalwire_agents/core/agent/prompt/__init__.py +14 -0
- signalwire_agents/core/agent/prompt/manager.py +306 -0
- signalwire_agents/core/agent/routing/__init__.py +9 -0
- signalwire_agents/core/agent/security/__init__.py +9 -0
- signalwire_agents/core/agent/swml/__init__.py +9 -0
- signalwire_agents/core/agent/tools/__init__.py +15 -0
- signalwire_agents/core/agent/tools/decorator.py +97 -0
- signalwire_agents/core/agent/tools/registry.py +210 -0
- signalwire_agents/core/agent_base.py +845 -2916
- signalwire_agents/core/auth_handler.py +233 -0
- signalwire_agents/core/config_loader.py +259 -0
- signalwire_agents/core/contexts.py +418 -0
- signalwire_agents/core/data_map.py +3 -15
- signalwire_agents/core/function_result.py +116 -44
- signalwire_agents/core/logging_config.py +162 -18
- signalwire_agents/core/mixins/__init__.py +28 -0
- signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
- signalwire_agents/core/mixins/auth_mixin.py +280 -0
- signalwire_agents/core/mixins/prompt_mixin.py +358 -0
- signalwire_agents/core/mixins/serverless_mixin.py +460 -0
- signalwire_agents/core/mixins/skill_mixin.py +55 -0
- signalwire_agents/core/mixins/state_mixin.py +153 -0
- signalwire_agents/core/mixins/tool_mixin.py +230 -0
- signalwire_agents/core/mixins/web_mixin.py +1142 -0
- signalwire_agents/core/security_config.py +333 -0
- signalwire_agents/core/skill_base.py +84 -1
- signalwire_agents/core/skill_manager.py +62 -20
- signalwire_agents/core/swaig_function.py +18 -5
- signalwire_agents/core/swml_builder.py +207 -11
- signalwire_agents/core/swml_handler.py +27 -21
- signalwire_agents/core/swml_renderer.py +123 -312
- signalwire_agents/core/swml_service.py +171 -203
- signalwire_agents/mcp_gateway/__init__.py +29 -0
- signalwire_agents/mcp_gateway/gateway_service.py +564 -0
- signalwire_agents/mcp_gateway/mcp_manager.py +513 -0
- signalwire_agents/mcp_gateway/session_manager.py +218 -0
- signalwire_agents/prefabs/concierge.py +0 -3
- signalwire_agents/prefabs/faq_bot.py +0 -3
- signalwire_agents/prefabs/info_gatherer.py +0 -3
- signalwire_agents/prefabs/receptionist.py +0 -3
- signalwire_agents/prefabs/survey.py +0 -3
- signalwire_agents/schema.json +9218 -5489
- signalwire_agents/search/__init__.py +7 -1
- signalwire_agents/search/document_processor.py +490 -31
- signalwire_agents/search/index_builder.py +307 -37
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +748 -0
- signalwire_agents/search/query_processor.py +162 -31
- signalwire_agents/search/search_engine.py +916 -35
- signalwire_agents/search/search_service.py +376 -53
- signalwire_agents/skills/README.md +452 -0
- signalwire_agents/skills/__init__.py +14 -2
- signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
- signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
- signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
- signalwire_agents/skills/datasphere/README.md +210 -0
- signalwire_agents/skills/datasphere/skill.py +84 -3
- signalwire_agents/skills/datasphere_serverless/README.md +258 -0
- signalwire_agents/skills/datasphere_serverless/__init__.py +9 -0
- signalwire_agents/skills/datasphere_serverless/skill.py +82 -1
- signalwire_agents/skills/datetime/README.md +132 -0
- signalwire_agents/skills/datetime/__init__.py +9 -0
- signalwire_agents/skills/datetime/skill.py +20 -7
- signalwire_agents/skills/joke/README.md +149 -0
- signalwire_agents/skills/joke/__init__.py +9 -0
- signalwire_agents/skills/joke/skill.py +21 -0
- signalwire_agents/skills/math/README.md +161 -0
- signalwire_agents/skills/math/__init__.py +9 -0
- signalwire_agents/skills/math/skill.py +18 -4
- signalwire_agents/skills/mcp_gateway/README.md +230 -0
- signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
- signalwire_agents/skills/mcp_gateway/skill.py +421 -0
- signalwire_agents/skills/native_vector_search/README.md +210 -0
- signalwire_agents/skills/native_vector_search/__init__.py +9 -0
- signalwire_agents/skills/native_vector_search/skill.py +569 -101
- signalwire_agents/skills/play_background_file/README.md +218 -0
- signalwire_agents/skills/play_background_file/__init__.py +12 -0
- signalwire_agents/skills/play_background_file/skill.py +242 -0
- signalwire_agents/skills/registry.py +395 -40
- signalwire_agents/skills/spider/README.md +236 -0
- signalwire_agents/skills/spider/__init__.py +13 -0
- signalwire_agents/skills/spider/skill.py +598 -0
- signalwire_agents/skills/swml_transfer/README.md +395 -0
- signalwire_agents/skills/swml_transfer/__init__.py +10 -0
- signalwire_agents/skills/swml_transfer/skill.py +359 -0
- signalwire_agents/skills/weather_api/README.md +178 -0
- signalwire_agents/skills/weather_api/__init__.py +12 -0
- signalwire_agents/skills/weather_api/skill.py +191 -0
- signalwire_agents/skills/web_search/README.md +163 -0
- signalwire_agents/skills/web_search/__init__.py +9 -0
- signalwire_agents/skills/web_search/skill.py +586 -112
- signalwire_agents/skills/wikipedia_search/README.md +228 -0
- signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
- signalwire_agents/skills/{wikipedia → wikipedia_search}/skill.py +33 -3
- signalwire_agents/web/__init__.py +17 -0
- signalwire_agents/web/web_service.py +559 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-agent-init.1 +400 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-search.1 +483 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/swaig-test.1 +308 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/METADATA +347 -215
- signalwire_agents-1.0.17.dev4.dist-info/RECORD +147 -0
- signalwire_agents-1.0.17.dev4.dist-info/entry_points.txt +6 -0
- signalwire_agents/core/state/file_state_manager.py +0 -219
- signalwire_agents/core/state/state_manager.py +0 -101
- signalwire_agents/skills/wikipedia/__init__.py +0 -9
- signalwire_agents-0.1.13.data/data/schema.json +0 -5611
- signalwire_agents-0.1.13.dist-info/RECORD +0 -67
- signalwire_agents-0.1.13.dist-info/entry_points.txt +0 -3
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/top_level.txt +0 -0
|
@@ -10,18 +10,19 @@ See LICENSE file in the project root for full license information.
|
|
|
10
10
|
import os
|
|
11
11
|
import requests
|
|
12
12
|
import time
|
|
13
|
+
import re
|
|
13
14
|
from urllib.parse import urljoin, urlparse
|
|
14
15
|
from bs4 import BeautifulSoup
|
|
15
16
|
import json
|
|
16
|
-
from typing import Optional, List, Dict, Any
|
|
17
|
+
from typing import Optional, List, Dict, Any, Tuple
|
|
17
18
|
|
|
18
19
|
from signalwire_agents.core.skill_base import SkillBase
|
|
19
20
|
from signalwire_agents.core.function_result import SwaigFunctionResult
|
|
20
21
|
|
|
21
22
|
class GoogleSearchScraper:
|
|
22
|
-
"""Google Search and Web Scraping functionality"""
|
|
23
|
-
|
|
24
|
-
def __init__(self, api_key: str, search_engine_id: str, max_content_length: int =
|
|
23
|
+
"""Google Search and Web Scraping functionality with quality scoring"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, api_key: str, search_engine_id: str, max_content_length: int = 32768):
|
|
25
26
|
self.api_key = api_key
|
|
26
27
|
self.search_engine_id = search_engine_id
|
|
27
28
|
self.max_content_length = max_content_length
|
|
@@ -33,22 +34,22 @@ class GoogleSearchScraper:
|
|
|
33
34
|
def search_google(self, query: str, num_results: int = 5) -> list:
|
|
34
35
|
"""Search Google using Custom Search JSON API"""
|
|
35
36
|
url = "https://www.googleapis.com/customsearch/v1"
|
|
36
|
-
|
|
37
|
+
|
|
37
38
|
params = {
|
|
38
39
|
'key': self.api_key,
|
|
39
40
|
'cx': self.search_engine_id,
|
|
40
41
|
'q': query,
|
|
41
42
|
'num': min(num_results, 10)
|
|
42
43
|
}
|
|
43
|
-
|
|
44
|
+
|
|
44
45
|
try:
|
|
45
46
|
response = self.session.get(url, params=params)
|
|
46
47
|
response.raise_for_status()
|
|
47
48
|
data = response.json()
|
|
48
|
-
|
|
49
|
+
|
|
49
50
|
if 'items' not in data:
|
|
50
51
|
return []
|
|
51
|
-
|
|
52
|
+
|
|
52
53
|
results = []
|
|
53
54
|
for item in data['items'][:num_results]:
|
|
54
55
|
results.append({
|
|
@@ -56,94 +57,497 @@ class GoogleSearchScraper:
|
|
|
56
57
|
'url': item.get('link', ''),
|
|
57
58
|
'snippet': item.get('snippet', '')
|
|
58
59
|
})
|
|
59
|
-
|
|
60
|
+
|
|
60
61
|
return results
|
|
61
|
-
|
|
62
|
+
|
|
62
63
|
except Exception as e:
|
|
63
64
|
return []
|
|
64
65
|
|
|
65
|
-
def
|
|
66
|
-
"""
|
|
66
|
+
def is_reddit_url(self, url: str) -> bool:
|
|
67
|
+
"""Check if URL is from Reddit"""
|
|
68
|
+
domain = urlparse(url).netloc.lower()
|
|
69
|
+
return 'reddit.com' in domain or 'redd.it' in domain
|
|
70
|
+
|
|
71
|
+
def extract_reddit_content(self, url: str, content_limit: int = None, timeout: int = 10) -> Tuple[str, Dict[str, Any]]:
|
|
72
|
+
"""
|
|
73
|
+
Extract Reddit content using JSON API for better quality
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Tuple of (text_content, quality_metrics)
|
|
77
|
+
"""
|
|
78
|
+
try:
|
|
79
|
+
# Convert to JSON endpoint
|
|
80
|
+
if not url.endswith('.json'):
|
|
81
|
+
json_url = url.rstrip('/') + '.json'
|
|
82
|
+
else:
|
|
83
|
+
json_url = url
|
|
84
|
+
|
|
85
|
+
# Fetch with proper headers (Reddit requires User-Agent)
|
|
86
|
+
headers = {'User-Agent': 'SignalWire-WebSearch/2.0'}
|
|
87
|
+
response = requests.get(json_url, headers=headers, timeout=timeout)
|
|
88
|
+
response.raise_for_status()
|
|
89
|
+
|
|
90
|
+
data = response.json()
|
|
91
|
+
|
|
92
|
+
# Extract post information
|
|
93
|
+
if not data or not isinstance(data, list) or len(data) < 1:
|
|
94
|
+
return "", {"error": "Invalid Reddit JSON structure", "quality_score": 0}
|
|
95
|
+
|
|
96
|
+
# First element is the post, second (if exists) contains comments
|
|
97
|
+
post_data = data[0]['data']['children'][0]['data']
|
|
98
|
+
|
|
99
|
+
# Build content from post
|
|
100
|
+
content_parts = []
|
|
101
|
+
|
|
102
|
+
# Add post title and metadata
|
|
103
|
+
title = post_data.get('title', 'No title')
|
|
104
|
+
author = post_data.get('author', 'unknown')
|
|
105
|
+
score = post_data.get('score', 0)
|
|
106
|
+
num_comments = post_data.get('num_comments', 0)
|
|
107
|
+
subreddit = post_data.get('subreddit', '')
|
|
108
|
+
|
|
109
|
+
content_parts.append(f"Reddit r/{subreddit} Discussion")
|
|
110
|
+
content_parts.append(f"\nPost: {title}")
|
|
111
|
+
content_parts.append(f"Author: {author} | Score: {score} | Comments: {num_comments}")
|
|
112
|
+
|
|
113
|
+
# Add original post text if it's a text post
|
|
114
|
+
selftext = post_data.get('selftext', '').strip()
|
|
115
|
+
if selftext and selftext != '[removed]' and selftext != '[deleted]':
|
|
116
|
+
content_parts.append(f"\nOriginal Post:\n{selftext[:1000]}") # Limit post text
|
|
117
|
+
|
|
118
|
+
# Extract top comments if available
|
|
119
|
+
valid_comments = []
|
|
120
|
+
if len(data) > 1 and 'data' in data[1] and 'children' in data[1]['data']:
|
|
121
|
+
comments = data[1]['data']['children']
|
|
122
|
+
|
|
123
|
+
# Filter and sort comments by score
|
|
124
|
+
for comment in comments[:20]: # Look at top 20 comments
|
|
125
|
+
if comment.get('kind') == 't1': # t1 = comment
|
|
126
|
+
comment_data = comment.get('data', {})
|
|
127
|
+
body = comment_data.get('body', '').strip()
|
|
128
|
+
if (body and
|
|
129
|
+
body != '[removed]' and
|
|
130
|
+
body != '[deleted]' and
|
|
131
|
+
len(body) > 50): # Skip very short comments
|
|
132
|
+
valid_comments.append({
|
|
133
|
+
'body': body,
|
|
134
|
+
'author': comment_data.get('author', 'unknown'),
|
|
135
|
+
'score': comment_data.get('score', 0)
|
|
136
|
+
})
|
|
137
|
+
|
|
138
|
+
# Sort by score and take top comments
|
|
139
|
+
valid_comments.sort(key=lambda x: x['score'], reverse=True)
|
|
140
|
+
|
|
141
|
+
if valid_comments:
|
|
142
|
+
content_parts.append("\n--- Top Discussion ---")
|
|
143
|
+
for i, comment in enumerate(valid_comments[:5], 1):
|
|
144
|
+
# Truncate long comments
|
|
145
|
+
comment_text = comment['body'][:500]
|
|
146
|
+
if len(comment['body']) > 500:
|
|
147
|
+
comment_text += "..."
|
|
148
|
+
|
|
149
|
+
content_parts.append(f"\nComment {i} (Score: {comment['score']}, Author: {comment['author']}):")
|
|
150
|
+
content_parts.append(comment_text)
|
|
151
|
+
|
|
152
|
+
# Join all content
|
|
153
|
+
text = '\n'.join(content_parts)
|
|
154
|
+
|
|
155
|
+
# Calculate quality metrics specifically for Reddit content
|
|
156
|
+
metrics = {
|
|
157
|
+
'text_length': len(text),
|
|
158
|
+
'score': score,
|
|
159
|
+
'num_comments': num_comments,
|
|
160
|
+
'domain': urlparse(url).netloc.lower(),
|
|
161
|
+
'is_reddit': True
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
# Quality score based on Reddit-specific factors
|
|
165
|
+
length_score = min(1.0, len(text) / 2000) # Want at least 2000 chars
|
|
166
|
+
engagement_score = min(1.0, (score + num_comments) / 100) # High engagement is good
|
|
167
|
+
has_comments = 1.0 if len(valid_comments) > 0 else 0.3 # Heavily penalize if no good comments
|
|
168
|
+
|
|
169
|
+
quality_score = (
|
|
170
|
+
length_score * 0.4 +
|
|
171
|
+
engagement_score * 0.3 +
|
|
172
|
+
has_comments * 0.3
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
metrics['quality_score'] = round(quality_score, 3)
|
|
176
|
+
|
|
177
|
+
# Limit content if needed
|
|
178
|
+
limit = content_limit if content_limit is not None else self.max_content_length
|
|
179
|
+
if len(text) > limit:
|
|
180
|
+
text = text[:limit]
|
|
181
|
+
|
|
182
|
+
return text, metrics
|
|
183
|
+
|
|
184
|
+
except Exception as e:
|
|
185
|
+
# Fall back to HTML extraction if JSON fails
|
|
186
|
+
return self.extract_html_content(url, content_limit, timeout)
|
|
187
|
+
|
|
188
|
+
def extract_text_from_url(self, url: str, content_limit: int = None, timeout: int = 10) -> Tuple[str, Dict[str, Any]]:
|
|
189
|
+
"""
|
|
190
|
+
Main extraction method that routes to appropriate extractor
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Tuple of (text_content, quality_metrics)
|
|
194
|
+
"""
|
|
195
|
+
if self.is_reddit_url(url):
|
|
196
|
+
return self.extract_reddit_content(url, content_limit, timeout)
|
|
197
|
+
else:
|
|
198
|
+
return self.extract_html_content(url, content_limit, timeout)
|
|
199
|
+
|
|
200
|
+
def extract_html_content(self, url: str, content_limit: int = None, timeout: int = 10) -> Tuple[str, Dict[str, Any]]:
|
|
201
|
+
"""
|
|
202
|
+
Original HTML extraction method (renamed from extract_text_from_url)
|
|
203
|
+
"""
|
|
67
204
|
try:
|
|
68
205
|
response = self.session.get(url, timeout=timeout)
|
|
69
206
|
response.raise_for_status()
|
|
70
|
-
|
|
207
|
+
|
|
71
208
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
72
|
-
|
|
73
|
-
#
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
209
|
+
|
|
210
|
+
# Extract main content areas (common content selectors)
|
|
211
|
+
main_content = None
|
|
212
|
+
content_selectors = [
|
|
213
|
+
'article', 'main', '[role="main"]', '.content', '#content',
|
|
214
|
+
'.post', '.entry-content', '.article-body', '.story-body',
|
|
215
|
+
'.markdown-body', '.wiki-body', '.documentation'
|
|
216
|
+
]
|
|
217
|
+
|
|
218
|
+
for selector in content_selectors:
|
|
219
|
+
main_content = soup.select_one(selector)
|
|
220
|
+
if main_content:
|
|
221
|
+
break
|
|
222
|
+
|
|
223
|
+
# If no main content found, use the whole body
|
|
224
|
+
if not main_content:
|
|
225
|
+
main_content = soup.find('body') or soup
|
|
226
|
+
|
|
227
|
+
# Clone for processing
|
|
228
|
+
content_soup = BeautifulSoup(str(main_content), 'html.parser')
|
|
229
|
+
|
|
230
|
+
# Remove unwanted elements from the content area
|
|
231
|
+
unwanted_tags = ["script", "style", "nav", "footer", "header", "aside", "noscript", "iframe"]
|
|
232
|
+
for tag in unwanted_tags:
|
|
233
|
+
for element in content_soup(tag):
|
|
234
|
+
element.decompose()
|
|
235
|
+
|
|
236
|
+
# Remove elements with unwanted classes/ids
|
|
237
|
+
unwanted_patterns = [
|
|
238
|
+
'sidebar', 'navigation', 'menu', 'advertisement', 'ads', 'banner',
|
|
239
|
+
'popup', 'modal', 'cookie', 'gdpr', 'subscribe', 'newsletter',
|
|
240
|
+
'comments', 'related', 'share', 'social'
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
for pattern in unwanted_patterns:
|
|
244
|
+
# Remove by class
|
|
245
|
+
for element in content_soup.find_all(class_=re.compile(pattern, re.I)):
|
|
246
|
+
element.decompose()
|
|
247
|
+
# Remove by id
|
|
248
|
+
for element in content_soup.find_all(id=re.compile(pattern, re.I)):
|
|
249
|
+
element.decompose()
|
|
250
|
+
|
|
251
|
+
# Extract text
|
|
252
|
+
text = content_soup.get_text()
|
|
253
|
+
|
|
79
254
|
# Clean up the text
|
|
80
|
-
lines =
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
255
|
+
lines = [line.strip() for line in text.splitlines()]
|
|
256
|
+
# Remove empty lines and join
|
|
257
|
+
lines = [line for line in lines if line]
|
|
258
|
+
text = ' '.join(lines)
|
|
259
|
+
|
|
260
|
+
# Remove excessive whitespace
|
|
261
|
+
text = re.sub(r'\s+', ' ', text).strip()
|
|
262
|
+
|
|
263
|
+
# Calculate quality metrics (need to pass query for relevance)
|
|
264
|
+
quality_metrics = self._calculate_content_quality(text, url, "")
|
|
265
|
+
|
|
84
266
|
# Limit text length
|
|
85
|
-
if
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
267
|
+
limit = content_limit if content_limit is not None else self.max_content_length
|
|
268
|
+
if len(text) > limit:
|
|
269
|
+
text = text[:limit]
|
|
270
|
+
|
|
271
|
+
return text, quality_metrics
|
|
272
|
+
|
|
90
273
|
except Exception as e:
|
|
91
|
-
return ""
|
|
274
|
+
return "", {"error": str(e), "quality_score": 0}
|
|
275
|
+
|
|
276
|
+
def _calculate_content_quality(self, text: str, url: str, query: str = "") -> Dict[str, Any]:
|
|
277
|
+
"""
|
|
278
|
+
Calculate quality metrics for extracted content
|
|
279
|
+
|
|
280
|
+
Quality factors:
|
|
281
|
+
- Text length (substantive content)
|
|
282
|
+
- Word diversity (not repetitive)
|
|
283
|
+
- Sentence structure (proper formatting)
|
|
284
|
+
- Lack of boilerplate phrases
|
|
285
|
+
- Domain reputation
|
|
286
|
+
"""
|
|
287
|
+
if not text:
|
|
288
|
+
return {"quality_score": 0, "text_length": 0}
|
|
289
|
+
|
|
290
|
+
metrics = {}
|
|
291
|
+
|
|
292
|
+
# Text length (MUCH stricter - prefer 2000-10000 chars of actual content)
|
|
293
|
+
text_length = len(text)
|
|
294
|
+
metrics['text_length'] = text_length
|
|
295
|
+
if text_length < 500:
|
|
296
|
+
length_score = 0 # Too short to be useful
|
|
297
|
+
elif text_length < 2000:
|
|
298
|
+
length_score = (text_length - 500) / 1500 * 0.5 # Scale to 0.5 max
|
|
299
|
+
elif text_length <= 10000:
|
|
300
|
+
length_score = 1.0 # Ideal range
|
|
301
|
+
else:
|
|
302
|
+
length_score = max(0.8, 1.0 - (text_length - 10000) / 20000)
|
|
303
|
+
|
|
304
|
+
# Word diversity (unique words / total words)
|
|
305
|
+
words = text.lower().split()
|
|
306
|
+
if words:
|
|
307
|
+
unique_words = len(set(words))
|
|
308
|
+
total_words = len(words)
|
|
309
|
+
diversity_score = min(1.0, unique_words / (total_words * 0.3)) # Expect 30% unique
|
|
310
|
+
metrics['word_diversity'] = unique_words / total_words if total_words > 0 else 0
|
|
311
|
+
else:
|
|
312
|
+
diversity_score = 0
|
|
313
|
+
metrics['word_diversity'] = 0
|
|
314
|
+
|
|
315
|
+
# Check for boilerplate/navigation text (MUCH stricter)
|
|
316
|
+
boilerplate_phrases = [
|
|
317
|
+
'cookie', 'privacy policy', 'terms of service', 'subscribe',
|
|
318
|
+
'sign up', 'log in', 'advertisement', 'sponsored', 'copyright',
|
|
319
|
+
'all rights reserved', 'skip to', 'navigation', 'breadcrumb',
|
|
320
|
+
'reddit inc', 'google llc', 'expand navigation', 'members •',
|
|
321
|
+
'archived post', 'votes cannot be cast', 'r/', 'subreddit',
|
|
322
|
+
'youtube', 'facebook', 'twitter', 'instagram', 'pinterest'
|
|
323
|
+
]
|
|
324
|
+
|
|
325
|
+
lower_text = text.lower()
|
|
326
|
+
boilerplate_count = sum(1 for phrase in boilerplate_phrases if phrase in lower_text)
|
|
327
|
+
boilerplate_penalty = max(0, 1.0 - (boilerplate_count * 0.15)) # -15% per boilerplate phrase
|
|
328
|
+
metrics['boilerplate_count'] = boilerplate_count
|
|
329
|
+
|
|
330
|
+
# Sentence detection (need MORE sentences for quality content)
|
|
331
|
+
sentences = re.split(r'[.!?]+', text)
|
|
332
|
+
sentence_count = len([s for s in sentences if len(s.strip()) > 30]) # Longer min sentence
|
|
333
|
+
sentence_score = min(1.0, sentence_count / 10) # Need at least 10 proper sentences
|
|
334
|
+
metrics['sentence_count'] = sentence_count
|
|
335
|
+
|
|
336
|
+
# Domain quality (MUCH harsher on social media)
|
|
337
|
+
domain = urlparse(url).netloc.lower()
|
|
338
|
+
quality_domains = [
|
|
339
|
+
'wikipedia.org', 'starwars.fandom.com', 'imdb.com',
|
|
340
|
+
'screenrant.com', 'denofgeek.com', 'ign.com',
|
|
341
|
+
'hollywoodreporter.com', 'variety.com', 'ew.com',
|
|
342
|
+
'stackexchange.com', 'stackoverflow.com',
|
|
343
|
+
'github.com', 'medium.com', 'dev.to', 'arxiv.org',
|
|
344
|
+
'nature.com', 'sciencedirect.com', 'ieee.org'
|
|
345
|
+
]
|
|
346
|
+
|
|
347
|
+
low_quality_domains = [
|
|
348
|
+
'reddit.com', 'youtube.com', 'facebook.com', 'twitter.com',
|
|
349
|
+
'instagram.com', 'pinterest.com', 'tiktok.com', 'x.com'
|
|
350
|
+
]
|
|
351
|
+
|
|
352
|
+
if any(d in domain for d in quality_domains):
|
|
353
|
+
domain_score = 1.5 # Strong bonus for quality domains
|
|
354
|
+
elif any(d in domain for d in low_quality_domains):
|
|
355
|
+
domain_score = 0.1 # Severe penalty for social media
|
|
356
|
+
else:
|
|
357
|
+
domain_score = 1.0
|
|
358
|
+
|
|
359
|
+
metrics['domain'] = domain
|
|
360
|
+
|
|
361
|
+
# Query relevance scoring - check for query terms in content
|
|
362
|
+
relevance_score = 0
|
|
363
|
+
if query:
|
|
364
|
+
# Split query into meaningful words (skip common words)
|
|
365
|
+
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were'}
|
|
366
|
+
query_words = [w.lower() for w in query.split() if w.lower() not in stop_words and len(w) > 2]
|
|
367
|
+
|
|
368
|
+
if query_words:
|
|
369
|
+
lower_content = text.lower()
|
|
370
|
+
# Count how many query words appear in content
|
|
371
|
+
words_found = sum(1 for word in query_words if word in lower_content)
|
|
372
|
+
|
|
373
|
+
# Also check for exact phrase matches (bonus points)
|
|
374
|
+
exact_phrase_bonus = 0
|
|
375
|
+
if len(query_words) > 1:
|
|
376
|
+
# Check for consecutive word pairs
|
|
377
|
+
for i in range(len(query_words) - 1):
|
|
378
|
+
phrase = f"{query_words[i]} {query_words[i+1]}"
|
|
379
|
+
if phrase in lower_content:
|
|
380
|
+
exact_phrase_bonus += 0.2
|
|
381
|
+
|
|
382
|
+
relevance_score = min(1.0, (words_found / len(query_words)) + exact_phrase_bonus)
|
|
383
|
+
metrics['query_relevance'] = round(relevance_score, 3)
|
|
384
|
+
metrics['query_words_found'] = f"{words_found}/{len(query_words)}"
|
|
385
|
+
else:
|
|
386
|
+
relevance_score = 0.5 # Neutral if no query provided
|
|
387
|
+
metrics['query_relevance'] = 0.5
|
|
388
|
+
|
|
389
|
+
# Calculate final quality score (adjusted weights with relevance)
|
|
390
|
+
quality_score = (
|
|
391
|
+
length_score * 0.25 + # Length important
|
|
392
|
+
diversity_score * 0.10 + # Less weight
|
|
393
|
+
boilerplate_penalty * 0.10 + # Less weight
|
|
394
|
+
sentence_score * 0.15 + # Sentences important
|
|
395
|
+
domain_score * 0.15 + # Domain important
|
|
396
|
+
relevance_score * 0.25 # Query relevance important
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
metrics['quality_score'] = round(quality_score, 3)
|
|
400
|
+
metrics['length_score'] = round(length_score, 3)
|
|
401
|
+
metrics['diversity_score'] = round(diversity_score, 3)
|
|
402
|
+
metrics['boilerplate_penalty'] = round(boilerplate_penalty, 3)
|
|
403
|
+
metrics['sentence_score'] = round(sentence_score, 3)
|
|
404
|
+
metrics['domain_score'] = round(domain_score, 3)
|
|
405
|
+
|
|
406
|
+
return metrics
|
|
407
|
+
|
|
408
|
+
def search_and_scrape_best(self, query: str, num_results: int = 3,
|
|
409
|
+
oversample_factor: float = 4.0, delay: float = 0.5,
|
|
410
|
+
min_quality_score: float = 0.2) -> str:
|
|
411
|
+
"""
|
|
412
|
+
Search and scrape with quality filtering and source diversity
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
query: Search query
|
|
416
|
+
num_results: Number of best results to return
|
|
417
|
+
oversample_factor: How many extra results to fetch (e.g., 4.0 = fetch 4x)
|
|
418
|
+
delay: Delay between requests
|
|
419
|
+
min_quality_score: Minimum quality score to include a result
|
|
420
|
+
|
|
421
|
+
Returns:
|
|
422
|
+
Formatted string with the best N results from diverse sources
|
|
423
|
+
"""
|
|
424
|
+
# Fetch more results than requested (increased to 4x for better pool)
|
|
425
|
+
fetch_count = min(10, int(num_results * oversample_factor))
|
|
426
|
+
search_results = self.search_google(query, fetch_count)
|
|
92
427
|
|
|
93
|
-
def search_and_scrape(self, query: str, num_results: int = 3, delay: float = 0.5) -> str:
|
|
94
|
-
"""Main function: search Google and scrape the resulting pages"""
|
|
95
|
-
search_results = self.search_google(query, num_results)
|
|
96
|
-
|
|
97
428
|
if not search_results:
|
|
98
429
|
return f"No search results found for query: {query}"
|
|
99
|
-
|
|
430
|
+
|
|
431
|
+
# Process all results and collect quality metrics
|
|
432
|
+
processed_results = []
|
|
433
|
+
|
|
434
|
+
for result in search_results:
|
|
435
|
+
# Extract content and quality metrics (pass query for relevance scoring)
|
|
436
|
+
page_text, metrics = self.extract_text_from_url(result['url'])
|
|
437
|
+
|
|
438
|
+
# Recalculate metrics with query relevance
|
|
439
|
+
if page_text:
|
|
440
|
+
metrics = self._calculate_content_quality(page_text, result['url'], query)
|
|
441
|
+
|
|
442
|
+
if metrics.get('quality_score', 0) >= min_quality_score and page_text:
|
|
443
|
+
processed_results.append({
|
|
444
|
+
'title': result['title'],
|
|
445
|
+
'url': result['url'],
|
|
446
|
+
'snippet': result['snippet'],
|
|
447
|
+
'content': page_text,
|
|
448
|
+
'metrics': metrics,
|
|
449
|
+
'quality_score': metrics.get('quality_score', 0),
|
|
450
|
+
'domain': metrics.get('domain', '')
|
|
451
|
+
})
|
|
452
|
+
|
|
453
|
+
# Small delay between requests
|
|
454
|
+
if delay > 0:
|
|
455
|
+
time.sleep(delay)
|
|
456
|
+
|
|
457
|
+
if not processed_results:
|
|
458
|
+
return f"No quality results found for query: {query}. All results were below quality threshold."
|
|
459
|
+
|
|
460
|
+
# Sort by quality score
|
|
461
|
+
processed_results.sort(key=lambda x: x['quality_score'], reverse=True)
|
|
462
|
+
|
|
463
|
+
# Select diverse results (prefer different domains)
|
|
464
|
+
best_results = []
|
|
465
|
+
seen_domains = set()
|
|
466
|
+
|
|
467
|
+
# First pass: Add highest quality result from each unique domain
|
|
468
|
+
for result in processed_results:
|
|
469
|
+
domain = result['domain']
|
|
470
|
+
if domain not in seen_domains and len(best_results) < num_results:
|
|
471
|
+
best_results.append(result)
|
|
472
|
+
seen_domains.add(domain)
|
|
473
|
+
|
|
474
|
+
# Second pass: If we need more results, add remaining high-quality ones
|
|
475
|
+
if len(best_results) < num_results:
|
|
476
|
+
for result in processed_results:
|
|
477
|
+
if result not in best_results and len(best_results) < num_results:
|
|
478
|
+
best_results.append(result)
|
|
479
|
+
|
|
480
|
+
if not best_results:
|
|
481
|
+
return f"No quality results found for query: {query}. Try a different search term."
|
|
482
|
+
|
|
483
|
+
# Calculate per-result content budget for the best results
|
|
484
|
+
estimated_overhead_per_result = 400 # Including quality info
|
|
485
|
+
total_overhead = len(best_results) * estimated_overhead_per_result
|
|
486
|
+
available_for_content = self.max_content_length - total_overhead
|
|
487
|
+
per_result_limit = max(2000, available_for_content // len(best_results)) # Increased minimum
|
|
488
|
+
|
|
489
|
+
# Format the best results
|
|
100
490
|
all_text = []
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
491
|
+
all_text.append(f"Found {len(processed_results)} results meeting quality threshold from {len(search_results)} searched.")
|
|
492
|
+
all_text.append(f"Showing top {len(best_results)} from diverse sources:\n")
|
|
493
|
+
|
|
494
|
+
for i, result in enumerate(best_results, 1):
|
|
495
|
+
text_content = f"=== RESULT {i} (Quality: {result['quality_score']:.2f}) ===\n"
|
|
104
496
|
text_content += f"Title: {result['title']}\n"
|
|
105
497
|
text_content += f"URL: {result['url']}\n"
|
|
498
|
+
text_content += f"Source: {result['domain']}\n"
|
|
106
499
|
text_content += f"Snippet: {result['snippet']}\n"
|
|
500
|
+
|
|
501
|
+
# Add quality indicators
|
|
502
|
+
metrics = result['metrics']
|
|
503
|
+
text_content += f"Content Stats: {metrics.get('text_length', 0)} chars, "
|
|
504
|
+
text_content += f"{metrics.get('sentence_count', 0)} sentences\n"
|
|
505
|
+
text_content += f"Query Relevance: {metrics.get('query_relevance', 0):.2f} "
|
|
506
|
+
text_content += f"(keywords: {metrics.get('query_words_found', 'N/A')})\n"
|
|
107
507
|
text_content += f"Content:\n"
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
if
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
508
|
+
|
|
509
|
+
# Truncate content if needed
|
|
510
|
+
content = result['content']
|
|
511
|
+
if len(content) > per_result_limit:
|
|
512
|
+
content = content[:per_result_limit] + "..."
|
|
513
|
+
text_content += content
|
|
514
|
+
|
|
116
515
|
text_content += f"\n{'='*50}\n\n"
|
|
117
516
|
all_text.append(text_content)
|
|
118
|
-
|
|
119
|
-
if i < len(search_results):
|
|
120
|
-
time.sleep(delay)
|
|
121
|
-
|
|
517
|
+
|
|
122
518
|
return '\n'.join(all_text)
|
|
123
519
|
|
|
520
|
+
def search_and_scrape(self, query: str, num_results: int = 3, delay: float = 0.5) -> str:
|
|
521
|
+
"""
|
|
522
|
+
Backward compatible method that uses the improved search
|
|
523
|
+
"""
|
|
524
|
+
return self.search_and_scrape_best(
|
|
525
|
+
query=query,
|
|
526
|
+
num_results=num_results,
|
|
527
|
+
oversample_factor=4.0,
|
|
528
|
+
delay=delay,
|
|
529
|
+
min_quality_score=0.2
|
|
530
|
+
)
|
|
531
|
+
|
|
124
532
|
|
|
125
533
|
class WebSearchSkill(SkillBase):
|
|
126
|
-
"""Web search capability using Google Custom Search API"""
|
|
127
|
-
|
|
534
|
+
"""Web search capability using Google Custom Search API with quality filtering"""
|
|
535
|
+
|
|
128
536
|
SKILL_NAME = "web_search"
|
|
129
537
|
SKILL_DESCRIPTION = "Search the web for information using Google Custom Search API"
|
|
130
|
-
SKILL_VERSION = "
|
|
538
|
+
SKILL_VERSION = "2.0.0" # Bumped version for improved functionality
|
|
131
539
|
REQUIRED_PACKAGES = ["bs4", "requests"]
|
|
132
|
-
REQUIRED_ENV_VARS = []
|
|
133
|
-
|
|
540
|
+
REQUIRED_ENV_VARS = []
|
|
541
|
+
|
|
134
542
|
# Enable multiple instances support
|
|
135
543
|
SUPPORTS_MULTIPLE_INSTANCES = True
|
|
136
|
-
|
|
544
|
+
|
|
137
545
|
def get_instance_key(self) -> str:
|
|
138
|
-
"""
|
|
139
|
-
Get the key used to track this skill instance
|
|
140
|
-
|
|
141
|
-
For web search, we use the search_engine_id to differentiate instances
|
|
142
|
-
"""
|
|
546
|
+
"""Get the key used to track this skill instance"""
|
|
143
547
|
search_engine_id = self.params.get('search_engine_id', 'default')
|
|
144
548
|
tool_name = self.params.get('tool_name', 'web_search')
|
|
145
549
|
return f"{self.SKILL_NAME}_{search_engine_id}_{tool_name}"
|
|
146
|
-
|
|
550
|
+
|
|
147
551
|
def setup(self) -> bool:
|
|
148
552
|
"""Setup the web search skill"""
|
|
149
553
|
# Validate required parameters
|
|
@@ -152,114 +556,184 @@ class WebSearchSkill(SkillBase):
|
|
|
152
556
|
if missing_params:
|
|
153
557
|
self.logger.error(f"Missing required parameters: {missing_params}")
|
|
154
558
|
return False
|
|
155
|
-
|
|
559
|
+
|
|
156
560
|
if not self.validate_packages():
|
|
157
561
|
return False
|
|
158
|
-
|
|
562
|
+
|
|
159
563
|
# Set parameters from config
|
|
160
564
|
self.api_key = self.params['api_key']
|
|
161
565
|
self.search_engine_id = self.params['search_engine_id']
|
|
162
|
-
|
|
566
|
+
|
|
163
567
|
# Set default parameters
|
|
164
|
-
self.default_num_results = self.params.get('num_results',
|
|
165
|
-
self.default_delay = self.params.get('delay', 0)
|
|
166
|
-
self.max_content_length = self.params.get('max_content_length',
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
568
|
+
self.default_num_results = self.params.get('num_results', 3)
|
|
569
|
+
self.default_delay = self.params.get('delay', 0.5)
|
|
570
|
+
self.max_content_length = self.params.get('max_content_length', 32768)
|
|
571
|
+
|
|
572
|
+
# Quality control parameters (new)
|
|
573
|
+
self.oversample_factor = self.params.get('oversample_factor', 2.5)
|
|
574
|
+
self.min_quality_score = self.params.get('min_quality_score', 0.3)
|
|
575
|
+
|
|
576
|
+
self.no_results_message = self.params.get('no_results_message',
|
|
577
|
+
"I couldn't find quality results for '{query}'. "
|
|
578
|
+
"The search returned only low-quality or inaccessible pages. "
|
|
170
579
|
"Try rephrasing your search or asking about a different topic."
|
|
171
580
|
)
|
|
172
|
-
|
|
581
|
+
|
|
173
582
|
# Tool name (for multiple instances)
|
|
174
583
|
self.tool_name = self.params.get('tool_name', 'web_search')
|
|
175
|
-
|
|
176
|
-
# Initialize the search scraper
|
|
584
|
+
|
|
585
|
+
# Initialize the improved search scraper
|
|
177
586
|
self.search_scraper = GoogleSearchScraper(
|
|
178
587
|
api_key=self.api_key,
|
|
179
588
|
search_engine_id=self.search_engine_id,
|
|
180
589
|
max_content_length=self.max_content_length
|
|
181
590
|
)
|
|
182
|
-
|
|
591
|
+
|
|
183
592
|
return True
|
|
184
|
-
|
|
593
|
+
|
|
185
594
|
def register_tools(self) -> None:
|
|
186
595
|
"""Register web search tool with the agent"""
|
|
187
|
-
self.
|
|
596
|
+
self.define_tool(
|
|
188
597
|
name=self.tool_name,
|
|
189
|
-
description="Search the web for information
|
|
598
|
+
description="Search the web for high-quality information, automatically filtering low-quality results",
|
|
190
599
|
parameters={
|
|
191
600
|
"query": {
|
|
192
601
|
"type": "string",
|
|
193
602
|
"description": "The search query - what you want to find information about"
|
|
194
603
|
}
|
|
195
604
|
},
|
|
196
|
-
handler=self._web_search_handler
|
|
197
|
-
**self.swaig_fields
|
|
605
|
+
handler=self._web_search_handler
|
|
198
606
|
)
|
|
199
|
-
|
|
607
|
+
|
|
200
608
|
def _web_search_handler(self, args, raw_data):
|
|
201
|
-
"""Handler for web search tool"""
|
|
609
|
+
"""Handler for web search tool with quality filtering"""
|
|
202
610
|
query = args.get("query", "").strip()
|
|
203
|
-
|
|
611
|
+
|
|
204
612
|
if not query:
|
|
205
613
|
return SwaigFunctionResult(
|
|
206
614
|
"Please provide a search query. What would you like me to search for?"
|
|
207
615
|
)
|
|
208
|
-
|
|
209
|
-
# Use the configured number of results
|
|
616
|
+
|
|
617
|
+
# Use the configured number of results
|
|
210
618
|
num_results = self.default_num_results
|
|
211
|
-
|
|
212
|
-
self.logger.info(f"Web search requested: '{query}' ({num_results} results)")
|
|
213
|
-
|
|
214
|
-
# Perform the search
|
|
619
|
+
|
|
620
|
+
self.logger.info(f"Web search requested: '{query}' (requesting {num_results} quality results)")
|
|
621
|
+
|
|
622
|
+
# Perform the improved search
|
|
215
623
|
try:
|
|
216
|
-
search_results = self.search_scraper.
|
|
624
|
+
search_results = self.search_scraper.search_and_scrape_best(
|
|
217
625
|
query=query,
|
|
218
626
|
num_results=num_results,
|
|
219
|
-
|
|
627
|
+
oversample_factor=self.oversample_factor,
|
|
628
|
+
delay=self.default_delay,
|
|
629
|
+
min_quality_score=self.min_quality_score
|
|
220
630
|
)
|
|
221
|
-
|
|
222
|
-
if not search_results or "No search results found" in search_results:
|
|
223
|
-
# Format the no results message with the query if it contains a placeholder
|
|
631
|
+
|
|
632
|
+
if not search_results or "No quality results found" in search_results or "No search results found" in search_results:
|
|
224
633
|
formatted_message = self.no_results_message.format(query=query) if '{query}' in self.no_results_message else self.no_results_message
|
|
225
634
|
return SwaigFunctionResult(formatted_message)
|
|
226
|
-
|
|
227
|
-
response = f"
|
|
635
|
+
|
|
636
|
+
response = f"Quality web search results for '{query}':\n\n{search_results}"
|
|
228
637
|
return SwaigFunctionResult(response)
|
|
229
|
-
|
|
638
|
+
|
|
230
639
|
except Exception as e:
|
|
231
640
|
self.logger.error(f"Error performing web search: {e}")
|
|
232
641
|
return SwaigFunctionResult(
|
|
233
642
|
"Sorry, I encountered an error while searching. Please try again later."
|
|
234
643
|
)
|
|
235
|
-
|
|
644
|
+
|
|
236
645
|
def get_hints(self) -> List[str]:
|
|
237
646
|
"""Return speech recognition hints"""
|
|
238
|
-
# Currently no hints provided, but you could add them like:
|
|
239
|
-
# return [
|
|
240
|
-
# "Google", "search", "internet", "web", "information",
|
|
241
|
-
# "find", "look up", "research", "query", "results"
|
|
242
|
-
# ]
|
|
243
647
|
return []
|
|
244
|
-
|
|
648
|
+
|
|
245
649
|
def get_global_data(self) -> Dict[str, Any]:
|
|
246
650
|
"""Return global data for agent context"""
|
|
247
651
|
return {
|
|
248
652
|
"web_search_enabled": True,
|
|
249
|
-
"search_provider": "Google Custom Search"
|
|
653
|
+
"search_provider": "Google Custom Search",
|
|
654
|
+
"quality_filtering": True
|
|
250
655
|
}
|
|
251
|
-
|
|
656
|
+
|
|
252
657
|
def get_prompt_sections(self) -> List[Dict[str, Any]]:
|
|
253
658
|
"""Return prompt sections to add to agent"""
|
|
254
659
|
return [
|
|
255
660
|
{
|
|
256
|
-
"title": "Web Search Capability",
|
|
257
|
-
"body": f"You can search the internet for
|
|
661
|
+
"title": "Web Search Capability (Quality Enhanced)",
|
|
662
|
+
"body": f"You can search the internet for high-quality information using the {self.tool_name} tool.",
|
|
258
663
|
"bullets": [
|
|
259
664
|
f"Use the {self.tool_name} tool when users ask for information you need to look up",
|
|
260
|
-
"
|
|
261
|
-
"
|
|
262
|
-
"
|
|
665
|
+
"The search automatically filters out low-quality results like empty pages",
|
|
666
|
+
"Results are ranked by content quality, relevance, and domain reputation",
|
|
667
|
+
"Summarize the high-quality results in a clear, helpful way"
|
|
263
668
|
]
|
|
264
669
|
}
|
|
265
|
-
]
|
|
670
|
+
]
|
|
671
|
+
|
|
672
|
+
@classmethod
|
|
673
|
+
def get_parameter_schema(cls) -> Dict[str, Dict[str, Any]]:
|
|
674
|
+
"""Get the parameter schema for the web search skill"""
|
|
675
|
+
schema = super().get_parameter_schema()
|
|
676
|
+
|
|
677
|
+
# Add web search specific parameters
|
|
678
|
+
schema.update({
|
|
679
|
+
"api_key": {
|
|
680
|
+
"type": "string",
|
|
681
|
+
"description": "Google Custom Search API key",
|
|
682
|
+
"required": True,
|
|
683
|
+
"hidden": True,
|
|
684
|
+
"env_var": "GOOGLE_SEARCH_API_KEY"
|
|
685
|
+
},
|
|
686
|
+
"search_engine_id": {
|
|
687
|
+
"type": "string",
|
|
688
|
+
"description": "Google Custom Search Engine ID",
|
|
689
|
+
"required": True,
|
|
690
|
+
"hidden": True,
|
|
691
|
+
"env_var": "GOOGLE_SEARCH_ENGINE_ID"
|
|
692
|
+
},
|
|
693
|
+
"num_results": {
|
|
694
|
+
"type": "integer",
|
|
695
|
+
"description": "Number of high-quality results to return",
|
|
696
|
+
"default": 3,
|
|
697
|
+
"required": False,
|
|
698
|
+
"min": 1,
|
|
699
|
+
"max": 10
|
|
700
|
+
},
|
|
701
|
+
"delay": {
|
|
702
|
+
"type": "number",
|
|
703
|
+
"description": "Delay between scraping pages in seconds",
|
|
704
|
+
"default": 0.5,
|
|
705
|
+
"required": False,
|
|
706
|
+
"min": 0
|
|
707
|
+
},
|
|
708
|
+
"max_content_length": {
|
|
709
|
+
"type": "integer",
|
|
710
|
+
"description": "Maximum total response size in characters",
|
|
711
|
+
"default": 32768,
|
|
712
|
+
"required": False,
|
|
713
|
+
"min": 1000
|
|
714
|
+
},
|
|
715
|
+
"oversample_factor": {
|
|
716
|
+
"type": "number",
|
|
717
|
+
"description": "How many extra results to fetch for quality filtering (e.g., 2.5 = fetch 2.5x requested)",
|
|
718
|
+
"default": 2.5,
|
|
719
|
+
"required": False,
|
|
720
|
+
"min": 1.0,
|
|
721
|
+
"max": 3.5
|
|
722
|
+
},
|
|
723
|
+
"min_quality_score": {
|
|
724
|
+
"type": "number",
|
|
725
|
+
"description": "Minimum quality score (0-1) for including a result",
|
|
726
|
+
"default": 0.3,
|
|
727
|
+
"required": False,
|
|
728
|
+
"min": 0.0,
|
|
729
|
+
"max": 1.0
|
|
730
|
+
},
|
|
731
|
+
"no_results_message": {
|
|
732
|
+
"type": "string",
|
|
733
|
+
"description": "Message to show when no quality results are found. Use {query} as placeholder.",
|
|
734
|
+
"default": "I couldn't find quality results for '{query}'. The search returned only low-quality or inaccessible pages. Try rephrasing your search or asking about a different topic.",
|
|
735
|
+
"required": False
|
|
736
|
+
}
|
|
737
|
+
})
|
|
738
|
+
|
|
739
|
+
return schema
|