signalwire-agents 0.1.23__py3-none-any.whl → 0.1.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +1 -1
- signalwire_agents/agent_server.py +2 -1
- signalwire_agents/cli/config.py +61 -0
- signalwire_agents/cli/core/__init__.py +1 -0
- signalwire_agents/cli/core/agent_loader.py +254 -0
- signalwire_agents/cli/core/argparse_helpers.py +164 -0
- signalwire_agents/cli/core/dynamic_config.py +62 -0
- signalwire_agents/cli/execution/__init__.py +1 -0
- signalwire_agents/cli/execution/datamap_exec.py +437 -0
- signalwire_agents/cli/execution/webhook_exec.py +125 -0
- signalwire_agents/cli/output/__init__.py +1 -0
- signalwire_agents/cli/output/output_formatter.py +132 -0
- signalwire_agents/cli/output/swml_dump.py +177 -0
- signalwire_agents/cli/simulation/__init__.py +1 -0
- signalwire_agents/cli/simulation/data_generation.py +365 -0
- signalwire_agents/cli/simulation/data_overrides.py +187 -0
- signalwire_agents/cli/simulation/mock_env.py +271 -0
- signalwire_agents/cli/test_swaig.py +522 -2539
- signalwire_agents/cli/types.py +72 -0
- signalwire_agents/core/agent/__init__.py +1 -3
- signalwire_agents/core/agent/config/__init__.py +1 -3
- signalwire_agents/core/agent/prompt/manager.py +25 -7
- signalwire_agents/core/agent/tools/decorator.py +2 -0
- signalwire_agents/core/agent/tools/registry.py +8 -0
- signalwire_agents/core/agent_base.py +492 -3053
- signalwire_agents/core/function_result.py +31 -42
- signalwire_agents/core/mixins/__init__.py +28 -0
- signalwire_agents/core/mixins/ai_config_mixin.py +373 -0
- signalwire_agents/core/mixins/auth_mixin.py +287 -0
- signalwire_agents/core/mixins/prompt_mixin.py +345 -0
- signalwire_agents/core/mixins/serverless_mixin.py +368 -0
- signalwire_agents/core/mixins/skill_mixin.py +55 -0
- signalwire_agents/core/mixins/state_mixin.py +219 -0
- signalwire_agents/core/mixins/tool_mixin.py +295 -0
- signalwire_agents/core/mixins/web_mixin.py +1130 -0
- signalwire_agents/core/skill_manager.py +3 -1
- signalwire_agents/core/swaig_function.py +10 -1
- signalwire_agents/core/swml_service.py +140 -58
- signalwire_agents/skills/README.md +452 -0
- signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
- signalwire_agents/skills/datasphere/README.md +210 -0
- signalwire_agents/skills/datasphere_serverless/README.md +258 -0
- signalwire_agents/skills/datetime/README.md +132 -0
- signalwire_agents/skills/joke/README.md +149 -0
- signalwire_agents/skills/math/README.md +161 -0
- signalwire_agents/skills/native_vector_search/skill.py +33 -13
- signalwire_agents/skills/play_background_file/README.md +218 -0
- signalwire_agents/skills/spider/README.md +236 -0
- signalwire_agents/skills/spider/__init__.py +4 -0
- signalwire_agents/skills/spider/skill.py +479 -0
- signalwire_agents/skills/swml_transfer/README.md +395 -0
- signalwire_agents/skills/swml_transfer/__init__.py +1 -0
- signalwire_agents/skills/swml_transfer/skill.py +257 -0
- signalwire_agents/skills/weather_api/README.md +178 -0
- signalwire_agents/skills/web_search/README.md +163 -0
- signalwire_agents/skills/wikipedia_search/README.md +228 -0
- {signalwire_agents-0.1.23.dist-info → signalwire_agents-0.1.24.dist-info}/METADATA +47 -2
- {signalwire_agents-0.1.23.dist-info → signalwire_agents-0.1.24.dist-info}/RECORD +62 -22
- {signalwire_agents-0.1.23.dist-info → signalwire_agents-0.1.24.dist-info}/entry_points.txt +1 -1
- signalwire_agents/core/agent/config/ephemeral.py +0 -176
- signalwire_agents-0.1.23.data/data/schema.json +0 -5611
- {signalwire_agents-0.1.23.dist-info → signalwire_agents-0.1.24.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.23.dist-info → signalwire_agents-0.1.24.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.23.dist-info → signalwire_agents-0.1.24.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,479 @@
|
|
1
|
+
"""Spider skill for fast web scraping with SignalWire AI Agents."""
|
2
|
+
import re
|
3
|
+
import logging
|
4
|
+
from typing import Dict, Any, Optional, List
|
5
|
+
from urllib.parse import urljoin, urlparse
|
6
|
+
import requests
|
7
|
+
from lxml import html
|
8
|
+
from lxml.etree import XPathEvalError
|
9
|
+
|
10
|
+
from signalwire_agents.core.skill_base import SkillBase
|
11
|
+
from signalwire_agents.core.function_result import SwaigFunctionResult
|
12
|
+
|
13
|
+
|
14
|
+
class SpiderSkill(SkillBase):
|
15
|
+
"""Fast web scraping skill optimized for speed and token efficiency."""
|
16
|
+
|
17
|
+
SKILL_NAME = "spider"
|
18
|
+
SKILL_DESCRIPTION = "Fast web scraping and crawling capabilities"
|
19
|
+
SKILL_VERSION = "1.0.0"
|
20
|
+
REQUIRED_PACKAGES = ["lxml"] # beautifulsoup4 and requests are in base dependencies
|
21
|
+
REQUIRED_ENV_VARS = [] # No required env vars by default
|
22
|
+
SUPPORTS_MULTIPLE_INSTANCES = True
|
23
|
+
|
24
|
+
# Compiled regex for performance
|
25
|
+
WHITESPACE_REGEX = re.compile(r'\s+')
|
26
|
+
|
27
|
+
def __init__(self, agent, params: Dict[str, Any]):
|
28
|
+
"""Initialize the spider skill with configuration parameters."""
|
29
|
+
super().__init__(agent, params)
|
30
|
+
|
31
|
+
# Performance settings
|
32
|
+
self.delay = self.params.get('delay', 0.1)
|
33
|
+
self.concurrent_requests = self.params.get('concurrent_requests', 5)
|
34
|
+
self.timeout = self.params.get('timeout', 5)
|
35
|
+
|
36
|
+
# Crawling limits
|
37
|
+
self.max_pages = self.params.get('max_pages', 1)
|
38
|
+
self.max_depth = self.params.get('max_depth', 0)
|
39
|
+
|
40
|
+
# Content processing
|
41
|
+
self.extract_type = self.params.get('extract_type', 'fast_text')
|
42
|
+
self.max_text_length = self.params.get('max_text_length', 3000)
|
43
|
+
self.clean_text = self.params.get('clean_text', True)
|
44
|
+
|
45
|
+
# Features
|
46
|
+
self.cache_enabled = self.params.get('cache_enabled', True)
|
47
|
+
self.follow_robots_txt = self.params.get('follow_robots_txt', False)
|
48
|
+
self.user_agent = self.params.get('user_agent', 'Spider/1.0 (SignalWire AI Agent)')
|
49
|
+
|
50
|
+
# Optional headers
|
51
|
+
self.headers = self.params.get('headers', {})
|
52
|
+
self.headers['User-Agent'] = self.user_agent
|
53
|
+
|
54
|
+
# Session for connection pooling
|
55
|
+
self.session = requests.Session()
|
56
|
+
self.session.headers.update(self.headers)
|
57
|
+
|
58
|
+
# Cache for responses
|
59
|
+
self.cache = {} if self.cache_enabled else None
|
60
|
+
|
61
|
+
# XPath expressions for unwanted elements
|
62
|
+
self.remove_xpaths = [
|
63
|
+
'//script', '//style', '//nav', '//header',
|
64
|
+
'//footer', '//aside', '//noscript'
|
65
|
+
]
|
66
|
+
|
67
|
+
def get_instance_key(self) -> str:
|
68
|
+
"""Return unique key for this skill instance."""
|
69
|
+
tool_name = self.params.get('tool_name', self.SKILL_NAME)
|
70
|
+
return f"{self.SKILL_NAME}_{tool_name}"
|
71
|
+
|
72
|
+
def setup(self) -> bool:
|
73
|
+
"""Validate configuration and setup the skill."""
|
74
|
+
# Validate delay is reasonable
|
75
|
+
if self.delay < 0:
|
76
|
+
self.logger.error("Delay cannot be negative")
|
77
|
+
return False
|
78
|
+
|
79
|
+
# Validate concurrent requests
|
80
|
+
if not 1 <= self.concurrent_requests <= 20:
|
81
|
+
self.logger.error("Concurrent requests must be between 1 and 20")
|
82
|
+
return False
|
83
|
+
|
84
|
+
# Validate max pages and depth
|
85
|
+
if self.max_pages < 1:
|
86
|
+
self.logger.error("Max pages must be at least 1")
|
87
|
+
return False
|
88
|
+
|
89
|
+
if self.max_depth < 0:
|
90
|
+
self.logger.error("Max depth cannot be negative")
|
91
|
+
return False
|
92
|
+
|
93
|
+
self.logger.info(f"Spider skill configured: delay={self.delay}s, max_pages={self.max_pages}, max_depth={self.max_depth}")
|
94
|
+
return True
|
95
|
+
|
96
|
+
def register_tools(self) -> None:
|
97
|
+
"""Register the web scraping tools with the agent."""
|
98
|
+
# Tool name prefix for multiple instances
|
99
|
+
tool_prefix = self.params.get('tool_name', '')
|
100
|
+
if tool_prefix:
|
101
|
+
tool_prefix = f"{tool_prefix}_"
|
102
|
+
|
103
|
+
# Register scrape_url tool
|
104
|
+
self.agent.define_tool(
|
105
|
+
name=f"{tool_prefix}scrape_url",
|
106
|
+
description="Extract text content from a single web page",
|
107
|
+
parameters={
|
108
|
+
"url": {
|
109
|
+
"type": "string",
|
110
|
+
"description": "The URL to scrape"
|
111
|
+
}
|
112
|
+
},
|
113
|
+
required=["url"],
|
114
|
+
handler=self._scrape_url_handler,
|
115
|
+
**self.swaig_fields
|
116
|
+
)
|
117
|
+
|
118
|
+
# Register crawl_site tool
|
119
|
+
self.agent.define_tool(
|
120
|
+
name=f"{tool_prefix}crawl_site",
|
121
|
+
description="Crawl multiple pages starting from a URL",
|
122
|
+
parameters={
|
123
|
+
"start_url": {
|
124
|
+
"type": "string",
|
125
|
+
"description": "Starting URL for the crawl"
|
126
|
+
}
|
127
|
+
},
|
128
|
+
required=["start_url"],
|
129
|
+
handler=self._crawl_site_handler,
|
130
|
+
**self.swaig_fields
|
131
|
+
)
|
132
|
+
|
133
|
+
# Register extract_structured_data tool
|
134
|
+
self.agent.define_tool(
|
135
|
+
name=f"{tool_prefix}extract_structured_data",
|
136
|
+
description="Extract specific data from a web page using selectors",
|
137
|
+
parameters={
|
138
|
+
"url": {
|
139
|
+
"type": "string",
|
140
|
+
"description": "The URL to scrape"
|
141
|
+
}
|
142
|
+
},
|
143
|
+
required=["url"],
|
144
|
+
handler=self._extract_structured_handler,
|
145
|
+
**self.swaig_fields
|
146
|
+
)
|
147
|
+
|
148
|
+
def _fetch_url(self, url: str) -> Optional[requests.Response]:
|
149
|
+
"""Fetch a URL with caching and error handling."""
|
150
|
+
# Check cache first
|
151
|
+
if self.cache_enabled and url in self.cache:
|
152
|
+
self.logger.debug(f"Cache hit for {url}")
|
153
|
+
return self.cache[url]
|
154
|
+
|
155
|
+
try:
|
156
|
+
response = self.session.get(url, timeout=self.timeout)
|
157
|
+
response.raise_for_status()
|
158
|
+
|
159
|
+
# Cache successful responses
|
160
|
+
if self.cache_enabled:
|
161
|
+
self.cache[url] = response
|
162
|
+
|
163
|
+
return response
|
164
|
+
|
165
|
+
except requests.exceptions.Timeout:
|
166
|
+
self.logger.error(f"Timeout fetching {url}")
|
167
|
+
return None
|
168
|
+
except requests.exceptions.RequestException as e:
|
169
|
+
self.logger.error(f"Error fetching {url}: {e}")
|
170
|
+
return None
|
171
|
+
|
172
|
+
def _fast_text_extract(self, response: requests.Response) -> str:
|
173
|
+
"""Ultra-fast text extraction using lxml."""
|
174
|
+
try:
|
175
|
+
# Parse HTML with lxml
|
176
|
+
tree = html.fromstring(response.content)
|
177
|
+
|
178
|
+
# Remove unwanted elements in one pass
|
179
|
+
for xpath in self.remove_xpaths:
|
180
|
+
for elem in tree.xpath(xpath):
|
181
|
+
elem.drop_tree()
|
182
|
+
|
183
|
+
# Extract text
|
184
|
+
text = tree.text_content()
|
185
|
+
|
186
|
+
# Clean whitespace if requested
|
187
|
+
if self.clean_text:
|
188
|
+
text = self.WHITESPACE_REGEX.sub(' ', text).strip()
|
189
|
+
|
190
|
+
# Smart truncation
|
191
|
+
if len(text) > self.max_text_length:
|
192
|
+
keep_start = self.max_text_length * 2 // 3
|
193
|
+
keep_end = self.max_text_length // 3
|
194
|
+
text = (
|
195
|
+
text[:keep_start] +
|
196
|
+
"\n\n[...CONTENT TRUNCATED...]\n\n" +
|
197
|
+
text[-keep_end:]
|
198
|
+
)
|
199
|
+
|
200
|
+
return text
|
201
|
+
|
202
|
+
except Exception as e:
|
203
|
+
self.logger.error(f"Error extracting text: {e}")
|
204
|
+
return ""
|
205
|
+
|
206
|
+
def _markdown_extract(self, response: requests.Response) -> str:
|
207
|
+
"""Extract content in markdown format."""
|
208
|
+
try:
|
209
|
+
from bs4 import BeautifulSoup
|
210
|
+
|
211
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
212
|
+
|
213
|
+
# Remove unwanted tags
|
214
|
+
for tag in ['script', 'style', 'nav', 'header', 'footer', 'aside']:
|
215
|
+
for elem in soup.find_all(tag):
|
216
|
+
elem.decompose()
|
217
|
+
|
218
|
+
# Convert to markdown-like format
|
219
|
+
text_parts = []
|
220
|
+
|
221
|
+
# Title
|
222
|
+
title = soup.find('title')
|
223
|
+
if title:
|
224
|
+
text_parts.append(f"# {title.get_text().strip()}\n")
|
225
|
+
|
226
|
+
# Main content
|
227
|
+
for elem in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'code', 'pre']):
|
228
|
+
if elem.name.startswith('h'):
|
229
|
+
level = int(elem.name[1])
|
230
|
+
text_parts.append(f"\n{'#' * level} {elem.get_text().strip()}\n")
|
231
|
+
elif elem.name == 'p':
|
232
|
+
text_parts.append(f"\n{elem.get_text().strip()}\n")
|
233
|
+
elif elem.name == 'li':
|
234
|
+
text_parts.append(f"- {elem.get_text().strip()}")
|
235
|
+
elif elem.name in ['code', 'pre']:
|
236
|
+
text_parts.append(f"\n```\n{elem.get_text().strip()}\n```\n")
|
237
|
+
|
238
|
+
text = '\n'.join(text_parts)
|
239
|
+
|
240
|
+
# Truncate if needed
|
241
|
+
if len(text) > self.max_text_length:
|
242
|
+
text = text[:self.max_text_length] + "\n\n[...TRUNCATED...]"
|
243
|
+
|
244
|
+
return text
|
245
|
+
|
246
|
+
except ImportError:
|
247
|
+
self.logger.warning("BeautifulSoup not available, falling back to fast_text")
|
248
|
+
return self._fast_text_extract(response)
|
249
|
+
except Exception as e:
|
250
|
+
self.logger.error(f"Error in markdown extraction: {e}")
|
251
|
+
return self._fast_text_extract(response)
|
252
|
+
|
253
|
+
def _structured_extract(self, response: requests.Response, selectors: Dict[str, str] = None) -> Dict[str, Any]:
|
254
|
+
"""Extract structured data using selectors."""
|
255
|
+
try:
|
256
|
+
tree = html.fromstring(response.content)
|
257
|
+
result = {
|
258
|
+
"url": response.url,
|
259
|
+
"status_code": response.status_code,
|
260
|
+
"title": "",
|
261
|
+
"data": {}
|
262
|
+
}
|
263
|
+
|
264
|
+
# Get title
|
265
|
+
title_elem = tree.xpath('//title/text()')
|
266
|
+
if title_elem:
|
267
|
+
result["title"] = title_elem[0].strip()
|
268
|
+
|
269
|
+
# Extract using provided selectors
|
270
|
+
if selectors:
|
271
|
+
for field, selector in selectors.items():
|
272
|
+
try:
|
273
|
+
if selector.startswith('/'): # XPath
|
274
|
+
values = tree.xpath(selector)
|
275
|
+
else: # CSS selector
|
276
|
+
from lxml.cssselect import CSSSelector
|
277
|
+
sel = CSSSelector(selector)
|
278
|
+
values = sel(tree)
|
279
|
+
|
280
|
+
# Extract text from elements
|
281
|
+
if values:
|
282
|
+
if len(values) == 1:
|
283
|
+
result["data"][field] = values[0].text_content().strip()
|
284
|
+
else:
|
285
|
+
result["data"][field] = [v.text_content().strip() for v in values]
|
286
|
+
except (XPathEvalError, Exception) as e:
|
287
|
+
self.logger.warning(f"Error with selector {selector}: {e}")
|
288
|
+
result["data"][field] = None
|
289
|
+
|
290
|
+
return result
|
291
|
+
|
292
|
+
except Exception as e:
|
293
|
+
self.logger.error(f"Error in structured extraction: {e}")
|
294
|
+
return {"error": str(e)}
|
295
|
+
|
296
|
+
def _scrape_url_handler(self, args: Dict[str, Any], raw_data: Dict[str, Any]) -> SwaigFunctionResult:
|
297
|
+
"""Handle single page scraping."""
|
298
|
+
url = args.get('url', '').strip()
|
299
|
+
if not url:
|
300
|
+
return SwaigFunctionResult("Please provide a URL to scrape")
|
301
|
+
|
302
|
+
# Validate URL
|
303
|
+
parsed = urlparse(url)
|
304
|
+
if not parsed.scheme or not parsed.netloc:
|
305
|
+
return SwaigFunctionResult(f"Invalid URL: {url}")
|
306
|
+
|
307
|
+
# Fetch the page
|
308
|
+
response = self._fetch_url(url)
|
309
|
+
if not response:
|
310
|
+
return SwaigFunctionResult(f"Failed to fetch {url}")
|
311
|
+
|
312
|
+
# Extract content based on configured type (not from args)
|
313
|
+
extract_type = self.extract_type
|
314
|
+
|
315
|
+
try:
|
316
|
+
if extract_type == 'structured':
|
317
|
+
# For structured extraction, use predefined selectors from config if available
|
318
|
+
selectors = self.params.get('selectors', {})
|
319
|
+
result = self._structured_extract(response, selectors)
|
320
|
+
return SwaigFunctionResult(f"Extracted structured data from {url}: {result}")
|
321
|
+
elif extract_type == 'markdown':
|
322
|
+
content = self._markdown_extract(response)
|
323
|
+
else: # fast_text (default)
|
324
|
+
content = self._fast_text_extract(response)
|
325
|
+
|
326
|
+
if not content:
|
327
|
+
return SwaigFunctionResult(f"No content extracted from {url}")
|
328
|
+
|
329
|
+
# Format response
|
330
|
+
char_count = len(content)
|
331
|
+
header = f"Content from {url} ({char_count} characters):\n\n"
|
332
|
+
|
333
|
+
return SwaigFunctionResult(header + content)
|
334
|
+
|
335
|
+
except Exception as e:
|
336
|
+
self.logger.error(f"Error processing {url}: {e}")
|
337
|
+
return SwaigFunctionResult(f"Error processing {url}: {str(e)}")
|
338
|
+
|
339
|
+
def _crawl_site_handler(self, args: Dict[str, Any], raw_data: Dict[str, Any]) -> SwaigFunctionResult:
|
340
|
+
"""Handle multi-page crawling."""
|
341
|
+
start_url = args.get('start_url', '').strip()
|
342
|
+
if not start_url:
|
343
|
+
return SwaigFunctionResult("Please provide a starting URL for the crawl")
|
344
|
+
|
345
|
+
# Use configured parameters (not from args)
|
346
|
+
max_depth = self.max_depth
|
347
|
+
max_pages = self.max_pages
|
348
|
+
follow_patterns = self.params.get('follow_patterns', [])
|
349
|
+
|
350
|
+
# Validate parameters
|
351
|
+
if max_depth < 0:
|
352
|
+
return SwaigFunctionResult("Max depth cannot be negative")
|
353
|
+
if max_pages < 1:
|
354
|
+
return SwaigFunctionResult("Max pages must be at least 1")
|
355
|
+
|
356
|
+
# Simple breadth-first crawl
|
357
|
+
visited = set()
|
358
|
+
to_visit = [(start_url, 0)] # (url, depth)
|
359
|
+
results = []
|
360
|
+
|
361
|
+
while to_visit and len(visited) < max_pages:
|
362
|
+
if not to_visit:
|
363
|
+
break
|
364
|
+
|
365
|
+
url, depth = to_visit.pop(0)
|
366
|
+
|
367
|
+
# Skip if already visited or depth exceeded
|
368
|
+
if url in visited or depth > max_depth:
|
369
|
+
continue
|
370
|
+
|
371
|
+
# Fetch and process page
|
372
|
+
response = self._fetch_url(url)
|
373
|
+
if not response:
|
374
|
+
continue
|
375
|
+
|
376
|
+
visited.add(url)
|
377
|
+
|
378
|
+
# Extract content
|
379
|
+
content = self._fast_text_extract(response)
|
380
|
+
if content:
|
381
|
+
results.append({
|
382
|
+
'url': url,
|
383
|
+
'depth': depth,
|
384
|
+
'content_length': len(content),
|
385
|
+
'summary': content[:500] + '...' if len(content) > 500 else content
|
386
|
+
})
|
387
|
+
|
388
|
+
# Extract links if not at max depth
|
389
|
+
if depth < max_depth:
|
390
|
+
try:
|
391
|
+
tree = html.fromstring(response.content)
|
392
|
+
links = tree.xpath('//a[@href]/@href')
|
393
|
+
|
394
|
+
for link in links:
|
395
|
+
absolute_url = urljoin(url, link)
|
396
|
+
|
397
|
+
# Check if we should follow this link
|
398
|
+
if follow_patterns:
|
399
|
+
if not any(re.search(pattern, absolute_url) for pattern in follow_patterns):
|
400
|
+
continue
|
401
|
+
|
402
|
+
# Only follow same domain by default
|
403
|
+
if urlparse(absolute_url).netloc == urlparse(start_url).netloc:
|
404
|
+
if absolute_url not in visited:
|
405
|
+
to_visit.append((absolute_url, depth + 1))
|
406
|
+
|
407
|
+
except Exception as e:
|
408
|
+
self.logger.warning(f"Error extracting links from {url}: {e}")
|
409
|
+
|
410
|
+
# Respect delay between requests
|
411
|
+
if self.delay > 0 and len(visited) < max_pages:
|
412
|
+
import time
|
413
|
+
time.sleep(self.delay)
|
414
|
+
|
415
|
+
# Format results
|
416
|
+
if not results:
|
417
|
+
return SwaigFunctionResult(f"No pages could be crawled from {start_url}")
|
418
|
+
|
419
|
+
summary = f"Crawled {len(results)} pages from {urlparse(start_url).netloc}:\n\n"
|
420
|
+
|
421
|
+
for i, result in enumerate(results, 1):
|
422
|
+
summary += f"{i}. {result['url']} (depth: {result['depth']}, {result['content_length']} chars)\n"
|
423
|
+
summary += f" Summary: {result['summary'][:100]}...\n\n"
|
424
|
+
|
425
|
+
total_chars = sum(r['content_length'] for r in results)
|
426
|
+
summary += f"\nTotal content: {total_chars:,} characters across {len(results)} pages"
|
427
|
+
|
428
|
+
return SwaigFunctionResult(summary)
|
429
|
+
|
430
|
+
def _extract_structured_handler(self, args: Dict[str, Any], raw_data: Dict[str, Any]) -> SwaigFunctionResult:
|
431
|
+
"""Handle structured data extraction."""
|
432
|
+
url = args.get('url', '').strip()
|
433
|
+
|
434
|
+
if not url:
|
435
|
+
return SwaigFunctionResult("Please provide a URL")
|
436
|
+
|
437
|
+
# Use configured selectors from params
|
438
|
+
selectors = self.params.get('selectors', {})
|
439
|
+
if not selectors:
|
440
|
+
return SwaigFunctionResult("No selectors configured for structured data extraction")
|
441
|
+
|
442
|
+
# Fetch the page
|
443
|
+
response = self._fetch_url(url)
|
444
|
+
if not response:
|
445
|
+
return SwaigFunctionResult(f"Failed to fetch {url}")
|
446
|
+
|
447
|
+
# Extract structured data
|
448
|
+
result = self._structured_extract(response, selectors)
|
449
|
+
|
450
|
+
if 'error' in result:
|
451
|
+
return SwaigFunctionResult(f"Error extracting data: {result['error']}")
|
452
|
+
|
453
|
+
# Format the response
|
454
|
+
output = f"Extracted data from {url}:\n\n"
|
455
|
+
output += f"Title: {result.get('title', 'N/A')}\n\n"
|
456
|
+
|
457
|
+
if result.get('data'):
|
458
|
+
output += "Data:\n"
|
459
|
+
for field, value in result['data'].items():
|
460
|
+
output += f"- {field}: {value}\n"
|
461
|
+
else:
|
462
|
+
output += "No data extracted with provided selectors"
|
463
|
+
|
464
|
+
return SwaigFunctionResult(output)
|
465
|
+
|
466
|
+
def get_hints(self) -> List[str]:
|
467
|
+
"""Return speech recognition hints for this skill."""
|
468
|
+
return [
|
469
|
+
"scrape", "crawl", "extract", "web page", "website",
|
470
|
+
"get content from", "fetch data from", "spider"
|
471
|
+
]
|
472
|
+
|
473
|
+
def cleanup(self) -> None:
|
474
|
+
"""Clean up resources when skill is unloaded."""
|
475
|
+
if hasattr(self, 'session'):
|
476
|
+
self.session.close()
|
477
|
+
if hasattr(self, 'cache'):
|
478
|
+
self.cache.clear()
|
479
|
+
self.logger.info("Spider skill cleaned up")
|