signalwire-agents 0.1.23__py3-none-any.whl → 0.1.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. signalwire_agents/__init__.py +1 -1
  2. signalwire_agents/agent_server.py +2 -1
  3. signalwire_agents/cli/config.py +61 -0
  4. signalwire_agents/cli/core/__init__.py +1 -0
  5. signalwire_agents/cli/core/agent_loader.py +254 -0
  6. signalwire_agents/cli/core/argparse_helpers.py +164 -0
  7. signalwire_agents/cli/core/dynamic_config.py +62 -0
  8. signalwire_agents/cli/execution/__init__.py +1 -0
  9. signalwire_agents/cli/execution/datamap_exec.py +437 -0
  10. signalwire_agents/cli/execution/webhook_exec.py +125 -0
  11. signalwire_agents/cli/output/__init__.py +1 -0
  12. signalwire_agents/cli/output/output_formatter.py +132 -0
  13. signalwire_agents/cli/output/swml_dump.py +177 -0
  14. signalwire_agents/cli/simulation/__init__.py +1 -0
  15. signalwire_agents/cli/simulation/data_generation.py +365 -0
  16. signalwire_agents/cli/simulation/data_overrides.py +187 -0
  17. signalwire_agents/cli/simulation/mock_env.py +271 -0
  18. signalwire_agents/cli/test_swaig.py +522 -2539
  19. signalwire_agents/cli/types.py +72 -0
  20. signalwire_agents/core/agent/__init__.py +1 -3
  21. signalwire_agents/core/agent/config/__init__.py +1 -3
  22. signalwire_agents/core/agent/prompt/manager.py +25 -7
  23. signalwire_agents/core/agent/tools/decorator.py +2 -0
  24. signalwire_agents/core/agent/tools/registry.py +8 -0
  25. signalwire_agents/core/agent_base.py +492 -3053
  26. signalwire_agents/core/function_result.py +31 -42
  27. signalwire_agents/core/mixins/__init__.py +28 -0
  28. signalwire_agents/core/mixins/ai_config_mixin.py +373 -0
  29. signalwire_agents/core/mixins/auth_mixin.py +287 -0
  30. signalwire_agents/core/mixins/prompt_mixin.py +345 -0
  31. signalwire_agents/core/mixins/serverless_mixin.py +368 -0
  32. signalwire_agents/core/mixins/skill_mixin.py +55 -0
  33. signalwire_agents/core/mixins/state_mixin.py +219 -0
  34. signalwire_agents/core/mixins/tool_mixin.py +295 -0
  35. signalwire_agents/core/mixins/web_mixin.py +1130 -0
  36. signalwire_agents/core/skill_manager.py +3 -1
  37. signalwire_agents/core/swaig_function.py +10 -1
  38. signalwire_agents/core/swml_service.py +140 -58
  39. signalwire_agents/skills/README.md +452 -0
  40. signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
  41. signalwire_agents/skills/datasphere/README.md +210 -0
  42. signalwire_agents/skills/datasphere_serverless/README.md +258 -0
  43. signalwire_agents/skills/datetime/README.md +132 -0
  44. signalwire_agents/skills/joke/README.md +149 -0
  45. signalwire_agents/skills/math/README.md +161 -0
  46. signalwire_agents/skills/native_vector_search/skill.py +33 -13
  47. signalwire_agents/skills/play_background_file/README.md +218 -0
  48. signalwire_agents/skills/spider/README.md +236 -0
  49. signalwire_agents/skills/spider/__init__.py +4 -0
  50. signalwire_agents/skills/spider/skill.py +479 -0
  51. signalwire_agents/skills/swml_transfer/README.md +395 -0
  52. signalwire_agents/skills/swml_transfer/__init__.py +1 -0
  53. signalwire_agents/skills/swml_transfer/skill.py +257 -0
  54. signalwire_agents/skills/weather_api/README.md +178 -0
  55. signalwire_agents/skills/web_search/README.md +163 -0
  56. signalwire_agents/skills/wikipedia_search/README.md +228 -0
  57. {signalwire_agents-0.1.23.dist-info → signalwire_agents-0.1.24.dist-info}/METADATA +47 -2
  58. {signalwire_agents-0.1.23.dist-info → signalwire_agents-0.1.24.dist-info}/RECORD +62 -22
  59. {signalwire_agents-0.1.23.dist-info → signalwire_agents-0.1.24.dist-info}/entry_points.txt +1 -1
  60. signalwire_agents/core/agent/config/ephemeral.py +0 -176
  61. signalwire_agents-0.1.23.data/data/schema.json +0 -5611
  62. {signalwire_agents-0.1.23.dist-info → signalwire_agents-0.1.24.dist-info}/WHEEL +0 -0
  63. {signalwire_agents-0.1.23.dist-info → signalwire_agents-0.1.24.dist-info}/licenses/LICENSE +0 -0
  64. {signalwire_agents-0.1.23.dist-info → signalwire_agents-0.1.24.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,479 @@
1
+ """Spider skill for fast web scraping with SignalWire AI Agents."""
2
+ import re
3
+ import logging
4
+ from typing import Dict, Any, Optional, List
5
+ from urllib.parse import urljoin, urlparse
6
+ import requests
7
+ from lxml import html
8
+ from lxml.etree import XPathEvalError
9
+
10
+ from signalwire_agents.core.skill_base import SkillBase
11
+ from signalwire_agents.core.function_result import SwaigFunctionResult
12
+
13
+
14
+ class SpiderSkill(SkillBase):
15
+ """Fast web scraping skill optimized for speed and token efficiency."""
16
+
17
+ SKILL_NAME = "spider"
18
+ SKILL_DESCRIPTION = "Fast web scraping and crawling capabilities"
19
+ SKILL_VERSION = "1.0.0"
20
+ REQUIRED_PACKAGES = ["lxml"] # beautifulsoup4 and requests are in base dependencies
21
+ REQUIRED_ENV_VARS = [] # No required env vars by default
22
+ SUPPORTS_MULTIPLE_INSTANCES = True
23
+
24
+ # Compiled regex for performance
25
+ WHITESPACE_REGEX = re.compile(r'\s+')
26
+
27
+ def __init__(self, agent, params: Dict[str, Any]):
28
+ """Initialize the spider skill with configuration parameters."""
29
+ super().__init__(agent, params)
30
+
31
+ # Performance settings
32
+ self.delay = self.params.get('delay', 0.1)
33
+ self.concurrent_requests = self.params.get('concurrent_requests', 5)
34
+ self.timeout = self.params.get('timeout', 5)
35
+
36
+ # Crawling limits
37
+ self.max_pages = self.params.get('max_pages', 1)
38
+ self.max_depth = self.params.get('max_depth', 0)
39
+
40
+ # Content processing
41
+ self.extract_type = self.params.get('extract_type', 'fast_text')
42
+ self.max_text_length = self.params.get('max_text_length', 3000)
43
+ self.clean_text = self.params.get('clean_text', True)
44
+
45
+ # Features
46
+ self.cache_enabled = self.params.get('cache_enabled', True)
47
+ self.follow_robots_txt = self.params.get('follow_robots_txt', False)
48
+ self.user_agent = self.params.get('user_agent', 'Spider/1.0 (SignalWire AI Agent)')
49
+
50
+ # Optional headers
51
+ self.headers = self.params.get('headers', {})
52
+ self.headers['User-Agent'] = self.user_agent
53
+
54
+ # Session for connection pooling
55
+ self.session = requests.Session()
56
+ self.session.headers.update(self.headers)
57
+
58
+ # Cache for responses
59
+ self.cache = {} if self.cache_enabled else None
60
+
61
+ # XPath expressions for unwanted elements
62
+ self.remove_xpaths = [
63
+ '//script', '//style', '//nav', '//header',
64
+ '//footer', '//aside', '//noscript'
65
+ ]
66
+
67
+ def get_instance_key(self) -> str:
68
+ """Return unique key for this skill instance."""
69
+ tool_name = self.params.get('tool_name', self.SKILL_NAME)
70
+ return f"{self.SKILL_NAME}_{tool_name}"
71
+
72
+ def setup(self) -> bool:
73
+ """Validate configuration and setup the skill."""
74
+ # Validate delay is reasonable
75
+ if self.delay < 0:
76
+ self.logger.error("Delay cannot be negative")
77
+ return False
78
+
79
+ # Validate concurrent requests
80
+ if not 1 <= self.concurrent_requests <= 20:
81
+ self.logger.error("Concurrent requests must be between 1 and 20")
82
+ return False
83
+
84
+ # Validate max pages and depth
85
+ if self.max_pages < 1:
86
+ self.logger.error("Max pages must be at least 1")
87
+ return False
88
+
89
+ if self.max_depth < 0:
90
+ self.logger.error("Max depth cannot be negative")
91
+ return False
92
+
93
+ self.logger.info(f"Spider skill configured: delay={self.delay}s, max_pages={self.max_pages}, max_depth={self.max_depth}")
94
+ return True
95
+
96
+ def register_tools(self) -> None:
97
+ """Register the web scraping tools with the agent."""
98
+ # Tool name prefix for multiple instances
99
+ tool_prefix = self.params.get('tool_name', '')
100
+ if tool_prefix:
101
+ tool_prefix = f"{tool_prefix}_"
102
+
103
+ # Register scrape_url tool
104
+ self.agent.define_tool(
105
+ name=f"{tool_prefix}scrape_url",
106
+ description="Extract text content from a single web page",
107
+ parameters={
108
+ "url": {
109
+ "type": "string",
110
+ "description": "The URL to scrape"
111
+ }
112
+ },
113
+ required=["url"],
114
+ handler=self._scrape_url_handler,
115
+ **self.swaig_fields
116
+ )
117
+
118
+ # Register crawl_site tool
119
+ self.agent.define_tool(
120
+ name=f"{tool_prefix}crawl_site",
121
+ description="Crawl multiple pages starting from a URL",
122
+ parameters={
123
+ "start_url": {
124
+ "type": "string",
125
+ "description": "Starting URL for the crawl"
126
+ }
127
+ },
128
+ required=["start_url"],
129
+ handler=self._crawl_site_handler,
130
+ **self.swaig_fields
131
+ )
132
+
133
+ # Register extract_structured_data tool
134
+ self.agent.define_tool(
135
+ name=f"{tool_prefix}extract_structured_data",
136
+ description="Extract specific data from a web page using selectors",
137
+ parameters={
138
+ "url": {
139
+ "type": "string",
140
+ "description": "The URL to scrape"
141
+ }
142
+ },
143
+ required=["url"],
144
+ handler=self._extract_structured_handler,
145
+ **self.swaig_fields
146
+ )
147
+
148
+ def _fetch_url(self, url: str) -> Optional[requests.Response]:
149
+ """Fetch a URL with caching and error handling."""
150
+ # Check cache first
151
+ if self.cache_enabled and url in self.cache:
152
+ self.logger.debug(f"Cache hit for {url}")
153
+ return self.cache[url]
154
+
155
+ try:
156
+ response = self.session.get(url, timeout=self.timeout)
157
+ response.raise_for_status()
158
+
159
+ # Cache successful responses
160
+ if self.cache_enabled:
161
+ self.cache[url] = response
162
+
163
+ return response
164
+
165
+ except requests.exceptions.Timeout:
166
+ self.logger.error(f"Timeout fetching {url}")
167
+ return None
168
+ except requests.exceptions.RequestException as e:
169
+ self.logger.error(f"Error fetching {url}: {e}")
170
+ return None
171
+
172
+ def _fast_text_extract(self, response: requests.Response) -> str:
173
+ """Ultra-fast text extraction using lxml."""
174
+ try:
175
+ # Parse HTML with lxml
176
+ tree = html.fromstring(response.content)
177
+
178
+ # Remove unwanted elements in one pass
179
+ for xpath in self.remove_xpaths:
180
+ for elem in tree.xpath(xpath):
181
+ elem.drop_tree()
182
+
183
+ # Extract text
184
+ text = tree.text_content()
185
+
186
+ # Clean whitespace if requested
187
+ if self.clean_text:
188
+ text = self.WHITESPACE_REGEX.sub(' ', text).strip()
189
+
190
+ # Smart truncation
191
+ if len(text) > self.max_text_length:
192
+ keep_start = self.max_text_length * 2 // 3
193
+ keep_end = self.max_text_length // 3
194
+ text = (
195
+ text[:keep_start] +
196
+ "\n\n[...CONTENT TRUNCATED...]\n\n" +
197
+ text[-keep_end:]
198
+ )
199
+
200
+ return text
201
+
202
+ except Exception as e:
203
+ self.logger.error(f"Error extracting text: {e}")
204
+ return ""
205
+
206
+ def _markdown_extract(self, response: requests.Response) -> str:
207
+ """Extract content in markdown format."""
208
+ try:
209
+ from bs4 import BeautifulSoup
210
+
211
+ soup = BeautifulSoup(response.content, 'html.parser')
212
+
213
+ # Remove unwanted tags
214
+ for tag in ['script', 'style', 'nav', 'header', 'footer', 'aside']:
215
+ for elem in soup.find_all(tag):
216
+ elem.decompose()
217
+
218
+ # Convert to markdown-like format
219
+ text_parts = []
220
+
221
+ # Title
222
+ title = soup.find('title')
223
+ if title:
224
+ text_parts.append(f"# {title.get_text().strip()}\n")
225
+
226
+ # Main content
227
+ for elem in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'code', 'pre']):
228
+ if elem.name.startswith('h'):
229
+ level = int(elem.name[1])
230
+ text_parts.append(f"\n{'#' * level} {elem.get_text().strip()}\n")
231
+ elif elem.name == 'p':
232
+ text_parts.append(f"\n{elem.get_text().strip()}\n")
233
+ elif elem.name == 'li':
234
+ text_parts.append(f"- {elem.get_text().strip()}")
235
+ elif elem.name in ['code', 'pre']:
236
+ text_parts.append(f"\n```\n{elem.get_text().strip()}\n```\n")
237
+
238
+ text = '\n'.join(text_parts)
239
+
240
+ # Truncate if needed
241
+ if len(text) > self.max_text_length:
242
+ text = text[:self.max_text_length] + "\n\n[...TRUNCATED...]"
243
+
244
+ return text
245
+
246
+ except ImportError:
247
+ self.logger.warning("BeautifulSoup not available, falling back to fast_text")
248
+ return self._fast_text_extract(response)
249
+ except Exception as e:
250
+ self.logger.error(f"Error in markdown extraction: {e}")
251
+ return self._fast_text_extract(response)
252
+
253
+ def _structured_extract(self, response: requests.Response, selectors: Dict[str, str] = None) -> Dict[str, Any]:
254
+ """Extract structured data using selectors."""
255
+ try:
256
+ tree = html.fromstring(response.content)
257
+ result = {
258
+ "url": response.url,
259
+ "status_code": response.status_code,
260
+ "title": "",
261
+ "data": {}
262
+ }
263
+
264
+ # Get title
265
+ title_elem = tree.xpath('//title/text()')
266
+ if title_elem:
267
+ result["title"] = title_elem[0].strip()
268
+
269
+ # Extract using provided selectors
270
+ if selectors:
271
+ for field, selector in selectors.items():
272
+ try:
273
+ if selector.startswith('/'): # XPath
274
+ values = tree.xpath(selector)
275
+ else: # CSS selector
276
+ from lxml.cssselect import CSSSelector
277
+ sel = CSSSelector(selector)
278
+ values = sel(tree)
279
+
280
+ # Extract text from elements
281
+ if values:
282
+ if len(values) == 1:
283
+ result["data"][field] = values[0].text_content().strip()
284
+ else:
285
+ result["data"][field] = [v.text_content().strip() for v in values]
286
+ except (XPathEvalError, Exception) as e:
287
+ self.logger.warning(f"Error with selector {selector}: {e}")
288
+ result["data"][field] = None
289
+
290
+ return result
291
+
292
+ except Exception as e:
293
+ self.logger.error(f"Error in structured extraction: {e}")
294
+ return {"error": str(e)}
295
+
296
+ def _scrape_url_handler(self, args: Dict[str, Any], raw_data: Dict[str, Any]) -> SwaigFunctionResult:
297
+ """Handle single page scraping."""
298
+ url = args.get('url', '').strip()
299
+ if not url:
300
+ return SwaigFunctionResult("Please provide a URL to scrape")
301
+
302
+ # Validate URL
303
+ parsed = urlparse(url)
304
+ if not parsed.scheme or not parsed.netloc:
305
+ return SwaigFunctionResult(f"Invalid URL: {url}")
306
+
307
+ # Fetch the page
308
+ response = self._fetch_url(url)
309
+ if not response:
310
+ return SwaigFunctionResult(f"Failed to fetch {url}")
311
+
312
+ # Extract content based on configured type (not from args)
313
+ extract_type = self.extract_type
314
+
315
+ try:
316
+ if extract_type == 'structured':
317
+ # For structured extraction, use predefined selectors from config if available
318
+ selectors = self.params.get('selectors', {})
319
+ result = self._structured_extract(response, selectors)
320
+ return SwaigFunctionResult(f"Extracted structured data from {url}: {result}")
321
+ elif extract_type == 'markdown':
322
+ content = self._markdown_extract(response)
323
+ else: # fast_text (default)
324
+ content = self._fast_text_extract(response)
325
+
326
+ if not content:
327
+ return SwaigFunctionResult(f"No content extracted from {url}")
328
+
329
+ # Format response
330
+ char_count = len(content)
331
+ header = f"Content from {url} ({char_count} characters):\n\n"
332
+
333
+ return SwaigFunctionResult(header + content)
334
+
335
+ except Exception as e:
336
+ self.logger.error(f"Error processing {url}: {e}")
337
+ return SwaigFunctionResult(f"Error processing {url}: {str(e)}")
338
+
339
+ def _crawl_site_handler(self, args: Dict[str, Any], raw_data: Dict[str, Any]) -> SwaigFunctionResult:
340
+ """Handle multi-page crawling."""
341
+ start_url = args.get('start_url', '').strip()
342
+ if not start_url:
343
+ return SwaigFunctionResult("Please provide a starting URL for the crawl")
344
+
345
+ # Use configured parameters (not from args)
346
+ max_depth = self.max_depth
347
+ max_pages = self.max_pages
348
+ follow_patterns = self.params.get('follow_patterns', [])
349
+
350
+ # Validate parameters
351
+ if max_depth < 0:
352
+ return SwaigFunctionResult("Max depth cannot be negative")
353
+ if max_pages < 1:
354
+ return SwaigFunctionResult("Max pages must be at least 1")
355
+
356
+ # Simple breadth-first crawl
357
+ visited = set()
358
+ to_visit = [(start_url, 0)] # (url, depth)
359
+ results = []
360
+
361
+ while to_visit and len(visited) < max_pages:
362
+ if not to_visit:
363
+ break
364
+
365
+ url, depth = to_visit.pop(0)
366
+
367
+ # Skip if already visited or depth exceeded
368
+ if url in visited or depth > max_depth:
369
+ continue
370
+
371
+ # Fetch and process page
372
+ response = self._fetch_url(url)
373
+ if not response:
374
+ continue
375
+
376
+ visited.add(url)
377
+
378
+ # Extract content
379
+ content = self._fast_text_extract(response)
380
+ if content:
381
+ results.append({
382
+ 'url': url,
383
+ 'depth': depth,
384
+ 'content_length': len(content),
385
+ 'summary': content[:500] + '...' if len(content) > 500 else content
386
+ })
387
+
388
+ # Extract links if not at max depth
389
+ if depth < max_depth:
390
+ try:
391
+ tree = html.fromstring(response.content)
392
+ links = tree.xpath('//a[@href]/@href')
393
+
394
+ for link in links:
395
+ absolute_url = urljoin(url, link)
396
+
397
+ # Check if we should follow this link
398
+ if follow_patterns:
399
+ if not any(re.search(pattern, absolute_url) for pattern in follow_patterns):
400
+ continue
401
+
402
+ # Only follow same domain by default
403
+ if urlparse(absolute_url).netloc == urlparse(start_url).netloc:
404
+ if absolute_url not in visited:
405
+ to_visit.append((absolute_url, depth + 1))
406
+
407
+ except Exception as e:
408
+ self.logger.warning(f"Error extracting links from {url}: {e}")
409
+
410
+ # Respect delay between requests
411
+ if self.delay > 0 and len(visited) < max_pages:
412
+ import time
413
+ time.sleep(self.delay)
414
+
415
+ # Format results
416
+ if not results:
417
+ return SwaigFunctionResult(f"No pages could be crawled from {start_url}")
418
+
419
+ summary = f"Crawled {len(results)} pages from {urlparse(start_url).netloc}:\n\n"
420
+
421
+ for i, result in enumerate(results, 1):
422
+ summary += f"{i}. {result['url']} (depth: {result['depth']}, {result['content_length']} chars)\n"
423
+ summary += f" Summary: {result['summary'][:100]}...\n\n"
424
+
425
+ total_chars = sum(r['content_length'] for r in results)
426
+ summary += f"\nTotal content: {total_chars:,} characters across {len(results)} pages"
427
+
428
+ return SwaigFunctionResult(summary)
429
+
430
+ def _extract_structured_handler(self, args: Dict[str, Any], raw_data: Dict[str, Any]) -> SwaigFunctionResult:
431
+ """Handle structured data extraction."""
432
+ url = args.get('url', '').strip()
433
+
434
+ if not url:
435
+ return SwaigFunctionResult("Please provide a URL")
436
+
437
+ # Use configured selectors from params
438
+ selectors = self.params.get('selectors', {})
439
+ if not selectors:
440
+ return SwaigFunctionResult("No selectors configured for structured data extraction")
441
+
442
+ # Fetch the page
443
+ response = self._fetch_url(url)
444
+ if not response:
445
+ return SwaigFunctionResult(f"Failed to fetch {url}")
446
+
447
+ # Extract structured data
448
+ result = self._structured_extract(response, selectors)
449
+
450
+ if 'error' in result:
451
+ return SwaigFunctionResult(f"Error extracting data: {result['error']}")
452
+
453
+ # Format the response
454
+ output = f"Extracted data from {url}:\n\n"
455
+ output += f"Title: {result.get('title', 'N/A')}\n\n"
456
+
457
+ if result.get('data'):
458
+ output += "Data:\n"
459
+ for field, value in result['data'].items():
460
+ output += f"- {field}: {value}\n"
461
+ else:
462
+ output += "No data extracted with provided selectors"
463
+
464
+ return SwaigFunctionResult(output)
465
+
466
+ def get_hints(self) -> List[str]:
467
+ """Return speech recognition hints for this skill."""
468
+ return [
469
+ "scrape", "crawl", "extract", "web page", "website",
470
+ "get content from", "fetch data from", "spider"
471
+ ]
472
+
473
+ def cleanup(self) -> None:
474
+ """Clean up resources when skill is unloaded."""
475
+ if hasattr(self, 'session'):
476
+ self.session.close()
477
+ if hasattr(self, 'cache'):
478
+ self.cache.clear()
479
+ self.logger.info("Spider skill cleaned up")