signalwire-agents 0.1.13__py3-none-any.whl → 1.0.17.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. signalwire_agents/__init__.py +99 -15
  2. signalwire_agents/agent_server.py +248 -60
  3. signalwire_agents/agents/bedrock.py +296 -0
  4. signalwire_agents/cli/__init__.py +9 -0
  5. signalwire_agents/cli/build_search.py +951 -41
  6. signalwire_agents/cli/config.py +80 -0
  7. signalwire_agents/cli/core/__init__.py +10 -0
  8. signalwire_agents/cli/core/agent_loader.py +470 -0
  9. signalwire_agents/cli/core/argparse_helpers.py +179 -0
  10. signalwire_agents/cli/core/dynamic_config.py +71 -0
  11. signalwire_agents/cli/core/service_loader.py +303 -0
  12. signalwire_agents/cli/dokku.py +2320 -0
  13. signalwire_agents/cli/execution/__init__.py +10 -0
  14. signalwire_agents/cli/execution/datamap_exec.py +446 -0
  15. signalwire_agents/cli/execution/webhook_exec.py +134 -0
  16. signalwire_agents/cli/init_project.py +2636 -0
  17. signalwire_agents/cli/output/__init__.py +10 -0
  18. signalwire_agents/cli/output/output_formatter.py +255 -0
  19. signalwire_agents/cli/output/swml_dump.py +186 -0
  20. signalwire_agents/cli/simulation/__init__.py +10 -0
  21. signalwire_agents/cli/simulation/data_generation.py +374 -0
  22. signalwire_agents/cli/simulation/data_overrides.py +200 -0
  23. signalwire_agents/cli/simulation/mock_env.py +282 -0
  24. signalwire_agents/cli/swaig_test_wrapper.py +52 -0
  25. signalwire_agents/cli/test_swaig.py +566 -2366
  26. signalwire_agents/cli/types.py +81 -0
  27. signalwire_agents/core/__init__.py +2 -2
  28. signalwire_agents/core/agent/__init__.py +12 -0
  29. signalwire_agents/core/agent/config/__init__.py +12 -0
  30. signalwire_agents/core/agent/deployment/__init__.py +9 -0
  31. signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
  32. signalwire_agents/core/agent/prompt/__init__.py +14 -0
  33. signalwire_agents/core/agent/prompt/manager.py +306 -0
  34. signalwire_agents/core/agent/routing/__init__.py +9 -0
  35. signalwire_agents/core/agent/security/__init__.py +9 -0
  36. signalwire_agents/core/agent/swml/__init__.py +9 -0
  37. signalwire_agents/core/agent/tools/__init__.py +15 -0
  38. signalwire_agents/core/agent/tools/decorator.py +97 -0
  39. signalwire_agents/core/agent/tools/registry.py +210 -0
  40. signalwire_agents/core/agent_base.py +845 -2916
  41. signalwire_agents/core/auth_handler.py +233 -0
  42. signalwire_agents/core/config_loader.py +259 -0
  43. signalwire_agents/core/contexts.py +418 -0
  44. signalwire_agents/core/data_map.py +3 -15
  45. signalwire_agents/core/function_result.py +116 -44
  46. signalwire_agents/core/logging_config.py +162 -18
  47. signalwire_agents/core/mixins/__init__.py +28 -0
  48. signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
  49. signalwire_agents/core/mixins/auth_mixin.py +280 -0
  50. signalwire_agents/core/mixins/prompt_mixin.py +358 -0
  51. signalwire_agents/core/mixins/serverless_mixin.py +460 -0
  52. signalwire_agents/core/mixins/skill_mixin.py +55 -0
  53. signalwire_agents/core/mixins/state_mixin.py +153 -0
  54. signalwire_agents/core/mixins/tool_mixin.py +230 -0
  55. signalwire_agents/core/mixins/web_mixin.py +1142 -0
  56. signalwire_agents/core/security_config.py +333 -0
  57. signalwire_agents/core/skill_base.py +84 -1
  58. signalwire_agents/core/skill_manager.py +62 -20
  59. signalwire_agents/core/swaig_function.py +18 -5
  60. signalwire_agents/core/swml_builder.py +207 -11
  61. signalwire_agents/core/swml_handler.py +27 -21
  62. signalwire_agents/core/swml_renderer.py +123 -312
  63. signalwire_agents/core/swml_service.py +171 -203
  64. signalwire_agents/mcp_gateway/__init__.py +29 -0
  65. signalwire_agents/mcp_gateway/gateway_service.py +564 -0
  66. signalwire_agents/mcp_gateway/mcp_manager.py +513 -0
  67. signalwire_agents/mcp_gateway/session_manager.py +218 -0
  68. signalwire_agents/prefabs/concierge.py +0 -3
  69. signalwire_agents/prefabs/faq_bot.py +0 -3
  70. signalwire_agents/prefabs/info_gatherer.py +0 -3
  71. signalwire_agents/prefabs/receptionist.py +0 -3
  72. signalwire_agents/prefabs/survey.py +0 -3
  73. signalwire_agents/schema.json +9218 -5489
  74. signalwire_agents/search/__init__.py +7 -1
  75. signalwire_agents/search/document_processor.py +490 -31
  76. signalwire_agents/search/index_builder.py +307 -37
  77. signalwire_agents/search/migration.py +418 -0
  78. signalwire_agents/search/models.py +30 -0
  79. signalwire_agents/search/pgvector_backend.py +748 -0
  80. signalwire_agents/search/query_processor.py +162 -31
  81. signalwire_agents/search/search_engine.py +916 -35
  82. signalwire_agents/search/search_service.py +376 -53
  83. signalwire_agents/skills/README.md +452 -0
  84. signalwire_agents/skills/__init__.py +14 -2
  85. signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
  86. signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
  87. signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
  88. signalwire_agents/skills/datasphere/README.md +210 -0
  89. signalwire_agents/skills/datasphere/skill.py +84 -3
  90. signalwire_agents/skills/datasphere_serverless/README.md +258 -0
  91. signalwire_agents/skills/datasphere_serverless/__init__.py +9 -0
  92. signalwire_agents/skills/datasphere_serverless/skill.py +82 -1
  93. signalwire_agents/skills/datetime/README.md +132 -0
  94. signalwire_agents/skills/datetime/__init__.py +9 -0
  95. signalwire_agents/skills/datetime/skill.py +20 -7
  96. signalwire_agents/skills/joke/README.md +149 -0
  97. signalwire_agents/skills/joke/__init__.py +9 -0
  98. signalwire_agents/skills/joke/skill.py +21 -0
  99. signalwire_agents/skills/math/README.md +161 -0
  100. signalwire_agents/skills/math/__init__.py +9 -0
  101. signalwire_agents/skills/math/skill.py +18 -4
  102. signalwire_agents/skills/mcp_gateway/README.md +230 -0
  103. signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
  104. signalwire_agents/skills/mcp_gateway/skill.py +421 -0
  105. signalwire_agents/skills/native_vector_search/README.md +210 -0
  106. signalwire_agents/skills/native_vector_search/__init__.py +9 -0
  107. signalwire_agents/skills/native_vector_search/skill.py +569 -101
  108. signalwire_agents/skills/play_background_file/README.md +218 -0
  109. signalwire_agents/skills/play_background_file/__init__.py +12 -0
  110. signalwire_agents/skills/play_background_file/skill.py +242 -0
  111. signalwire_agents/skills/registry.py +395 -40
  112. signalwire_agents/skills/spider/README.md +236 -0
  113. signalwire_agents/skills/spider/__init__.py +13 -0
  114. signalwire_agents/skills/spider/skill.py +598 -0
  115. signalwire_agents/skills/swml_transfer/README.md +395 -0
  116. signalwire_agents/skills/swml_transfer/__init__.py +10 -0
  117. signalwire_agents/skills/swml_transfer/skill.py +359 -0
  118. signalwire_agents/skills/weather_api/README.md +178 -0
  119. signalwire_agents/skills/weather_api/__init__.py +12 -0
  120. signalwire_agents/skills/weather_api/skill.py +191 -0
  121. signalwire_agents/skills/web_search/README.md +163 -0
  122. signalwire_agents/skills/web_search/__init__.py +9 -0
  123. signalwire_agents/skills/web_search/skill.py +586 -112
  124. signalwire_agents/skills/wikipedia_search/README.md +228 -0
  125. signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
  126. signalwire_agents/skills/{wikipedia → wikipedia_search}/skill.py +33 -3
  127. signalwire_agents/web/__init__.py +17 -0
  128. signalwire_agents/web/web_service.py +559 -0
  129. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-agent-init.1 +400 -0
  130. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-search.1 +483 -0
  131. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/swaig-test.1 +308 -0
  132. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/METADATA +347 -215
  133. signalwire_agents-1.0.17.dev4.dist-info/RECORD +147 -0
  134. signalwire_agents-1.0.17.dev4.dist-info/entry_points.txt +6 -0
  135. signalwire_agents/core/state/file_state_manager.py +0 -219
  136. signalwire_agents/core/state/state_manager.py +0 -101
  137. signalwire_agents/skills/wikipedia/__init__.py +0 -9
  138. signalwire_agents-0.1.13.data/data/schema.json +0 -5611
  139. signalwire_agents-0.1.13.dist-info/RECORD +0 -67
  140. signalwire_agents-0.1.13.dist-info/entry_points.txt +0 -3
  141. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/WHEEL +0 -0
  142. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/licenses/LICENSE +0 -0
  143. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,598 @@
1
+ """
2
+ Copyright (c) 2025 SignalWire
3
+
4
+ This file is part of the SignalWire AI Agents SDK.
5
+
6
+ Licensed under the MIT License.
7
+ See LICENSE file in the project root for full license information.
8
+ """
9
+
10
+ """Spider skill for fast web scraping with SignalWire AI Agents."""
11
+ import re
12
+ import logging
13
+ from typing import Dict, Any, Optional, List
14
+ from urllib.parse import urljoin, urlparse
15
+ import requests
16
+ from lxml import html
17
+ from lxml.etree import XPathEvalError
18
+
19
+ from signalwire_agents.core.skill_base import SkillBase
20
+ from signalwire_agents.core.function_result import SwaigFunctionResult
21
+
22
+
23
+ class SpiderSkill(SkillBase):
24
+ """Fast web scraping skill optimized for speed and token efficiency."""
25
+
26
+ SKILL_NAME = "spider"
27
+ SKILL_DESCRIPTION = "Fast web scraping and crawling capabilities"
28
+ SKILL_VERSION = "1.0.0"
29
+ REQUIRED_PACKAGES = ["lxml"] # beautifulsoup4 and requests are in base dependencies
30
+ REQUIRED_ENV_VARS = [] # No required env vars by default
31
+ SUPPORTS_MULTIPLE_INSTANCES = True
32
+
33
+ # Compiled regex for performance
34
+ WHITESPACE_REGEX = re.compile(r'\s+')
35
+
36
+ @classmethod
37
+ def get_parameter_schema(cls) -> Dict[str, Dict[str, Any]]:
38
+ """Get parameter schema for Spider skill"""
39
+ schema = super().get_parameter_schema()
40
+ schema.update({
41
+ "delay": {
42
+ "type": "number",
43
+ "description": "Delay between requests in seconds",
44
+ "default": 0.1,
45
+ "required": False,
46
+ "minimum": 0.0
47
+ },
48
+ "concurrent_requests": {
49
+ "type": "integer",
50
+ "description": "Number of concurrent requests allowed",
51
+ "default": 5,
52
+ "required": False,
53
+ "minimum": 1,
54
+ "maximum": 20
55
+ },
56
+ "timeout": {
57
+ "type": "integer",
58
+ "description": "Request timeout in seconds",
59
+ "default": 5,
60
+ "required": False,
61
+ "minimum": 1,
62
+ "maximum": 60
63
+ },
64
+ "max_pages": {
65
+ "type": "integer",
66
+ "description": "Maximum number of pages to scrape",
67
+ "default": 1,
68
+ "required": False,
69
+ "minimum": 1,
70
+ "maximum": 100
71
+ },
72
+ "max_depth": {
73
+ "type": "integer",
74
+ "description": "Maximum crawl depth (0 = single page only)",
75
+ "default": 0,
76
+ "required": False,
77
+ "minimum": 0,
78
+ "maximum": 5
79
+ },
80
+ "extract_type": {
81
+ "type": "string",
82
+ "description": "Content extraction method",
83
+ "default": "fast_text",
84
+ "required": False,
85
+ "enum": ["fast_text", "clean_text", "full_text", "html", "custom"]
86
+ },
87
+ "max_text_length": {
88
+ "type": "integer",
89
+ "description": "Maximum text length to return",
90
+ "default": 10000,
91
+ "required": False,
92
+ "minimum": 100,
93
+ "maximum": 100000
94
+ },
95
+ "clean_text": {
96
+ "type": "boolean",
97
+ "description": "Whether to clean extracted text",
98
+ "default": True,
99
+ "required": False
100
+ },
101
+ "selectors": {
102
+ "type": "object",
103
+ "description": "Custom CSS/XPath selectors for extraction",
104
+ "default": {},
105
+ "required": False,
106
+ "additionalProperties": {
107
+ "type": "string"
108
+ }
109
+ },
110
+ "follow_patterns": {
111
+ "type": "array",
112
+ "description": "URL patterns to follow when crawling",
113
+ "default": [],
114
+ "required": False,
115
+ "items": {
116
+ "type": "string"
117
+ }
118
+ },
119
+ "user_agent": {
120
+ "type": "string",
121
+ "description": "User agent string for requests",
122
+ "default": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
123
+ "required": False
124
+ },
125
+ "headers": {
126
+ "type": "object",
127
+ "description": "Additional HTTP headers",
128
+ "default": {},
129
+ "required": False,
130
+ "additionalProperties": {
131
+ "type": "string"
132
+ }
133
+ },
134
+ "follow_robots_txt": {
135
+ "type": "boolean",
136
+ "description": "Whether to respect robots.txt",
137
+ "default": True,
138
+ "required": False
139
+ },
140
+ "cache_enabled": {
141
+ "type": "boolean",
142
+ "description": "Whether to cache scraped pages",
143
+ "default": True,
144
+ "required": False
145
+ }
146
+ })
147
+ return schema
148
+
149
+ def __init__(self, agent, params: Dict[str, Any]):
150
+ """Initialize the spider skill with configuration parameters."""
151
+ super().__init__(agent, params)
152
+
153
+ # Performance settings
154
+ self.delay = self.params.get('delay', 0.1)
155
+ self.concurrent_requests = self.params.get('concurrent_requests', 5)
156
+ self.timeout = self.params.get('timeout', 5)
157
+
158
+ # Crawling limits
159
+ self.max_pages = self.params.get('max_pages', 1)
160
+ self.max_depth = self.params.get('max_depth', 0)
161
+
162
+ # Content processing
163
+ self.extract_type = self.params.get('extract_type', 'fast_text')
164
+ self.max_text_length = self.params.get('max_text_length', 3000)
165
+ self.clean_text = self.params.get('clean_text', True)
166
+
167
+ # Features
168
+ self.cache_enabled = self.params.get('cache_enabled', True)
169
+ self.follow_robots_txt = self.params.get('follow_robots_txt', False)
170
+ self.user_agent = self.params.get('user_agent', 'Spider/1.0 (SignalWire AI Agent)')
171
+
172
+ # Optional headers
173
+ self.headers = self.params.get('headers', {})
174
+ self.headers['User-Agent'] = self.user_agent
175
+
176
+ # Session for connection pooling
177
+ self.session = requests.Session()
178
+ self.session.headers.update(self.headers)
179
+
180
+ # Cache for responses
181
+ self.cache = {} if self.cache_enabled else None
182
+
183
+ # XPath expressions for unwanted elements
184
+ self.remove_xpaths = [
185
+ '//script', '//style', '//nav', '//header',
186
+ '//footer', '//aside', '//noscript'
187
+ ]
188
+
189
+ def get_instance_key(self) -> str:
190
+ """Return unique key for this skill instance."""
191
+ tool_name = self.params.get('tool_name', self.SKILL_NAME)
192
+ return f"{self.SKILL_NAME}_{tool_name}"
193
+
194
+ def setup(self) -> bool:
195
+ """Validate configuration and setup the skill."""
196
+ # Validate delay is reasonable
197
+ if self.delay < 0:
198
+ self.logger.error("Delay cannot be negative")
199
+ return False
200
+
201
+ # Validate concurrent requests
202
+ if not 1 <= self.concurrent_requests <= 20:
203
+ self.logger.error("Concurrent requests must be between 1 and 20")
204
+ return False
205
+
206
+ # Validate max pages and depth
207
+ if self.max_pages < 1:
208
+ self.logger.error("Max pages must be at least 1")
209
+ return False
210
+
211
+ if self.max_depth < 0:
212
+ self.logger.error("Max depth cannot be negative")
213
+ return False
214
+
215
+ self.logger.info(f"Spider skill configured: delay={self.delay}s, max_pages={self.max_pages}, max_depth={self.max_depth}")
216
+ return True
217
+
218
+ def register_tools(self) -> None:
219
+ """Register the web scraping tools with the agent."""
220
+ # Tool name prefix for multiple instances
221
+ tool_prefix = self.params.get('tool_name', '')
222
+ if tool_prefix:
223
+ tool_prefix = f"{tool_prefix}_"
224
+
225
+ # Register scrape_url tool
226
+ self.define_tool(
227
+ name=f"{tool_prefix}scrape_url",
228
+ description="Extract text content from a single web page",
229
+ parameters={
230
+ "url": {
231
+ "type": "string",
232
+ "description": "The URL to scrape"
233
+ }
234
+ },
235
+ required=["url"],
236
+ handler=self._scrape_url_handler
237
+ )
238
+
239
+ # Register crawl_site tool
240
+ self.define_tool(
241
+ name=f"{tool_prefix}crawl_site",
242
+ description="Crawl multiple pages starting from a URL",
243
+ parameters={
244
+ "start_url": {
245
+ "type": "string",
246
+ "description": "Starting URL for the crawl"
247
+ }
248
+ },
249
+ required=["start_url"],
250
+ handler=self._crawl_site_handler
251
+ )
252
+
253
+ # Register extract_structured_data tool
254
+ self.define_tool(
255
+ name=f"{tool_prefix}extract_structured_data",
256
+ description="Extract specific data from a web page using selectors",
257
+ parameters={
258
+ "url": {
259
+ "type": "string",
260
+ "description": "The URL to scrape"
261
+ }
262
+ },
263
+ required=["url"],
264
+ handler=self._extract_structured_handler
265
+ )
266
+
267
+ def _fetch_url(self, url: str) -> Optional[requests.Response]:
268
+ """Fetch a URL with caching and error handling."""
269
+ # Check cache first
270
+ if self.cache_enabled and url in self.cache:
271
+ self.logger.debug(f"Cache hit for {url}")
272
+ return self.cache[url]
273
+
274
+ try:
275
+ response = self.session.get(url, timeout=self.timeout)
276
+ response.raise_for_status()
277
+
278
+ # Cache successful responses
279
+ if self.cache_enabled:
280
+ self.cache[url] = response
281
+
282
+ return response
283
+
284
+ except requests.exceptions.Timeout:
285
+ self.logger.error(f"Timeout fetching {url}")
286
+ return None
287
+ except requests.exceptions.RequestException as e:
288
+ self.logger.error(f"Error fetching {url}: {e}")
289
+ return None
290
+
291
+ def _fast_text_extract(self, response: requests.Response) -> str:
292
+ """Ultra-fast text extraction using lxml."""
293
+ try:
294
+ # Parse HTML with lxml
295
+ tree = html.fromstring(response.content)
296
+
297
+ # Remove unwanted elements in one pass
298
+ for xpath in self.remove_xpaths:
299
+ for elem in tree.xpath(xpath):
300
+ elem.drop_tree()
301
+
302
+ # Extract text
303
+ text = tree.text_content()
304
+
305
+ # Clean whitespace if requested
306
+ if self.clean_text:
307
+ text = self.WHITESPACE_REGEX.sub(' ', text).strip()
308
+
309
+ # Smart truncation
310
+ if len(text) > self.max_text_length:
311
+ keep_start = self.max_text_length * 2 // 3
312
+ keep_end = self.max_text_length // 3
313
+ text = (
314
+ text[:keep_start] +
315
+ "\n\n[...CONTENT TRUNCATED...]\n\n" +
316
+ text[-keep_end:]
317
+ )
318
+
319
+ return text
320
+
321
+ except Exception as e:
322
+ self.logger.error(f"Error extracting text: {e}")
323
+ return ""
324
+
325
+ def _markdown_extract(self, response: requests.Response) -> str:
326
+ """Extract content in markdown format."""
327
+ try:
328
+ from bs4 import BeautifulSoup
329
+
330
+ soup = BeautifulSoup(response.content, 'html.parser')
331
+
332
+ # Remove unwanted tags
333
+ for tag in ['script', 'style', 'nav', 'header', 'footer', 'aside']:
334
+ for elem in soup.find_all(tag):
335
+ elem.decompose()
336
+
337
+ # Convert to markdown-like format
338
+ text_parts = []
339
+
340
+ # Title
341
+ title = soup.find('title')
342
+ if title:
343
+ text_parts.append(f"# {title.get_text().strip()}\n")
344
+
345
+ # Main content
346
+ for elem in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'code', 'pre']):
347
+ if elem.name.startswith('h'):
348
+ level = int(elem.name[1])
349
+ text_parts.append(f"\n{'#' * level} {elem.get_text().strip()}\n")
350
+ elif elem.name == 'p':
351
+ text_parts.append(f"\n{elem.get_text().strip()}\n")
352
+ elif elem.name == 'li':
353
+ text_parts.append(f"- {elem.get_text().strip()}")
354
+ elif elem.name in ['code', 'pre']:
355
+ text_parts.append(f"\n```\n{elem.get_text().strip()}\n```\n")
356
+
357
+ text = '\n'.join(text_parts)
358
+
359
+ # Truncate if needed
360
+ if len(text) > self.max_text_length:
361
+ text = text[:self.max_text_length] + "\n\n[...TRUNCATED...]"
362
+
363
+ return text
364
+
365
+ except ImportError:
366
+ self.logger.warning("BeautifulSoup not available, falling back to fast_text")
367
+ return self._fast_text_extract(response)
368
+ except Exception as e:
369
+ self.logger.error(f"Error in markdown extraction: {e}")
370
+ return self._fast_text_extract(response)
371
+
372
+ def _structured_extract(self, response: requests.Response, selectors: Dict[str, str] = None) -> Dict[str, Any]:
373
+ """Extract structured data using selectors."""
374
+ try:
375
+ tree = html.fromstring(response.content)
376
+ result = {
377
+ "url": response.url,
378
+ "status_code": response.status_code,
379
+ "title": "",
380
+ "data": {}
381
+ }
382
+
383
+ # Get title
384
+ title_elem = tree.xpath('//title/text()')
385
+ if title_elem:
386
+ result["title"] = title_elem[0].strip()
387
+
388
+ # Extract using provided selectors
389
+ if selectors:
390
+ for field, selector in selectors.items():
391
+ try:
392
+ if selector.startswith('/'): # XPath
393
+ values = tree.xpath(selector)
394
+ else: # CSS selector
395
+ from lxml.cssselect import CSSSelector
396
+ sel = CSSSelector(selector)
397
+ values = sel(tree)
398
+
399
+ # Extract text from elements
400
+ if values:
401
+ if len(values) == 1:
402
+ result["data"][field] = values[0].text_content().strip()
403
+ else:
404
+ result["data"][field] = [v.text_content().strip() for v in values]
405
+ except (XPathEvalError, Exception) as e:
406
+ self.logger.warning(f"Error with selector {selector}: {e}")
407
+ result["data"][field] = None
408
+
409
+ return result
410
+
411
+ except Exception as e:
412
+ self.logger.error(f"Error in structured extraction: {e}")
413
+ return {"error": str(e)}
414
+
415
+ def _scrape_url_handler(self, args: Dict[str, Any], raw_data: Dict[str, Any]) -> SwaigFunctionResult:
416
+ """Handle single page scraping."""
417
+ url = args.get('url', '').strip()
418
+ if not url:
419
+ return SwaigFunctionResult("Please provide a URL to scrape")
420
+
421
+ # Validate URL
422
+ parsed = urlparse(url)
423
+ if not parsed.scheme or not parsed.netloc:
424
+ return SwaigFunctionResult(f"Invalid URL: {url}")
425
+
426
+ # Fetch the page
427
+ response = self._fetch_url(url)
428
+ if not response:
429
+ return SwaigFunctionResult(f"Failed to fetch {url}")
430
+
431
+ # Extract content based on configured type (not from args)
432
+ extract_type = self.extract_type
433
+
434
+ try:
435
+ if extract_type == 'structured':
436
+ # For structured extraction, use predefined selectors from config if available
437
+ selectors = self.params.get('selectors', {})
438
+ result = self._structured_extract(response, selectors)
439
+ return SwaigFunctionResult(f"Extracted structured data from {url}: {result}")
440
+ elif extract_type == 'markdown':
441
+ content = self._markdown_extract(response)
442
+ else: # fast_text (default)
443
+ content = self._fast_text_extract(response)
444
+
445
+ if not content:
446
+ return SwaigFunctionResult(f"No content extracted from {url}")
447
+
448
+ # Format response
449
+ char_count = len(content)
450
+ header = f"Content from {url} ({char_count} characters):\n\n"
451
+
452
+ return SwaigFunctionResult(header + content)
453
+
454
+ except Exception as e:
455
+ self.logger.error(f"Error processing {url}: {e}")
456
+ return SwaigFunctionResult(f"Error processing {url}: {str(e)}")
457
+
458
+ def _crawl_site_handler(self, args: Dict[str, Any], raw_data: Dict[str, Any]) -> SwaigFunctionResult:
459
+ """Handle multi-page crawling."""
460
+ start_url = args.get('start_url', '').strip()
461
+ if not start_url:
462
+ return SwaigFunctionResult("Please provide a starting URL for the crawl")
463
+
464
+ # Use configured parameters (not from args)
465
+ max_depth = self.max_depth
466
+ max_pages = self.max_pages
467
+ follow_patterns = self.params.get('follow_patterns', [])
468
+
469
+ # Validate parameters
470
+ if max_depth < 0:
471
+ return SwaigFunctionResult("Max depth cannot be negative")
472
+ if max_pages < 1:
473
+ return SwaigFunctionResult("Max pages must be at least 1")
474
+
475
+ # Simple breadth-first crawl
476
+ visited = set()
477
+ to_visit = [(start_url, 0)] # (url, depth)
478
+ results = []
479
+
480
+ while to_visit and len(visited) < max_pages:
481
+ if not to_visit:
482
+ break
483
+
484
+ url, depth = to_visit.pop(0)
485
+
486
+ # Skip if already visited or depth exceeded
487
+ if url in visited or depth > max_depth:
488
+ continue
489
+
490
+ # Fetch and process page
491
+ response = self._fetch_url(url)
492
+ if not response:
493
+ continue
494
+
495
+ visited.add(url)
496
+
497
+ # Extract content
498
+ content = self._fast_text_extract(response)
499
+ if content:
500
+ results.append({
501
+ 'url': url,
502
+ 'depth': depth,
503
+ 'content_length': len(content),
504
+ 'summary': content[:500] + '...' if len(content) > 500 else content
505
+ })
506
+
507
+ # Extract links if not at max depth
508
+ if depth < max_depth:
509
+ try:
510
+ tree = html.fromstring(response.content)
511
+ links = tree.xpath('//a[@href]/@href')
512
+
513
+ for link in links:
514
+ absolute_url = urljoin(url, link)
515
+
516
+ # Check if we should follow this link
517
+ if follow_patterns:
518
+ if not any(re.search(pattern, absolute_url) for pattern in follow_patterns):
519
+ continue
520
+
521
+ # Only follow same domain by default
522
+ if urlparse(absolute_url).netloc == urlparse(start_url).netloc:
523
+ if absolute_url not in visited:
524
+ to_visit.append((absolute_url, depth + 1))
525
+
526
+ except Exception as e:
527
+ self.logger.warning(f"Error extracting links from {url}: {e}")
528
+
529
+ # Respect delay between requests
530
+ if self.delay > 0 and len(visited) < max_pages:
531
+ import time
532
+ time.sleep(self.delay)
533
+
534
+ # Format results
535
+ if not results:
536
+ return SwaigFunctionResult(f"No pages could be crawled from {start_url}")
537
+
538
+ summary = f"Crawled {len(results)} pages from {urlparse(start_url).netloc}:\n\n"
539
+
540
+ for i, result in enumerate(results, 1):
541
+ summary += f"{i}. {result['url']} (depth: {result['depth']}, {result['content_length']} chars)\n"
542
+ summary += f" Summary: {result['summary'][:100]}...\n\n"
543
+
544
+ total_chars = sum(r['content_length'] for r in results)
545
+ summary += f"\nTotal content: {total_chars:,} characters across {len(results)} pages"
546
+
547
+ return SwaigFunctionResult(summary)
548
+
549
+ def _extract_structured_handler(self, args: Dict[str, Any], raw_data: Dict[str, Any]) -> SwaigFunctionResult:
550
+ """Handle structured data extraction."""
551
+ url = args.get('url', '').strip()
552
+
553
+ if not url:
554
+ return SwaigFunctionResult("Please provide a URL")
555
+
556
+ # Use configured selectors from params
557
+ selectors = self.params.get('selectors', {})
558
+ if not selectors:
559
+ return SwaigFunctionResult("No selectors configured for structured data extraction")
560
+
561
+ # Fetch the page
562
+ response = self._fetch_url(url)
563
+ if not response:
564
+ return SwaigFunctionResult(f"Failed to fetch {url}")
565
+
566
+ # Extract structured data
567
+ result = self._structured_extract(response, selectors)
568
+
569
+ if 'error' in result:
570
+ return SwaigFunctionResult(f"Error extracting data: {result['error']}")
571
+
572
+ # Format the response
573
+ output = f"Extracted data from {url}:\n\n"
574
+ output += f"Title: {result.get('title', 'N/A')}\n\n"
575
+
576
+ if result.get('data'):
577
+ output += "Data:\n"
578
+ for field, value in result['data'].items():
579
+ output += f"- {field}: {value}\n"
580
+ else:
581
+ output += "No data extracted with provided selectors"
582
+
583
+ return SwaigFunctionResult(output)
584
+
585
+ def get_hints(self) -> List[str]:
586
+ """Return speech recognition hints for this skill."""
587
+ return [
588
+ "scrape", "crawl", "extract", "web page", "website",
589
+ "get content from", "fetch data from", "spider"
590
+ ]
591
+
592
+ def cleanup(self) -> None:
593
+ """Clean up resources when skill is unloaded."""
594
+ if hasattr(self, 'session'):
595
+ self.session.close()
596
+ if hasattr(self, 'cache'):
597
+ self.cache.clear()
598
+ self.logger.info("Spider skill cleaned up")