signalwire-agents 0.1.37__py3-none-any.whl → 0.1.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +1 -1
- signalwire_agents/cli/build_search.py +95 -19
- signalwire_agents/core/agent_base.py +38 -0
- signalwire_agents/core/mixins/ai_config_mixin.py +120 -0
- signalwire_agents/core/skill_manager.py +47 -0
- signalwire_agents/search/index_builder.py +105 -10
- signalwire_agents/search/pgvector_backend.py +523 -0
- signalwire_agents/search/search_engine.py +41 -4
- signalwire_agents/search/search_service.py +86 -35
- signalwire_agents/skills/api_ninjas_trivia/skill.py +37 -1
- signalwire_agents/skills/datasphere/skill.py +82 -0
- signalwire_agents/skills/datasphere_serverless/skill.py +82 -0
- signalwire_agents/skills/joke/skill.py +21 -0
- signalwire_agents/skills/mcp_gateway/skill.py +82 -0
- signalwire_agents/skills/native_vector_search/README.md +210 -0
- signalwire_agents/skills/native_vector_search/skill.py +197 -7
- signalwire_agents/skills/play_background_file/skill.py +36 -0
- signalwire_agents/skills/registry.py +36 -0
- signalwire_agents/skills/spider/skill.py +113 -0
- signalwire_agents/skills/swml_transfer/skill.py +90 -0
- signalwire_agents/skills/weather_api/skill.py +28 -0
- signalwire_agents/skills/wikipedia_search/skill.py +22 -0
- {signalwire_agents-0.1.37.dist-info → signalwire_agents-0.1.38.dist-info}/METADATA +53 -1
- {signalwire_agents-0.1.37.dist-info → signalwire_agents-0.1.38.dist-info}/RECORD +28 -26
- {signalwire_agents-0.1.37.dist-info → signalwire_agents-0.1.38.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.37.dist-info → signalwire_agents-0.1.38.dist-info}/entry_points.txt +0 -0
- {signalwire_agents-0.1.37.dist-info → signalwire_agents-0.1.38.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.37.dist-info → signalwire_agents-0.1.38.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,210 @@
|
|
1
|
+
# Native Vector Search Skill
|
2
|
+
|
3
|
+
The Native Vector Search skill provides document search capabilities using vector similarity and keyword search. It supports multiple storage backends including SQLite (local files) and PostgreSQL with pgvector extension.
|
4
|
+
|
5
|
+
## Features
|
6
|
+
|
7
|
+
- **Hybrid Search**: Combines vector similarity and keyword search for better results
|
8
|
+
- **Multiple Backends**: SQLite for local deployment, pgvector for scalable production use
|
9
|
+
- **Remote Search**: Connect to remote search servers
|
10
|
+
- **Auto-indexing**: Automatically build indexes from source directories
|
11
|
+
- **NLP Enhancement**: Query expansion and synonym matching
|
12
|
+
- **Tag Filtering**: Filter results by document tags
|
13
|
+
|
14
|
+
## Backends
|
15
|
+
|
16
|
+
### SQLite Backend (Default)
|
17
|
+
- Stores indexes in `.swsearch` files
|
18
|
+
- Good for single-agent deployments
|
19
|
+
- Portable and self-contained
|
20
|
+
- No external dependencies
|
21
|
+
|
22
|
+
### pgvector Backend
|
23
|
+
- Uses PostgreSQL with pgvector extension
|
24
|
+
- Scalable for multi-agent deployments
|
25
|
+
- Real-time updates capability
|
26
|
+
- Efficient similarity search with specialized indexes
|
27
|
+
|
28
|
+
### Remote Search Server
|
29
|
+
- Connect to centralized search API
|
30
|
+
- Lower memory usage per agent
|
31
|
+
- Shared knowledge base
|
32
|
+
|
33
|
+
## Configuration Parameters
|
34
|
+
|
35
|
+
### Basic Parameters
|
36
|
+
- `tool_name`: Name of the search tool (default: "search_knowledge")
|
37
|
+
- `description`: Tool description for the AI
|
38
|
+
- `count`: Number of results to return (default: 5)
|
39
|
+
- `distance_threshold`: Minimum similarity score (default: 0.0)
|
40
|
+
- `tags`: Filter results by these tags
|
41
|
+
|
42
|
+
### Backend Selection
|
43
|
+
- `backend`: Storage backend - "sqlite" or "pgvector" (default: "sqlite")
|
44
|
+
|
45
|
+
### SQLite Backend
|
46
|
+
- `index_file`: Path to .swsearch index file
|
47
|
+
- `build_index`: Auto-build index from source (default: false)
|
48
|
+
- `source_dir`: Directory to index if build_index=true
|
49
|
+
|
50
|
+
### pgvector Backend
|
51
|
+
- `connection_string`: PostgreSQL connection string (required)
|
52
|
+
- `collection_name`: Name of the collection to search (required)
|
53
|
+
|
54
|
+
### Remote Backend
|
55
|
+
- `remote_url`: URL of remote search server
|
56
|
+
- `index_name`: Name of index on remote server
|
57
|
+
|
58
|
+
### Response Formatting
|
59
|
+
- `response_prefix`: Text to prepend to results
|
60
|
+
- `response_postfix`: Text to append to results
|
61
|
+
- `no_results_message`: Message when no results found
|
62
|
+
|
63
|
+
### NLP Configuration
|
64
|
+
- `query_nlp_backend`: NLP backend for queries ("nltk" or "spacy")
|
65
|
+
- `index_nlp_backend`: NLP backend for indexing ("nltk" or "spacy")
|
66
|
+
|
67
|
+
## Usage Examples
|
68
|
+
|
69
|
+
### SQLite Backend (Local File)
|
70
|
+
```python
|
71
|
+
agent.add_skill("native_vector_search", {
|
72
|
+
"tool_name": "search_docs",
|
73
|
+
"description": "Search technical documentation",
|
74
|
+
"index_file": "docs.swsearch",
|
75
|
+
"count": 5
|
76
|
+
})
|
77
|
+
```
|
78
|
+
|
79
|
+
### pgvector Backend (PostgreSQL)
|
80
|
+
```python
|
81
|
+
agent.add_skill("native_vector_search", {
|
82
|
+
"tool_name": "search_knowledge",
|
83
|
+
"description": "Search the knowledge base",
|
84
|
+
"backend": "pgvector",
|
85
|
+
"connection_string": "postgresql://user:pass@localhost:5432/knowledge",
|
86
|
+
"collection_name": "docs_collection",
|
87
|
+
"count": 5
|
88
|
+
})
|
89
|
+
```
|
90
|
+
|
91
|
+
### Remote Search Server
|
92
|
+
```python
|
93
|
+
agent.add_skill("native_vector_search", {
|
94
|
+
"tool_name": "search_api",
|
95
|
+
"description": "Search API documentation",
|
96
|
+
"remote_url": "http://search-server:8001",
|
97
|
+
"index_name": "api_docs"
|
98
|
+
})
|
99
|
+
```
|
100
|
+
|
101
|
+
### Auto-build Index
|
102
|
+
```python
|
103
|
+
agent.add_skill("native_vector_search", {
|
104
|
+
"tool_name": "search_local",
|
105
|
+
"build_index": True,
|
106
|
+
"source_dir": "./documentation",
|
107
|
+
"file_types": ["md", "txt"],
|
108
|
+
"index_file": "auto_docs.swsearch"
|
109
|
+
})
|
110
|
+
```
|
111
|
+
|
112
|
+
### Multiple Search Instances
|
113
|
+
```python
|
114
|
+
# Documentation search
|
115
|
+
agent.add_skill("native_vector_search", {
|
116
|
+
"tool_name": "search_docs",
|
117
|
+
"index_file": "docs.swsearch",
|
118
|
+
"description": "Search documentation"
|
119
|
+
})
|
120
|
+
|
121
|
+
# Code examples search
|
122
|
+
agent.add_skill("native_vector_search", {
|
123
|
+
"tool_name": "search_examples",
|
124
|
+
"backend": "pgvector",
|
125
|
+
"connection_string": "postgresql://localhost/knowledge",
|
126
|
+
"collection_name": "examples",
|
127
|
+
"description": "Search code examples"
|
128
|
+
})
|
129
|
+
```
|
130
|
+
|
131
|
+
## Installation
|
132
|
+
|
133
|
+
### For SQLite Backend
|
134
|
+
```bash
|
135
|
+
pip install signalwire-agents[search]
|
136
|
+
```
|
137
|
+
|
138
|
+
### For pgvector Backend
|
139
|
+
```bash
|
140
|
+
pip install signalwire-agents[search,pgvector]
|
141
|
+
```
|
142
|
+
|
143
|
+
### For All Features
|
144
|
+
```bash
|
145
|
+
pip install signalwire-agents[search-all]
|
146
|
+
```
|
147
|
+
|
148
|
+
## Building Indexes
|
149
|
+
|
150
|
+
### Using sw-search CLI
|
151
|
+
|
152
|
+
#### SQLite Backend
|
153
|
+
```bash
|
154
|
+
sw-search ./docs --output docs.swsearch
|
155
|
+
```
|
156
|
+
|
157
|
+
#### pgvector Backend
|
158
|
+
```bash
|
159
|
+
sw-search ./docs \
|
160
|
+
--backend pgvector \
|
161
|
+
--connection-string "postgresql://localhost/knowledge" \
|
162
|
+
--output docs_collection
|
163
|
+
```
|
164
|
+
|
165
|
+
## Performance Considerations
|
166
|
+
|
167
|
+
### SQLite
|
168
|
+
- Fast for small to medium datasets (<100k documents)
|
169
|
+
- Linear search for vector similarity
|
170
|
+
- Single-file deployment
|
171
|
+
|
172
|
+
### pgvector
|
173
|
+
- Efficient for large datasets
|
174
|
+
- Uses IVFFlat or HNSW indexes
|
175
|
+
- Handles concurrent access well
|
176
|
+
- Requires PostgreSQL server
|
177
|
+
|
178
|
+
### NLP Backends
|
179
|
+
- `nltk`: Fast, good for most use cases (~50-100ms)
|
180
|
+
- `spacy`: Better quality, slower (~150-300ms)
|
181
|
+
|
182
|
+
## Environment Variables
|
183
|
+
|
184
|
+
None required - all configuration comes through skill parameters.
|
185
|
+
|
186
|
+
## Troubleshooting
|
187
|
+
|
188
|
+
### "Search dependencies not available"
|
189
|
+
Install the search extras:
|
190
|
+
```bash
|
191
|
+
pip install signalwire-agents[search]
|
192
|
+
```
|
193
|
+
|
194
|
+
### "pgvector dependencies not available"
|
195
|
+
Install pgvector support:
|
196
|
+
```bash
|
197
|
+
pip install signalwire-agents[pgvector]
|
198
|
+
```
|
199
|
+
|
200
|
+
### "Failed to connect to pgvector"
|
201
|
+
1. Ensure PostgreSQL is running
|
202
|
+
2. Check connection string
|
203
|
+
3. Verify pgvector extension is installed
|
204
|
+
4. Check collection exists
|
205
|
+
|
206
|
+
### Poor Search Results
|
207
|
+
1. Try different NLP backends
|
208
|
+
2. Adjust distance_threshold
|
209
|
+
3. Check document preprocessing
|
210
|
+
4. Verify index quality
|
@@ -28,6 +28,174 @@ class NativeVectorSearchSkill(SkillBase):
|
|
28
28
|
# Enable multiple instances support
|
29
29
|
SUPPORTS_MULTIPLE_INSTANCES = True
|
30
30
|
|
31
|
+
@classmethod
|
32
|
+
def get_parameter_schema(cls) -> Dict[str, Dict[str, Any]]:
|
33
|
+
"""Get parameter schema for Native Vector Search skill
|
34
|
+
|
35
|
+
This skill supports three modes of operation:
|
36
|
+
1. Network Mode: Set 'remote_url' to connect to a remote search server
|
37
|
+
2. Local pgvector: Set backend='pgvector' with connection_string and collection_name
|
38
|
+
3. Local SQLite: Set 'index_file' to use a local .swsearch file (default)
|
39
|
+
"""
|
40
|
+
schema = super().get_parameter_schema()
|
41
|
+
schema.update({
|
42
|
+
"index_file": {
|
43
|
+
"type": "string",
|
44
|
+
"description": "Path to .swsearch index file (SQLite backend only). Use this for local file-based search",
|
45
|
+
"required": False
|
46
|
+
},
|
47
|
+
"build_index": {
|
48
|
+
"type": "boolean",
|
49
|
+
"description": "Whether to build index from source files",
|
50
|
+
"default": False,
|
51
|
+
"required": False
|
52
|
+
},
|
53
|
+
"source_dir": {
|
54
|
+
"type": "string",
|
55
|
+
"description": "Directory containing documents to index (required if build_index=True)",
|
56
|
+
"required": False
|
57
|
+
},
|
58
|
+
"remote_url": {
|
59
|
+
"type": "string",
|
60
|
+
"description": "URL of remote search server for network mode (e.g., http://localhost:8001). Use this instead of index_file or pgvector for centralized search",
|
61
|
+
"required": False
|
62
|
+
},
|
63
|
+
"index_name": {
|
64
|
+
"type": "string",
|
65
|
+
"description": "Name of index on remote server (network mode only, used with remote_url)",
|
66
|
+
"default": "default",
|
67
|
+
"required": False
|
68
|
+
},
|
69
|
+
"count": {
|
70
|
+
"type": "integer",
|
71
|
+
"description": "Number of search results to return",
|
72
|
+
"default": 5,
|
73
|
+
"required": False,
|
74
|
+
"minimum": 1,
|
75
|
+
"maximum": 20
|
76
|
+
},
|
77
|
+
"distance_threshold": {
|
78
|
+
"type": "number",
|
79
|
+
"description": "Maximum distance threshold for results (0.0 = no limit)",
|
80
|
+
"default": 0.0,
|
81
|
+
"required": False,
|
82
|
+
"minimum": 0.0,
|
83
|
+
"maximum": 1.0
|
84
|
+
},
|
85
|
+
"tags": {
|
86
|
+
"type": "array",
|
87
|
+
"description": "Tags to filter search results",
|
88
|
+
"default": [],
|
89
|
+
"required": False,
|
90
|
+
"items": {
|
91
|
+
"type": "string"
|
92
|
+
}
|
93
|
+
},
|
94
|
+
"global_tags": {
|
95
|
+
"type": "array",
|
96
|
+
"description": "Tags to apply to all indexed documents",
|
97
|
+
"default": [],
|
98
|
+
"required": False,
|
99
|
+
"items": {
|
100
|
+
"type": "string"
|
101
|
+
}
|
102
|
+
},
|
103
|
+
"file_types": {
|
104
|
+
"type": "array",
|
105
|
+
"description": "File extensions to include when building index",
|
106
|
+
"default": ["md", "txt", "pdf", "docx", "html"],
|
107
|
+
"required": False,
|
108
|
+
"items": {
|
109
|
+
"type": "string"
|
110
|
+
}
|
111
|
+
},
|
112
|
+
"exclude_patterns": {
|
113
|
+
"type": "array",
|
114
|
+
"description": "Patterns to exclude when building index",
|
115
|
+
"default": ["**/node_modules/**", "**/.git/**", "**/dist/**", "**/build/**"],
|
116
|
+
"required": False,
|
117
|
+
"items": {
|
118
|
+
"type": "string"
|
119
|
+
}
|
120
|
+
},
|
121
|
+
"no_results_message": {
|
122
|
+
"type": "string",
|
123
|
+
"description": "Message when no results are found",
|
124
|
+
"default": "No information found for '{query}'",
|
125
|
+
"required": False
|
126
|
+
},
|
127
|
+
"response_prefix": {
|
128
|
+
"type": "string",
|
129
|
+
"description": "Prefix to add to search results",
|
130
|
+
"default": "",
|
131
|
+
"required": False
|
132
|
+
},
|
133
|
+
"response_postfix": {
|
134
|
+
"type": "string",
|
135
|
+
"description": "Postfix to add to search results",
|
136
|
+
"default": "",
|
137
|
+
"required": False
|
138
|
+
},
|
139
|
+
"description": {
|
140
|
+
"type": "string",
|
141
|
+
"description": "Tool description",
|
142
|
+
"default": "Search the knowledge base for information",
|
143
|
+
"required": False
|
144
|
+
},
|
145
|
+
"hints": {
|
146
|
+
"type": "array",
|
147
|
+
"description": "Speech recognition hints",
|
148
|
+
"default": [],
|
149
|
+
"required": False,
|
150
|
+
"items": {
|
151
|
+
"type": "string"
|
152
|
+
}
|
153
|
+
},
|
154
|
+
"nlp_backend": {
|
155
|
+
"type": "string",
|
156
|
+
"description": "NLP backend for query processing",
|
157
|
+
"default": "basic",
|
158
|
+
"required": False,
|
159
|
+
"enum": ["basic", "spacy", "nltk"]
|
160
|
+
},
|
161
|
+
"query_nlp_backend": {
|
162
|
+
"type": "string",
|
163
|
+
"description": "NLP backend for query expansion",
|
164
|
+
"required": False,
|
165
|
+
"enum": ["basic", "spacy", "nltk"]
|
166
|
+
},
|
167
|
+
"index_nlp_backend": {
|
168
|
+
"type": "string",
|
169
|
+
"description": "NLP backend for indexing",
|
170
|
+
"required": False,
|
171
|
+
"enum": ["basic", "spacy", "nltk"]
|
172
|
+
},
|
173
|
+
"backend": {
|
174
|
+
"type": "string",
|
175
|
+
"description": "Storage backend for local database mode: 'sqlite' for file-based or 'pgvector' for PostgreSQL. Ignored if remote_url is set",
|
176
|
+
"default": "sqlite",
|
177
|
+
"required": False,
|
178
|
+
"enum": ["sqlite", "pgvector"]
|
179
|
+
},
|
180
|
+
"connection_string": {
|
181
|
+
"type": "string",
|
182
|
+
"description": "PostgreSQL connection string (pgvector backend only, e.g., 'postgresql://user:pass@localhost:5432/dbname'). Required when backend='pgvector'",
|
183
|
+
"required": False
|
184
|
+
},
|
185
|
+
"collection_name": {
|
186
|
+
"type": "string",
|
187
|
+
"description": "Collection/table name in PostgreSQL (pgvector backend only). Required when backend='pgvector'",
|
188
|
+
"required": False
|
189
|
+
},
|
190
|
+
"verbose": {
|
191
|
+
"type": "boolean",
|
192
|
+
"description": "Enable verbose logging",
|
193
|
+
"default": False,
|
194
|
+
"required": False
|
195
|
+
}
|
196
|
+
})
|
197
|
+
return schema
|
198
|
+
|
31
199
|
def get_instance_key(self) -> str:
|
32
200
|
"""
|
33
201
|
Get the key used to track this skill instance
|
@@ -43,6 +211,9 @@ class NativeVectorSearchSkill(SkillBase):
|
|
43
211
|
|
44
212
|
# Get configuration first
|
45
213
|
self.tool_name = self.params.get('tool_name', 'search_knowledge')
|
214
|
+
self.backend = self.params.get('backend', 'sqlite')
|
215
|
+
self.connection_string = self.params.get('connection_string')
|
216
|
+
self.collection_name = self.params.get('collection_name')
|
46
217
|
self.index_file = self.params.get('index_file')
|
47
218
|
self.build_index = self.params.get('build_index', False)
|
48
219
|
self.source_dir = self.params.get('source_dir')
|
@@ -153,13 +324,32 @@ class NativeVectorSearchSkill(SkillBase):
|
|
153
324
|
|
154
325
|
# Initialize local search engine
|
155
326
|
self.search_engine = None
|
156
|
-
if self.search_available
|
157
|
-
|
158
|
-
|
159
|
-
self.
|
160
|
-
|
161
|
-
|
162
|
-
|
327
|
+
if self.search_available:
|
328
|
+
if self.backend == 'pgvector':
|
329
|
+
# Initialize pgvector backend
|
330
|
+
if self.connection_string and self.collection_name:
|
331
|
+
try:
|
332
|
+
from signalwire_agents.search import SearchEngine
|
333
|
+
self.search_engine = SearchEngine(
|
334
|
+
backend='pgvector',
|
335
|
+
connection_string=self.connection_string,
|
336
|
+
collection_name=self.collection_name
|
337
|
+
)
|
338
|
+
self.logger.info(f"Connected to pgvector collection: {self.collection_name}")
|
339
|
+
except Exception as e:
|
340
|
+
self.logger.error(f"Failed to connect to pgvector: {e}")
|
341
|
+
self.search_available = False
|
342
|
+
else:
|
343
|
+
self.logger.error("pgvector backend requires connection_string and collection_name")
|
344
|
+
self.search_available = False
|
345
|
+
elif self.index_file and os.path.exists(self.index_file):
|
346
|
+
# Initialize SQLite backend
|
347
|
+
try:
|
348
|
+
from signalwire_agents.search import SearchEngine
|
349
|
+
self.search_engine = SearchEngine(backend='sqlite', index_path=self.index_file)
|
350
|
+
except Exception as e:
|
351
|
+
self.logger.error(f"Failed to load search index {self.index_file}: {e}")
|
352
|
+
self.search_available = False
|
163
353
|
|
164
354
|
return True
|
165
355
|
|
@@ -48,6 +48,42 @@ class PlayBackgroundFileSkill(SkillBase):
|
|
48
48
|
SKILL_DESCRIPTION = "Control background file playback"
|
49
49
|
SUPPORTS_MULTIPLE_INSTANCES = True
|
50
50
|
|
51
|
+
@classmethod
|
52
|
+
def get_parameter_schema(cls) -> Dict[str, Dict[str, Any]]:
|
53
|
+
"""Get parameter schema for Play Background File skill"""
|
54
|
+
schema = super().get_parameter_schema()
|
55
|
+
schema.update({
|
56
|
+
"files": {
|
57
|
+
"type": "array",
|
58
|
+
"description": "Array of file configurations to make available for playback",
|
59
|
+
"required": True,
|
60
|
+
"items": {
|
61
|
+
"type": "object",
|
62
|
+
"properties": {
|
63
|
+
"key": {
|
64
|
+
"type": "string",
|
65
|
+
"description": "Unique identifier for the file"
|
66
|
+
},
|
67
|
+
"description": {
|
68
|
+
"type": "string",
|
69
|
+
"description": "Human-readable description of the file"
|
70
|
+
},
|
71
|
+
"url": {
|
72
|
+
"type": "string",
|
73
|
+
"description": "URL of the audio/video file to play"
|
74
|
+
},
|
75
|
+
"wait": {
|
76
|
+
"type": "boolean",
|
77
|
+
"description": "Whether to wait for file to finish playing",
|
78
|
+
"default": False
|
79
|
+
}
|
80
|
+
},
|
81
|
+
"required": ["key", "description", "url"]
|
82
|
+
}
|
83
|
+
}
|
84
|
+
})
|
85
|
+
return schema
|
86
|
+
|
51
87
|
def __init__(self, agent, params: Dict[str, Any] = None):
|
52
88
|
"""
|
53
89
|
Initialize the skill with configuration parameters.
|
@@ -128,6 +128,42 @@ class SkillRegistry:
|
|
128
128
|
if not hasattr(skill_class, 'SKILL_NAME') or skill_class.SKILL_NAME is None:
|
129
129
|
raise ValueError(f"{skill_class} must define SKILL_NAME")
|
130
130
|
|
131
|
+
# Validate that the skill has a proper parameter schema
|
132
|
+
if not hasattr(skill_class, 'get_parameter_schema') or not callable(getattr(skill_class, 'get_parameter_schema')):
|
133
|
+
raise ValueError(f"{skill_class.__name__} must have get_parameter_schema() classmethod")
|
134
|
+
|
135
|
+
# Try to call get_parameter_schema to ensure it's properly implemented
|
136
|
+
try:
|
137
|
+
schema = skill_class.get_parameter_schema()
|
138
|
+
if not isinstance(schema, dict):
|
139
|
+
raise ValueError(f"{skill_class.__name__}.get_parameter_schema() must return a dictionary, got {type(schema)}")
|
140
|
+
|
141
|
+
# Ensure it's not an empty schema (skills should at least have the base parameters)
|
142
|
+
if not schema:
|
143
|
+
raise ValueError(f"{skill_class.__name__}.get_parameter_schema() returned an empty dictionary. Skills should at least call super().get_parameter_schema()")
|
144
|
+
|
145
|
+
# Check if the skill has overridden the method (not just inherited base)
|
146
|
+
skill_method = getattr(skill_class, 'get_parameter_schema', None)
|
147
|
+
base_method = getattr(SkillBase, 'get_parameter_schema', None)
|
148
|
+
|
149
|
+
if skill_method and base_method:
|
150
|
+
# For class methods, check the underlying function
|
151
|
+
skill_func = skill_method.__func__ if hasattr(skill_method, '__func__') else skill_method
|
152
|
+
base_func = base_method.__func__ if hasattr(base_method, '__func__') else base_method
|
153
|
+
|
154
|
+
if skill_func is base_func:
|
155
|
+
# Get base schema to check if skill added any parameters
|
156
|
+
base_schema = SkillBase.get_parameter_schema()
|
157
|
+
if set(schema.keys()) == set(base_schema.keys()):
|
158
|
+
raise ValueError(f"{skill_class.__name__} must override get_parameter_schema() to define its specific parameters")
|
159
|
+
|
160
|
+
except AttributeError as e:
|
161
|
+
raise ValueError(f"{skill_class.__name__} must properly implement get_parameter_schema() classmethod")
|
162
|
+
except ValueError:
|
163
|
+
raise # Re-raise our validation errors
|
164
|
+
except Exception as e:
|
165
|
+
raise ValueError(f"{skill_class.__name__}.get_parameter_schema() failed: {e}")
|
166
|
+
|
131
167
|
if skill_class.SKILL_NAME in self._skills:
|
132
168
|
self.logger.warning(f"Skill '{skill_class.SKILL_NAME}' already registered")
|
133
169
|
return
|
@@ -33,6 +33,119 @@ class SpiderSkill(SkillBase):
|
|
33
33
|
# Compiled regex for performance
|
34
34
|
WHITESPACE_REGEX = re.compile(r'\s+')
|
35
35
|
|
36
|
+
@classmethod
|
37
|
+
def get_parameter_schema(cls) -> Dict[str, Dict[str, Any]]:
|
38
|
+
"""Get parameter schema for Spider skill"""
|
39
|
+
schema = super().get_parameter_schema()
|
40
|
+
schema.update({
|
41
|
+
"delay": {
|
42
|
+
"type": "number",
|
43
|
+
"description": "Delay between requests in seconds",
|
44
|
+
"default": 0.1,
|
45
|
+
"required": False,
|
46
|
+
"minimum": 0.0
|
47
|
+
},
|
48
|
+
"concurrent_requests": {
|
49
|
+
"type": "integer",
|
50
|
+
"description": "Number of concurrent requests allowed",
|
51
|
+
"default": 5,
|
52
|
+
"required": False,
|
53
|
+
"minimum": 1,
|
54
|
+
"maximum": 20
|
55
|
+
},
|
56
|
+
"timeout": {
|
57
|
+
"type": "integer",
|
58
|
+
"description": "Request timeout in seconds",
|
59
|
+
"default": 5,
|
60
|
+
"required": False,
|
61
|
+
"minimum": 1,
|
62
|
+
"maximum": 60
|
63
|
+
},
|
64
|
+
"max_pages": {
|
65
|
+
"type": "integer",
|
66
|
+
"description": "Maximum number of pages to scrape",
|
67
|
+
"default": 1,
|
68
|
+
"required": False,
|
69
|
+
"minimum": 1,
|
70
|
+
"maximum": 100
|
71
|
+
},
|
72
|
+
"max_depth": {
|
73
|
+
"type": "integer",
|
74
|
+
"description": "Maximum crawl depth (0 = single page only)",
|
75
|
+
"default": 0,
|
76
|
+
"required": False,
|
77
|
+
"minimum": 0,
|
78
|
+
"maximum": 5
|
79
|
+
},
|
80
|
+
"extract_type": {
|
81
|
+
"type": "string",
|
82
|
+
"description": "Content extraction method",
|
83
|
+
"default": "fast_text",
|
84
|
+
"required": False,
|
85
|
+
"enum": ["fast_text", "clean_text", "full_text", "html", "custom"]
|
86
|
+
},
|
87
|
+
"max_text_length": {
|
88
|
+
"type": "integer",
|
89
|
+
"description": "Maximum text length to return",
|
90
|
+
"default": 10000,
|
91
|
+
"required": False,
|
92
|
+
"minimum": 100,
|
93
|
+
"maximum": 100000
|
94
|
+
},
|
95
|
+
"clean_text": {
|
96
|
+
"type": "boolean",
|
97
|
+
"description": "Whether to clean extracted text",
|
98
|
+
"default": True,
|
99
|
+
"required": False
|
100
|
+
},
|
101
|
+
"selectors": {
|
102
|
+
"type": "object",
|
103
|
+
"description": "Custom CSS/XPath selectors for extraction",
|
104
|
+
"default": {},
|
105
|
+
"required": False,
|
106
|
+
"additionalProperties": {
|
107
|
+
"type": "string"
|
108
|
+
}
|
109
|
+
},
|
110
|
+
"follow_patterns": {
|
111
|
+
"type": "array",
|
112
|
+
"description": "URL patterns to follow when crawling",
|
113
|
+
"default": [],
|
114
|
+
"required": False,
|
115
|
+
"items": {
|
116
|
+
"type": "string"
|
117
|
+
}
|
118
|
+
},
|
119
|
+
"user_agent": {
|
120
|
+
"type": "string",
|
121
|
+
"description": "User agent string for requests",
|
122
|
+
"default": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
123
|
+
"required": False
|
124
|
+
},
|
125
|
+
"headers": {
|
126
|
+
"type": "object",
|
127
|
+
"description": "Additional HTTP headers",
|
128
|
+
"default": {},
|
129
|
+
"required": False,
|
130
|
+
"additionalProperties": {
|
131
|
+
"type": "string"
|
132
|
+
}
|
133
|
+
},
|
134
|
+
"follow_robots_txt": {
|
135
|
+
"type": "boolean",
|
136
|
+
"description": "Whether to respect robots.txt",
|
137
|
+
"default": True,
|
138
|
+
"required": False
|
139
|
+
},
|
140
|
+
"cache_enabled": {
|
141
|
+
"type": "boolean",
|
142
|
+
"description": "Whether to cache scraped pages",
|
143
|
+
"default": True,
|
144
|
+
"required": False
|
145
|
+
}
|
146
|
+
})
|
147
|
+
return schema
|
148
|
+
|
36
149
|
def __init__(self, agent, params: Dict[str, Any]):
|
37
150
|
"""Initialize the spider skill with configuration parameters."""
|
38
151
|
super().__init__(agent, params)
|