signalwire-agents 0.1.46__py3-none-any.whl → 0.1.48__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +1 -1
- signalwire_agents/cli/build_search.py +522 -13
- signalwire_agents/core/agent_base.py +29 -37
- signalwire_agents/core/mixins/ai_config_mixin.py +32 -87
- signalwire_agents/core/swaig_function.py +2 -2
- signalwire_agents/search/__init__.py +7 -1
- signalwire_agents/search/document_processor.py +105 -1
- signalwire_agents/search/index_builder.py +113 -14
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +236 -13
- signalwire_agents/search/query_processor.py +87 -9
- signalwire_agents/search/search_engine.py +835 -31
- signalwire_agents/search/search_service.py +56 -6
- signalwire_agents/skills/native_vector_search/skill.py +208 -33
- signalwire_agents/skills/weather_api/skill.py +2 -2
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/METADATA +12 -7
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/RECORD +22 -20
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/entry_points.txt +0 -0
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,418 @@
|
|
1
|
+
"""
|
2
|
+
Copyright (c) 2025 SignalWire
|
3
|
+
|
4
|
+
This file is part of the SignalWire AI Agents SDK.
|
5
|
+
|
6
|
+
Licensed under the MIT License.
|
7
|
+
See LICENSE file in the project root for full license information.
|
8
|
+
"""
|
9
|
+
|
10
|
+
import sqlite3
|
11
|
+
import json
|
12
|
+
import logging
|
13
|
+
from typing import Dict, Any, Optional, List
|
14
|
+
from pathlib import Path
|
15
|
+
from datetime import datetime
|
16
|
+
|
17
|
+
try:
|
18
|
+
import numpy as np
|
19
|
+
except ImportError:
|
20
|
+
np = None
|
21
|
+
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
class SearchIndexMigrator:
|
26
|
+
"""Migrate search indexes between different backends"""
|
27
|
+
|
28
|
+
def __init__(self, verbose: bool = False):
|
29
|
+
"""
|
30
|
+
Initialize the migrator
|
31
|
+
|
32
|
+
Args:
|
33
|
+
verbose: Enable verbose output
|
34
|
+
"""
|
35
|
+
self.verbose = verbose
|
36
|
+
|
37
|
+
def migrate_sqlite_to_pgvector(
|
38
|
+
self,
|
39
|
+
sqlite_path: str,
|
40
|
+
connection_string: str,
|
41
|
+
collection_name: str,
|
42
|
+
overwrite: bool = False,
|
43
|
+
batch_size: int = 100
|
44
|
+
) -> Dict[str, Any]:
|
45
|
+
"""
|
46
|
+
Migrate a .swsearch SQLite index to pgvector
|
47
|
+
|
48
|
+
Args:
|
49
|
+
sqlite_path: Path to .swsearch file
|
50
|
+
connection_string: PostgreSQL connection string
|
51
|
+
collection_name: Name for the pgvector collection
|
52
|
+
overwrite: Whether to overwrite existing collection
|
53
|
+
batch_size: Number of chunks to insert at once
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
Migration statistics
|
57
|
+
"""
|
58
|
+
if not Path(sqlite_path).exists():
|
59
|
+
raise FileNotFoundError(f"SQLite index not found: {sqlite_path}")
|
60
|
+
|
61
|
+
# Import pgvector backend
|
62
|
+
from .pgvector_backend import PgVectorBackend
|
63
|
+
|
64
|
+
stats = {
|
65
|
+
'source': sqlite_path,
|
66
|
+
'target': collection_name,
|
67
|
+
'chunks_migrated': 0,
|
68
|
+
'errors': 0,
|
69
|
+
'config': {}
|
70
|
+
}
|
71
|
+
|
72
|
+
try:
|
73
|
+
# Connect to SQLite
|
74
|
+
if self.verbose:
|
75
|
+
print(f"Opening SQLite index: {sqlite_path}")
|
76
|
+
|
77
|
+
sqlite_conn = sqlite3.connect(sqlite_path)
|
78
|
+
cursor = sqlite_conn.cursor()
|
79
|
+
|
80
|
+
# Load configuration
|
81
|
+
cursor.execute("SELECT key, value FROM config")
|
82
|
+
config_rows = cursor.fetchall()
|
83
|
+
config = dict(config_rows)
|
84
|
+
stats['config'] = config
|
85
|
+
|
86
|
+
# Get important config values
|
87
|
+
model_name = config.get('embedding_model', 'sentence-transformers/all-mpnet-base-v2')
|
88
|
+
embedding_dim = int(config.get('embedding_dimensions', 768))
|
89
|
+
|
90
|
+
if self.verbose:
|
91
|
+
print(f"Source configuration:")
|
92
|
+
print(f" Model: {model_name}")
|
93
|
+
print(f" Dimensions: {embedding_dim}")
|
94
|
+
print(f" Created: {config.get('created_at', 'Unknown')}")
|
95
|
+
|
96
|
+
# Initialize pgvector backend
|
97
|
+
pgvector = PgVectorBackend(connection_string)
|
98
|
+
|
99
|
+
try:
|
100
|
+
# Handle existing collection
|
101
|
+
if overwrite:
|
102
|
+
if self.verbose:
|
103
|
+
print(f"Dropping existing collection: {collection_name}")
|
104
|
+
pgvector.delete_collection(collection_name)
|
105
|
+
|
106
|
+
# Create schema
|
107
|
+
if self.verbose:
|
108
|
+
print(f"Creating pgvector collection: {collection_name}")
|
109
|
+
|
110
|
+
pgvector.create_schema(collection_name, embedding_dim)
|
111
|
+
|
112
|
+
# Prepare collection config
|
113
|
+
collection_config = {
|
114
|
+
'model_name': model_name,
|
115
|
+
'embedding_dimensions': embedding_dim,
|
116
|
+
'chunking_strategy': config.get('chunking_strategy', 'sentence'),
|
117
|
+
'languages': json.loads(config.get('languages', '["en"]')),
|
118
|
+
'metadata': {
|
119
|
+
'migrated_from': sqlite_path,
|
120
|
+
'original_created': config.get('created_at'),
|
121
|
+
'source_dir': config.get('source_dir'),
|
122
|
+
'file_types': json.loads(config.get('file_types', '[]'))
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
# Count total chunks
|
127
|
+
cursor.execute("SELECT COUNT(*) FROM chunks")
|
128
|
+
total_chunks = cursor.fetchone()[0]
|
129
|
+
|
130
|
+
if self.verbose:
|
131
|
+
print(f"Migrating {total_chunks} chunks...")
|
132
|
+
|
133
|
+
# Check if metadata_text column exists (do this once)
|
134
|
+
cursor.execute("PRAGMA table_info(chunks)")
|
135
|
+
columns = [col[1] for col in cursor.fetchall()]
|
136
|
+
has_metadata_text = 'metadata_text' in columns
|
137
|
+
|
138
|
+
# Migrate chunks in batches
|
139
|
+
offset = 0
|
140
|
+
while offset < total_chunks:
|
141
|
+
# Fetch batch of chunks
|
142
|
+
|
143
|
+
if has_metadata_text:
|
144
|
+
cursor.execute("""
|
145
|
+
SELECT id, content, processed_content, keywords, language,
|
146
|
+
embedding, filename, section, start_line, end_line,
|
147
|
+
tags, metadata, metadata_text, chunk_hash
|
148
|
+
FROM chunks
|
149
|
+
ORDER BY id
|
150
|
+
LIMIT ? OFFSET ?
|
151
|
+
""", (batch_size, offset))
|
152
|
+
else:
|
153
|
+
cursor.execute("""
|
154
|
+
SELECT id, content, processed_content, keywords, language,
|
155
|
+
embedding, filename, section, start_line, end_line,
|
156
|
+
tags, metadata, chunk_hash
|
157
|
+
FROM chunks
|
158
|
+
ORDER BY id
|
159
|
+
LIMIT ? OFFSET ?
|
160
|
+
""", (batch_size, offset))
|
161
|
+
|
162
|
+
chunks_batch = []
|
163
|
+
for row in cursor.fetchall():
|
164
|
+
# Handle both old and new schema (with or without metadata_text)
|
165
|
+
if len(row) == 14: # New schema with metadata_text
|
166
|
+
(chunk_id, content, processed_content, keywords_json, language,
|
167
|
+
embedding_blob, filename, section, start_line, end_line,
|
168
|
+
tags_json, metadata_json, metadata_text, chunk_hash) = row
|
169
|
+
else: # Old schema without metadata_text
|
170
|
+
(chunk_id, content, processed_content, keywords_json, language,
|
171
|
+
embedding_blob, filename, section, start_line, end_line,
|
172
|
+
tags_json, metadata_json, chunk_hash) = row
|
173
|
+
metadata_text = None
|
174
|
+
|
175
|
+
# Convert embedding blob to numpy array if available
|
176
|
+
if embedding_blob and np:
|
177
|
+
embedding = np.frombuffer(embedding_blob, dtype=np.float32)
|
178
|
+
else:
|
179
|
+
embedding = embedding_blob
|
180
|
+
|
181
|
+
# Parse JSON fields
|
182
|
+
keywords = json.loads(keywords_json) if keywords_json else []
|
183
|
+
tags = json.loads(tags_json) if tags_json else []
|
184
|
+
metadata = json.loads(metadata_json) if metadata_json else {}
|
185
|
+
|
186
|
+
chunk = {
|
187
|
+
'content': content,
|
188
|
+
'processed_content': processed_content,
|
189
|
+
'keywords': keywords,
|
190
|
+
'language': language,
|
191
|
+
'embedding': embedding,
|
192
|
+
'filename': filename,
|
193
|
+
'section': section,
|
194
|
+
'start_line': start_line,
|
195
|
+
'end_line': end_line,
|
196
|
+
'tags': tags,
|
197
|
+
'metadata': metadata,
|
198
|
+
'metadata_text': metadata_text, # Will be regenerated if None
|
199
|
+
'chunk_hash': chunk_hash
|
200
|
+
}
|
201
|
+
|
202
|
+
chunks_batch.append(chunk)
|
203
|
+
|
204
|
+
# Store batch in pgvector
|
205
|
+
if chunks_batch:
|
206
|
+
try:
|
207
|
+
pgvector.store_chunks(chunks_batch, collection_name, collection_config)
|
208
|
+
stats['chunks_migrated'] += len(chunks_batch)
|
209
|
+
|
210
|
+
if self.verbose:
|
211
|
+
progress = (offset + len(chunks_batch)) / total_chunks * 100
|
212
|
+
print(f" Progress: {stats['chunks_migrated']}/{total_chunks} ({progress:.1f}%)")
|
213
|
+
except Exception as e:
|
214
|
+
logger.error(f"Error storing batch at offset {offset}: {e}")
|
215
|
+
stats['errors'] += len(chunks_batch)
|
216
|
+
|
217
|
+
offset += batch_size
|
218
|
+
|
219
|
+
# Success
|
220
|
+
if self.verbose:
|
221
|
+
print(f"\nMigration completed successfully!")
|
222
|
+
print(f" Chunks migrated: {stats['chunks_migrated']}")
|
223
|
+
print(f" Errors: {stats['errors']}")
|
224
|
+
|
225
|
+
finally:
|
226
|
+
pgvector.close()
|
227
|
+
|
228
|
+
except Exception as e:
|
229
|
+
logger.error(f"Migration failed: {e}")
|
230
|
+
raise
|
231
|
+
finally:
|
232
|
+
sqlite_conn.close()
|
233
|
+
|
234
|
+
return stats
|
235
|
+
|
236
|
+
def migrate_pgvector_to_sqlite(
|
237
|
+
self,
|
238
|
+
connection_string: str,
|
239
|
+
collection_name: str,
|
240
|
+
output_path: str,
|
241
|
+
batch_size: int = 100
|
242
|
+
) -> Dict[str, Any]:
|
243
|
+
"""
|
244
|
+
Migrate a pgvector collection to SQLite .swsearch format
|
245
|
+
|
246
|
+
Args:
|
247
|
+
connection_string: PostgreSQL connection string
|
248
|
+
collection_name: Name of the pgvector collection
|
249
|
+
output_path: Output .swsearch file path
|
250
|
+
batch_size: Number of chunks to fetch at once
|
251
|
+
|
252
|
+
Returns:
|
253
|
+
Migration statistics
|
254
|
+
"""
|
255
|
+
from .pgvector_backend import PgVectorBackend
|
256
|
+
from .index_builder import IndexBuilder
|
257
|
+
|
258
|
+
# Ensure output has .swsearch extension
|
259
|
+
if not output_path.endswith('.swsearch'):
|
260
|
+
output_path += '.swsearch'
|
261
|
+
|
262
|
+
stats = {
|
263
|
+
'source': f"{collection_name} (pgvector)",
|
264
|
+
'target': output_path,
|
265
|
+
'chunks_migrated': 0,
|
266
|
+
'errors': 0,
|
267
|
+
'config': {}
|
268
|
+
}
|
269
|
+
|
270
|
+
# Connect to pgvector
|
271
|
+
if self.verbose:
|
272
|
+
print(f"Connecting to pgvector collection: {collection_name}")
|
273
|
+
|
274
|
+
pgvector = PgVectorBackend(connection_string)
|
275
|
+
|
276
|
+
try:
|
277
|
+
# Get collection stats and config
|
278
|
+
pg_stats = pgvector.get_stats(collection_name)
|
279
|
+
config = pg_stats.get('config', {})
|
280
|
+
stats['config'] = config
|
281
|
+
|
282
|
+
total_chunks = pg_stats.get('total_chunks', 0)
|
283
|
+
|
284
|
+
if self.verbose:
|
285
|
+
print(f"Source configuration:")
|
286
|
+
print(f" Model: {config.get('model_name', 'Unknown')}")
|
287
|
+
print(f" Dimensions: {config.get('embedding_dimensions', 'Unknown')}")
|
288
|
+
print(f" Total chunks: {total_chunks}")
|
289
|
+
|
290
|
+
# Create SQLite database structure
|
291
|
+
# We'll manually create it to match the expected format
|
292
|
+
if Path(output_path).exists():
|
293
|
+
Path(output_path).unlink()
|
294
|
+
|
295
|
+
conn = sqlite3.connect(output_path)
|
296
|
+
cursor = conn.cursor()
|
297
|
+
|
298
|
+
# Create schema (matching index_builder.py)
|
299
|
+
cursor.execute('''
|
300
|
+
CREATE TABLE chunks (
|
301
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
302
|
+
content TEXT NOT NULL,
|
303
|
+
processed_content TEXT NOT NULL,
|
304
|
+
keywords TEXT,
|
305
|
+
language TEXT DEFAULT 'en',
|
306
|
+
embedding BLOB NOT NULL,
|
307
|
+
filename TEXT NOT NULL,
|
308
|
+
section TEXT,
|
309
|
+
start_line INTEGER,
|
310
|
+
end_line INTEGER,
|
311
|
+
tags TEXT,
|
312
|
+
metadata TEXT,
|
313
|
+
chunk_hash TEXT UNIQUE,
|
314
|
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
315
|
+
)
|
316
|
+
''')
|
317
|
+
|
318
|
+
cursor.execute('''
|
319
|
+
CREATE VIRTUAL TABLE chunks_fts USING fts5(
|
320
|
+
processed_content,
|
321
|
+
keywords,
|
322
|
+
content='chunks',
|
323
|
+
content_rowid='id'
|
324
|
+
)
|
325
|
+
''')
|
326
|
+
|
327
|
+
cursor.execute('''
|
328
|
+
CREATE TABLE synonyms (
|
329
|
+
word TEXT,
|
330
|
+
pos_tag TEXT,
|
331
|
+
synonyms TEXT,
|
332
|
+
language TEXT DEFAULT 'en',
|
333
|
+
PRIMARY KEY (word, pos_tag, language)
|
334
|
+
)
|
335
|
+
''')
|
336
|
+
|
337
|
+
cursor.execute('''
|
338
|
+
CREATE TABLE config (
|
339
|
+
key TEXT PRIMARY KEY,
|
340
|
+
value TEXT
|
341
|
+
)
|
342
|
+
''')
|
343
|
+
|
344
|
+
# Create indexes
|
345
|
+
cursor.execute('CREATE INDEX idx_chunks_filename ON chunks(filename)')
|
346
|
+
cursor.execute('CREATE INDEX idx_chunks_language ON chunks(language)')
|
347
|
+
cursor.execute('CREATE INDEX idx_chunks_tags ON chunks(tags)')
|
348
|
+
|
349
|
+
# Insert config
|
350
|
+
config_data = {
|
351
|
+
'embedding_model': config.get('model_name', 'sentence-transformers/all-mpnet-base-v2'),
|
352
|
+
'embedding_dimensions': str(config.get('embedding_dimensions', 768)),
|
353
|
+
'chunk_size': str(config.get('metadata', {}).get('chunk_size', 50)),
|
354
|
+
'chunk_overlap': str(config.get('metadata', {}).get('chunk_overlap', 10)),
|
355
|
+
'preprocessing_version': '1.0',
|
356
|
+
'languages': json.dumps(config.get('languages', ['en'])),
|
357
|
+
'created_at': datetime.now().isoformat(),
|
358
|
+
'source_dir': config.get('metadata', {}).get('source_dir', 'pgvector_migration'),
|
359
|
+
'file_types': json.dumps(config.get('metadata', {}).get('file_types', []))
|
360
|
+
}
|
361
|
+
|
362
|
+
for key, value in config_data.items():
|
363
|
+
cursor.execute('INSERT INTO config (key, value) VALUES (?, ?)', (key, value))
|
364
|
+
|
365
|
+
# TODO: Implement chunk fetching from pgvector
|
366
|
+
# This would require adding a method to PgVectorBackend to fetch chunks
|
367
|
+
# For now, we'll note this as a limitation
|
368
|
+
|
369
|
+
if self.verbose:
|
370
|
+
print("\nNote: pgvector to SQLite migration requires implementing chunk fetching in PgVectorBackend")
|
371
|
+
print("This feature is planned for future development.")
|
372
|
+
|
373
|
+
conn.commit()
|
374
|
+
conn.close()
|
375
|
+
|
376
|
+
finally:
|
377
|
+
pgvector.close()
|
378
|
+
|
379
|
+
return stats
|
380
|
+
|
381
|
+
def get_index_info(self, index_path: str) -> Dict[str, Any]:
|
382
|
+
"""
|
383
|
+
Get information about a search index
|
384
|
+
|
385
|
+
Args:
|
386
|
+
index_path: Path to index file or pgvector collection identifier
|
387
|
+
|
388
|
+
Returns:
|
389
|
+
Index information including type, config, and statistics
|
390
|
+
"""
|
391
|
+
info = {}
|
392
|
+
|
393
|
+
if index_path.endswith('.swsearch') and Path(index_path).exists():
|
394
|
+
# SQLite index
|
395
|
+
info['type'] = 'sqlite'
|
396
|
+
info['path'] = index_path
|
397
|
+
|
398
|
+
conn = sqlite3.connect(index_path)
|
399
|
+
cursor = conn.cursor()
|
400
|
+
|
401
|
+
# Get config
|
402
|
+
cursor.execute("SELECT key, value FROM config")
|
403
|
+
info['config'] = dict(cursor.fetchall())
|
404
|
+
|
405
|
+
# Get stats
|
406
|
+
cursor.execute("SELECT COUNT(*) FROM chunks")
|
407
|
+
info['total_chunks'] = cursor.fetchone()[0]
|
408
|
+
|
409
|
+
cursor.execute("SELECT COUNT(DISTINCT filename) FROM chunks")
|
410
|
+
info['total_files'] = cursor.fetchone()[0]
|
411
|
+
|
412
|
+
conn.close()
|
413
|
+
|
414
|
+
else:
|
415
|
+
info['type'] = 'unknown'
|
416
|
+
info['path'] = index_path
|
417
|
+
|
418
|
+
return info
|
@@ -0,0 +1,30 @@
|
|
1
|
+
"""
|
2
|
+
Copyright (c) 2025 SignalWire
|
3
|
+
|
4
|
+
This file is part of the SignalWire AI Agents SDK.
|
5
|
+
|
6
|
+
Licensed under the MIT License.
|
7
|
+
See LICENSE file in the project root for full license information.
|
8
|
+
"""
|
9
|
+
|
10
|
+
# Embedding model configuration
|
11
|
+
MODEL_ALIASES = {
|
12
|
+
'mini': 'sentence-transformers/all-MiniLM-L6-v2', # 384 dims, ~5x faster
|
13
|
+
'base': 'sentence-transformers/all-mpnet-base-v2', # 768 dims, balanced
|
14
|
+
'large': 'sentence-transformers/all-mpnet-base-v2', # Same as base for now
|
15
|
+
}
|
16
|
+
|
17
|
+
# Default model for new indexes
|
18
|
+
DEFAULT_MODEL = MODEL_ALIASES['mini']
|
19
|
+
|
20
|
+
def resolve_model_alias(model_name: str) -> str:
|
21
|
+
"""
|
22
|
+
Resolve model alias to full model name
|
23
|
+
|
24
|
+
Args:
|
25
|
+
model_name: Model name or alias (mini, base, large)
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
Full model name
|
29
|
+
"""
|
30
|
+
return MODEL_ALIASES.get(model_name, model_name)
|