signalwire-agents 0.1.46__py3-none-any.whl → 0.1.48__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,418 @@
1
+ """
2
+ Copyright (c) 2025 SignalWire
3
+
4
+ This file is part of the SignalWire AI Agents SDK.
5
+
6
+ Licensed under the MIT License.
7
+ See LICENSE file in the project root for full license information.
8
+ """
9
+
10
+ import sqlite3
11
+ import json
12
+ import logging
13
+ from typing import Dict, Any, Optional, List
14
+ from pathlib import Path
15
+ from datetime import datetime
16
+
17
+ try:
18
+ import numpy as np
19
+ except ImportError:
20
+ np = None
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class SearchIndexMigrator:
26
+ """Migrate search indexes between different backends"""
27
+
28
+ def __init__(self, verbose: bool = False):
29
+ """
30
+ Initialize the migrator
31
+
32
+ Args:
33
+ verbose: Enable verbose output
34
+ """
35
+ self.verbose = verbose
36
+
37
+ def migrate_sqlite_to_pgvector(
38
+ self,
39
+ sqlite_path: str,
40
+ connection_string: str,
41
+ collection_name: str,
42
+ overwrite: bool = False,
43
+ batch_size: int = 100
44
+ ) -> Dict[str, Any]:
45
+ """
46
+ Migrate a .swsearch SQLite index to pgvector
47
+
48
+ Args:
49
+ sqlite_path: Path to .swsearch file
50
+ connection_string: PostgreSQL connection string
51
+ collection_name: Name for the pgvector collection
52
+ overwrite: Whether to overwrite existing collection
53
+ batch_size: Number of chunks to insert at once
54
+
55
+ Returns:
56
+ Migration statistics
57
+ """
58
+ if not Path(sqlite_path).exists():
59
+ raise FileNotFoundError(f"SQLite index not found: {sqlite_path}")
60
+
61
+ # Import pgvector backend
62
+ from .pgvector_backend import PgVectorBackend
63
+
64
+ stats = {
65
+ 'source': sqlite_path,
66
+ 'target': collection_name,
67
+ 'chunks_migrated': 0,
68
+ 'errors': 0,
69
+ 'config': {}
70
+ }
71
+
72
+ try:
73
+ # Connect to SQLite
74
+ if self.verbose:
75
+ print(f"Opening SQLite index: {sqlite_path}")
76
+
77
+ sqlite_conn = sqlite3.connect(sqlite_path)
78
+ cursor = sqlite_conn.cursor()
79
+
80
+ # Load configuration
81
+ cursor.execute("SELECT key, value FROM config")
82
+ config_rows = cursor.fetchall()
83
+ config = dict(config_rows)
84
+ stats['config'] = config
85
+
86
+ # Get important config values
87
+ model_name = config.get('embedding_model', 'sentence-transformers/all-mpnet-base-v2')
88
+ embedding_dim = int(config.get('embedding_dimensions', 768))
89
+
90
+ if self.verbose:
91
+ print(f"Source configuration:")
92
+ print(f" Model: {model_name}")
93
+ print(f" Dimensions: {embedding_dim}")
94
+ print(f" Created: {config.get('created_at', 'Unknown')}")
95
+
96
+ # Initialize pgvector backend
97
+ pgvector = PgVectorBackend(connection_string)
98
+
99
+ try:
100
+ # Handle existing collection
101
+ if overwrite:
102
+ if self.verbose:
103
+ print(f"Dropping existing collection: {collection_name}")
104
+ pgvector.delete_collection(collection_name)
105
+
106
+ # Create schema
107
+ if self.verbose:
108
+ print(f"Creating pgvector collection: {collection_name}")
109
+
110
+ pgvector.create_schema(collection_name, embedding_dim)
111
+
112
+ # Prepare collection config
113
+ collection_config = {
114
+ 'model_name': model_name,
115
+ 'embedding_dimensions': embedding_dim,
116
+ 'chunking_strategy': config.get('chunking_strategy', 'sentence'),
117
+ 'languages': json.loads(config.get('languages', '["en"]')),
118
+ 'metadata': {
119
+ 'migrated_from': sqlite_path,
120
+ 'original_created': config.get('created_at'),
121
+ 'source_dir': config.get('source_dir'),
122
+ 'file_types': json.loads(config.get('file_types', '[]'))
123
+ }
124
+ }
125
+
126
+ # Count total chunks
127
+ cursor.execute("SELECT COUNT(*) FROM chunks")
128
+ total_chunks = cursor.fetchone()[0]
129
+
130
+ if self.verbose:
131
+ print(f"Migrating {total_chunks} chunks...")
132
+
133
+ # Check if metadata_text column exists (do this once)
134
+ cursor.execute("PRAGMA table_info(chunks)")
135
+ columns = [col[1] for col in cursor.fetchall()]
136
+ has_metadata_text = 'metadata_text' in columns
137
+
138
+ # Migrate chunks in batches
139
+ offset = 0
140
+ while offset < total_chunks:
141
+ # Fetch batch of chunks
142
+
143
+ if has_metadata_text:
144
+ cursor.execute("""
145
+ SELECT id, content, processed_content, keywords, language,
146
+ embedding, filename, section, start_line, end_line,
147
+ tags, metadata, metadata_text, chunk_hash
148
+ FROM chunks
149
+ ORDER BY id
150
+ LIMIT ? OFFSET ?
151
+ """, (batch_size, offset))
152
+ else:
153
+ cursor.execute("""
154
+ SELECT id, content, processed_content, keywords, language,
155
+ embedding, filename, section, start_line, end_line,
156
+ tags, metadata, chunk_hash
157
+ FROM chunks
158
+ ORDER BY id
159
+ LIMIT ? OFFSET ?
160
+ """, (batch_size, offset))
161
+
162
+ chunks_batch = []
163
+ for row in cursor.fetchall():
164
+ # Handle both old and new schema (with or without metadata_text)
165
+ if len(row) == 14: # New schema with metadata_text
166
+ (chunk_id, content, processed_content, keywords_json, language,
167
+ embedding_blob, filename, section, start_line, end_line,
168
+ tags_json, metadata_json, metadata_text, chunk_hash) = row
169
+ else: # Old schema without metadata_text
170
+ (chunk_id, content, processed_content, keywords_json, language,
171
+ embedding_blob, filename, section, start_line, end_line,
172
+ tags_json, metadata_json, chunk_hash) = row
173
+ metadata_text = None
174
+
175
+ # Convert embedding blob to numpy array if available
176
+ if embedding_blob and np:
177
+ embedding = np.frombuffer(embedding_blob, dtype=np.float32)
178
+ else:
179
+ embedding = embedding_blob
180
+
181
+ # Parse JSON fields
182
+ keywords = json.loads(keywords_json) if keywords_json else []
183
+ tags = json.loads(tags_json) if tags_json else []
184
+ metadata = json.loads(metadata_json) if metadata_json else {}
185
+
186
+ chunk = {
187
+ 'content': content,
188
+ 'processed_content': processed_content,
189
+ 'keywords': keywords,
190
+ 'language': language,
191
+ 'embedding': embedding,
192
+ 'filename': filename,
193
+ 'section': section,
194
+ 'start_line': start_line,
195
+ 'end_line': end_line,
196
+ 'tags': tags,
197
+ 'metadata': metadata,
198
+ 'metadata_text': metadata_text, # Will be regenerated if None
199
+ 'chunk_hash': chunk_hash
200
+ }
201
+
202
+ chunks_batch.append(chunk)
203
+
204
+ # Store batch in pgvector
205
+ if chunks_batch:
206
+ try:
207
+ pgvector.store_chunks(chunks_batch, collection_name, collection_config)
208
+ stats['chunks_migrated'] += len(chunks_batch)
209
+
210
+ if self.verbose:
211
+ progress = (offset + len(chunks_batch)) / total_chunks * 100
212
+ print(f" Progress: {stats['chunks_migrated']}/{total_chunks} ({progress:.1f}%)")
213
+ except Exception as e:
214
+ logger.error(f"Error storing batch at offset {offset}: {e}")
215
+ stats['errors'] += len(chunks_batch)
216
+
217
+ offset += batch_size
218
+
219
+ # Success
220
+ if self.verbose:
221
+ print(f"\nMigration completed successfully!")
222
+ print(f" Chunks migrated: {stats['chunks_migrated']}")
223
+ print(f" Errors: {stats['errors']}")
224
+
225
+ finally:
226
+ pgvector.close()
227
+
228
+ except Exception as e:
229
+ logger.error(f"Migration failed: {e}")
230
+ raise
231
+ finally:
232
+ sqlite_conn.close()
233
+
234
+ return stats
235
+
236
+ def migrate_pgvector_to_sqlite(
237
+ self,
238
+ connection_string: str,
239
+ collection_name: str,
240
+ output_path: str,
241
+ batch_size: int = 100
242
+ ) -> Dict[str, Any]:
243
+ """
244
+ Migrate a pgvector collection to SQLite .swsearch format
245
+
246
+ Args:
247
+ connection_string: PostgreSQL connection string
248
+ collection_name: Name of the pgvector collection
249
+ output_path: Output .swsearch file path
250
+ batch_size: Number of chunks to fetch at once
251
+
252
+ Returns:
253
+ Migration statistics
254
+ """
255
+ from .pgvector_backend import PgVectorBackend
256
+ from .index_builder import IndexBuilder
257
+
258
+ # Ensure output has .swsearch extension
259
+ if not output_path.endswith('.swsearch'):
260
+ output_path += '.swsearch'
261
+
262
+ stats = {
263
+ 'source': f"{collection_name} (pgvector)",
264
+ 'target': output_path,
265
+ 'chunks_migrated': 0,
266
+ 'errors': 0,
267
+ 'config': {}
268
+ }
269
+
270
+ # Connect to pgvector
271
+ if self.verbose:
272
+ print(f"Connecting to pgvector collection: {collection_name}")
273
+
274
+ pgvector = PgVectorBackend(connection_string)
275
+
276
+ try:
277
+ # Get collection stats and config
278
+ pg_stats = pgvector.get_stats(collection_name)
279
+ config = pg_stats.get('config', {})
280
+ stats['config'] = config
281
+
282
+ total_chunks = pg_stats.get('total_chunks', 0)
283
+
284
+ if self.verbose:
285
+ print(f"Source configuration:")
286
+ print(f" Model: {config.get('model_name', 'Unknown')}")
287
+ print(f" Dimensions: {config.get('embedding_dimensions', 'Unknown')}")
288
+ print(f" Total chunks: {total_chunks}")
289
+
290
+ # Create SQLite database structure
291
+ # We'll manually create it to match the expected format
292
+ if Path(output_path).exists():
293
+ Path(output_path).unlink()
294
+
295
+ conn = sqlite3.connect(output_path)
296
+ cursor = conn.cursor()
297
+
298
+ # Create schema (matching index_builder.py)
299
+ cursor.execute('''
300
+ CREATE TABLE chunks (
301
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
302
+ content TEXT NOT NULL,
303
+ processed_content TEXT NOT NULL,
304
+ keywords TEXT,
305
+ language TEXT DEFAULT 'en',
306
+ embedding BLOB NOT NULL,
307
+ filename TEXT NOT NULL,
308
+ section TEXT,
309
+ start_line INTEGER,
310
+ end_line INTEGER,
311
+ tags TEXT,
312
+ metadata TEXT,
313
+ chunk_hash TEXT UNIQUE,
314
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP
315
+ )
316
+ ''')
317
+
318
+ cursor.execute('''
319
+ CREATE VIRTUAL TABLE chunks_fts USING fts5(
320
+ processed_content,
321
+ keywords,
322
+ content='chunks',
323
+ content_rowid='id'
324
+ )
325
+ ''')
326
+
327
+ cursor.execute('''
328
+ CREATE TABLE synonyms (
329
+ word TEXT,
330
+ pos_tag TEXT,
331
+ synonyms TEXT,
332
+ language TEXT DEFAULT 'en',
333
+ PRIMARY KEY (word, pos_tag, language)
334
+ )
335
+ ''')
336
+
337
+ cursor.execute('''
338
+ CREATE TABLE config (
339
+ key TEXT PRIMARY KEY,
340
+ value TEXT
341
+ )
342
+ ''')
343
+
344
+ # Create indexes
345
+ cursor.execute('CREATE INDEX idx_chunks_filename ON chunks(filename)')
346
+ cursor.execute('CREATE INDEX idx_chunks_language ON chunks(language)')
347
+ cursor.execute('CREATE INDEX idx_chunks_tags ON chunks(tags)')
348
+
349
+ # Insert config
350
+ config_data = {
351
+ 'embedding_model': config.get('model_name', 'sentence-transformers/all-mpnet-base-v2'),
352
+ 'embedding_dimensions': str(config.get('embedding_dimensions', 768)),
353
+ 'chunk_size': str(config.get('metadata', {}).get('chunk_size', 50)),
354
+ 'chunk_overlap': str(config.get('metadata', {}).get('chunk_overlap', 10)),
355
+ 'preprocessing_version': '1.0',
356
+ 'languages': json.dumps(config.get('languages', ['en'])),
357
+ 'created_at': datetime.now().isoformat(),
358
+ 'source_dir': config.get('metadata', {}).get('source_dir', 'pgvector_migration'),
359
+ 'file_types': json.dumps(config.get('metadata', {}).get('file_types', []))
360
+ }
361
+
362
+ for key, value in config_data.items():
363
+ cursor.execute('INSERT INTO config (key, value) VALUES (?, ?)', (key, value))
364
+
365
+ # TODO: Implement chunk fetching from pgvector
366
+ # This would require adding a method to PgVectorBackend to fetch chunks
367
+ # For now, we'll note this as a limitation
368
+
369
+ if self.verbose:
370
+ print("\nNote: pgvector to SQLite migration requires implementing chunk fetching in PgVectorBackend")
371
+ print("This feature is planned for future development.")
372
+
373
+ conn.commit()
374
+ conn.close()
375
+
376
+ finally:
377
+ pgvector.close()
378
+
379
+ return stats
380
+
381
+ def get_index_info(self, index_path: str) -> Dict[str, Any]:
382
+ """
383
+ Get information about a search index
384
+
385
+ Args:
386
+ index_path: Path to index file or pgvector collection identifier
387
+
388
+ Returns:
389
+ Index information including type, config, and statistics
390
+ """
391
+ info = {}
392
+
393
+ if index_path.endswith('.swsearch') and Path(index_path).exists():
394
+ # SQLite index
395
+ info['type'] = 'sqlite'
396
+ info['path'] = index_path
397
+
398
+ conn = sqlite3.connect(index_path)
399
+ cursor = conn.cursor()
400
+
401
+ # Get config
402
+ cursor.execute("SELECT key, value FROM config")
403
+ info['config'] = dict(cursor.fetchall())
404
+
405
+ # Get stats
406
+ cursor.execute("SELECT COUNT(*) FROM chunks")
407
+ info['total_chunks'] = cursor.fetchone()[0]
408
+
409
+ cursor.execute("SELECT COUNT(DISTINCT filename) FROM chunks")
410
+ info['total_files'] = cursor.fetchone()[0]
411
+
412
+ conn.close()
413
+
414
+ else:
415
+ info['type'] = 'unknown'
416
+ info['path'] = index_path
417
+
418
+ return info
@@ -0,0 +1,30 @@
1
+ """
2
+ Copyright (c) 2025 SignalWire
3
+
4
+ This file is part of the SignalWire AI Agents SDK.
5
+
6
+ Licensed under the MIT License.
7
+ See LICENSE file in the project root for full license information.
8
+ """
9
+
10
+ # Embedding model configuration
11
+ MODEL_ALIASES = {
12
+ 'mini': 'sentence-transformers/all-MiniLM-L6-v2', # 384 dims, ~5x faster
13
+ 'base': 'sentence-transformers/all-mpnet-base-v2', # 768 dims, balanced
14
+ 'large': 'sentence-transformers/all-mpnet-base-v2', # Same as base for now
15
+ }
16
+
17
+ # Default model for new indexes
18
+ DEFAULT_MODEL = MODEL_ALIASES['mini']
19
+
20
+ def resolve_model_alias(model_name: str) -> str:
21
+ """
22
+ Resolve model alias to full model name
23
+
24
+ Args:
25
+ model_name: Model name or alias (mini, base, large)
26
+
27
+ Returns:
28
+ Full model name
29
+ """
30
+ return MODEL_ALIASES.get(model_name, model_name)