signalwire-agents 0.1.47__py3-none-any.whl → 0.1.48__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -68,6 +68,8 @@ if _SEARCH_AVAILABLE:
68
68
  from .index_builder import IndexBuilder
69
69
  from .search_engine import SearchEngine
70
70
  from .search_service import SearchService
71
+ from .models import MODEL_ALIASES, DEFAULT_MODEL, resolve_model_alias
72
+ from .migration import SearchIndexMigrator
71
73
 
72
74
  __all__ = [
73
75
  'preprocess_query',
@@ -75,7 +77,11 @@ if _SEARCH_AVAILABLE:
75
77
  'DocumentProcessor',
76
78
  'IndexBuilder',
77
79
  'SearchEngine',
78
- 'SearchService'
80
+ 'SearchService',
81
+ 'MODEL_ALIASES',
82
+ 'DEFAULT_MODEL',
83
+ 'resolve_model_alias',
84
+ 'SearchIndexMigrator'
79
85
  ]
80
86
  except ImportError as e:
81
87
  # Some search components failed to import
@@ -1075,7 +1075,7 @@ class DocumentProcessor:
1075
1075
  json_metadata = json_chunk.get('metadata', {})
1076
1076
  chunk_type = json_chunk.get('type', 'content')
1077
1077
 
1078
- # Build chunk metadata
1078
+ # Build chunk metadata (excluding tags which go at top level)
1079
1079
  metadata = {
1080
1080
  'chunk_method': 'json',
1081
1081
  'chunk_index': idx,
@@ -1083,7 +1083,11 @@ class DocumentProcessor:
1083
1083
  'original_chunk_id': json_chunk.get('chunk_id', f'chunk_{idx}')
1084
1084
  }
1085
1085
 
1086
- # Merge JSON metadata
1086
+ # Extract tags before merging metadata
1087
+ tags = json_metadata.get('tags', [])
1088
+
1089
+ # Merge JSON metadata (this includes all fields including tags)
1090
+ # We'll keep tags in metadata for backward compatibility but also set at top level
1087
1091
  metadata.update(json_metadata)
1088
1092
 
1089
1093
  # Determine section name
@@ -1100,12 +1104,11 @@ class DocumentProcessor:
1100
1104
  metadata=metadata
1101
1105
  )
1102
1106
 
1103
- # Add any additional fields from JSON
1104
- if 'tags' in json_chunk:
1105
- chunk['tags'] = json_chunk['tags']
1106
-
1107
- # For TOC entries, we might want to add special tags
1108
- if chunk_type == 'toc' and 'tags' not in chunk:
1107
+ # Set tags at the top level for proper tag filtering
1108
+ if tags:
1109
+ chunk['tags'] = tags
1110
+ elif chunk_type == 'toc':
1111
+ # For TOC entries, add special tags if none provided
1109
1112
  chunk['tags'] = ['toc', 'navigation']
1110
1113
 
1111
1114
  chunks.append(chunk)
@@ -85,9 +85,6 @@ class IndexBuilder:
85
85
  if self.backend not in ['sqlite', 'pgvector']:
86
86
  raise ValueError(f"Invalid backend '{self.backend}'. Must be 'sqlite' or 'pgvector'")
87
87
 
88
- if self.backend == 'pgvector' and not self.connection_string:
89
- raise ValueError("connection_string is required for pgvector backend")
90
-
91
88
  # Validate NLP backend
92
89
  if self.index_nlp_backend not in ['nltk', 'spacy']:
93
90
  logger.warning(f"Invalid index_nlp_backend '{self.index_nlp_backend}', using 'nltk'")
@@ -105,6 +102,50 @@ class IndexBuilder:
105
102
  topic_threshold=self.topic_threshold
106
103
  )
107
104
 
105
+ def _extract_metadata_from_json_content(self, content: str) -> tuple[Dict[str, Any], str]:
106
+ """
107
+ Extract metadata from JSON content if present
108
+
109
+ Returns:
110
+ (metadata_dict, metadata_text)
111
+ """
112
+ metadata_dict = {}
113
+
114
+ # Try to extract metadata from JSON structure in content
115
+ if '"metadata":' in content:
116
+ try:
117
+ # Look for metadata object in content
118
+ import re
119
+ # Find all metadata objects
120
+ pattern = r'"metadata"\s*:\s*(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})'
121
+ matches = re.finditer(pattern, content)
122
+
123
+ for match in matches:
124
+ try:
125
+ json_metadata = json.loads(match.group(1))
126
+ # Merge all found metadata
127
+ if isinstance(json_metadata, dict):
128
+ metadata_dict.update(json_metadata)
129
+ except:
130
+ pass
131
+ except Exception as e:
132
+ logger.debug(f"Error extracting JSON metadata: {e}")
133
+
134
+ # Create searchable text from all metadata keys and values
135
+ metadata_text_parts = []
136
+ for key, value in metadata_dict.items():
137
+ # Add key
138
+ metadata_text_parts.append(str(key))
139
+ # Add value(s)
140
+ if isinstance(value, list):
141
+ metadata_text_parts.extend(str(v) for v in value)
142
+ else:
143
+ metadata_text_parts.append(str(value))
144
+
145
+ metadata_text = ' '.join(metadata_text_parts).lower()
146
+
147
+ return metadata_dict, metadata_text
148
+
108
149
  def _load_model(self):
109
150
  """Load embedding model (lazy loading)"""
110
151
  if self.model is None:
@@ -147,6 +188,7 @@ class IndexBuilder:
147
188
 
148
189
  # Process documents
149
190
  chunks = []
191
+ print(f"Processing {len(files)} files...")
150
192
  for file_path in files:
151
193
  try:
152
194
  # For individual files, use the file's parent as the base directory
@@ -154,8 +196,8 @@ class IndexBuilder:
154
196
  base_dir = self._get_base_directory_for_file(file_path, sources)
155
197
  file_chunks = self._process_file(file_path, base_dir, tags)
156
198
  chunks.extend(file_chunks)
157
- if self.verbose:
158
- print(f"Processed {file_path}: {len(file_chunks)} chunks")
199
+ if self.verbose or file_path.suffix == '.json':
200
+ print(f" {file_path}: {len(file_chunks)} chunks")
159
201
  except Exception as e:
160
202
  logger.error(f"Error processing {file_path}: {e}")
161
203
  if self.verbose:
@@ -171,7 +213,9 @@ class IndexBuilder:
171
213
  # Generate embeddings
172
214
  self._load_model()
173
215
  if self.verbose:
174
- print("Generating embeddings...")
216
+ print(f"Generating embeddings for {len(chunks)} chunks...")
217
+ else:
218
+ print(f"Generating embeddings for {len(chunks)} chunks...")
175
219
 
176
220
  for i, chunk in enumerate(chunks):
177
221
  try:
@@ -183,15 +227,33 @@ class IndexBuilder:
183
227
  )
184
228
 
185
229
  chunk['processed_content'] = processed['enhanced_text']
186
- chunk['keywords'] = processed.get('keywords', [])
230
+
231
+ # Include tags in keywords for better search matching
232
+ keywords = processed.get('keywords', [])
233
+ chunk_tags = chunk.get('tags', [])
234
+ if chunk_tags:
235
+ # Add tags to keywords list for FTS matching
236
+ keywords.extend(chunk_tags)
237
+ # Remove duplicates while preserving order
238
+ keywords = list(dict.fromkeys(keywords))
239
+
240
+ chunk['keywords'] = keywords
241
+
242
+ # For embedding, include tags in the text for better semantic matching
243
+ embedding_text = processed['enhanced_text']
244
+ if chunk_tags:
245
+ # Append tags to the text for embedding generation
246
+ embedding_text += " " + " ".join(chunk_tags)
187
247
 
188
248
  # Generate embedding (suppress progress bar)
189
- embedding = self.model.encode(processed['enhanced_text'], show_progress_bar=False)
249
+ embedding = self.model.encode(embedding_text, show_progress_bar=False)
190
250
  chunk['embedding'] = embedding.tobytes()
191
251
 
192
- if self.verbose and (i + 1) % 50 == 0:
252
+ # Show progress more frequently
253
+ show_every = 50 if len(chunks) > 500 else max(10, len(chunks) // 10)
254
+ if (i + 1) % show_every == 0 or (i + 1) == len(chunks):
193
255
  progress_pct = ((i + 1) / len(chunks)) * 100
194
- print(f"Generated embeddings: {i + 1}/{len(chunks)} chunks ({progress_pct:.1f}%)")
256
+ print(f" Progress: {i + 1}/{len(chunks)} chunks ({progress_pct:.1f}%)")
195
257
 
196
258
  except Exception as e:
197
259
  logger.error(f"Error processing chunk {i}: {e}")
@@ -485,6 +547,7 @@ class IndexBuilder:
485
547
  end_line INTEGER,
486
548
  tags TEXT,
487
549
  metadata TEXT,
550
+ metadata_text TEXT, -- Searchable text representation of all metadata
488
551
  chunk_hash TEXT UNIQUE,
489
552
  created_at TEXT DEFAULT CURRENT_TIMESTAMP
490
553
  )
@@ -494,6 +557,7 @@ class IndexBuilder:
494
557
  CREATE VIRTUAL TABLE chunks_fts USING fts5(
495
558
  processed_content,
496
559
  keywords,
560
+ metadata_text,
497
561
  content='chunks',
498
562
  content_rowid='id'
499
563
  )
@@ -555,13 +619,47 @@ class IndexBuilder:
555
619
  # Prepare data
556
620
  keywords_json = json.dumps(chunk.get('keywords', []))
557
621
  tags_json = json.dumps(chunk.get('tags', []))
558
- metadata_json = json.dumps(chunk.get('metadata', {}))
622
+
623
+ # Extract metadata from JSON content and merge with chunk metadata
624
+ json_metadata, json_metadata_text = self._extract_metadata_from_json_content(chunk['content'])
625
+ chunk_metadata = chunk.get('metadata', {})
626
+
627
+ # Merge metadata: chunk metadata takes precedence
628
+ merged_metadata = {**json_metadata, **chunk_metadata}
629
+ metadata_json = json.dumps(merged_metadata)
630
+
631
+ # Create comprehensive metadata_text including tags
632
+ metadata_text_parts = []
633
+
634
+ # Add metadata text from JSON content
635
+ if json_metadata_text:
636
+ metadata_text_parts.append(json_metadata_text)
637
+
638
+ # Add tags
639
+ tags = chunk.get('tags', [])
640
+ if tags:
641
+ metadata_text_parts.extend(str(tag).lower() for tag in tags)
642
+
643
+ # Add section if present
644
+ if chunk.get('section'):
645
+ metadata_text_parts.append(chunk['section'].lower())
646
+
647
+ # Add any additional metadata values
648
+ for key, value in chunk_metadata.items():
649
+ if key not in json_metadata: # Avoid duplicates
650
+ metadata_text_parts.append(str(key).lower())
651
+ if isinstance(value, list):
652
+ metadata_text_parts.extend(str(v).lower() for v in value)
653
+ else:
654
+ metadata_text_parts.append(str(value).lower())
655
+
656
+ metadata_text = ' '.join(metadata_text_parts)
559
657
 
560
658
  cursor.execute('''
561
659
  INSERT OR IGNORE INTO chunks (
562
660
  content, processed_content, keywords, language, embedding,
563
- filename, section, start_line, end_line, tags, metadata, chunk_hash
564
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
661
+ filename, section, start_line, end_line, tags, metadata, metadata_text, chunk_hash
662
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
565
663
  ''', (
566
664
  chunk['content'],
567
665
  chunk.get('processed_content', chunk['content']),
@@ -574,6 +672,7 @@ class IndexBuilder:
574
672
  chunk.get('end_line'),
575
673
  tags_json,
576
674
  metadata_json,
675
+ metadata_text,
577
676
  chunk_hash
578
677
  ))
579
678
 
@@ -0,0 +1,418 @@
1
+ """
2
+ Copyright (c) 2025 SignalWire
3
+
4
+ This file is part of the SignalWire AI Agents SDK.
5
+
6
+ Licensed under the MIT License.
7
+ See LICENSE file in the project root for full license information.
8
+ """
9
+
10
+ import sqlite3
11
+ import json
12
+ import logging
13
+ from typing import Dict, Any, Optional, List
14
+ from pathlib import Path
15
+ from datetime import datetime
16
+
17
+ try:
18
+ import numpy as np
19
+ except ImportError:
20
+ np = None
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class SearchIndexMigrator:
26
+ """Migrate search indexes between different backends"""
27
+
28
+ def __init__(self, verbose: bool = False):
29
+ """
30
+ Initialize the migrator
31
+
32
+ Args:
33
+ verbose: Enable verbose output
34
+ """
35
+ self.verbose = verbose
36
+
37
+ def migrate_sqlite_to_pgvector(
38
+ self,
39
+ sqlite_path: str,
40
+ connection_string: str,
41
+ collection_name: str,
42
+ overwrite: bool = False,
43
+ batch_size: int = 100
44
+ ) -> Dict[str, Any]:
45
+ """
46
+ Migrate a .swsearch SQLite index to pgvector
47
+
48
+ Args:
49
+ sqlite_path: Path to .swsearch file
50
+ connection_string: PostgreSQL connection string
51
+ collection_name: Name for the pgvector collection
52
+ overwrite: Whether to overwrite existing collection
53
+ batch_size: Number of chunks to insert at once
54
+
55
+ Returns:
56
+ Migration statistics
57
+ """
58
+ if not Path(sqlite_path).exists():
59
+ raise FileNotFoundError(f"SQLite index not found: {sqlite_path}")
60
+
61
+ # Import pgvector backend
62
+ from .pgvector_backend import PgVectorBackend
63
+
64
+ stats = {
65
+ 'source': sqlite_path,
66
+ 'target': collection_name,
67
+ 'chunks_migrated': 0,
68
+ 'errors': 0,
69
+ 'config': {}
70
+ }
71
+
72
+ try:
73
+ # Connect to SQLite
74
+ if self.verbose:
75
+ print(f"Opening SQLite index: {sqlite_path}")
76
+
77
+ sqlite_conn = sqlite3.connect(sqlite_path)
78
+ cursor = sqlite_conn.cursor()
79
+
80
+ # Load configuration
81
+ cursor.execute("SELECT key, value FROM config")
82
+ config_rows = cursor.fetchall()
83
+ config = dict(config_rows)
84
+ stats['config'] = config
85
+
86
+ # Get important config values
87
+ model_name = config.get('embedding_model', 'sentence-transformers/all-mpnet-base-v2')
88
+ embedding_dim = int(config.get('embedding_dimensions', 768))
89
+
90
+ if self.verbose:
91
+ print(f"Source configuration:")
92
+ print(f" Model: {model_name}")
93
+ print(f" Dimensions: {embedding_dim}")
94
+ print(f" Created: {config.get('created_at', 'Unknown')}")
95
+
96
+ # Initialize pgvector backend
97
+ pgvector = PgVectorBackend(connection_string)
98
+
99
+ try:
100
+ # Handle existing collection
101
+ if overwrite:
102
+ if self.verbose:
103
+ print(f"Dropping existing collection: {collection_name}")
104
+ pgvector.delete_collection(collection_name)
105
+
106
+ # Create schema
107
+ if self.verbose:
108
+ print(f"Creating pgvector collection: {collection_name}")
109
+
110
+ pgvector.create_schema(collection_name, embedding_dim)
111
+
112
+ # Prepare collection config
113
+ collection_config = {
114
+ 'model_name': model_name,
115
+ 'embedding_dimensions': embedding_dim,
116
+ 'chunking_strategy': config.get('chunking_strategy', 'sentence'),
117
+ 'languages': json.loads(config.get('languages', '["en"]')),
118
+ 'metadata': {
119
+ 'migrated_from': sqlite_path,
120
+ 'original_created': config.get('created_at'),
121
+ 'source_dir': config.get('source_dir'),
122
+ 'file_types': json.loads(config.get('file_types', '[]'))
123
+ }
124
+ }
125
+
126
+ # Count total chunks
127
+ cursor.execute("SELECT COUNT(*) FROM chunks")
128
+ total_chunks = cursor.fetchone()[0]
129
+
130
+ if self.verbose:
131
+ print(f"Migrating {total_chunks} chunks...")
132
+
133
+ # Check if metadata_text column exists (do this once)
134
+ cursor.execute("PRAGMA table_info(chunks)")
135
+ columns = [col[1] for col in cursor.fetchall()]
136
+ has_metadata_text = 'metadata_text' in columns
137
+
138
+ # Migrate chunks in batches
139
+ offset = 0
140
+ while offset < total_chunks:
141
+ # Fetch batch of chunks
142
+
143
+ if has_metadata_text:
144
+ cursor.execute("""
145
+ SELECT id, content, processed_content, keywords, language,
146
+ embedding, filename, section, start_line, end_line,
147
+ tags, metadata, metadata_text, chunk_hash
148
+ FROM chunks
149
+ ORDER BY id
150
+ LIMIT ? OFFSET ?
151
+ """, (batch_size, offset))
152
+ else:
153
+ cursor.execute("""
154
+ SELECT id, content, processed_content, keywords, language,
155
+ embedding, filename, section, start_line, end_line,
156
+ tags, metadata, chunk_hash
157
+ FROM chunks
158
+ ORDER BY id
159
+ LIMIT ? OFFSET ?
160
+ """, (batch_size, offset))
161
+
162
+ chunks_batch = []
163
+ for row in cursor.fetchall():
164
+ # Handle both old and new schema (with or without metadata_text)
165
+ if len(row) == 14: # New schema with metadata_text
166
+ (chunk_id, content, processed_content, keywords_json, language,
167
+ embedding_blob, filename, section, start_line, end_line,
168
+ tags_json, metadata_json, metadata_text, chunk_hash) = row
169
+ else: # Old schema without metadata_text
170
+ (chunk_id, content, processed_content, keywords_json, language,
171
+ embedding_blob, filename, section, start_line, end_line,
172
+ tags_json, metadata_json, chunk_hash) = row
173
+ metadata_text = None
174
+
175
+ # Convert embedding blob to numpy array if available
176
+ if embedding_blob and np:
177
+ embedding = np.frombuffer(embedding_blob, dtype=np.float32)
178
+ else:
179
+ embedding = embedding_blob
180
+
181
+ # Parse JSON fields
182
+ keywords = json.loads(keywords_json) if keywords_json else []
183
+ tags = json.loads(tags_json) if tags_json else []
184
+ metadata = json.loads(metadata_json) if metadata_json else {}
185
+
186
+ chunk = {
187
+ 'content': content,
188
+ 'processed_content': processed_content,
189
+ 'keywords': keywords,
190
+ 'language': language,
191
+ 'embedding': embedding,
192
+ 'filename': filename,
193
+ 'section': section,
194
+ 'start_line': start_line,
195
+ 'end_line': end_line,
196
+ 'tags': tags,
197
+ 'metadata': metadata,
198
+ 'metadata_text': metadata_text, # Will be regenerated if None
199
+ 'chunk_hash': chunk_hash
200
+ }
201
+
202
+ chunks_batch.append(chunk)
203
+
204
+ # Store batch in pgvector
205
+ if chunks_batch:
206
+ try:
207
+ pgvector.store_chunks(chunks_batch, collection_name, collection_config)
208
+ stats['chunks_migrated'] += len(chunks_batch)
209
+
210
+ if self.verbose:
211
+ progress = (offset + len(chunks_batch)) / total_chunks * 100
212
+ print(f" Progress: {stats['chunks_migrated']}/{total_chunks} ({progress:.1f}%)")
213
+ except Exception as e:
214
+ logger.error(f"Error storing batch at offset {offset}: {e}")
215
+ stats['errors'] += len(chunks_batch)
216
+
217
+ offset += batch_size
218
+
219
+ # Success
220
+ if self.verbose:
221
+ print(f"\nMigration completed successfully!")
222
+ print(f" Chunks migrated: {stats['chunks_migrated']}")
223
+ print(f" Errors: {stats['errors']}")
224
+
225
+ finally:
226
+ pgvector.close()
227
+
228
+ except Exception as e:
229
+ logger.error(f"Migration failed: {e}")
230
+ raise
231
+ finally:
232
+ sqlite_conn.close()
233
+
234
+ return stats
235
+
236
+ def migrate_pgvector_to_sqlite(
237
+ self,
238
+ connection_string: str,
239
+ collection_name: str,
240
+ output_path: str,
241
+ batch_size: int = 100
242
+ ) -> Dict[str, Any]:
243
+ """
244
+ Migrate a pgvector collection to SQLite .swsearch format
245
+
246
+ Args:
247
+ connection_string: PostgreSQL connection string
248
+ collection_name: Name of the pgvector collection
249
+ output_path: Output .swsearch file path
250
+ batch_size: Number of chunks to fetch at once
251
+
252
+ Returns:
253
+ Migration statistics
254
+ """
255
+ from .pgvector_backend import PgVectorBackend
256
+ from .index_builder import IndexBuilder
257
+
258
+ # Ensure output has .swsearch extension
259
+ if not output_path.endswith('.swsearch'):
260
+ output_path += '.swsearch'
261
+
262
+ stats = {
263
+ 'source': f"{collection_name} (pgvector)",
264
+ 'target': output_path,
265
+ 'chunks_migrated': 0,
266
+ 'errors': 0,
267
+ 'config': {}
268
+ }
269
+
270
+ # Connect to pgvector
271
+ if self.verbose:
272
+ print(f"Connecting to pgvector collection: {collection_name}")
273
+
274
+ pgvector = PgVectorBackend(connection_string)
275
+
276
+ try:
277
+ # Get collection stats and config
278
+ pg_stats = pgvector.get_stats(collection_name)
279
+ config = pg_stats.get('config', {})
280
+ stats['config'] = config
281
+
282
+ total_chunks = pg_stats.get('total_chunks', 0)
283
+
284
+ if self.verbose:
285
+ print(f"Source configuration:")
286
+ print(f" Model: {config.get('model_name', 'Unknown')}")
287
+ print(f" Dimensions: {config.get('embedding_dimensions', 'Unknown')}")
288
+ print(f" Total chunks: {total_chunks}")
289
+
290
+ # Create SQLite database structure
291
+ # We'll manually create it to match the expected format
292
+ if Path(output_path).exists():
293
+ Path(output_path).unlink()
294
+
295
+ conn = sqlite3.connect(output_path)
296
+ cursor = conn.cursor()
297
+
298
+ # Create schema (matching index_builder.py)
299
+ cursor.execute('''
300
+ CREATE TABLE chunks (
301
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
302
+ content TEXT NOT NULL,
303
+ processed_content TEXT NOT NULL,
304
+ keywords TEXT,
305
+ language TEXT DEFAULT 'en',
306
+ embedding BLOB NOT NULL,
307
+ filename TEXT NOT NULL,
308
+ section TEXT,
309
+ start_line INTEGER,
310
+ end_line INTEGER,
311
+ tags TEXT,
312
+ metadata TEXT,
313
+ chunk_hash TEXT UNIQUE,
314
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP
315
+ )
316
+ ''')
317
+
318
+ cursor.execute('''
319
+ CREATE VIRTUAL TABLE chunks_fts USING fts5(
320
+ processed_content,
321
+ keywords,
322
+ content='chunks',
323
+ content_rowid='id'
324
+ )
325
+ ''')
326
+
327
+ cursor.execute('''
328
+ CREATE TABLE synonyms (
329
+ word TEXT,
330
+ pos_tag TEXT,
331
+ synonyms TEXT,
332
+ language TEXT DEFAULT 'en',
333
+ PRIMARY KEY (word, pos_tag, language)
334
+ )
335
+ ''')
336
+
337
+ cursor.execute('''
338
+ CREATE TABLE config (
339
+ key TEXT PRIMARY KEY,
340
+ value TEXT
341
+ )
342
+ ''')
343
+
344
+ # Create indexes
345
+ cursor.execute('CREATE INDEX idx_chunks_filename ON chunks(filename)')
346
+ cursor.execute('CREATE INDEX idx_chunks_language ON chunks(language)')
347
+ cursor.execute('CREATE INDEX idx_chunks_tags ON chunks(tags)')
348
+
349
+ # Insert config
350
+ config_data = {
351
+ 'embedding_model': config.get('model_name', 'sentence-transformers/all-mpnet-base-v2'),
352
+ 'embedding_dimensions': str(config.get('embedding_dimensions', 768)),
353
+ 'chunk_size': str(config.get('metadata', {}).get('chunk_size', 50)),
354
+ 'chunk_overlap': str(config.get('metadata', {}).get('chunk_overlap', 10)),
355
+ 'preprocessing_version': '1.0',
356
+ 'languages': json.dumps(config.get('languages', ['en'])),
357
+ 'created_at': datetime.now().isoformat(),
358
+ 'source_dir': config.get('metadata', {}).get('source_dir', 'pgvector_migration'),
359
+ 'file_types': json.dumps(config.get('metadata', {}).get('file_types', []))
360
+ }
361
+
362
+ for key, value in config_data.items():
363
+ cursor.execute('INSERT INTO config (key, value) VALUES (?, ?)', (key, value))
364
+
365
+ # TODO: Implement chunk fetching from pgvector
366
+ # This would require adding a method to PgVectorBackend to fetch chunks
367
+ # For now, we'll note this as a limitation
368
+
369
+ if self.verbose:
370
+ print("\nNote: pgvector to SQLite migration requires implementing chunk fetching in PgVectorBackend")
371
+ print("This feature is planned for future development.")
372
+
373
+ conn.commit()
374
+ conn.close()
375
+
376
+ finally:
377
+ pgvector.close()
378
+
379
+ return stats
380
+
381
+ def get_index_info(self, index_path: str) -> Dict[str, Any]:
382
+ """
383
+ Get information about a search index
384
+
385
+ Args:
386
+ index_path: Path to index file or pgvector collection identifier
387
+
388
+ Returns:
389
+ Index information including type, config, and statistics
390
+ """
391
+ info = {}
392
+
393
+ if index_path.endswith('.swsearch') and Path(index_path).exists():
394
+ # SQLite index
395
+ info['type'] = 'sqlite'
396
+ info['path'] = index_path
397
+
398
+ conn = sqlite3.connect(index_path)
399
+ cursor = conn.cursor()
400
+
401
+ # Get config
402
+ cursor.execute("SELECT key, value FROM config")
403
+ info['config'] = dict(cursor.fetchall())
404
+
405
+ # Get stats
406
+ cursor.execute("SELECT COUNT(*) FROM chunks")
407
+ info['total_chunks'] = cursor.fetchone()[0]
408
+
409
+ cursor.execute("SELECT COUNT(DISTINCT filename) FROM chunks")
410
+ info['total_files'] = cursor.fetchone()[0]
411
+
412
+ conn.close()
413
+
414
+ else:
415
+ info['type'] = 'unknown'
416
+ info['path'] = index_path
417
+
418
+ return info