signalwire-agents 0.1.47__py3-none-any.whl → 0.1.48__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +1 -1
- signalwire_agents/cli/build_search.py +516 -12
- signalwire_agents/search/__init__.py +7 -1
- signalwire_agents/search/document_processor.py +11 -8
- signalwire_agents/search/index_builder.py +112 -13
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +236 -13
- signalwire_agents/search/query_processor.py +87 -9
- signalwire_agents/search/search_engine.py +835 -31
- signalwire_agents/search/search_service.py +56 -6
- signalwire_agents/skills/native_vector_search/skill.py +208 -33
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.48.dist-info}/METADATA +1 -1
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.48.dist-info}/RECORD +18 -16
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.48.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.48.dist-info}/entry_points.txt +0 -0
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.48.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.48.dist-info}/top_level.txt +0 -0
@@ -68,6 +68,8 @@ if _SEARCH_AVAILABLE:
|
|
68
68
|
from .index_builder import IndexBuilder
|
69
69
|
from .search_engine import SearchEngine
|
70
70
|
from .search_service import SearchService
|
71
|
+
from .models import MODEL_ALIASES, DEFAULT_MODEL, resolve_model_alias
|
72
|
+
from .migration import SearchIndexMigrator
|
71
73
|
|
72
74
|
__all__ = [
|
73
75
|
'preprocess_query',
|
@@ -75,7 +77,11 @@ if _SEARCH_AVAILABLE:
|
|
75
77
|
'DocumentProcessor',
|
76
78
|
'IndexBuilder',
|
77
79
|
'SearchEngine',
|
78
|
-
'SearchService'
|
80
|
+
'SearchService',
|
81
|
+
'MODEL_ALIASES',
|
82
|
+
'DEFAULT_MODEL',
|
83
|
+
'resolve_model_alias',
|
84
|
+
'SearchIndexMigrator'
|
79
85
|
]
|
80
86
|
except ImportError as e:
|
81
87
|
# Some search components failed to import
|
@@ -1075,7 +1075,7 @@ class DocumentProcessor:
|
|
1075
1075
|
json_metadata = json_chunk.get('metadata', {})
|
1076
1076
|
chunk_type = json_chunk.get('type', 'content')
|
1077
1077
|
|
1078
|
-
# Build chunk metadata
|
1078
|
+
# Build chunk metadata (excluding tags which go at top level)
|
1079
1079
|
metadata = {
|
1080
1080
|
'chunk_method': 'json',
|
1081
1081
|
'chunk_index': idx,
|
@@ -1083,7 +1083,11 @@ class DocumentProcessor:
|
|
1083
1083
|
'original_chunk_id': json_chunk.get('chunk_id', f'chunk_{idx}')
|
1084
1084
|
}
|
1085
1085
|
|
1086
|
-
#
|
1086
|
+
# Extract tags before merging metadata
|
1087
|
+
tags = json_metadata.get('tags', [])
|
1088
|
+
|
1089
|
+
# Merge JSON metadata (this includes all fields including tags)
|
1090
|
+
# We'll keep tags in metadata for backward compatibility but also set at top level
|
1087
1091
|
metadata.update(json_metadata)
|
1088
1092
|
|
1089
1093
|
# Determine section name
|
@@ -1100,12 +1104,11 @@ class DocumentProcessor:
|
|
1100
1104
|
metadata=metadata
|
1101
1105
|
)
|
1102
1106
|
|
1103
|
-
#
|
1104
|
-
if
|
1105
|
-
chunk['tags'] =
|
1106
|
-
|
1107
|
-
|
1108
|
-
if chunk_type == 'toc' and 'tags' not in chunk:
|
1107
|
+
# Set tags at the top level for proper tag filtering
|
1108
|
+
if tags:
|
1109
|
+
chunk['tags'] = tags
|
1110
|
+
elif chunk_type == 'toc':
|
1111
|
+
# For TOC entries, add special tags if none provided
|
1109
1112
|
chunk['tags'] = ['toc', 'navigation']
|
1110
1113
|
|
1111
1114
|
chunks.append(chunk)
|
@@ -85,9 +85,6 @@ class IndexBuilder:
|
|
85
85
|
if self.backend not in ['sqlite', 'pgvector']:
|
86
86
|
raise ValueError(f"Invalid backend '{self.backend}'. Must be 'sqlite' or 'pgvector'")
|
87
87
|
|
88
|
-
if self.backend == 'pgvector' and not self.connection_string:
|
89
|
-
raise ValueError("connection_string is required for pgvector backend")
|
90
|
-
|
91
88
|
# Validate NLP backend
|
92
89
|
if self.index_nlp_backend not in ['nltk', 'spacy']:
|
93
90
|
logger.warning(f"Invalid index_nlp_backend '{self.index_nlp_backend}', using 'nltk'")
|
@@ -105,6 +102,50 @@ class IndexBuilder:
|
|
105
102
|
topic_threshold=self.topic_threshold
|
106
103
|
)
|
107
104
|
|
105
|
+
def _extract_metadata_from_json_content(self, content: str) -> tuple[Dict[str, Any], str]:
|
106
|
+
"""
|
107
|
+
Extract metadata from JSON content if present
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
(metadata_dict, metadata_text)
|
111
|
+
"""
|
112
|
+
metadata_dict = {}
|
113
|
+
|
114
|
+
# Try to extract metadata from JSON structure in content
|
115
|
+
if '"metadata":' in content:
|
116
|
+
try:
|
117
|
+
# Look for metadata object in content
|
118
|
+
import re
|
119
|
+
# Find all metadata objects
|
120
|
+
pattern = r'"metadata"\s*:\s*(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})'
|
121
|
+
matches = re.finditer(pattern, content)
|
122
|
+
|
123
|
+
for match in matches:
|
124
|
+
try:
|
125
|
+
json_metadata = json.loads(match.group(1))
|
126
|
+
# Merge all found metadata
|
127
|
+
if isinstance(json_metadata, dict):
|
128
|
+
metadata_dict.update(json_metadata)
|
129
|
+
except:
|
130
|
+
pass
|
131
|
+
except Exception as e:
|
132
|
+
logger.debug(f"Error extracting JSON metadata: {e}")
|
133
|
+
|
134
|
+
# Create searchable text from all metadata keys and values
|
135
|
+
metadata_text_parts = []
|
136
|
+
for key, value in metadata_dict.items():
|
137
|
+
# Add key
|
138
|
+
metadata_text_parts.append(str(key))
|
139
|
+
# Add value(s)
|
140
|
+
if isinstance(value, list):
|
141
|
+
metadata_text_parts.extend(str(v) for v in value)
|
142
|
+
else:
|
143
|
+
metadata_text_parts.append(str(value))
|
144
|
+
|
145
|
+
metadata_text = ' '.join(metadata_text_parts).lower()
|
146
|
+
|
147
|
+
return metadata_dict, metadata_text
|
148
|
+
|
108
149
|
def _load_model(self):
|
109
150
|
"""Load embedding model (lazy loading)"""
|
110
151
|
if self.model is None:
|
@@ -147,6 +188,7 @@ class IndexBuilder:
|
|
147
188
|
|
148
189
|
# Process documents
|
149
190
|
chunks = []
|
191
|
+
print(f"Processing {len(files)} files...")
|
150
192
|
for file_path in files:
|
151
193
|
try:
|
152
194
|
# For individual files, use the file's parent as the base directory
|
@@ -154,8 +196,8 @@ class IndexBuilder:
|
|
154
196
|
base_dir = self._get_base_directory_for_file(file_path, sources)
|
155
197
|
file_chunks = self._process_file(file_path, base_dir, tags)
|
156
198
|
chunks.extend(file_chunks)
|
157
|
-
if self.verbose:
|
158
|
-
print(f"
|
199
|
+
if self.verbose or file_path.suffix == '.json':
|
200
|
+
print(f" {file_path}: {len(file_chunks)} chunks")
|
159
201
|
except Exception as e:
|
160
202
|
logger.error(f"Error processing {file_path}: {e}")
|
161
203
|
if self.verbose:
|
@@ -171,7 +213,9 @@ class IndexBuilder:
|
|
171
213
|
# Generate embeddings
|
172
214
|
self._load_model()
|
173
215
|
if self.verbose:
|
174
|
-
print("Generating embeddings...")
|
216
|
+
print(f"Generating embeddings for {len(chunks)} chunks...")
|
217
|
+
else:
|
218
|
+
print(f"Generating embeddings for {len(chunks)} chunks...")
|
175
219
|
|
176
220
|
for i, chunk in enumerate(chunks):
|
177
221
|
try:
|
@@ -183,15 +227,33 @@ class IndexBuilder:
|
|
183
227
|
)
|
184
228
|
|
185
229
|
chunk['processed_content'] = processed['enhanced_text']
|
186
|
-
|
230
|
+
|
231
|
+
# Include tags in keywords for better search matching
|
232
|
+
keywords = processed.get('keywords', [])
|
233
|
+
chunk_tags = chunk.get('tags', [])
|
234
|
+
if chunk_tags:
|
235
|
+
# Add tags to keywords list for FTS matching
|
236
|
+
keywords.extend(chunk_tags)
|
237
|
+
# Remove duplicates while preserving order
|
238
|
+
keywords = list(dict.fromkeys(keywords))
|
239
|
+
|
240
|
+
chunk['keywords'] = keywords
|
241
|
+
|
242
|
+
# For embedding, include tags in the text for better semantic matching
|
243
|
+
embedding_text = processed['enhanced_text']
|
244
|
+
if chunk_tags:
|
245
|
+
# Append tags to the text for embedding generation
|
246
|
+
embedding_text += " " + " ".join(chunk_tags)
|
187
247
|
|
188
248
|
# Generate embedding (suppress progress bar)
|
189
|
-
embedding = self.model.encode(
|
249
|
+
embedding = self.model.encode(embedding_text, show_progress_bar=False)
|
190
250
|
chunk['embedding'] = embedding.tobytes()
|
191
251
|
|
192
|
-
|
252
|
+
# Show progress more frequently
|
253
|
+
show_every = 50 if len(chunks) > 500 else max(10, len(chunks) // 10)
|
254
|
+
if (i + 1) % show_every == 0 or (i + 1) == len(chunks):
|
193
255
|
progress_pct = ((i + 1) / len(chunks)) * 100
|
194
|
-
print(f"
|
256
|
+
print(f" Progress: {i + 1}/{len(chunks)} chunks ({progress_pct:.1f}%)")
|
195
257
|
|
196
258
|
except Exception as e:
|
197
259
|
logger.error(f"Error processing chunk {i}: {e}")
|
@@ -485,6 +547,7 @@ class IndexBuilder:
|
|
485
547
|
end_line INTEGER,
|
486
548
|
tags TEXT,
|
487
549
|
metadata TEXT,
|
550
|
+
metadata_text TEXT, -- Searchable text representation of all metadata
|
488
551
|
chunk_hash TEXT UNIQUE,
|
489
552
|
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
490
553
|
)
|
@@ -494,6 +557,7 @@ class IndexBuilder:
|
|
494
557
|
CREATE VIRTUAL TABLE chunks_fts USING fts5(
|
495
558
|
processed_content,
|
496
559
|
keywords,
|
560
|
+
metadata_text,
|
497
561
|
content='chunks',
|
498
562
|
content_rowid='id'
|
499
563
|
)
|
@@ -555,13 +619,47 @@ class IndexBuilder:
|
|
555
619
|
# Prepare data
|
556
620
|
keywords_json = json.dumps(chunk.get('keywords', []))
|
557
621
|
tags_json = json.dumps(chunk.get('tags', []))
|
558
|
-
|
622
|
+
|
623
|
+
# Extract metadata from JSON content and merge with chunk metadata
|
624
|
+
json_metadata, json_metadata_text = self._extract_metadata_from_json_content(chunk['content'])
|
625
|
+
chunk_metadata = chunk.get('metadata', {})
|
626
|
+
|
627
|
+
# Merge metadata: chunk metadata takes precedence
|
628
|
+
merged_metadata = {**json_metadata, **chunk_metadata}
|
629
|
+
metadata_json = json.dumps(merged_metadata)
|
630
|
+
|
631
|
+
# Create comprehensive metadata_text including tags
|
632
|
+
metadata_text_parts = []
|
633
|
+
|
634
|
+
# Add metadata text from JSON content
|
635
|
+
if json_metadata_text:
|
636
|
+
metadata_text_parts.append(json_metadata_text)
|
637
|
+
|
638
|
+
# Add tags
|
639
|
+
tags = chunk.get('tags', [])
|
640
|
+
if tags:
|
641
|
+
metadata_text_parts.extend(str(tag).lower() for tag in tags)
|
642
|
+
|
643
|
+
# Add section if present
|
644
|
+
if chunk.get('section'):
|
645
|
+
metadata_text_parts.append(chunk['section'].lower())
|
646
|
+
|
647
|
+
# Add any additional metadata values
|
648
|
+
for key, value in chunk_metadata.items():
|
649
|
+
if key not in json_metadata: # Avoid duplicates
|
650
|
+
metadata_text_parts.append(str(key).lower())
|
651
|
+
if isinstance(value, list):
|
652
|
+
metadata_text_parts.extend(str(v).lower() for v in value)
|
653
|
+
else:
|
654
|
+
metadata_text_parts.append(str(value).lower())
|
655
|
+
|
656
|
+
metadata_text = ' '.join(metadata_text_parts)
|
559
657
|
|
560
658
|
cursor.execute('''
|
561
659
|
INSERT OR IGNORE INTO chunks (
|
562
660
|
content, processed_content, keywords, language, embedding,
|
563
|
-
filename, section, start_line, end_line, tags, metadata, chunk_hash
|
564
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
661
|
+
filename, section, start_line, end_line, tags, metadata, metadata_text, chunk_hash
|
662
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
565
663
|
''', (
|
566
664
|
chunk['content'],
|
567
665
|
chunk.get('processed_content', chunk['content']),
|
@@ -574,6 +672,7 @@ class IndexBuilder:
|
|
574
672
|
chunk.get('end_line'),
|
575
673
|
tags_json,
|
576
674
|
metadata_json,
|
675
|
+
metadata_text,
|
577
676
|
chunk_hash
|
578
677
|
))
|
579
678
|
|
@@ -0,0 +1,418 @@
|
|
1
|
+
"""
|
2
|
+
Copyright (c) 2025 SignalWire
|
3
|
+
|
4
|
+
This file is part of the SignalWire AI Agents SDK.
|
5
|
+
|
6
|
+
Licensed under the MIT License.
|
7
|
+
See LICENSE file in the project root for full license information.
|
8
|
+
"""
|
9
|
+
|
10
|
+
import sqlite3
|
11
|
+
import json
|
12
|
+
import logging
|
13
|
+
from typing import Dict, Any, Optional, List
|
14
|
+
from pathlib import Path
|
15
|
+
from datetime import datetime
|
16
|
+
|
17
|
+
try:
|
18
|
+
import numpy as np
|
19
|
+
except ImportError:
|
20
|
+
np = None
|
21
|
+
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
class SearchIndexMigrator:
|
26
|
+
"""Migrate search indexes between different backends"""
|
27
|
+
|
28
|
+
def __init__(self, verbose: bool = False):
|
29
|
+
"""
|
30
|
+
Initialize the migrator
|
31
|
+
|
32
|
+
Args:
|
33
|
+
verbose: Enable verbose output
|
34
|
+
"""
|
35
|
+
self.verbose = verbose
|
36
|
+
|
37
|
+
def migrate_sqlite_to_pgvector(
|
38
|
+
self,
|
39
|
+
sqlite_path: str,
|
40
|
+
connection_string: str,
|
41
|
+
collection_name: str,
|
42
|
+
overwrite: bool = False,
|
43
|
+
batch_size: int = 100
|
44
|
+
) -> Dict[str, Any]:
|
45
|
+
"""
|
46
|
+
Migrate a .swsearch SQLite index to pgvector
|
47
|
+
|
48
|
+
Args:
|
49
|
+
sqlite_path: Path to .swsearch file
|
50
|
+
connection_string: PostgreSQL connection string
|
51
|
+
collection_name: Name for the pgvector collection
|
52
|
+
overwrite: Whether to overwrite existing collection
|
53
|
+
batch_size: Number of chunks to insert at once
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
Migration statistics
|
57
|
+
"""
|
58
|
+
if not Path(sqlite_path).exists():
|
59
|
+
raise FileNotFoundError(f"SQLite index not found: {sqlite_path}")
|
60
|
+
|
61
|
+
# Import pgvector backend
|
62
|
+
from .pgvector_backend import PgVectorBackend
|
63
|
+
|
64
|
+
stats = {
|
65
|
+
'source': sqlite_path,
|
66
|
+
'target': collection_name,
|
67
|
+
'chunks_migrated': 0,
|
68
|
+
'errors': 0,
|
69
|
+
'config': {}
|
70
|
+
}
|
71
|
+
|
72
|
+
try:
|
73
|
+
# Connect to SQLite
|
74
|
+
if self.verbose:
|
75
|
+
print(f"Opening SQLite index: {sqlite_path}")
|
76
|
+
|
77
|
+
sqlite_conn = sqlite3.connect(sqlite_path)
|
78
|
+
cursor = sqlite_conn.cursor()
|
79
|
+
|
80
|
+
# Load configuration
|
81
|
+
cursor.execute("SELECT key, value FROM config")
|
82
|
+
config_rows = cursor.fetchall()
|
83
|
+
config = dict(config_rows)
|
84
|
+
stats['config'] = config
|
85
|
+
|
86
|
+
# Get important config values
|
87
|
+
model_name = config.get('embedding_model', 'sentence-transformers/all-mpnet-base-v2')
|
88
|
+
embedding_dim = int(config.get('embedding_dimensions', 768))
|
89
|
+
|
90
|
+
if self.verbose:
|
91
|
+
print(f"Source configuration:")
|
92
|
+
print(f" Model: {model_name}")
|
93
|
+
print(f" Dimensions: {embedding_dim}")
|
94
|
+
print(f" Created: {config.get('created_at', 'Unknown')}")
|
95
|
+
|
96
|
+
# Initialize pgvector backend
|
97
|
+
pgvector = PgVectorBackend(connection_string)
|
98
|
+
|
99
|
+
try:
|
100
|
+
# Handle existing collection
|
101
|
+
if overwrite:
|
102
|
+
if self.verbose:
|
103
|
+
print(f"Dropping existing collection: {collection_name}")
|
104
|
+
pgvector.delete_collection(collection_name)
|
105
|
+
|
106
|
+
# Create schema
|
107
|
+
if self.verbose:
|
108
|
+
print(f"Creating pgvector collection: {collection_name}")
|
109
|
+
|
110
|
+
pgvector.create_schema(collection_name, embedding_dim)
|
111
|
+
|
112
|
+
# Prepare collection config
|
113
|
+
collection_config = {
|
114
|
+
'model_name': model_name,
|
115
|
+
'embedding_dimensions': embedding_dim,
|
116
|
+
'chunking_strategy': config.get('chunking_strategy', 'sentence'),
|
117
|
+
'languages': json.loads(config.get('languages', '["en"]')),
|
118
|
+
'metadata': {
|
119
|
+
'migrated_from': sqlite_path,
|
120
|
+
'original_created': config.get('created_at'),
|
121
|
+
'source_dir': config.get('source_dir'),
|
122
|
+
'file_types': json.loads(config.get('file_types', '[]'))
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
# Count total chunks
|
127
|
+
cursor.execute("SELECT COUNT(*) FROM chunks")
|
128
|
+
total_chunks = cursor.fetchone()[0]
|
129
|
+
|
130
|
+
if self.verbose:
|
131
|
+
print(f"Migrating {total_chunks} chunks...")
|
132
|
+
|
133
|
+
# Check if metadata_text column exists (do this once)
|
134
|
+
cursor.execute("PRAGMA table_info(chunks)")
|
135
|
+
columns = [col[1] for col in cursor.fetchall()]
|
136
|
+
has_metadata_text = 'metadata_text' in columns
|
137
|
+
|
138
|
+
# Migrate chunks in batches
|
139
|
+
offset = 0
|
140
|
+
while offset < total_chunks:
|
141
|
+
# Fetch batch of chunks
|
142
|
+
|
143
|
+
if has_metadata_text:
|
144
|
+
cursor.execute("""
|
145
|
+
SELECT id, content, processed_content, keywords, language,
|
146
|
+
embedding, filename, section, start_line, end_line,
|
147
|
+
tags, metadata, metadata_text, chunk_hash
|
148
|
+
FROM chunks
|
149
|
+
ORDER BY id
|
150
|
+
LIMIT ? OFFSET ?
|
151
|
+
""", (batch_size, offset))
|
152
|
+
else:
|
153
|
+
cursor.execute("""
|
154
|
+
SELECT id, content, processed_content, keywords, language,
|
155
|
+
embedding, filename, section, start_line, end_line,
|
156
|
+
tags, metadata, chunk_hash
|
157
|
+
FROM chunks
|
158
|
+
ORDER BY id
|
159
|
+
LIMIT ? OFFSET ?
|
160
|
+
""", (batch_size, offset))
|
161
|
+
|
162
|
+
chunks_batch = []
|
163
|
+
for row in cursor.fetchall():
|
164
|
+
# Handle both old and new schema (with or without metadata_text)
|
165
|
+
if len(row) == 14: # New schema with metadata_text
|
166
|
+
(chunk_id, content, processed_content, keywords_json, language,
|
167
|
+
embedding_blob, filename, section, start_line, end_line,
|
168
|
+
tags_json, metadata_json, metadata_text, chunk_hash) = row
|
169
|
+
else: # Old schema without metadata_text
|
170
|
+
(chunk_id, content, processed_content, keywords_json, language,
|
171
|
+
embedding_blob, filename, section, start_line, end_line,
|
172
|
+
tags_json, metadata_json, chunk_hash) = row
|
173
|
+
metadata_text = None
|
174
|
+
|
175
|
+
# Convert embedding blob to numpy array if available
|
176
|
+
if embedding_blob and np:
|
177
|
+
embedding = np.frombuffer(embedding_blob, dtype=np.float32)
|
178
|
+
else:
|
179
|
+
embedding = embedding_blob
|
180
|
+
|
181
|
+
# Parse JSON fields
|
182
|
+
keywords = json.loads(keywords_json) if keywords_json else []
|
183
|
+
tags = json.loads(tags_json) if tags_json else []
|
184
|
+
metadata = json.loads(metadata_json) if metadata_json else {}
|
185
|
+
|
186
|
+
chunk = {
|
187
|
+
'content': content,
|
188
|
+
'processed_content': processed_content,
|
189
|
+
'keywords': keywords,
|
190
|
+
'language': language,
|
191
|
+
'embedding': embedding,
|
192
|
+
'filename': filename,
|
193
|
+
'section': section,
|
194
|
+
'start_line': start_line,
|
195
|
+
'end_line': end_line,
|
196
|
+
'tags': tags,
|
197
|
+
'metadata': metadata,
|
198
|
+
'metadata_text': metadata_text, # Will be regenerated if None
|
199
|
+
'chunk_hash': chunk_hash
|
200
|
+
}
|
201
|
+
|
202
|
+
chunks_batch.append(chunk)
|
203
|
+
|
204
|
+
# Store batch in pgvector
|
205
|
+
if chunks_batch:
|
206
|
+
try:
|
207
|
+
pgvector.store_chunks(chunks_batch, collection_name, collection_config)
|
208
|
+
stats['chunks_migrated'] += len(chunks_batch)
|
209
|
+
|
210
|
+
if self.verbose:
|
211
|
+
progress = (offset + len(chunks_batch)) / total_chunks * 100
|
212
|
+
print(f" Progress: {stats['chunks_migrated']}/{total_chunks} ({progress:.1f}%)")
|
213
|
+
except Exception as e:
|
214
|
+
logger.error(f"Error storing batch at offset {offset}: {e}")
|
215
|
+
stats['errors'] += len(chunks_batch)
|
216
|
+
|
217
|
+
offset += batch_size
|
218
|
+
|
219
|
+
# Success
|
220
|
+
if self.verbose:
|
221
|
+
print(f"\nMigration completed successfully!")
|
222
|
+
print(f" Chunks migrated: {stats['chunks_migrated']}")
|
223
|
+
print(f" Errors: {stats['errors']}")
|
224
|
+
|
225
|
+
finally:
|
226
|
+
pgvector.close()
|
227
|
+
|
228
|
+
except Exception as e:
|
229
|
+
logger.error(f"Migration failed: {e}")
|
230
|
+
raise
|
231
|
+
finally:
|
232
|
+
sqlite_conn.close()
|
233
|
+
|
234
|
+
return stats
|
235
|
+
|
236
|
+
def migrate_pgvector_to_sqlite(
|
237
|
+
self,
|
238
|
+
connection_string: str,
|
239
|
+
collection_name: str,
|
240
|
+
output_path: str,
|
241
|
+
batch_size: int = 100
|
242
|
+
) -> Dict[str, Any]:
|
243
|
+
"""
|
244
|
+
Migrate a pgvector collection to SQLite .swsearch format
|
245
|
+
|
246
|
+
Args:
|
247
|
+
connection_string: PostgreSQL connection string
|
248
|
+
collection_name: Name of the pgvector collection
|
249
|
+
output_path: Output .swsearch file path
|
250
|
+
batch_size: Number of chunks to fetch at once
|
251
|
+
|
252
|
+
Returns:
|
253
|
+
Migration statistics
|
254
|
+
"""
|
255
|
+
from .pgvector_backend import PgVectorBackend
|
256
|
+
from .index_builder import IndexBuilder
|
257
|
+
|
258
|
+
# Ensure output has .swsearch extension
|
259
|
+
if not output_path.endswith('.swsearch'):
|
260
|
+
output_path += '.swsearch'
|
261
|
+
|
262
|
+
stats = {
|
263
|
+
'source': f"{collection_name} (pgvector)",
|
264
|
+
'target': output_path,
|
265
|
+
'chunks_migrated': 0,
|
266
|
+
'errors': 0,
|
267
|
+
'config': {}
|
268
|
+
}
|
269
|
+
|
270
|
+
# Connect to pgvector
|
271
|
+
if self.verbose:
|
272
|
+
print(f"Connecting to pgvector collection: {collection_name}")
|
273
|
+
|
274
|
+
pgvector = PgVectorBackend(connection_string)
|
275
|
+
|
276
|
+
try:
|
277
|
+
# Get collection stats and config
|
278
|
+
pg_stats = pgvector.get_stats(collection_name)
|
279
|
+
config = pg_stats.get('config', {})
|
280
|
+
stats['config'] = config
|
281
|
+
|
282
|
+
total_chunks = pg_stats.get('total_chunks', 0)
|
283
|
+
|
284
|
+
if self.verbose:
|
285
|
+
print(f"Source configuration:")
|
286
|
+
print(f" Model: {config.get('model_name', 'Unknown')}")
|
287
|
+
print(f" Dimensions: {config.get('embedding_dimensions', 'Unknown')}")
|
288
|
+
print(f" Total chunks: {total_chunks}")
|
289
|
+
|
290
|
+
# Create SQLite database structure
|
291
|
+
# We'll manually create it to match the expected format
|
292
|
+
if Path(output_path).exists():
|
293
|
+
Path(output_path).unlink()
|
294
|
+
|
295
|
+
conn = sqlite3.connect(output_path)
|
296
|
+
cursor = conn.cursor()
|
297
|
+
|
298
|
+
# Create schema (matching index_builder.py)
|
299
|
+
cursor.execute('''
|
300
|
+
CREATE TABLE chunks (
|
301
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
302
|
+
content TEXT NOT NULL,
|
303
|
+
processed_content TEXT NOT NULL,
|
304
|
+
keywords TEXT,
|
305
|
+
language TEXT DEFAULT 'en',
|
306
|
+
embedding BLOB NOT NULL,
|
307
|
+
filename TEXT NOT NULL,
|
308
|
+
section TEXT,
|
309
|
+
start_line INTEGER,
|
310
|
+
end_line INTEGER,
|
311
|
+
tags TEXT,
|
312
|
+
metadata TEXT,
|
313
|
+
chunk_hash TEXT UNIQUE,
|
314
|
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
315
|
+
)
|
316
|
+
''')
|
317
|
+
|
318
|
+
cursor.execute('''
|
319
|
+
CREATE VIRTUAL TABLE chunks_fts USING fts5(
|
320
|
+
processed_content,
|
321
|
+
keywords,
|
322
|
+
content='chunks',
|
323
|
+
content_rowid='id'
|
324
|
+
)
|
325
|
+
''')
|
326
|
+
|
327
|
+
cursor.execute('''
|
328
|
+
CREATE TABLE synonyms (
|
329
|
+
word TEXT,
|
330
|
+
pos_tag TEXT,
|
331
|
+
synonyms TEXT,
|
332
|
+
language TEXT DEFAULT 'en',
|
333
|
+
PRIMARY KEY (word, pos_tag, language)
|
334
|
+
)
|
335
|
+
''')
|
336
|
+
|
337
|
+
cursor.execute('''
|
338
|
+
CREATE TABLE config (
|
339
|
+
key TEXT PRIMARY KEY,
|
340
|
+
value TEXT
|
341
|
+
)
|
342
|
+
''')
|
343
|
+
|
344
|
+
# Create indexes
|
345
|
+
cursor.execute('CREATE INDEX idx_chunks_filename ON chunks(filename)')
|
346
|
+
cursor.execute('CREATE INDEX idx_chunks_language ON chunks(language)')
|
347
|
+
cursor.execute('CREATE INDEX idx_chunks_tags ON chunks(tags)')
|
348
|
+
|
349
|
+
# Insert config
|
350
|
+
config_data = {
|
351
|
+
'embedding_model': config.get('model_name', 'sentence-transformers/all-mpnet-base-v2'),
|
352
|
+
'embedding_dimensions': str(config.get('embedding_dimensions', 768)),
|
353
|
+
'chunk_size': str(config.get('metadata', {}).get('chunk_size', 50)),
|
354
|
+
'chunk_overlap': str(config.get('metadata', {}).get('chunk_overlap', 10)),
|
355
|
+
'preprocessing_version': '1.0',
|
356
|
+
'languages': json.dumps(config.get('languages', ['en'])),
|
357
|
+
'created_at': datetime.now().isoformat(),
|
358
|
+
'source_dir': config.get('metadata', {}).get('source_dir', 'pgvector_migration'),
|
359
|
+
'file_types': json.dumps(config.get('metadata', {}).get('file_types', []))
|
360
|
+
}
|
361
|
+
|
362
|
+
for key, value in config_data.items():
|
363
|
+
cursor.execute('INSERT INTO config (key, value) VALUES (?, ?)', (key, value))
|
364
|
+
|
365
|
+
# TODO: Implement chunk fetching from pgvector
|
366
|
+
# This would require adding a method to PgVectorBackend to fetch chunks
|
367
|
+
# For now, we'll note this as a limitation
|
368
|
+
|
369
|
+
if self.verbose:
|
370
|
+
print("\nNote: pgvector to SQLite migration requires implementing chunk fetching in PgVectorBackend")
|
371
|
+
print("This feature is planned for future development.")
|
372
|
+
|
373
|
+
conn.commit()
|
374
|
+
conn.close()
|
375
|
+
|
376
|
+
finally:
|
377
|
+
pgvector.close()
|
378
|
+
|
379
|
+
return stats
|
380
|
+
|
381
|
+
def get_index_info(self, index_path: str) -> Dict[str, Any]:
|
382
|
+
"""
|
383
|
+
Get information about a search index
|
384
|
+
|
385
|
+
Args:
|
386
|
+
index_path: Path to index file or pgvector collection identifier
|
387
|
+
|
388
|
+
Returns:
|
389
|
+
Index information including type, config, and statistics
|
390
|
+
"""
|
391
|
+
info = {}
|
392
|
+
|
393
|
+
if index_path.endswith('.swsearch') and Path(index_path).exists():
|
394
|
+
# SQLite index
|
395
|
+
info['type'] = 'sqlite'
|
396
|
+
info['path'] = index_path
|
397
|
+
|
398
|
+
conn = sqlite3.connect(index_path)
|
399
|
+
cursor = conn.cursor()
|
400
|
+
|
401
|
+
# Get config
|
402
|
+
cursor.execute("SELECT key, value FROM config")
|
403
|
+
info['config'] = dict(cursor.fetchall())
|
404
|
+
|
405
|
+
# Get stats
|
406
|
+
cursor.execute("SELECT COUNT(*) FROM chunks")
|
407
|
+
info['total_chunks'] = cursor.fetchone()[0]
|
408
|
+
|
409
|
+
cursor.execute("SELECT COUNT(DISTINCT filename) FROM chunks")
|
410
|
+
info['total_files'] = cursor.fetchone()[0]
|
411
|
+
|
412
|
+
conn.close()
|
413
|
+
|
414
|
+
else:
|
415
|
+
info['type'] = 'unknown'
|
416
|
+
info['path'] = index_path
|
417
|
+
|
418
|
+
return info
|