signalwire-agents 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +43 -4
- signalwire_agents/agent_server.py +268 -15
- signalwire_agents/cli/__init__.py +9 -0
- signalwire_agents/cli/build_search.py +457 -0
- signalwire_agents/cli/test_swaig.py +2609 -0
- signalwire_agents/core/agent_base.py +691 -82
- signalwire_agents/core/contexts.py +289 -0
- signalwire_agents/core/data_map.py +499 -0
- signalwire_agents/core/function_result.py +57 -10
- signalwire_agents/core/logging_config.py +232 -0
- signalwire_agents/core/skill_base.py +27 -37
- signalwire_agents/core/skill_manager.py +89 -23
- signalwire_agents/core/swaig_function.py +13 -1
- signalwire_agents/core/swml_handler.py +37 -13
- signalwire_agents/core/swml_service.py +37 -28
- signalwire_agents/search/__init__.py +131 -0
- signalwire_agents/search/document_processor.py +764 -0
- signalwire_agents/search/index_builder.py +534 -0
- signalwire_agents/search/query_processor.py +371 -0
- signalwire_agents/search/search_engine.py +383 -0
- signalwire_agents/search/search_service.py +251 -0
- signalwire_agents/skills/datasphere/__init__.py +12 -0
- signalwire_agents/skills/datasphere/skill.py +229 -0
- signalwire_agents/skills/datasphere_serverless/__init__.py +1 -0
- signalwire_agents/skills/datasphere_serverless/skill.py +156 -0
- signalwire_agents/skills/datetime/skill.py +9 -5
- signalwire_agents/skills/joke/__init__.py +1 -0
- signalwire_agents/skills/joke/skill.py +88 -0
- signalwire_agents/skills/math/skill.py +9 -6
- signalwire_agents/skills/native_vector_search/__init__.py +1 -0
- signalwire_agents/skills/native_vector_search/skill.py +352 -0
- signalwire_agents/skills/registry.py +10 -4
- signalwire_agents/skills/web_search/skill.py +57 -21
- signalwire_agents/skills/wikipedia/__init__.py +9 -0
- signalwire_agents/skills/wikipedia/skill.py +180 -0
- signalwire_agents/utils/__init__.py +14 -0
- signalwire_agents/utils/schema_utils.py +111 -44
- signalwire_agents-0.1.12.dist-info/METADATA +863 -0
- signalwire_agents-0.1.12.dist-info/RECORD +67 -0
- {signalwire_agents-0.1.10.dist-info → signalwire_agents-0.1.12.dist-info}/WHEEL +1 -1
- signalwire_agents-0.1.12.dist-info/entry_points.txt +3 -0
- signalwire_agents-0.1.10.dist-info/METADATA +0 -319
- signalwire_agents-0.1.10.dist-info/RECORD +0 -44
- {signalwire_agents-0.1.10.data → signalwire_agents-0.1.12.data}/data/schema.json +0 -0
- {signalwire_agents-0.1.10.dist-info → signalwire_agents-0.1.12.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.10.dist-info → signalwire_agents-0.1.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,534 @@
|
|
1
|
+
"""
|
2
|
+
Copyright (c) 2025 SignalWire
|
3
|
+
|
4
|
+
This file is part of the SignalWire AI Agents SDK.
|
5
|
+
|
6
|
+
Licensed under the MIT License.
|
7
|
+
See LICENSE file in the project root for full license information.
|
8
|
+
"""
|
9
|
+
|
10
|
+
import os
|
11
|
+
import sqlite3
|
12
|
+
import json
|
13
|
+
import hashlib
|
14
|
+
import logging
|
15
|
+
from datetime import datetime
|
16
|
+
from pathlib import Path
|
17
|
+
from typing import List, Optional, Dict, Any
|
18
|
+
import fnmatch
|
19
|
+
|
20
|
+
try:
|
21
|
+
import numpy as np
|
22
|
+
except ImportError:
|
23
|
+
np = None
|
24
|
+
|
25
|
+
try:
|
26
|
+
from sentence_transformers import SentenceTransformer
|
27
|
+
except ImportError:
|
28
|
+
SentenceTransformer = None
|
29
|
+
|
30
|
+
from .document_processor import DocumentProcessor
|
31
|
+
from .query_processor import preprocess_document_content
|
32
|
+
|
33
|
+
logger = logging.getLogger(__name__)
|
34
|
+
|
35
|
+
class IndexBuilder:
|
36
|
+
"""Build searchable indexes from document directories"""
|
37
|
+
|
38
|
+
def __init__(self, model_name: str = 'sentence-transformers/all-mpnet-base-v2',
|
39
|
+
chunking_strategy: str = 'sentence',
|
40
|
+
max_sentences_per_chunk: int = 50,
|
41
|
+
chunk_size: int = 50,
|
42
|
+
chunk_overlap: int = 10,
|
43
|
+
split_newlines: Optional[int] = None,
|
44
|
+
verbose: bool = False):
|
45
|
+
self.model_name = model_name
|
46
|
+
self.chunking_strategy = chunking_strategy
|
47
|
+
self.max_sentences_per_chunk = max_sentences_per_chunk
|
48
|
+
self.chunk_size = chunk_size
|
49
|
+
self.chunk_overlap = chunk_overlap
|
50
|
+
self.split_newlines = split_newlines
|
51
|
+
self.verbose = verbose
|
52
|
+
self.model = None
|
53
|
+
self.doc_processor = DocumentProcessor(
|
54
|
+
chunking_strategy=chunking_strategy,
|
55
|
+
max_sentences_per_chunk=max_sentences_per_chunk,
|
56
|
+
chunk_size=chunk_size,
|
57
|
+
overlap_size=chunk_overlap,
|
58
|
+
split_newlines=split_newlines
|
59
|
+
)
|
60
|
+
|
61
|
+
def _load_model(self):
|
62
|
+
"""Load embedding model (lazy loading)"""
|
63
|
+
if self.model is None:
|
64
|
+
if not SentenceTransformer:
|
65
|
+
raise ImportError("sentence-transformers is required for embedding generation. Install with: pip install sentence-transformers")
|
66
|
+
|
67
|
+
if self.verbose:
|
68
|
+
print(f"Loading embedding model: {self.model_name}")
|
69
|
+
|
70
|
+
try:
|
71
|
+
self.model = SentenceTransformer(self.model_name)
|
72
|
+
except Exception as e:
|
73
|
+
logger.error(f"Failed to load model '{self.model_name}': {e}")
|
74
|
+
raise
|
75
|
+
|
76
|
+
def build_index_from_sources(self, sources: List[Path], output_file: str,
|
77
|
+
file_types: List[str], exclude_patterns: Optional[List[str]] = None,
|
78
|
+
languages: List[str] = None, tags: Optional[List[str]] = None):
|
79
|
+
"""
|
80
|
+
Build complete search index from multiple sources (files and directories)
|
81
|
+
|
82
|
+
Args:
|
83
|
+
sources: List of Path objects (files and/or directories)
|
84
|
+
output_file: Output .swsearch file path
|
85
|
+
file_types: List of file extensions to include for directories
|
86
|
+
exclude_patterns: Glob patterns to exclude
|
87
|
+
languages: List of languages to support
|
88
|
+
tags: Global tags to add to all chunks
|
89
|
+
"""
|
90
|
+
|
91
|
+
# Discover files from all sources
|
92
|
+
files = self._discover_files_from_sources(sources, file_types, exclude_patterns)
|
93
|
+
if self.verbose:
|
94
|
+
print(f"Found {len(files)} files to process")
|
95
|
+
|
96
|
+
if not files:
|
97
|
+
print("No files found to process. Check your sources, file types and exclude patterns.")
|
98
|
+
return
|
99
|
+
|
100
|
+
# Process documents
|
101
|
+
chunks = []
|
102
|
+
for file_path in files:
|
103
|
+
try:
|
104
|
+
# For individual files, use the file's parent as the base directory
|
105
|
+
# For files from directories, use the original source directory
|
106
|
+
base_dir = self._get_base_directory_for_file(file_path, sources)
|
107
|
+
file_chunks = self._process_file(file_path, base_dir, tags)
|
108
|
+
chunks.extend(file_chunks)
|
109
|
+
if self.verbose:
|
110
|
+
print(f"Processed {file_path}: {len(file_chunks)} chunks")
|
111
|
+
except Exception as e:
|
112
|
+
logger.error(f"Error processing {file_path}: {e}")
|
113
|
+
if self.verbose:
|
114
|
+
print(f"Error processing {file_path}: {e}")
|
115
|
+
|
116
|
+
if not chunks:
|
117
|
+
print("No chunks created from documents. Check file contents and processing.")
|
118
|
+
return
|
119
|
+
|
120
|
+
if self.verbose:
|
121
|
+
print(f"Created {len(chunks)} total chunks")
|
122
|
+
|
123
|
+
# Generate embeddings
|
124
|
+
self._load_model()
|
125
|
+
if self.verbose:
|
126
|
+
print("Generating embeddings...")
|
127
|
+
|
128
|
+
for i, chunk in enumerate(chunks):
|
129
|
+
try:
|
130
|
+
# Preprocess content for better search
|
131
|
+
processed = preprocess_document_content(
|
132
|
+
chunk['content'],
|
133
|
+
language=chunk.get('language', 'en')
|
134
|
+
)
|
135
|
+
|
136
|
+
chunk['processed_content'] = processed['enhanced_text']
|
137
|
+
chunk['keywords'] = processed.get('keywords', [])
|
138
|
+
|
139
|
+
# Generate embedding (suppress progress bar)
|
140
|
+
embedding = self.model.encode(processed['enhanced_text'], show_progress_bar=False)
|
141
|
+
chunk['embedding'] = embedding.tobytes()
|
142
|
+
|
143
|
+
if self.verbose and (i + 1) % 50 == 0:
|
144
|
+
progress_pct = ((i + 1) / len(chunks)) * 100
|
145
|
+
print(f"Generated embeddings: {i + 1}/{len(chunks)} chunks ({progress_pct:.1f}%)")
|
146
|
+
|
147
|
+
except Exception as e:
|
148
|
+
logger.error(f"Error processing chunk {i}: {e}")
|
149
|
+
# Use original content as fallback
|
150
|
+
chunk['processed_content'] = chunk['content']
|
151
|
+
chunk['keywords'] = []
|
152
|
+
# Create zero embedding as fallback
|
153
|
+
if np:
|
154
|
+
embedding = np.zeros(768, dtype=np.float32)
|
155
|
+
chunk['embedding'] = embedding.tobytes()
|
156
|
+
else:
|
157
|
+
chunk['embedding'] = b''
|
158
|
+
|
159
|
+
# Create SQLite database
|
160
|
+
sources_info = [str(s) for s in sources]
|
161
|
+
self._create_database(output_file, chunks, languages or ['en'], sources_info, file_types)
|
162
|
+
|
163
|
+
if self.verbose:
|
164
|
+
print(f"Index created: {output_file}")
|
165
|
+
print(f"Total chunks: {len(chunks)}")
|
166
|
+
|
167
|
+
def build_index(self, source_dir: str, output_file: str,
|
168
|
+
file_types: List[str], exclude_patterns: Optional[List[str]] = None,
|
169
|
+
languages: List[str] = None, tags: Optional[List[str]] = None):
|
170
|
+
"""
|
171
|
+
Build complete search index from a single directory (legacy method)
|
172
|
+
|
173
|
+
Args:
|
174
|
+
source_dir: Directory to scan for documents
|
175
|
+
output_file: Output .swsearch file path
|
176
|
+
file_types: List of file extensions to include
|
177
|
+
exclude_patterns: Glob patterns to exclude
|
178
|
+
languages: List of languages to support
|
179
|
+
tags: Global tags to add to all chunks
|
180
|
+
"""
|
181
|
+
|
182
|
+
# Convert to new multi-source method
|
183
|
+
sources = [Path(source_dir)]
|
184
|
+
self.build_index_from_sources(sources, output_file, file_types, exclude_patterns, languages, tags)
|
185
|
+
|
186
|
+
def _get_base_directory_for_file(self, file_path: Path, sources: List[Path]) -> str:
|
187
|
+
"""
|
188
|
+
Determine the appropriate base directory for a file to calculate relative paths
|
189
|
+
|
190
|
+
Args:
|
191
|
+
file_path: The file being processed
|
192
|
+
sources: List of original source paths
|
193
|
+
|
194
|
+
Returns:
|
195
|
+
Base directory path as string
|
196
|
+
"""
|
197
|
+
|
198
|
+
# Check if this file was specified directly as a source
|
199
|
+
if file_path in sources:
|
200
|
+
# For individual files, use the parent directory
|
201
|
+
return str(file_path.parent)
|
202
|
+
|
203
|
+
# Check if this file is within any of the source directories
|
204
|
+
for source in sources:
|
205
|
+
if source.is_dir():
|
206
|
+
try:
|
207
|
+
# Check if file_path is relative to this source directory
|
208
|
+
file_path.relative_to(source)
|
209
|
+
return str(source)
|
210
|
+
except ValueError:
|
211
|
+
# file_path is not relative to this source
|
212
|
+
continue
|
213
|
+
|
214
|
+
# Fallback: use the file's parent directory
|
215
|
+
return str(file_path.parent)
|
216
|
+
|
217
|
+
def _discover_files_from_sources(self, sources: List[Path], file_types: List[str],
|
218
|
+
exclude_patterns: Optional[List[str]] = None) -> List[Path]:
|
219
|
+
"""
|
220
|
+
Discover files from multiple sources (files and directories)
|
221
|
+
|
222
|
+
Args:
|
223
|
+
sources: List of Path objects (files and/or directories)
|
224
|
+
file_types: List of file extensions to include for directories
|
225
|
+
exclude_patterns: Glob patterns to exclude
|
226
|
+
|
227
|
+
Returns:
|
228
|
+
List of file paths to process
|
229
|
+
"""
|
230
|
+
|
231
|
+
files = []
|
232
|
+
supported_extensions = set(ft.lstrip('.').lower() for ft in file_types)
|
233
|
+
|
234
|
+
for source in sources:
|
235
|
+
if source.is_file():
|
236
|
+
# Individual file - check if it's supported
|
237
|
+
file_ext = source.suffix.lstrip('.').lower()
|
238
|
+
if file_ext in supported_extensions or not file_ext: # Allow extensionless files
|
239
|
+
# Check exclusions
|
240
|
+
if self._is_file_excluded(source, exclude_patterns):
|
241
|
+
if self.verbose:
|
242
|
+
print(f"Excluded file: {source}")
|
243
|
+
continue
|
244
|
+
|
245
|
+
files.append(source)
|
246
|
+
if self.verbose:
|
247
|
+
print(f"Added individual file: {source}")
|
248
|
+
else:
|
249
|
+
if self.verbose:
|
250
|
+
print(f"Skipped unsupported file type: {source} (extension: {file_ext})")
|
251
|
+
|
252
|
+
elif source.is_dir():
|
253
|
+
# Directory - use existing discovery logic
|
254
|
+
dir_files = self._discover_files(str(source), file_types, exclude_patterns)
|
255
|
+
files.extend(dir_files)
|
256
|
+
if self.verbose:
|
257
|
+
print(f"Added {len(dir_files)} files from directory: {source}")
|
258
|
+
else:
|
259
|
+
if self.verbose:
|
260
|
+
print(f"Skipped non-existent or invalid source: {source}")
|
261
|
+
|
262
|
+
# Remove duplicates while preserving order
|
263
|
+
seen = set()
|
264
|
+
unique_files = []
|
265
|
+
for file_path in files:
|
266
|
+
if file_path not in seen:
|
267
|
+
seen.add(file_path)
|
268
|
+
unique_files.append(file_path)
|
269
|
+
|
270
|
+
return unique_files
|
271
|
+
|
272
|
+
def _is_file_excluded(self, file_path: Path, exclude_patterns: Optional[List[str]] = None) -> bool:
|
273
|
+
"""
|
274
|
+
Check if a file should be excluded based on exclude patterns
|
275
|
+
|
276
|
+
Args:
|
277
|
+
file_path: Path to check
|
278
|
+
exclude_patterns: List of glob patterns to exclude
|
279
|
+
|
280
|
+
Returns:
|
281
|
+
True if file should be excluded
|
282
|
+
"""
|
283
|
+
|
284
|
+
if not exclude_patterns:
|
285
|
+
return False
|
286
|
+
|
287
|
+
import fnmatch
|
288
|
+
|
289
|
+
file_str = str(file_path)
|
290
|
+
for pattern in exclude_patterns:
|
291
|
+
if fnmatch.fnmatch(file_str, pattern):
|
292
|
+
return True
|
293
|
+
|
294
|
+
return False
|
295
|
+
|
296
|
+
def _discover_files(self, source_dir: str, file_types: List[str],
|
297
|
+
exclude_patterns: Optional[List[str]] = None) -> List[Path]:
|
298
|
+
"""Discover files to index"""
|
299
|
+
files = []
|
300
|
+
source_path = Path(source_dir)
|
301
|
+
|
302
|
+
if not source_path.exists():
|
303
|
+
raise FileNotFoundError(f"Source directory does not exist: {source_dir}")
|
304
|
+
|
305
|
+
for file_type in file_types:
|
306
|
+
# Clean up file type (remove leading dots)
|
307
|
+
clean_type = file_type.lstrip('.')
|
308
|
+
pattern = f"**/*.{clean_type}"
|
309
|
+
|
310
|
+
for file_path in source_path.glob(pattern):
|
311
|
+
# Skip directories
|
312
|
+
if not file_path.is_file():
|
313
|
+
continue
|
314
|
+
|
315
|
+
# Check exclusions
|
316
|
+
if exclude_patterns:
|
317
|
+
excluded = False
|
318
|
+
for pattern in exclude_patterns:
|
319
|
+
if fnmatch.fnmatch(str(file_path), pattern):
|
320
|
+
excluded = True
|
321
|
+
break
|
322
|
+
if excluded:
|
323
|
+
if self.verbose:
|
324
|
+
print(f"Excluded: {file_path}")
|
325
|
+
continue
|
326
|
+
|
327
|
+
files.append(file_path)
|
328
|
+
|
329
|
+
return files
|
330
|
+
|
331
|
+
def _process_file(self, file_path: Path, source_dir: str,
|
332
|
+
global_tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
333
|
+
"""Process single file into chunks"""
|
334
|
+
try:
|
335
|
+
# Try to read as text first
|
336
|
+
try:
|
337
|
+
content = file_path.read_text(encoding='utf-8')
|
338
|
+
except UnicodeDecodeError:
|
339
|
+
if self.verbose:
|
340
|
+
print(f"Skipping binary file: {file_path}")
|
341
|
+
return []
|
342
|
+
|
343
|
+
relative_path = str(file_path.relative_to(source_dir))
|
344
|
+
|
345
|
+
# Create chunks using document processor - pass content directly, not file path
|
346
|
+
chunks = self.doc_processor.create_chunks(
|
347
|
+
content=content, # Pass the actual content, not the file path
|
348
|
+
filename=relative_path,
|
349
|
+
file_type=file_path.suffix.lstrip('.')
|
350
|
+
)
|
351
|
+
|
352
|
+
# Add global tags
|
353
|
+
if global_tags:
|
354
|
+
for chunk in chunks:
|
355
|
+
existing_tags = chunk.get('tags', [])
|
356
|
+
if isinstance(existing_tags, str):
|
357
|
+
existing_tags = [existing_tags]
|
358
|
+
chunk['tags'] = existing_tags + global_tags
|
359
|
+
|
360
|
+
return chunks
|
361
|
+
|
362
|
+
except Exception as e:
|
363
|
+
logger.error(f"Error processing file {file_path}: {e}")
|
364
|
+
return []
|
365
|
+
|
366
|
+
def _create_database(self, output_file: str, chunks: List[Dict[str, Any]],
|
367
|
+
languages: List[str], sources_info: List[str], file_types: List[str]):
|
368
|
+
"""Create SQLite database with all data"""
|
369
|
+
|
370
|
+
# Remove existing file
|
371
|
+
if os.path.exists(output_file):
|
372
|
+
os.remove(output_file)
|
373
|
+
|
374
|
+
conn = sqlite3.connect(output_file)
|
375
|
+
cursor = conn.cursor()
|
376
|
+
|
377
|
+
try:
|
378
|
+
# Create schema
|
379
|
+
cursor.execute('''
|
380
|
+
CREATE TABLE chunks (
|
381
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
382
|
+
content TEXT NOT NULL,
|
383
|
+
processed_content TEXT NOT NULL,
|
384
|
+
keywords TEXT,
|
385
|
+
language TEXT DEFAULT 'en',
|
386
|
+
embedding BLOB NOT NULL,
|
387
|
+
filename TEXT NOT NULL,
|
388
|
+
section TEXT,
|
389
|
+
start_line INTEGER,
|
390
|
+
end_line INTEGER,
|
391
|
+
tags TEXT,
|
392
|
+
metadata TEXT,
|
393
|
+
chunk_hash TEXT UNIQUE,
|
394
|
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
395
|
+
)
|
396
|
+
''')
|
397
|
+
|
398
|
+
cursor.execute('''
|
399
|
+
CREATE VIRTUAL TABLE chunks_fts USING fts5(
|
400
|
+
processed_content,
|
401
|
+
keywords,
|
402
|
+
content='chunks',
|
403
|
+
content_rowid='id'
|
404
|
+
)
|
405
|
+
''')
|
406
|
+
|
407
|
+
cursor.execute('''
|
408
|
+
CREATE TABLE synonyms (
|
409
|
+
word TEXT,
|
410
|
+
pos_tag TEXT,
|
411
|
+
synonyms TEXT,
|
412
|
+
language TEXT DEFAULT 'en',
|
413
|
+
PRIMARY KEY (word, pos_tag, language)
|
414
|
+
)
|
415
|
+
''')
|
416
|
+
|
417
|
+
cursor.execute('''
|
418
|
+
CREATE TABLE config (
|
419
|
+
key TEXT PRIMARY KEY,
|
420
|
+
value TEXT
|
421
|
+
)
|
422
|
+
''')
|
423
|
+
|
424
|
+
# Create indexes for performance
|
425
|
+
cursor.execute('CREATE INDEX idx_chunks_filename ON chunks(filename)')
|
426
|
+
cursor.execute('CREATE INDEX idx_chunks_language ON chunks(language)')
|
427
|
+
cursor.execute('CREATE INDEX idx_chunks_tags ON chunks(tags)')
|
428
|
+
|
429
|
+
# Insert config
|
430
|
+
embedding_dimensions = 768 # Default for all-mpnet-base-v2
|
431
|
+
if chunks and chunks[0].get('embedding'):
|
432
|
+
try:
|
433
|
+
if np:
|
434
|
+
embedding_array = np.frombuffer(chunks[0]['embedding'], dtype=np.float32)
|
435
|
+
embedding_dimensions = len(embedding_array)
|
436
|
+
except:
|
437
|
+
pass
|
438
|
+
|
439
|
+
config_data = {
|
440
|
+
'embedding_model': self.model_name,
|
441
|
+
'embedding_dimensions': str(embedding_dimensions),
|
442
|
+
'chunk_size': str(self.chunk_size),
|
443
|
+
'chunk_overlap': str(self.chunk_overlap),
|
444
|
+
'preprocessing_version': '1.0',
|
445
|
+
'languages': json.dumps(languages),
|
446
|
+
'created_at': datetime.now().isoformat(),
|
447
|
+
'sources': json.dumps(sources_info), # Store list of sources instead of single directory
|
448
|
+
'file_types': json.dumps(file_types)
|
449
|
+
}
|
450
|
+
|
451
|
+
for key, value in config_data.items():
|
452
|
+
cursor.execute('INSERT INTO config (key, value) VALUES (?, ?)', (key, value))
|
453
|
+
|
454
|
+
# Insert chunks
|
455
|
+
for chunk in chunks:
|
456
|
+
# Create hash for deduplication - include filename, section, and line numbers for uniqueness
|
457
|
+
hash_content = f"{chunk['filename']}:{chunk.get('section', '')}:{chunk.get('start_line', 0)}:{chunk.get('end_line', 0)}:{chunk['content']}"
|
458
|
+
chunk_hash = hashlib.sha256(hash_content.encode()).hexdigest()[:16]
|
459
|
+
|
460
|
+
# Prepare data
|
461
|
+
keywords_json = json.dumps(chunk.get('keywords', []))
|
462
|
+
tags_json = json.dumps(chunk.get('tags', []))
|
463
|
+
metadata_json = json.dumps(chunk.get('metadata', {}))
|
464
|
+
|
465
|
+
cursor.execute('''
|
466
|
+
INSERT OR IGNORE INTO chunks (
|
467
|
+
content, processed_content, keywords, language, embedding,
|
468
|
+
filename, section, start_line, end_line, tags, metadata, chunk_hash
|
469
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
470
|
+
''', (
|
471
|
+
chunk['content'],
|
472
|
+
chunk.get('processed_content', chunk['content']),
|
473
|
+
keywords_json,
|
474
|
+
chunk.get('language', 'en'),
|
475
|
+
chunk.get('embedding', b''),
|
476
|
+
chunk['filename'],
|
477
|
+
chunk.get('section'),
|
478
|
+
chunk.get('start_line'),
|
479
|
+
chunk.get('end_line'),
|
480
|
+
tags_json,
|
481
|
+
metadata_json,
|
482
|
+
chunk_hash
|
483
|
+
))
|
484
|
+
|
485
|
+
conn.commit()
|
486
|
+
|
487
|
+
except Exception as e:
|
488
|
+
conn.rollback()
|
489
|
+
raise e
|
490
|
+
finally:
|
491
|
+
conn.close()
|
492
|
+
|
493
|
+
def validate_index(self, index_file: str) -> Dict[str, Any]:
|
494
|
+
"""Validate an existing search index"""
|
495
|
+
if not os.path.exists(index_file):
|
496
|
+
return {"valid": False, "error": "Index file does not exist"}
|
497
|
+
|
498
|
+
try:
|
499
|
+
conn = sqlite3.connect(index_file)
|
500
|
+
cursor = conn.cursor()
|
501
|
+
|
502
|
+
# Check schema
|
503
|
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
504
|
+
tables = [row[0] for row in cursor.fetchall()]
|
505
|
+
|
506
|
+
required_tables = ['chunks', 'chunks_fts', 'synonyms', 'config']
|
507
|
+
missing_tables = [t for t in required_tables if t not in tables]
|
508
|
+
|
509
|
+
if missing_tables:
|
510
|
+
return {"valid": False, "error": f"Missing tables: {missing_tables}"}
|
511
|
+
|
512
|
+
# Get config
|
513
|
+
cursor.execute("SELECT key, value FROM config")
|
514
|
+
config = dict(cursor.fetchall())
|
515
|
+
|
516
|
+
# Get chunk count
|
517
|
+
cursor.execute("SELECT COUNT(*) FROM chunks")
|
518
|
+
chunk_count = cursor.fetchone()[0]
|
519
|
+
|
520
|
+
# Get file count
|
521
|
+
cursor.execute("SELECT COUNT(DISTINCT filename) FROM chunks")
|
522
|
+
file_count = cursor.fetchone()[0]
|
523
|
+
|
524
|
+
conn.close()
|
525
|
+
|
526
|
+
return {
|
527
|
+
"valid": True,
|
528
|
+
"chunk_count": chunk_count,
|
529
|
+
"file_count": file_count,
|
530
|
+
"config": config
|
531
|
+
}
|
532
|
+
|
533
|
+
except Exception as e:
|
534
|
+
return {"valid": False, "error": str(e)}
|