signalwire-agents 0.1.46__py3-none-any.whl → 0.1.48__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +1 -1
- signalwire_agents/cli/build_search.py +522 -13
- signalwire_agents/core/agent_base.py +29 -37
- signalwire_agents/core/mixins/ai_config_mixin.py +32 -87
- signalwire_agents/core/swaig_function.py +2 -2
- signalwire_agents/search/__init__.py +7 -1
- signalwire_agents/search/document_processor.py +105 -1
- signalwire_agents/search/index_builder.py +113 -14
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +236 -13
- signalwire_agents/search/query_processor.py +87 -9
- signalwire_agents/search/search_engine.py +835 -31
- signalwire_agents/search/search_service.py +56 -6
- signalwire_agents/skills/native_vector_search/skill.py +208 -33
- signalwire_agents/skills/weather_api/skill.py +2 -2
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/METADATA +12 -7
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/RECORD +22 -20
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/entry_points.txt +0 -0
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/top_level.txt +0 -0
signalwire_agents/__init__.py
CHANGED
@@ -18,7 +18,7 @@ A package for building AI agents using SignalWire's AI and SWML capabilities.
|
|
18
18
|
from .core.logging_config import configure_logging
|
19
19
|
configure_logging()
|
20
20
|
|
21
|
-
__version__ = "0.1.
|
21
|
+
__version__ = "0.1.48"
|
22
22
|
|
23
23
|
# Import core classes for easier access
|
24
24
|
from .core.agent_base import AgentBase
|
@@ -10,6 +10,9 @@ See LICENSE file in the project root for full license information.
|
|
10
10
|
import argparse
|
11
11
|
import sys
|
12
12
|
from pathlib import Path
|
13
|
+
from datetime import datetime
|
14
|
+
|
15
|
+
from signalwire_agents.search.models import MODEL_ALIASES, DEFAULT_MODEL, resolve_model_alias
|
13
16
|
|
14
17
|
def main():
|
15
18
|
"""Main entry point for the build-search command"""
|
@@ -66,6 +69,35 @@ Examples:
|
|
66
69
|
sw-search ./docs \\
|
67
70
|
--chunking-strategy qa
|
68
71
|
|
72
|
+
# Model selection examples (performance vs quality tradeoff)
|
73
|
+
sw-search ./docs --model mini # Fastest (~5x faster), 384 dims, good for most use cases
|
74
|
+
sw-search ./docs --model base # Balanced speed/quality, 768 dims (previous default)
|
75
|
+
sw-search ./docs --model large # Best quality (same as base currently)
|
76
|
+
# Or use full model names:
|
77
|
+
sw-search ./docs --model sentence-transformers/all-MiniLM-L6-v2
|
78
|
+
sw-search ./docs --model sentence-transformers/all-mpnet-base-v2
|
79
|
+
|
80
|
+
# JSON-based chunking (pre-chunked content)
|
81
|
+
sw-search ./api_chunks.json \
|
82
|
+
--chunking-strategy json \
|
83
|
+
--file-types json
|
84
|
+
|
85
|
+
# Export chunks to JSON for review (single file)
|
86
|
+
sw-search ./docs \\
|
87
|
+
--output-format json \\
|
88
|
+
--output all_chunks.json
|
89
|
+
|
90
|
+
# Export chunks to JSON (one file per source)
|
91
|
+
sw-search ./docs \\
|
92
|
+
--output-format json \\
|
93
|
+
--output-dir ./chunks/
|
94
|
+
|
95
|
+
# Build index from exported JSON chunks
|
96
|
+
sw-search ./chunks/ \\
|
97
|
+
--chunking-strategy json \\
|
98
|
+
--file-types json \\
|
99
|
+
--output final.swsearch
|
100
|
+
|
69
101
|
# Full configuration example
|
70
102
|
sw-search ./docs ./examples README.md \\
|
71
103
|
--output ./knowledge.swsearch \\
|
@@ -90,6 +122,12 @@ Examples:
|
|
90
122
|
sw-search remote http://localhost:8001 "how to create an agent" --index-name docs
|
91
123
|
sw-search remote localhost:8001 "API reference" --index-name docs --count 3 --verbose
|
92
124
|
|
125
|
+
# Migrate between backends
|
126
|
+
sw-search migrate ./docs.swsearch --to-pgvector \\
|
127
|
+
--connection-string "postgresql://user:pass@localhost/db" \\
|
128
|
+
--collection-name docs_collection
|
129
|
+
sw-search migrate --info ./docs.swsearch
|
130
|
+
|
93
131
|
# PostgreSQL pgvector backend
|
94
132
|
sw-search ./docs \\
|
95
133
|
--backend pgvector \\
|
@@ -121,6 +159,18 @@ Examples:
|
|
121
159
|
help='Output .swsearch file (default: sources.swsearch) or collection name for pgvector'
|
122
160
|
)
|
123
161
|
|
162
|
+
parser.add_argument(
|
163
|
+
'--output-dir',
|
164
|
+
help='Output directory for results (creates one file per source file when used with --output-format json, or auto-names index files)'
|
165
|
+
)
|
166
|
+
|
167
|
+
parser.add_argument(
|
168
|
+
'--output-format',
|
169
|
+
choices=['index', 'json'],
|
170
|
+
default='index',
|
171
|
+
help='Output format: index (create search index) or json (export chunks as JSON) (default: index)'
|
172
|
+
)
|
173
|
+
|
124
174
|
parser.add_argument(
|
125
175
|
'--backend',
|
126
176
|
choices=['sqlite', 'pgvector'],
|
@@ -141,7 +191,7 @@ Examples:
|
|
141
191
|
|
142
192
|
parser.add_argument(
|
143
193
|
'--chunking-strategy',
|
144
|
-
choices=['sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa'],
|
194
|
+
choices=['sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json'],
|
145
195
|
default='sentence',
|
146
196
|
help='Chunking strategy to use (default: sentence)'
|
147
197
|
)
|
@@ -192,8 +242,8 @@ Examples:
|
|
192
242
|
|
193
243
|
parser.add_argument(
|
194
244
|
'--model',
|
195
|
-
default=
|
196
|
-
help='Sentence transformer model name (
|
245
|
+
default=DEFAULT_MODEL,
|
246
|
+
help=f'Sentence transformer model name or alias (mini/base/large). Default: mini ({DEFAULT_MODEL})'
|
197
247
|
)
|
198
248
|
|
199
249
|
parser.add_argument(
|
@@ -236,6 +286,9 @@ Examples:
|
|
236
286
|
|
237
287
|
args = parser.parse_args()
|
238
288
|
|
289
|
+
# Resolve model aliases
|
290
|
+
args.model = resolve_model_alias(args.model)
|
291
|
+
|
239
292
|
# Validate sources
|
240
293
|
valid_sources = []
|
241
294
|
for source in args.sources:
|
@@ -254,8 +307,35 @@ Examples:
|
|
254
307
|
print("Error: --connection-string is required for pgvector backend")
|
255
308
|
sys.exit(1)
|
256
309
|
|
257
|
-
#
|
258
|
-
if
|
310
|
+
# Validate output options
|
311
|
+
if args.output and args.output_dir:
|
312
|
+
print("Error: Cannot specify both --output and --output-dir")
|
313
|
+
sys.exit(1)
|
314
|
+
|
315
|
+
# Handle JSON output format differently
|
316
|
+
if args.output_format == 'json':
|
317
|
+
# JSON export doesn't use backend
|
318
|
+
if args.backend != 'sqlite':
|
319
|
+
print("Warning: --backend is ignored when using --output-format json")
|
320
|
+
|
321
|
+
# Determine output location
|
322
|
+
if args.output_dir:
|
323
|
+
# Multiple files mode
|
324
|
+
output_path = Path(args.output_dir)
|
325
|
+
if not output_path.exists():
|
326
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
327
|
+
elif args.output:
|
328
|
+
# Single file mode
|
329
|
+
output_path = Path(args.output)
|
330
|
+
if not output_path.suffix:
|
331
|
+
output_path = output_path.with_suffix('.json')
|
332
|
+
else:
|
333
|
+
# Default to single file
|
334
|
+
output_path = Path('chunks.json')
|
335
|
+
args.output = str(output_path)
|
336
|
+
|
337
|
+
# Default output filename (for index format)
|
338
|
+
if args.output_format == 'index' and not args.output and not args.output_dir:
|
259
339
|
if args.backend == 'sqlite':
|
260
340
|
if len(valid_sources) == 1:
|
261
341
|
# Single source - use its name
|
@@ -272,8 +352,25 @@ Examples:
|
|
272
352
|
else:
|
273
353
|
args.output = "documents"
|
274
354
|
|
275
|
-
#
|
276
|
-
if args.
|
355
|
+
# Handle --output-dir for index format
|
356
|
+
if args.output_format == 'index' and args.output_dir:
|
357
|
+
# Auto-generate output filename in the directory
|
358
|
+
if len(valid_sources) == 1:
|
359
|
+
source_name = valid_sources[0].stem if valid_sources[0].is_file() else valid_sources[0].name
|
360
|
+
else:
|
361
|
+
source_name = "combined"
|
362
|
+
|
363
|
+
output_dir = Path(args.output_dir)
|
364
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
365
|
+
|
366
|
+
if args.backend == 'sqlite':
|
367
|
+
args.output = str(output_dir / f"{source_name}.swsearch")
|
368
|
+
else:
|
369
|
+
# For pgvector, still use the name as collection
|
370
|
+
args.output = source_name
|
371
|
+
|
372
|
+
# Ensure output has .swsearch extension for sqlite (but not for JSON format)
|
373
|
+
if args.output_format == 'index' and args.backend == 'sqlite' and args.output and not args.output.endswith('.swsearch'):
|
277
374
|
args.output += '.swsearch'
|
278
375
|
|
279
376
|
# Parse lists
|
@@ -320,6 +417,103 @@ Examples:
|
|
320
417
|
print()
|
321
418
|
|
322
419
|
try:
|
420
|
+
# Handle JSON export mode
|
421
|
+
if args.output_format == 'json':
|
422
|
+
# Import what we need for chunking
|
423
|
+
from signalwire_agents.search.index_builder import IndexBuilder
|
424
|
+
import json
|
425
|
+
|
426
|
+
builder = IndexBuilder(
|
427
|
+
chunking_strategy=args.chunking_strategy,
|
428
|
+
max_sentences_per_chunk=args.max_sentences_per_chunk,
|
429
|
+
chunk_size=args.chunk_size,
|
430
|
+
chunk_overlap=args.overlap_size,
|
431
|
+
split_newlines=args.split_newlines,
|
432
|
+
index_nlp_backend=args.index_nlp_backend,
|
433
|
+
verbose=args.verbose,
|
434
|
+
semantic_threshold=args.semantic_threshold,
|
435
|
+
topic_threshold=args.topic_threshold
|
436
|
+
)
|
437
|
+
|
438
|
+
# Process files and export chunks
|
439
|
+
all_chunks = []
|
440
|
+
chunk_files_created = []
|
441
|
+
|
442
|
+
# Discover files from sources
|
443
|
+
files = builder._discover_files_from_sources(valid_sources, file_types, exclude_patterns)
|
444
|
+
|
445
|
+
if args.verbose:
|
446
|
+
print(f"Processing {len(files)} files...")
|
447
|
+
|
448
|
+
for file_path in files:
|
449
|
+
try:
|
450
|
+
# Determine base directory for relative paths
|
451
|
+
base_dir = builder._get_base_directory_for_file(file_path, valid_sources)
|
452
|
+
|
453
|
+
# Process file into chunks
|
454
|
+
chunks = builder._process_file(file_path, base_dir, tags)
|
455
|
+
|
456
|
+
if args.output_dir:
|
457
|
+
# Create individual JSON file
|
458
|
+
relative_path = file_path.relative_to(base_dir) if base_dir else file_path.name
|
459
|
+
json_filename = relative_path.with_suffix('.json')
|
460
|
+
json_path = Path(args.output_dir) / json_filename
|
461
|
+
|
462
|
+
# Create subdirectories if needed
|
463
|
+
json_path.parent.mkdir(parents=True, exist_ok=True)
|
464
|
+
|
465
|
+
# Save chunks to JSON
|
466
|
+
chunk_data = {
|
467
|
+
"chunks": chunks,
|
468
|
+
"metadata": {
|
469
|
+
"source_file": str(relative_path),
|
470
|
+
"total_chunks": len(chunks),
|
471
|
+
"chunking_strategy": args.chunking_strategy,
|
472
|
+
"processing_date": datetime.now().isoformat()
|
473
|
+
}
|
474
|
+
}
|
475
|
+
|
476
|
+
with open(json_path, 'w', encoding='utf-8') as f:
|
477
|
+
json.dump(chunk_data, f, indent=2, ensure_ascii=False)
|
478
|
+
|
479
|
+
chunk_files_created.append(json_path)
|
480
|
+
if args.verbose:
|
481
|
+
print(f" Created: {json_path} ({len(chunks)} chunks)")
|
482
|
+
else:
|
483
|
+
# Accumulate all chunks for single file output
|
484
|
+
all_chunks.extend(chunks)
|
485
|
+
|
486
|
+
except Exception as e:
|
487
|
+
print(f"Error processing {file_path}: {e}")
|
488
|
+
if args.verbose:
|
489
|
+
import traceback
|
490
|
+
traceback.print_exc()
|
491
|
+
|
492
|
+
# Handle single file output
|
493
|
+
if not args.output_dir:
|
494
|
+
output_data = {
|
495
|
+
"chunks": all_chunks,
|
496
|
+
"metadata": {
|
497
|
+
"total_chunks": len(all_chunks),
|
498
|
+
"total_files": len(files),
|
499
|
+
"chunking_strategy": args.chunking_strategy,
|
500
|
+
"processing_date": datetime.now().isoformat()
|
501
|
+
}
|
502
|
+
}
|
503
|
+
|
504
|
+
with open(args.output, 'w', encoding='utf-8') as f:
|
505
|
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
506
|
+
|
507
|
+
print(f"✓ Exported {len(all_chunks)} chunks to {args.output}")
|
508
|
+
else:
|
509
|
+
print(f"✓ Created {len(chunk_files_created)} JSON files in {args.output_dir}")
|
510
|
+
total_chunks = sum(len(json.load(open(f))['chunks']) for f in chunk_files_created)
|
511
|
+
print(f" Total chunks: {total_chunks}")
|
512
|
+
|
513
|
+
# Exit early for JSON format
|
514
|
+
return
|
515
|
+
|
516
|
+
# Regular index building mode
|
323
517
|
# Create index builder - import only when actually needed
|
324
518
|
from signalwire_agents.search.index_builder import IndexBuilder
|
325
519
|
builder = IndexBuilder(
|
@@ -365,7 +559,13 @@ Examples:
|
|
365
559
|
sys.exit(1)
|
366
560
|
|
367
561
|
if args.backend == 'sqlite':
|
368
|
-
|
562
|
+
# Check if the index was actually created
|
563
|
+
import os
|
564
|
+
if os.path.exists(args.output):
|
565
|
+
print(f"\n✓ Search index created successfully: {args.output}")
|
566
|
+
else:
|
567
|
+
print(f"\n✗ Search index creation failed - no files were processed")
|
568
|
+
sys.exit(1)
|
369
569
|
else:
|
370
570
|
print(f"\n✓ Search collection created successfully: {args.output}")
|
371
571
|
print(f" Connection: {args.connection_string}")
|
@@ -422,21 +622,41 @@ def search_command():
|
|
422
622
|
"""Search within an existing search index"""
|
423
623
|
parser = argparse.ArgumentParser(description='Search within a .swsearch index file or pgvector collection')
|
424
624
|
parser.add_argument('index_source', help='Path to .swsearch file or collection name for pgvector')
|
425
|
-
parser.add_argument('query', help='Search query')
|
625
|
+
parser.add_argument('query', nargs='?', help='Search query (optional if using --shell)')
|
426
626
|
parser.add_argument('--backend', choices=['sqlite', 'pgvector'], default='sqlite',
|
427
627
|
help='Storage backend (default: sqlite)')
|
428
628
|
parser.add_argument('--connection-string', help='PostgreSQL connection string for pgvector backend')
|
629
|
+
parser.add_argument('--shell', action='store_true',
|
630
|
+
help='Interactive shell mode - load once and search multiple times')
|
429
631
|
parser.add_argument('--count', type=int, default=5, help='Number of results to return (default: 5)')
|
430
632
|
parser.add_argument('--distance-threshold', type=float, default=0.0, help='Minimum similarity score (default: 0.0)')
|
431
633
|
parser.add_argument('--tags', help='Comma-separated tags to filter by')
|
432
634
|
parser.add_argument('--query-nlp-backend', choices=['nltk', 'spacy'], default='nltk',
|
433
635
|
help='NLP backend for query processing: nltk (fast, default) or spacy (better quality, slower)')
|
636
|
+
parser.add_argument('--keyword-weight', type=float, default=None,
|
637
|
+
help='Manual keyword weight (0.0-1.0). Overrides automatic weight detection.')
|
434
638
|
parser.add_argument('--verbose', action='store_true', help='Show detailed information')
|
435
639
|
parser.add_argument('--json', action='store_true', help='Output results as JSON')
|
436
640
|
parser.add_argument('--no-content', action='store_true', help='Hide content in results (show only metadata)')
|
641
|
+
parser.add_argument('--model', help='Override embedding model for query (mini/base/large or full model name)')
|
437
642
|
|
438
643
|
args = parser.parse_args()
|
439
644
|
|
645
|
+
# Validate arguments
|
646
|
+
if not args.shell and not args.query:
|
647
|
+
print("Error: Query is required unless using --shell mode")
|
648
|
+
sys.exit(1)
|
649
|
+
|
650
|
+
# Resolve model aliases
|
651
|
+
if args.model and args.model in MODEL_ALIASES:
|
652
|
+
args.model = MODEL_ALIASES[args.model]
|
653
|
+
|
654
|
+
# Validate keyword weight if provided
|
655
|
+
if args.keyword_weight is not None:
|
656
|
+
if args.keyword_weight < 0.0 or args.keyword_weight > 1.0:
|
657
|
+
print("Error: --keyword-weight must be between 0.0 and 1.0")
|
658
|
+
sys.exit(1)
|
659
|
+
|
440
660
|
# Validate backend configuration
|
441
661
|
if args.backend == 'pgvector' and not args.connection_string:
|
442
662
|
print("Error: --connection-string is required for pgvector backend")
|
@@ -464,21 +684,167 @@ def search_command():
|
|
464
684
|
print(f"Connecting to pgvector collection: {args.index_source}")
|
465
685
|
|
466
686
|
if args.backend == 'sqlite':
|
467
|
-
|
687
|
+
# Pass the model from the index or override if specified
|
688
|
+
model = args.model if args.model else None
|
689
|
+
engine = SearchEngine(backend='sqlite', index_path=args.index_source, model=model)
|
468
690
|
else:
|
691
|
+
# Pass the model override if specified
|
692
|
+
model = args.model if args.model else None
|
469
693
|
engine = SearchEngine(backend='pgvector', connection_string=args.connection_string,
|
470
|
-
collection_name=args.index_source)
|
694
|
+
collection_name=args.index_source, model=model)
|
471
695
|
|
472
696
|
# Get index stats
|
473
697
|
stats = engine.get_stats()
|
698
|
+
|
699
|
+
# Get the model from index config if not overridden
|
700
|
+
model_to_use = args.model
|
701
|
+
if not model_to_use and 'config' in stats:
|
702
|
+
# SQLite uses 'embedding_model', pgvector uses 'model_name'
|
703
|
+
model_to_use = stats['config'].get('embedding_model') or stats['config'].get('model_name')
|
704
|
+
|
705
|
+
# Shell mode implementation
|
706
|
+
if args.shell:
|
707
|
+
import time
|
708
|
+
print(f"Search Shell - Index: {args.index_source}")
|
709
|
+
print(f"Backend: {args.backend}")
|
710
|
+
print(f"Index contains {stats['total_chunks']} chunks from {stats['total_files']} files")
|
711
|
+
if model_to_use:
|
712
|
+
print(f"Model: {model_to_use}")
|
713
|
+
print("Type 'exit' or 'quit' to leave, 'help' for options")
|
714
|
+
print("-" * 60)
|
715
|
+
|
716
|
+
while True:
|
717
|
+
try:
|
718
|
+
query = input("\nsearch> ").strip()
|
719
|
+
|
720
|
+
if not query:
|
721
|
+
continue
|
722
|
+
|
723
|
+
if query.lower() in ['exit', 'quit', 'q']:
|
724
|
+
print("Goodbye!")
|
725
|
+
break
|
726
|
+
|
727
|
+
if query.lower() == 'help':
|
728
|
+
print("\nShell commands:")
|
729
|
+
print(" help - Show this help")
|
730
|
+
print(" exit/quit/q - Exit shell")
|
731
|
+
print(" count=N - Set result count (current: {})".format(args.count))
|
732
|
+
print(" tags=tag1,tag2 - Set tag filter (current: {})".format(args.tags or 'none'))
|
733
|
+
print(" verbose - Toggle verbose output")
|
734
|
+
print("\nOr type any search query...")
|
735
|
+
continue
|
736
|
+
|
737
|
+
# Handle shell commands
|
738
|
+
if query.startswith('count='):
|
739
|
+
try:
|
740
|
+
args.count = int(query.split('=')[1])
|
741
|
+
print(f"Result count set to: {args.count}")
|
742
|
+
except:
|
743
|
+
print("Invalid count value")
|
744
|
+
continue
|
745
|
+
|
746
|
+
if query.startswith('tags='):
|
747
|
+
tag_str = query.split('=', 1)[1]
|
748
|
+
args.tags = tag_str if tag_str else None
|
749
|
+
tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
|
750
|
+
print(f"Tags filter set to: {tags or 'none'}")
|
751
|
+
continue
|
752
|
+
|
753
|
+
if query == 'verbose':
|
754
|
+
args.verbose = not args.verbose
|
755
|
+
print(f"Verbose output: {'on' if args.verbose else 'off'}")
|
756
|
+
continue
|
757
|
+
|
758
|
+
# Perform search with timing
|
759
|
+
start_time = time.time()
|
760
|
+
|
761
|
+
# Preprocess query
|
762
|
+
enhanced = preprocess_query(
|
763
|
+
query,
|
764
|
+
vector=True,
|
765
|
+
query_nlp_backend=args.query_nlp_backend,
|
766
|
+
model_name=model_to_use,
|
767
|
+
preserve_original=True,
|
768
|
+
max_synonyms=2
|
769
|
+
)
|
770
|
+
|
771
|
+
# Parse tags
|
772
|
+
tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
|
773
|
+
|
774
|
+
# Perform search
|
775
|
+
results = engine.search(
|
776
|
+
query_vector=enhanced.get('vector'),
|
777
|
+
enhanced_text=enhanced.get('enhanced_text', query),
|
778
|
+
count=args.count,
|
779
|
+
distance_threshold=args.distance_threshold,
|
780
|
+
tags=tags,
|
781
|
+
keyword_weight=args.keyword_weight,
|
782
|
+
original_query=query
|
783
|
+
)
|
784
|
+
|
785
|
+
search_time = time.time() - start_time
|
786
|
+
|
787
|
+
# Display results
|
788
|
+
if not results:
|
789
|
+
print(f"\nNo results found for '{query}' ({search_time:.3f}s)")
|
790
|
+
else:
|
791
|
+
print(f"\nFound {len(results)} result(s) for '{query}' ({search_time:.3f}s):")
|
792
|
+
if enhanced.get('enhanced_text') != query and args.verbose:
|
793
|
+
print(f"Enhanced query: '{enhanced.get('enhanced_text')}'")
|
794
|
+
print("=" * 60)
|
795
|
+
|
796
|
+
for i, result in enumerate(results):
|
797
|
+
print(f"\n[{i+1}] Score: {result['score']:.4f}")
|
798
|
+
|
799
|
+
# Show metadata
|
800
|
+
metadata = result['metadata']
|
801
|
+
print(f"File: {metadata.get('filename', 'Unknown')}")
|
802
|
+
if metadata.get('section'):
|
803
|
+
print(f"Section: {metadata['section']}")
|
804
|
+
|
805
|
+
# Show content unless suppressed
|
806
|
+
if not args.no_content:
|
807
|
+
content = result['content']
|
808
|
+
if len(content) > 300 and not args.verbose:
|
809
|
+
content = content[:300] + "..."
|
810
|
+
print(f"\n{content}")
|
811
|
+
|
812
|
+
if i < len(results) - 1:
|
813
|
+
print("-" * 40)
|
814
|
+
|
815
|
+
except KeyboardInterrupt:
|
816
|
+
print("\nUse 'exit' to quit")
|
817
|
+
except EOFError:
|
818
|
+
print("\nGoodbye!")
|
819
|
+
break
|
820
|
+
except Exception as e:
|
821
|
+
print(f"\nError: {e}")
|
822
|
+
if args.verbose:
|
823
|
+
import traceback
|
824
|
+
traceback.print_exc()
|
825
|
+
|
826
|
+
return # Exit after shell mode
|
827
|
+
|
828
|
+
# Normal single query mode
|
474
829
|
if args.verbose:
|
475
830
|
print(f"Index contains {stats['total_chunks']} chunks from {stats['total_files']} files")
|
476
831
|
print(f"Searching for: '{args.query}'")
|
477
832
|
print(f"Query NLP Backend: {args.query_nlp_backend}")
|
833
|
+
if args.model:
|
834
|
+
print(f"Override model: {args.model}")
|
835
|
+
elif model_to_use:
|
836
|
+
print(f"Using index model: {model_to_use}")
|
478
837
|
print()
|
479
838
|
|
480
839
|
# Preprocess query
|
481
|
-
enhanced = preprocess_query(
|
840
|
+
enhanced = preprocess_query(
|
841
|
+
args.query,
|
842
|
+
vector=True, # Both backends need vector for similarity search
|
843
|
+
query_nlp_backend=args.query_nlp_backend,
|
844
|
+
model_name=model_to_use,
|
845
|
+
preserve_original=True, # Keep original query terms
|
846
|
+
max_synonyms=2 # Reduce synonym expansion
|
847
|
+
)
|
482
848
|
|
483
849
|
# Parse tags if provided
|
484
850
|
tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
|
@@ -489,7 +855,9 @@ def search_command():
|
|
489
855
|
enhanced_text=enhanced.get('enhanced_text', args.query),
|
490
856
|
count=args.count,
|
491
857
|
distance_threshold=args.distance_threshold,
|
492
|
-
tags=tags
|
858
|
+
tags=tags,
|
859
|
+
keyword_weight=args.keyword_weight,
|
860
|
+
original_query=args.query # Pass original for exact match boosting
|
493
861
|
)
|
494
862
|
|
495
863
|
if args.json:
|
@@ -558,6 +926,142 @@ def search_command():
|
|
558
926
|
traceback.print_exc()
|
559
927
|
sys.exit(1)
|
560
928
|
|
929
|
+
def migrate_command():
|
930
|
+
"""Migrate search indexes between backends"""
|
931
|
+
parser = argparse.ArgumentParser(
|
932
|
+
description='Migrate search indexes between SQLite and pgvector backends',
|
933
|
+
epilog="""
|
934
|
+
Examples:
|
935
|
+
# Migrate SQLite to pgvector
|
936
|
+
sw-search migrate ./docs.swsearch \\
|
937
|
+
--to-pgvector \\
|
938
|
+
--connection-string "postgresql://user:pass@localhost/db" \\
|
939
|
+
--collection-name docs_collection
|
940
|
+
|
941
|
+
# Migrate with overwrite
|
942
|
+
sw-search migrate ./docs.swsearch \\
|
943
|
+
--to-pgvector \\
|
944
|
+
--connection-string "postgresql://user:pass@localhost/db" \\
|
945
|
+
--collection-name docs_collection \\
|
946
|
+
--overwrite
|
947
|
+
|
948
|
+
# Get index information
|
949
|
+
sw-search migrate --info ./docs.swsearch
|
950
|
+
""",
|
951
|
+
formatter_class=argparse.RawDescriptionHelpFormatter
|
952
|
+
)
|
953
|
+
|
954
|
+
# Source argument (optional if using --info)
|
955
|
+
parser.add_argument('source', nargs='?', help='Source index file or collection')
|
956
|
+
|
957
|
+
# Migration direction
|
958
|
+
migration_group = parser.add_mutually_exclusive_group()
|
959
|
+
migration_group.add_argument('--to-pgvector', action='store_true',
|
960
|
+
help='Migrate SQLite index to pgvector')
|
961
|
+
migration_group.add_argument('--to-sqlite', action='store_true',
|
962
|
+
help='Migrate pgvector collection to SQLite (not yet implemented)')
|
963
|
+
migration_group.add_argument('--info', action='store_true',
|
964
|
+
help='Show information about an index')
|
965
|
+
|
966
|
+
# pgvector options
|
967
|
+
parser.add_argument('--connection-string',
|
968
|
+
help='PostgreSQL connection string for pgvector')
|
969
|
+
parser.add_argument('--collection-name',
|
970
|
+
help='Collection name for pgvector')
|
971
|
+
parser.add_argument('--overwrite', action='store_true',
|
972
|
+
help='Overwrite existing collection')
|
973
|
+
|
974
|
+
# SQLite options
|
975
|
+
parser.add_argument('--output',
|
976
|
+
help='Output .swsearch file path (for --to-sqlite)')
|
977
|
+
|
978
|
+
# Common options
|
979
|
+
parser.add_argument('--batch-size', type=int, default=100,
|
980
|
+
help='Number of chunks to process at once (default: 100)')
|
981
|
+
parser.add_argument('--verbose', action='store_true',
|
982
|
+
help='Show detailed progress')
|
983
|
+
|
984
|
+
args = parser.parse_args()
|
985
|
+
|
986
|
+
# Handle --info flag
|
987
|
+
if args.info:
|
988
|
+
if not args.source:
|
989
|
+
print("Error: Source index required with --info")
|
990
|
+
sys.exit(1)
|
991
|
+
|
992
|
+
try:
|
993
|
+
from signalwire_agents.search.migration import SearchIndexMigrator
|
994
|
+
migrator = SearchIndexMigrator(verbose=args.verbose)
|
995
|
+
info = migrator.get_index_info(args.source)
|
996
|
+
|
997
|
+
print(f"Index Information: {args.source}")
|
998
|
+
print(f" Type: {info['type']}")
|
999
|
+
if info['type'] == 'sqlite':
|
1000
|
+
print(f" Total chunks: {info['total_chunks']}")
|
1001
|
+
print(f" Total files: {info['total_files']}")
|
1002
|
+
print(f" Model: {info['config'].get('embedding_model', 'Unknown')}")
|
1003
|
+
print(f" Dimensions: {info['config'].get('embedding_dimensions', 'Unknown')}")
|
1004
|
+
print(f" Created: {info['config'].get('created_at', 'Unknown')}")
|
1005
|
+
if args.verbose:
|
1006
|
+
print("\n Full configuration:")
|
1007
|
+
for key, value in info['config'].items():
|
1008
|
+
print(f" {key}: {value}")
|
1009
|
+
else:
|
1010
|
+
print(" Unable to determine index type")
|
1011
|
+
except Exception as e:
|
1012
|
+
print(f"Error getting index info: {e}")
|
1013
|
+
sys.exit(1)
|
1014
|
+
return
|
1015
|
+
|
1016
|
+
# Validate arguments for migration
|
1017
|
+
if not args.source:
|
1018
|
+
print("Error: Source index required for migration")
|
1019
|
+
sys.exit(1)
|
1020
|
+
|
1021
|
+
if not args.to_pgvector and not args.to_sqlite:
|
1022
|
+
print("Error: Must specify migration direction (--to-pgvector or --to-sqlite)")
|
1023
|
+
sys.exit(1)
|
1024
|
+
|
1025
|
+
try:
|
1026
|
+
from signalwire_agents.search.migration import SearchIndexMigrator
|
1027
|
+
migrator = SearchIndexMigrator(verbose=args.verbose)
|
1028
|
+
|
1029
|
+
if args.to_pgvector:
|
1030
|
+
# Validate pgvector arguments
|
1031
|
+
if not args.connection_string:
|
1032
|
+
print("Error: --connection-string required for pgvector migration")
|
1033
|
+
sys.exit(1)
|
1034
|
+
if not args.collection_name:
|
1035
|
+
print("Error: --collection-name required for pgvector migration")
|
1036
|
+
sys.exit(1)
|
1037
|
+
|
1038
|
+
# Perform migration
|
1039
|
+
print(f"Migrating {args.source} to pgvector collection '{args.collection_name}'...")
|
1040
|
+
stats = migrator.migrate_sqlite_to_pgvector(
|
1041
|
+
sqlite_path=args.source,
|
1042
|
+
connection_string=args.connection_string,
|
1043
|
+
collection_name=args.collection_name,
|
1044
|
+
overwrite=args.overwrite,
|
1045
|
+
batch_size=args.batch_size
|
1046
|
+
)
|
1047
|
+
|
1048
|
+
print(f"\n✓ Migration completed successfully!")
|
1049
|
+
print(f" Chunks migrated: {stats['chunks_migrated']}")
|
1050
|
+
print(f" Errors: {stats['errors']}")
|
1051
|
+
|
1052
|
+
elif args.to_sqlite:
|
1053
|
+
print("Error: pgvector to SQLite migration not yet implemented")
|
1054
|
+
print("This feature is planned for future development")
|
1055
|
+
sys.exit(1)
|
1056
|
+
|
1057
|
+
except Exception as e:
|
1058
|
+
print(f"\nError during migration: {e}")
|
1059
|
+
if args.verbose:
|
1060
|
+
import traceback
|
1061
|
+
traceback.print_exc()
|
1062
|
+
sys.exit(1)
|
1063
|
+
|
1064
|
+
|
561
1065
|
def remote_command():
|
562
1066
|
"""Search via remote API endpoint"""
|
563
1067
|
parser = argparse.ArgumentParser(description='Search via remote API endpoint')
|
@@ -833,6 +1337,11 @@ Examples:
|
|
833
1337
|
sys.argv.pop(1)
|
834
1338
|
remote_command()
|
835
1339
|
return
|
1340
|
+
elif sys.argv[1] == 'migrate':
|
1341
|
+
# Remove 'migrate' from argv and call migrate_command
|
1342
|
+
sys.argv.pop(1)
|
1343
|
+
migrate_command()
|
1344
|
+
return
|
836
1345
|
|
837
1346
|
# Regular build command
|
838
1347
|
main()
|