signalwire-agents 0.1.6__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. signalwire_agents/__init__.py +130 -4
  2. signalwire_agents/agent_server.py +438 -32
  3. signalwire_agents/agents/bedrock.py +296 -0
  4. signalwire_agents/cli/__init__.py +18 -0
  5. signalwire_agents/cli/build_search.py +1367 -0
  6. signalwire_agents/cli/config.py +80 -0
  7. signalwire_agents/cli/core/__init__.py +10 -0
  8. signalwire_agents/cli/core/agent_loader.py +470 -0
  9. signalwire_agents/cli/core/argparse_helpers.py +179 -0
  10. signalwire_agents/cli/core/dynamic_config.py +71 -0
  11. signalwire_agents/cli/core/service_loader.py +303 -0
  12. signalwire_agents/cli/execution/__init__.py +10 -0
  13. signalwire_agents/cli/execution/datamap_exec.py +446 -0
  14. signalwire_agents/cli/execution/webhook_exec.py +134 -0
  15. signalwire_agents/cli/init_project.py +1225 -0
  16. signalwire_agents/cli/output/__init__.py +10 -0
  17. signalwire_agents/cli/output/output_formatter.py +255 -0
  18. signalwire_agents/cli/output/swml_dump.py +186 -0
  19. signalwire_agents/cli/simulation/__init__.py +10 -0
  20. signalwire_agents/cli/simulation/data_generation.py +374 -0
  21. signalwire_agents/cli/simulation/data_overrides.py +200 -0
  22. signalwire_agents/cli/simulation/mock_env.py +282 -0
  23. signalwire_agents/cli/swaig_test_wrapper.py +52 -0
  24. signalwire_agents/cli/test_swaig.py +809 -0
  25. signalwire_agents/cli/types.py +81 -0
  26. signalwire_agents/core/__init__.py +2 -2
  27. signalwire_agents/core/agent/__init__.py +12 -0
  28. signalwire_agents/core/agent/config/__init__.py +12 -0
  29. signalwire_agents/core/agent/deployment/__init__.py +9 -0
  30. signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
  31. signalwire_agents/core/agent/prompt/__init__.py +14 -0
  32. signalwire_agents/core/agent/prompt/manager.py +306 -0
  33. signalwire_agents/core/agent/routing/__init__.py +9 -0
  34. signalwire_agents/core/agent/security/__init__.py +9 -0
  35. signalwire_agents/core/agent/swml/__init__.py +9 -0
  36. signalwire_agents/core/agent/tools/__init__.py +15 -0
  37. signalwire_agents/core/agent/tools/decorator.py +97 -0
  38. signalwire_agents/core/agent/tools/registry.py +210 -0
  39. signalwire_agents/core/agent_base.py +959 -2166
  40. signalwire_agents/core/auth_handler.py +233 -0
  41. signalwire_agents/core/config_loader.py +259 -0
  42. signalwire_agents/core/contexts.py +707 -0
  43. signalwire_agents/core/data_map.py +487 -0
  44. signalwire_agents/core/function_result.py +1150 -1
  45. signalwire_agents/core/logging_config.py +376 -0
  46. signalwire_agents/core/mixins/__init__.py +28 -0
  47. signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
  48. signalwire_agents/core/mixins/auth_mixin.py +287 -0
  49. signalwire_agents/core/mixins/prompt_mixin.py +358 -0
  50. signalwire_agents/core/mixins/serverless_mixin.py +368 -0
  51. signalwire_agents/core/mixins/skill_mixin.py +55 -0
  52. signalwire_agents/core/mixins/state_mixin.py +153 -0
  53. signalwire_agents/core/mixins/tool_mixin.py +230 -0
  54. signalwire_agents/core/mixins/web_mixin.py +1134 -0
  55. signalwire_agents/core/security/session_manager.py +174 -86
  56. signalwire_agents/core/security_config.py +333 -0
  57. signalwire_agents/core/skill_base.py +200 -0
  58. signalwire_agents/core/skill_manager.py +244 -0
  59. signalwire_agents/core/swaig_function.py +33 -9
  60. signalwire_agents/core/swml_builder.py +212 -12
  61. signalwire_agents/core/swml_handler.py +43 -13
  62. signalwire_agents/core/swml_renderer.py +123 -297
  63. signalwire_agents/core/swml_service.py +277 -260
  64. signalwire_agents/prefabs/concierge.py +6 -2
  65. signalwire_agents/prefabs/info_gatherer.py +149 -33
  66. signalwire_agents/prefabs/receptionist.py +14 -22
  67. signalwire_agents/prefabs/survey.py +6 -2
  68. signalwire_agents/schema.json +9218 -5489
  69. signalwire_agents/search/__init__.py +137 -0
  70. signalwire_agents/search/document_processor.py +1223 -0
  71. signalwire_agents/search/index_builder.py +804 -0
  72. signalwire_agents/search/migration.py +418 -0
  73. signalwire_agents/search/models.py +30 -0
  74. signalwire_agents/search/pgvector_backend.py +752 -0
  75. signalwire_agents/search/query_processor.py +502 -0
  76. signalwire_agents/search/search_engine.py +1264 -0
  77. signalwire_agents/search/search_service.py +574 -0
  78. signalwire_agents/skills/README.md +452 -0
  79. signalwire_agents/skills/__init__.py +23 -0
  80. signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
  81. signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
  82. signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
  83. signalwire_agents/skills/datasphere/README.md +210 -0
  84. signalwire_agents/skills/datasphere/__init__.py +12 -0
  85. signalwire_agents/skills/datasphere/skill.py +310 -0
  86. signalwire_agents/skills/datasphere_serverless/README.md +258 -0
  87. signalwire_agents/skills/datasphere_serverless/__init__.py +10 -0
  88. signalwire_agents/skills/datasphere_serverless/skill.py +237 -0
  89. signalwire_agents/skills/datetime/README.md +132 -0
  90. signalwire_agents/skills/datetime/__init__.py +10 -0
  91. signalwire_agents/skills/datetime/skill.py +126 -0
  92. signalwire_agents/skills/joke/README.md +149 -0
  93. signalwire_agents/skills/joke/__init__.py +10 -0
  94. signalwire_agents/skills/joke/skill.py +109 -0
  95. signalwire_agents/skills/math/README.md +161 -0
  96. signalwire_agents/skills/math/__init__.py +10 -0
  97. signalwire_agents/skills/math/skill.py +105 -0
  98. signalwire_agents/skills/mcp_gateway/README.md +230 -0
  99. signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
  100. signalwire_agents/skills/mcp_gateway/skill.py +421 -0
  101. signalwire_agents/skills/native_vector_search/README.md +210 -0
  102. signalwire_agents/skills/native_vector_search/__init__.py +10 -0
  103. signalwire_agents/skills/native_vector_search/skill.py +820 -0
  104. signalwire_agents/skills/play_background_file/README.md +218 -0
  105. signalwire_agents/skills/play_background_file/__init__.py +12 -0
  106. signalwire_agents/skills/play_background_file/skill.py +242 -0
  107. signalwire_agents/skills/registry.py +459 -0
  108. signalwire_agents/skills/spider/README.md +236 -0
  109. signalwire_agents/skills/spider/__init__.py +13 -0
  110. signalwire_agents/skills/spider/skill.py +598 -0
  111. signalwire_agents/skills/swml_transfer/README.md +395 -0
  112. signalwire_agents/skills/swml_transfer/__init__.py +10 -0
  113. signalwire_agents/skills/swml_transfer/skill.py +359 -0
  114. signalwire_agents/skills/weather_api/README.md +178 -0
  115. signalwire_agents/skills/weather_api/__init__.py +12 -0
  116. signalwire_agents/skills/weather_api/skill.py +191 -0
  117. signalwire_agents/skills/web_search/README.md +163 -0
  118. signalwire_agents/skills/web_search/__init__.py +10 -0
  119. signalwire_agents/skills/web_search/skill.py +739 -0
  120. signalwire_agents/skills/wikipedia_search/README.md +228 -0
  121. signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
  122. signalwire_agents/skills/wikipedia_search/skill.py +210 -0
  123. signalwire_agents/utils/__init__.py +14 -0
  124. signalwire_agents/utils/schema_utils.py +111 -44
  125. signalwire_agents/web/__init__.py +17 -0
  126. signalwire_agents/web/web_service.py +559 -0
  127. signalwire_agents-1.0.7.data/data/share/man/man1/sw-agent-init.1 +307 -0
  128. signalwire_agents-1.0.7.data/data/share/man/man1/sw-search.1 +483 -0
  129. signalwire_agents-1.0.7.data/data/share/man/man1/swaig-test.1 +308 -0
  130. signalwire_agents-1.0.7.dist-info/METADATA +992 -0
  131. signalwire_agents-1.0.7.dist-info/RECORD +142 -0
  132. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/WHEEL +1 -1
  133. signalwire_agents-1.0.7.dist-info/entry_points.txt +4 -0
  134. signalwire_agents/core/state/file_state_manager.py +0 -219
  135. signalwire_agents/core/state/state_manager.py +0 -101
  136. signalwire_agents-0.1.6.data/data/schema.json +0 -5611
  137. signalwire_agents-0.1.6.dist-info/METADATA +0 -199
  138. signalwire_agents-0.1.6.dist-info/RECORD +0 -34
  139. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/licenses/LICENSE +0 -0
  140. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1367 @@
1
+ """
2
+ Copyright (c) 2025 SignalWire
3
+
4
+ This file is part of the SignalWire AI Agents SDK.
5
+
6
+ Licensed under the MIT License.
7
+ See LICENSE file in the project root for full license information.
8
+ """
9
+
10
+ import argparse
11
+ import sys
12
+ from pathlib import Path
13
+ from datetime import datetime
14
+
15
+ from signalwire_agents.search.models import MODEL_ALIASES, DEFAULT_MODEL, resolve_model_alias
16
+
17
+ def main():
18
+ """Main entry point for the build-search command"""
19
+ parser = argparse.ArgumentParser(
20
+ description='Build local search index from documents',
21
+ formatter_class=argparse.RawDescriptionHelpFormatter,
22
+ epilog="""
23
+ Examples:
24
+ # Basic usage with directory (defaults to sentence chunking with 5 sentences per chunk)
25
+ sw-search ./docs
26
+
27
+ # Multiple directories
28
+ sw-search ./docs ./examples --file-types md,txt,py
29
+
30
+ # Individual files
31
+ sw-search README.md ./docs/guide.md ./src/main.py
32
+
33
+ # Mixed sources (directories and files)
34
+ sw-search ./docs README.md ./examples specific_file.txt --file-types md,txt,py
35
+
36
+ # Sentence-based chunking with custom parameters
37
+ sw-search ./docs \\
38
+ --chunking-strategy sentence \\
39
+ --max-sentences-per-chunk 10 \\
40
+ --split-newlines 2
41
+
42
+ # Sliding window chunking
43
+ sw-search ./docs \\
44
+ --chunking-strategy sliding \\
45
+ --chunk-size 100 \\
46
+ --overlap-size 20
47
+
48
+ # Paragraph-based chunking
49
+ sw-search ./docs \\
50
+ --chunking-strategy paragraph \\
51
+ --file-types md,txt,rst
52
+
53
+ # Page-based chunking (good for PDFs)
54
+ sw-search ./docs \\
55
+ --chunking-strategy page \\
56
+ --file-types pdf
57
+
58
+ # Semantic chunking (groups semantically similar sentences)
59
+ sw-search ./docs \\
60
+ --chunking-strategy semantic \\
61
+ --semantic-threshold 0.6
62
+
63
+ # Topic-based chunking (groups by topic changes)
64
+ sw-search ./docs \\
65
+ --chunking-strategy topic \\
66
+ --topic-threshold 0.2
67
+
68
+ # QA-optimized chunking (optimized for question-answering)
69
+ sw-search ./docs \\
70
+ --chunking-strategy qa
71
+
72
+ # Markdown-aware chunking (preserves headers, detects code blocks, adds tags)
73
+ sw-search ./docs \\
74
+ --chunking-strategy markdown \\
75
+ --file-types md
76
+ # This strategy:
77
+ # - Chunks at header boundaries (h1, h2, h3...)
78
+ # - Detects code blocks and extracts language (python, bash, etc)
79
+ # - Adds "code" tags to chunks with code for better search
80
+ # - Preserves section hierarchy in metadata
81
+
82
+ # Model selection examples (performance vs quality tradeoff)
83
+ sw-search ./docs --model mini # Fastest (~5x faster), 384 dims, good for most use cases
84
+ sw-search ./docs --model base # Balanced speed/quality, 768 dims (previous default)
85
+ sw-search ./docs --model large # Best quality (same as base currently)
86
+ # Or use full model names:
87
+ sw-search ./docs --model sentence-transformers/all-MiniLM-L6-v2
88
+ sw-search ./docs --model sentence-transformers/all-mpnet-base-v2
89
+
90
+ # JSON-based chunking (pre-chunked content)
91
+ sw-search ./api_chunks.json \
92
+ --chunking-strategy json \
93
+ --file-types json
94
+
95
+ # Export chunks to JSON for review (single file)
96
+ sw-search ./docs \\
97
+ --output-format json \\
98
+ --output all_chunks.json
99
+
100
+ # Export chunks to JSON (one file per source)
101
+ sw-search ./docs \\
102
+ --output-format json \\
103
+ --output-dir ./chunks/
104
+
105
+ # Build index from exported JSON chunks
106
+ sw-search ./chunks/ \\
107
+ --chunking-strategy json \\
108
+ --file-types json \\
109
+ --output final.swsearch
110
+
111
+ # Full configuration example
112
+ sw-search ./docs ./examples README.md \\
113
+ --output ./knowledge.swsearch \\
114
+ --chunking-strategy sentence \\
115
+ --max-sentences-per-chunk 8 \\
116
+ --file-types md,txt,rst,py \\
117
+ --exclude "**/test/**,**/__pycache__/**" \\
118
+ --languages en,es,fr \\
119
+ --model sentence-transformers/all-mpnet-base-v2 \\
120
+ --tags documentation,api \\
121
+ --verbose
122
+
123
+ # Validate an existing index
124
+ sw-search validate ./docs.swsearch
125
+
126
+ # Search within an index
127
+ sw-search search ./docs.swsearch "how to create an agent"
128
+ sw-search search ./docs.swsearch "API reference" --count 3 --verbose
129
+ sw-search search ./docs.swsearch "configuration" --tags documentation --json
130
+
131
+ # Search via remote API
132
+ sw-search remote http://localhost:8001 "how to create an agent" --index-name docs
133
+ sw-search remote localhost:8001 "API reference" --index-name docs --count 3 --verbose
134
+
135
+ # Migrate between backends
136
+ sw-search migrate ./docs.swsearch --to-pgvector \\
137
+ --connection-string "postgresql://user:pass@localhost/db" \\
138
+ --collection-name docs_collection
139
+ sw-search migrate --info ./docs.swsearch
140
+
141
+ # PostgreSQL pgvector backend (direct build to PostgreSQL)
142
+ sw-search ./docs \\
143
+ --backend pgvector \\
144
+ --connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
145
+ --output docs_collection
146
+
147
+ # pgvector with markdown strategy (best for documentation with code examples)
148
+ sw-search ./docs \\
149
+ --backend pgvector \\
150
+ --connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
151
+ --output docs_collection \\
152
+ --chunking-strategy markdown
153
+
154
+ # Overwrite existing pgvector collection
155
+ sw-search ./docs \\
156
+ --backend pgvector \\
157
+ --connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
158
+ --output docs_collection \\
159
+ --overwrite
160
+
161
+ # Search in pgvector collection
162
+ sw-search search docs_collection "how to create an agent" \\
163
+ --backend pgvector \\
164
+ --connection-string "postgresql://user:pass@localhost/knowledge"
165
+ """
166
+ )
167
+
168
+ parser.add_argument(
169
+ 'sources',
170
+ nargs='+',
171
+ help='Source files and/or directories to index'
172
+ )
173
+
174
+ parser.add_argument(
175
+ '--output',
176
+ help='Output .swsearch file (default: sources.swsearch) or collection name for pgvector'
177
+ )
178
+
179
+ parser.add_argument(
180
+ '--output-dir',
181
+ help='Output directory for results (creates one file per source file when used with --output-format json, or auto-names index files)'
182
+ )
183
+
184
+ parser.add_argument(
185
+ '--output-format',
186
+ choices=['index', 'json'],
187
+ default='index',
188
+ help='Output format: index (create search index) or json (export chunks as JSON) (default: index)'
189
+ )
190
+
191
+ parser.add_argument(
192
+ '--backend',
193
+ choices=['sqlite', 'pgvector'],
194
+ default='sqlite',
195
+ help='Storage backend to use (default: sqlite)'
196
+ )
197
+
198
+ parser.add_argument(
199
+ '--connection-string',
200
+ help='PostgreSQL connection string for pgvector backend'
201
+ )
202
+
203
+ parser.add_argument(
204
+ '--overwrite',
205
+ action='store_true',
206
+ help='Overwrite existing collection (pgvector backend only)'
207
+ )
208
+
209
+ parser.add_argument(
210
+ '--chunking-strategy',
211
+ choices=['sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json', 'markdown'],
212
+ default='sentence',
213
+ help='Chunking strategy to use (default: sentence). Use "markdown" for documentation with code blocks.'
214
+ )
215
+
216
+ parser.add_argument(
217
+ '--max-sentences-per-chunk',
218
+ type=int,
219
+ default=5,
220
+ help='Maximum sentences per chunk for sentence strategy (default: 5)'
221
+ )
222
+
223
+ parser.add_argument(
224
+ '--chunk-size',
225
+ type=int,
226
+ default=50,
227
+ help='Chunk size in words for sliding window strategy (default: 50)'
228
+ )
229
+
230
+ parser.add_argument(
231
+ '--overlap-size',
232
+ type=int,
233
+ default=10,
234
+ help='Overlap size in words for sliding window strategy (default: 10)'
235
+ )
236
+
237
+ parser.add_argument(
238
+ '--split-newlines',
239
+ type=int,
240
+ help='Split on multiple newlines (for sentence strategy)'
241
+ )
242
+
243
+ parser.add_argument(
244
+ '--file-types',
245
+ default='md,txt,rst',
246
+ help='Comma-separated file extensions to include for directories (default: md,txt,rst)'
247
+ )
248
+
249
+ parser.add_argument(
250
+ '--exclude',
251
+ help='Comma-separated glob patterns to exclude (e.g., "**/test/**,**/__pycache__/**")'
252
+ )
253
+
254
+ parser.add_argument(
255
+ '--languages',
256
+ default='en',
257
+ help='Comma-separated language codes (default: en)'
258
+ )
259
+
260
+ parser.add_argument(
261
+ '--model',
262
+ default=DEFAULT_MODEL,
263
+ help=f'Sentence transformer model name or alias (mini/base/large). Default: mini ({DEFAULT_MODEL})'
264
+ )
265
+
266
+ parser.add_argument(
267
+ '--tags',
268
+ help='Comma-separated tags to add to all chunks'
269
+ )
270
+
271
+ parser.add_argument(
272
+ '--index-nlp-backend',
273
+ choices=['nltk', 'spacy'],
274
+ default='nltk',
275
+ help='NLP backend for document processing: nltk (fast, default) or spacy (better quality, slower)'
276
+ )
277
+
278
+ parser.add_argument(
279
+ '--verbose',
280
+ action='store_true',
281
+ help='Enable verbose output'
282
+ )
283
+
284
+ parser.add_argument(
285
+ '--validate',
286
+ action='store_true',
287
+ help='Validate the created index after building'
288
+ )
289
+
290
+ parser.add_argument(
291
+ '--semantic-threshold',
292
+ type=float,
293
+ default=0.5,
294
+ help='Similarity threshold for semantic chunking (default: 0.5)'
295
+ )
296
+
297
+ parser.add_argument(
298
+ '--topic-threshold',
299
+ type=float,
300
+ default=0.3,
301
+ help='Similarity threshold for topic chunking (default: 0.3)'
302
+ )
303
+
304
+ args = parser.parse_args()
305
+
306
+ # Resolve model aliases
307
+ args.model = resolve_model_alias(args.model)
308
+
309
+ # Validate sources
310
+ valid_sources = []
311
+ for source in args.sources:
312
+ source_path = Path(source)
313
+ if not source_path.exists():
314
+ print(f"Warning: Source does not exist, skipping: {source}")
315
+ continue
316
+ valid_sources.append(source_path)
317
+
318
+ if not valid_sources:
319
+ print("Error: No valid sources found")
320
+ sys.exit(1)
321
+
322
+ # Validate backend configuration
323
+ if args.backend == 'pgvector' and not args.connection_string:
324
+ print("Error: --connection-string is required for pgvector backend")
325
+ sys.exit(1)
326
+
327
+ # Validate output options
328
+ if args.output and args.output_dir:
329
+ print("Error: Cannot specify both --output and --output-dir")
330
+ sys.exit(1)
331
+
332
+ # Handle JSON output format differently
333
+ if args.output_format == 'json':
334
+ # JSON export doesn't use backend
335
+ if args.backend != 'sqlite':
336
+ print("Warning: --backend is ignored when using --output-format json")
337
+
338
+ # Determine output location
339
+ if args.output_dir:
340
+ # Multiple files mode
341
+ output_path = Path(args.output_dir)
342
+ if not output_path.exists():
343
+ output_path.mkdir(parents=True, exist_ok=True)
344
+ elif args.output:
345
+ # Single file mode
346
+ output_path = Path(args.output)
347
+ if not output_path.suffix:
348
+ output_path = output_path.with_suffix('.json')
349
+ else:
350
+ # Default to single file
351
+ output_path = Path('chunks.json')
352
+ args.output = str(output_path)
353
+
354
+ # Default output filename (for index format)
355
+ if args.output_format == 'index' and not args.output and not args.output_dir:
356
+ if args.backend == 'sqlite':
357
+ if len(valid_sources) == 1:
358
+ # Single source - use its name
359
+ source_name = valid_sources[0].stem if valid_sources[0].is_file() else valid_sources[0].name
360
+ args.output = f"{source_name}.swsearch"
361
+ else:
362
+ # Multiple sources - use generic name
363
+ args.output = "sources.swsearch"
364
+ else:
365
+ # For pgvector, use a default collection name
366
+ if len(valid_sources) == 1:
367
+ source_name = valid_sources[0].stem if valid_sources[0].is_file() else valid_sources[0].name
368
+ args.output = source_name
369
+ else:
370
+ args.output = "documents"
371
+
372
+ # Handle --output-dir for index format
373
+ if args.output_format == 'index' and args.output_dir:
374
+ # Auto-generate output filename in the directory
375
+ if len(valid_sources) == 1:
376
+ source_name = valid_sources[0].stem if valid_sources[0].is_file() else valid_sources[0].name
377
+ else:
378
+ source_name = "combined"
379
+
380
+ output_dir = Path(args.output_dir)
381
+ output_dir.mkdir(parents=True, exist_ok=True)
382
+
383
+ if args.backend == 'sqlite':
384
+ args.output = str(output_dir / f"{source_name}.swsearch")
385
+ else:
386
+ # For pgvector, still use the name as collection
387
+ args.output = source_name
388
+
389
+ # Ensure output has .swsearch extension for sqlite (but not for JSON format)
390
+ if args.output_format == 'index' and args.backend == 'sqlite' and args.output and not args.output.endswith('.swsearch'):
391
+ args.output += '.swsearch'
392
+
393
+ # Parse lists
394
+ file_types = [ft.strip() for ft in args.file_types.split(',')]
395
+ exclude_patterns = [p.strip() for p in args.exclude.split(',')] if args.exclude else None
396
+ languages = [lang.strip() for lang in args.languages.split(',')]
397
+ tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
398
+
399
+ if args.verbose:
400
+ print(f"Building search index:")
401
+ print(f" Backend: {args.backend}")
402
+ print(f" Sources: {[str(s) for s in valid_sources]}")
403
+ if args.backend == 'sqlite':
404
+ print(f" Output file: {args.output}")
405
+ else:
406
+ print(f" Collection name: {args.output}")
407
+ print(f" Connection: {args.connection_string}")
408
+ print(f" File types (for directories): {file_types}")
409
+ print(f" Exclude patterns: {exclude_patterns}")
410
+ print(f" Languages: {languages}")
411
+ print(f" Model: {args.model}")
412
+ print(f" Chunking strategy: {args.chunking_strategy}")
413
+ print(f" Index NLP backend: {args.index_nlp_backend}")
414
+
415
+ if args.chunking_strategy == 'sentence':
416
+ print(f" Max sentences per chunk: {args.max_sentences_per_chunk}")
417
+ if args.split_newlines:
418
+ print(f" Split on newlines: {args.split_newlines}")
419
+ elif args.chunking_strategy == 'sliding':
420
+ print(f" Chunk size (words): {args.chunk_size}")
421
+ print(f" Overlap size (words): {args.overlap_size}")
422
+ elif args.chunking_strategy == 'paragraph':
423
+ print(f" Chunking by paragraphs (double newlines)")
424
+ elif args.chunking_strategy == 'page':
425
+ print(f" Chunking by pages")
426
+ elif args.chunking_strategy == 'semantic':
427
+ print(f" Semantic chunking (similarity threshold: {args.semantic_threshold})")
428
+ elif args.chunking_strategy == 'topic':
429
+ print(f" Topic-based chunking (similarity threshold: {args.topic_threshold})")
430
+ elif args.chunking_strategy == 'qa':
431
+ print(f" QA-optimized chunking")
432
+
433
+ print(f" Tags: {tags}")
434
+ print()
435
+
436
+ try:
437
+ # Handle JSON export mode
438
+ if args.output_format == 'json':
439
+ # Import what we need for chunking
440
+ from signalwire_agents.search.index_builder import IndexBuilder
441
+ import json
442
+
443
+ builder = IndexBuilder(
444
+ chunking_strategy=args.chunking_strategy,
445
+ max_sentences_per_chunk=args.max_sentences_per_chunk,
446
+ chunk_size=args.chunk_size,
447
+ chunk_overlap=args.overlap_size,
448
+ split_newlines=args.split_newlines,
449
+ index_nlp_backend=args.index_nlp_backend,
450
+ verbose=args.verbose,
451
+ semantic_threshold=args.semantic_threshold,
452
+ topic_threshold=args.topic_threshold
453
+ )
454
+
455
+ # Process files and export chunks
456
+ all_chunks = []
457
+ chunk_files_created = []
458
+
459
+ # Discover files from sources
460
+ files = builder._discover_files_from_sources(valid_sources, file_types, exclude_patterns)
461
+
462
+ if args.verbose:
463
+ print(f"Processing {len(files)} files...")
464
+
465
+ for file_path in files:
466
+ try:
467
+ # Determine base directory for relative paths
468
+ base_dir = builder._get_base_directory_for_file(file_path, valid_sources)
469
+
470
+ # Process file into chunks
471
+ chunks = builder._process_file(file_path, base_dir, tags)
472
+
473
+ if args.output_dir:
474
+ # Create individual JSON file
475
+ relative_path = file_path.relative_to(base_dir) if base_dir else file_path.name
476
+ json_filename = relative_path.with_suffix('.json')
477
+ json_path = Path(args.output_dir) / json_filename
478
+
479
+ # Create subdirectories if needed
480
+ json_path.parent.mkdir(parents=True, exist_ok=True)
481
+
482
+ # Save chunks to JSON
483
+ chunk_data = {
484
+ "chunks": chunks,
485
+ "metadata": {
486
+ "source_file": str(relative_path),
487
+ "total_chunks": len(chunks),
488
+ "chunking_strategy": args.chunking_strategy,
489
+ "processing_date": datetime.now().isoformat()
490
+ }
491
+ }
492
+
493
+ with open(json_path, 'w', encoding='utf-8') as f:
494
+ json.dump(chunk_data, f, indent=2, ensure_ascii=False)
495
+
496
+ chunk_files_created.append(json_path)
497
+ if args.verbose:
498
+ print(f" Created: {json_path} ({len(chunks)} chunks)")
499
+ else:
500
+ # Accumulate all chunks for single file output
501
+ all_chunks.extend(chunks)
502
+
503
+ except Exception as e:
504
+ print(f"Error processing {file_path}: {e}")
505
+ if args.verbose:
506
+ import traceback
507
+ traceback.print_exc()
508
+
509
+ # Handle single file output
510
+ if not args.output_dir:
511
+ output_data = {
512
+ "chunks": all_chunks,
513
+ "metadata": {
514
+ "total_chunks": len(all_chunks),
515
+ "total_files": len(files),
516
+ "chunking_strategy": args.chunking_strategy,
517
+ "processing_date": datetime.now().isoformat()
518
+ }
519
+ }
520
+
521
+ with open(args.output, 'w', encoding='utf-8') as f:
522
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
523
+
524
+ print(f"✓ Exported {len(all_chunks)} chunks to {args.output}")
525
+ else:
526
+ print(f"✓ Created {len(chunk_files_created)} JSON files in {args.output_dir}")
527
+ total_chunks = sum(len(json.load(open(f))['chunks']) for f in chunk_files_created)
528
+ print(f" Total chunks: {total_chunks}")
529
+
530
+ # Exit early for JSON format
531
+ return
532
+
533
+ # Regular index building mode
534
+ # Create index builder - import only when actually needed
535
+ from signalwire_agents.search.index_builder import IndexBuilder
536
+ builder = IndexBuilder(
537
+ model_name=args.model,
538
+ chunking_strategy=args.chunking_strategy,
539
+ max_sentences_per_chunk=args.max_sentences_per_chunk,
540
+ chunk_size=args.chunk_size,
541
+ chunk_overlap=args.overlap_size,
542
+ split_newlines=args.split_newlines,
543
+ index_nlp_backend=args.index_nlp_backend,
544
+ verbose=args.verbose,
545
+ semantic_threshold=args.semantic_threshold,
546
+ topic_threshold=args.topic_threshold,
547
+ backend=args.backend,
548
+ connection_string=args.connection_string
549
+ )
550
+
551
+ # Build index with multiple sources
552
+ builder.build_index_from_sources(
553
+ sources=valid_sources,
554
+ output_file=args.output,
555
+ file_types=file_types,
556
+ exclude_patterns=exclude_patterns,
557
+ languages=languages,
558
+ tags=tags,
559
+ overwrite=args.overwrite if args.backend == 'pgvector' else False
560
+ )
561
+
562
+ # Validate if requested
563
+ if args.validate:
564
+ if args.verbose:
565
+ print("\nValidating index...")
566
+
567
+ validation = builder.validate_index(args.output)
568
+ if validation['valid']:
569
+ print(f"✓ Index validation successful:")
570
+ print(f" Chunks: {validation['chunk_count']}")
571
+ print(f" Files: {validation['file_count']}")
572
+ if args.verbose:
573
+ print(f" Config: {validation['config']}")
574
+ else:
575
+ print(f"✗ Index validation failed: {validation['error']}")
576
+ sys.exit(1)
577
+
578
+ if args.backend == 'sqlite':
579
+ # Check if the index was actually created
580
+ import os
581
+ if os.path.exists(args.output):
582
+ print(f"\n✓ Search index created successfully: {args.output}")
583
+ else:
584
+ print(f"\n✗ Search index creation failed - no files were processed")
585
+ sys.exit(1)
586
+ else:
587
+ print(f"\n✓ Search collection created successfully: {args.output}")
588
+ print(f" Connection: {args.connection_string}")
589
+
590
+ except KeyboardInterrupt:
591
+ print("\n\nBuild interrupted by user")
592
+ sys.exit(1)
593
+ except Exception as e:
594
+ print(f"\nError building index: {e}")
595
+ if args.verbose:
596
+ import traceback
597
+ traceback.print_exc()
598
+ sys.exit(1)
599
+
600
+ def validate_command():
601
+ """Validate an existing search index"""
602
+ parser = argparse.ArgumentParser(description='Validate a search index file')
603
+ parser.add_argument('index_file', help='Path to .swsearch file to validate')
604
+ parser.add_argument('--verbose', action='store_true', help='Show detailed information')
605
+
606
+ args = parser.parse_args()
607
+
608
+ if not Path(args.index_file).exists():
609
+ print(f"Error: Index file does not exist: {args.index_file}")
610
+ sys.exit(1)
611
+
612
+ try:
613
+ from signalwire_agents.search.index_builder import IndexBuilder
614
+ builder = IndexBuilder()
615
+
616
+ validation = builder.validate_index(args.index_file)
617
+
618
+ if validation['valid']:
619
+ print(f"✓ Index is valid: {args.index_file}")
620
+ print(f" Chunks: {validation['chunk_count']}")
621
+ print(f" Files: {validation['file_count']}")
622
+
623
+ if args.verbose and 'config' in validation:
624
+ print("\nConfiguration:")
625
+ for key, value in validation['config'].items():
626
+ print(f" {key}: {value}")
627
+ else:
628
+ print(f"✗ Index validation failed: {validation['error']}")
629
+ sys.exit(1)
630
+
631
+ except Exception as e:
632
+ print(f"Error validating index: {e}")
633
+ if args.verbose:
634
+ import traceback
635
+ traceback.print_exc()
636
+ sys.exit(1)
637
+
638
+ def search_command():
639
+ """Search within an existing search index"""
640
+ parser = argparse.ArgumentParser(description='Search within a .swsearch index file or pgvector collection')
641
+ parser.add_argument('index_source', help='Path to .swsearch file or collection name for pgvector')
642
+ parser.add_argument('query', nargs='?', help='Search query (optional if using --shell)')
643
+ parser.add_argument('--backend', choices=['sqlite', 'pgvector'], default='sqlite',
644
+ help='Storage backend (default: sqlite)')
645
+ parser.add_argument('--connection-string', help='PostgreSQL connection string for pgvector backend')
646
+ parser.add_argument('--shell', action='store_true',
647
+ help='Interactive shell mode - load once and search multiple times')
648
+ parser.add_argument('--count', type=int, default=5, help='Number of results to return (default: 5)')
649
+ parser.add_argument('--distance-threshold', type=float, default=0.0, help='Minimum similarity score (default: 0.0)')
650
+ parser.add_argument('--tags', help='Comma-separated tags to filter by')
651
+ parser.add_argument('--query-nlp-backend', choices=['nltk', 'spacy'], default='nltk',
652
+ help='NLP backend for query processing: nltk (fast, default) or spacy (better quality, slower)')
653
+ parser.add_argument('--keyword-weight', type=float, default=None,
654
+ help='Manual keyword weight (0.0-1.0). Overrides automatic weight detection.')
655
+ parser.add_argument('--verbose', action='store_true', help='Show detailed information')
656
+ parser.add_argument('--json', action='store_true', help='Output results as JSON')
657
+ parser.add_argument('--no-content', action='store_true', help='Hide content in results (show only metadata)')
658
+ parser.add_argument('--model', help='Override embedding model for query (mini/base/large or full model name)')
659
+
660
+ args = parser.parse_args()
661
+
662
+ # Validate arguments
663
+ if not args.shell and not args.query:
664
+ print("Error: Query is required unless using --shell mode")
665
+ sys.exit(1)
666
+
667
+ # Resolve model aliases
668
+ if args.model and args.model in MODEL_ALIASES:
669
+ args.model = MODEL_ALIASES[args.model]
670
+
671
+ # Validate keyword weight if provided
672
+ if args.keyword_weight is not None:
673
+ if args.keyword_weight < 0.0 or args.keyword_weight > 1.0:
674
+ print("Error: --keyword-weight must be between 0.0 and 1.0")
675
+ sys.exit(1)
676
+
677
+ # Validate backend configuration
678
+ if args.backend == 'pgvector' and not args.connection_string:
679
+ print("Error: --connection-string is required for pgvector backend")
680
+ sys.exit(1)
681
+
682
+ if args.backend == 'sqlite' and not Path(args.index_source).exists():
683
+ print(f"Error: Index file does not exist: {args.index_source}")
684
+ sys.exit(1)
685
+
686
+ try:
687
+ # Import search dependencies
688
+ try:
689
+ from signalwire_agents.search.search_engine import SearchEngine
690
+ from signalwire_agents.search.query_processor import preprocess_query
691
+ except ImportError as e:
692
+ print(f"Error: Search functionality not available. Install with: pip install signalwire-agents[search]")
693
+ print(f"Details: {e}")
694
+ sys.exit(1)
695
+
696
+ # Load search engine
697
+ if args.verbose:
698
+ if args.backend == 'sqlite':
699
+ print(f"Loading search index: {args.index_source}")
700
+ else:
701
+ print(f"Connecting to pgvector collection: {args.index_source}")
702
+
703
+ if args.backend == 'sqlite':
704
+ # Pass the model from the index or override if specified
705
+ model = args.model if args.model else None
706
+ engine = SearchEngine(backend='sqlite', index_path=args.index_source, model=model)
707
+ else:
708
+ # Pass the model override if specified
709
+ model = args.model if args.model else None
710
+ engine = SearchEngine(backend='pgvector', connection_string=args.connection_string,
711
+ collection_name=args.index_source, model=model)
712
+
713
+ # Get index stats
714
+ stats = engine.get_stats()
715
+
716
+ # Get the model from index config if not overridden
717
+ model_to_use = args.model
718
+ if not model_to_use and 'config' in stats:
719
+ # SQLite uses 'embedding_model', pgvector uses 'model_name'
720
+ model_to_use = stats['config'].get('embedding_model') or stats['config'].get('model_name')
721
+
722
+ # Shell mode implementation
723
+ if args.shell:
724
+ import time
725
+ print(f"Search Shell - Index: {args.index_source}")
726
+ print(f"Backend: {args.backend}")
727
+ print(f"Index contains {stats['total_chunks']} chunks from {stats['total_files']} files")
728
+ if model_to_use:
729
+ print(f"Model: {model_to_use}")
730
+ print("Type 'exit' or 'quit' to leave, 'help' for options")
731
+ print("-" * 60)
732
+
733
+ while True:
734
+ try:
735
+ query = input("\nsearch> ").strip()
736
+
737
+ if not query:
738
+ continue
739
+
740
+ if query.lower() in ['exit', 'quit', 'q']:
741
+ print("Goodbye!")
742
+ break
743
+
744
+ if query.lower() == 'help':
745
+ print("\nShell commands:")
746
+ print(" help - Show this help")
747
+ print(" exit/quit/q - Exit shell")
748
+ print(" count=N - Set result count (current: {})".format(args.count))
749
+ print(" tags=tag1,tag2 - Set tag filter (current: {})".format(args.tags or 'none'))
750
+ print(" verbose - Toggle verbose output")
751
+ print("\nOr type any search query...")
752
+ continue
753
+
754
+ # Handle shell commands
755
+ if query.startswith('count='):
756
+ try:
757
+ args.count = int(query.split('=')[1])
758
+ print(f"Result count set to: {args.count}")
759
+ except:
760
+ print("Invalid count value")
761
+ continue
762
+
763
+ if query.startswith('tags='):
764
+ tag_str = query.split('=', 1)[1]
765
+ args.tags = tag_str if tag_str else None
766
+ tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
767
+ print(f"Tags filter set to: {tags or 'none'}")
768
+ continue
769
+
770
+ if query == 'verbose':
771
+ args.verbose = not args.verbose
772
+ print(f"Verbose output: {'on' if args.verbose else 'off'}")
773
+ continue
774
+
775
+ # Perform search with timing
776
+ start_time = time.time()
777
+
778
+ # Preprocess query
779
+ enhanced = preprocess_query(
780
+ query,
781
+ vector=True,
782
+ query_nlp_backend=args.query_nlp_backend,
783
+ model_name=model_to_use,
784
+ preserve_original=True,
785
+ max_synonyms=2
786
+ )
787
+
788
+ # Parse tags
789
+ tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
790
+
791
+ # Perform search
792
+ results = engine.search(
793
+ query_vector=enhanced.get('vector'),
794
+ enhanced_text=enhanced.get('enhanced_text', query),
795
+ count=args.count,
796
+ similarity_threshold=args.similarity_threshold,
797
+ tags=tags,
798
+ keyword_weight=args.keyword_weight,
799
+ original_query=query
800
+ )
801
+
802
+ search_time = time.time() - start_time
803
+
804
+ # Display results
805
+ if not results:
806
+ print(f"\nNo results found for '{query}' ({search_time:.3f}s)")
807
+ else:
808
+ print(f"\nFound {len(results)} result(s) for '{query}' ({search_time:.3f}s):")
809
+ if enhanced.get('enhanced_text') != query and args.verbose:
810
+ print(f"Enhanced query: '{enhanced.get('enhanced_text')}'")
811
+ print("=" * 60)
812
+
813
+ for i, result in enumerate(results):
814
+ print(f"\n[{i+1}] Score: {result['score']:.4f}")
815
+
816
+ # Show metadata
817
+ metadata = result['metadata']
818
+ print(f"File: {metadata.get('filename', 'Unknown')}")
819
+ if metadata.get('section'):
820
+ print(f"Section: {metadata['section']}")
821
+
822
+ # Show content unless suppressed
823
+ if not args.no_content:
824
+ content = result['content']
825
+ if len(content) > 300 and not args.verbose:
826
+ content = content[:300] + "..."
827
+ print(f"\n{content}")
828
+
829
+ if i < len(results) - 1:
830
+ print("-" * 40)
831
+
832
+ except KeyboardInterrupt:
833
+ print("\nUse 'exit' to quit")
834
+ except EOFError:
835
+ print("\nGoodbye!")
836
+ break
837
+ except Exception as e:
838
+ print(f"\nError: {e}")
839
+ if args.verbose:
840
+ import traceback
841
+ traceback.print_exc()
842
+
843
+ return # Exit after shell mode
844
+
845
+ # Normal single query mode
846
+ if args.verbose:
847
+ print(f"Index contains {stats['total_chunks']} chunks from {stats['total_files']} files")
848
+ print(f"Searching for: '{args.query}'")
849
+ print(f"Query NLP Backend: {args.query_nlp_backend}")
850
+ if args.model:
851
+ print(f"Override model: {args.model}")
852
+ elif model_to_use:
853
+ print(f"Using index model: {model_to_use}")
854
+ print()
855
+
856
+ # Preprocess query
857
+ enhanced = preprocess_query(
858
+ args.query,
859
+ vector=True, # Both backends need vector for similarity search
860
+ query_nlp_backend=args.query_nlp_backend,
861
+ model_name=model_to_use,
862
+ preserve_original=True, # Keep original query terms
863
+ max_synonyms=2 # Reduce synonym expansion
864
+ )
865
+
866
+ # Parse tags if provided
867
+ tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
868
+
869
+ # Perform search
870
+ results = engine.search(
871
+ query_vector=enhanced.get('vector'),
872
+ enhanced_text=enhanced.get('enhanced_text', args.query),
873
+ count=args.count,
874
+ similarity_threshold=args.similarity_threshold,
875
+ tags=tags,
876
+ keyword_weight=args.keyword_weight,
877
+ original_query=args.query # Pass original for exact match boosting
878
+ )
879
+
880
+ if args.json:
881
+ # Output as JSON
882
+ import json
883
+ output = {
884
+ 'query': args.query,
885
+ 'enhanced_query': enhanced.get('enhanced_text', args.query),
886
+ 'count': len(results),
887
+ 'results': []
888
+ }
889
+
890
+ for i, result in enumerate(results):
891
+ result_data = {
892
+ 'rank': i + 1,
893
+ 'score': result['score'],
894
+ 'metadata': result['metadata']
895
+ }
896
+ if not args.no_content:
897
+ result_data['content'] = result['content']
898
+ output['results'].append(result_data)
899
+
900
+ print(json.dumps(output, indent=2))
901
+ else:
902
+ # Human-readable output
903
+ if not results:
904
+ print(f"No results found for '{args.query}'")
905
+ if tags:
906
+ print(f"(searched with tags: {tags})")
907
+ sys.exit(0)
908
+
909
+ print(f"Found {len(results)} result(s) for '{args.query}':")
910
+ if enhanced.get('enhanced_text') != args.query:
911
+ print(f"Enhanced query: '{enhanced.get('enhanced_text')}'")
912
+ if tags:
913
+ print(f"Filtered by tags: {tags}")
914
+ print("=" * 80)
915
+
916
+ for i, result in enumerate(results):
917
+ print(f"\n[{i+1}] Score: {result['score']:.4f}")
918
+
919
+ # Show metadata
920
+ metadata = result['metadata']
921
+ print(f"File: {metadata.get('filename', 'Unknown')}")
922
+ if metadata.get('section'):
923
+ print(f"Section: {metadata['section']}")
924
+ if metadata.get('line_start'):
925
+ print(f"Lines: {metadata['line_start']}-{metadata.get('line_end', metadata['line_start'])}")
926
+ if metadata.get('tags'):
927
+ print(f"Tags: {', '.join(metadata['tags'])}")
928
+
929
+ # Show content unless suppressed
930
+ if not args.no_content:
931
+ content = result['content']
932
+ if len(content) > 500 and not args.verbose:
933
+ content = content[:500] + "..."
934
+ print(f"\nContent:\n{content}")
935
+
936
+ if i < len(results) - 1:
937
+ print("-" * 80)
938
+
939
+ except Exception as e:
940
+ print(f"Error searching index: {e}")
941
+ if args.verbose:
942
+ import traceback
943
+ traceback.print_exc()
944
+ sys.exit(1)
945
+
946
+ def migrate_command():
947
+ """Migrate search indexes between backends"""
948
+ parser = argparse.ArgumentParser(
949
+ description='Migrate search indexes between SQLite and pgvector backends',
950
+ epilog="""
951
+ Examples:
952
+ # Migrate SQLite to pgvector
953
+ sw-search migrate ./docs.swsearch \\
954
+ --to-pgvector \\
955
+ --connection-string "postgresql://user:pass@localhost/db" \\
956
+ --collection-name docs_collection
957
+
958
+ # Migrate with overwrite
959
+ sw-search migrate ./docs.swsearch \\
960
+ --to-pgvector \\
961
+ --connection-string "postgresql://user:pass@localhost/db" \\
962
+ --collection-name docs_collection \\
963
+ --overwrite
964
+
965
+ # Get index information
966
+ sw-search migrate --info ./docs.swsearch
967
+ """,
968
+ formatter_class=argparse.RawDescriptionHelpFormatter
969
+ )
970
+
971
+ # Source argument (optional if using --info)
972
+ parser.add_argument('source', nargs='?', help='Source index file or collection')
973
+
974
+ # Migration direction
975
+ migration_group = parser.add_mutually_exclusive_group()
976
+ migration_group.add_argument('--to-pgvector', action='store_true',
977
+ help='Migrate SQLite index to pgvector')
978
+ migration_group.add_argument('--to-sqlite', action='store_true',
979
+ help='Migrate pgvector collection to SQLite (not yet implemented)')
980
+ migration_group.add_argument('--info', action='store_true',
981
+ help='Show information about an index')
982
+
983
+ # pgvector options
984
+ parser.add_argument('--connection-string',
985
+ help='PostgreSQL connection string for pgvector')
986
+ parser.add_argument('--collection-name',
987
+ help='Collection name for pgvector')
988
+ parser.add_argument('--overwrite', action='store_true',
989
+ help='Overwrite existing collection')
990
+
991
+ # SQLite options
992
+ parser.add_argument('--output',
993
+ help='Output .swsearch file path (for --to-sqlite)')
994
+
995
+ # Common options
996
+ parser.add_argument('--batch-size', type=int, default=100,
997
+ help='Number of chunks to process at once (default: 100)')
998
+ parser.add_argument('--verbose', action='store_true',
999
+ help='Show detailed progress')
1000
+
1001
+ args = parser.parse_args()
1002
+
1003
+ # Handle --info flag
1004
+ if args.info:
1005
+ if not args.source:
1006
+ print("Error: Source index required with --info")
1007
+ sys.exit(1)
1008
+
1009
+ try:
1010
+ from signalwire_agents.search.migration import SearchIndexMigrator
1011
+ migrator = SearchIndexMigrator(verbose=args.verbose)
1012
+ info = migrator.get_index_info(args.source)
1013
+
1014
+ print(f"Index Information: {args.source}")
1015
+ print(f" Type: {info['type']}")
1016
+ if info['type'] == 'sqlite':
1017
+ print(f" Total chunks: {info['total_chunks']}")
1018
+ print(f" Total files: {info['total_files']}")
1019
+ print(f" Model: {info['config'].get('embedding_model', 'Unknown')}")
1020
+ print(f" Dimensions: {info['config'].get('embedding_dimensions', 'Unknown')}")
1021
+ print(f" Created: {info['config'].get('created_at', 'Unknown')}")
1022
+ if args.verbose:
1023
+ print("\n Full configuration:")
1024
+ for key, value in info['config'].items():
1025
+ print(f" {key}: {value}")
1026
+ else:
1027
+ print(" Unable to determine index type")
1028
+ except Exception as e:
1029
+ print(f"Error getting index info: {e}")
1030
+ sys.exit(1)
1031
+ return
1032
+
1033
+ # Validate arguments for migration
1034
+ if not args.source:
1035
+ print("Error: Source index required for migration")
1036
+ sys.exit(1)
1037
+
1038
+ if not args.to_pgvector and not args.to_sqlite:
1039
+ print("Error: Must specify migration direction (--to-pgvector or --to-sqlite)")
1040
+ sys.exit(1)
1041
+
1042
+ try:
1043
+ from signalwire_agents.search.migration import SearchIndexMigrator
1044
+ migrator = SearchIndexMigrator(verbose=args.verbose)
1045
+
1046
+ if args.to_pgvector:
1047
+ # Validate pgvector arguments
1048
+ if not args.connection_string:
1049
+ print("Error: --connection-string required for pgvector migration")
1050
+ sys.exit(1)
1051
+ if not args.collection_name:
1052
+ print("Error: --collection-name required for pgvector migration")
1053
+ sys.exit(1)
1054
+
1055
+ # Perform migration
1056
+ print(f"Migrating {args.source} to pgvector collection '{args.collection_name}'...")
1057
+ stats = migrator.migrate_sqlite_to_pgvector(
1058
+ sqlite_path=args.source,
1059
+ connection_string=args.connection_string,
1060
+ collection_name=args.collection_name,
1061
+ overwrite=args.overwrite,
1062
+ batch_size=args.batch_size
1063
+ )
1064
+
1065
+ print(f"\n✓ Migration completed successfully!")
1066
+ print(f" Chunks migrated: {stats['chunks_migrated']}")
1067
+ print(f" Errors: {stats['errors']}")
1068
+
1069
+ elif args.to_sqlite:
1070
+ print("Error: pgvector to SQLite migration not yet implemented")
1071
+ print("This feature is planned for future development")
1072
+ sys.exit(1)
1073
+
1074
+ except Exception as e:
1075
+ print(f"\nError during migration: {e}")
1076
+ if args.verbose:
1077
+ import traceback
1078
+ traceback.print_exc()
1079
+ sys.exit(1)
1080
+
1081
+
1082
+ def remote_command():
1083
+ """Search via remote API endpoint"""
1084
+ parser = argparse.ArgumentParser(description='Search via remote API endpoint')
1085
+ parser.add_argument('endpoint', help='Remote API endpoint URL (e.g., http://localhost:8001)')
1086
+ parser.add_argument('query', help='Search query')
1087
+ parser.add_argument('--index-name', required=True, help='Name of the index to search')
1088
+ parser.add_argument('--count', type=int, default=5, help='Number of results to return (default: 5)')
1089
+ parser.add_argument('--distance-threshold', type=float, default=0.0, help='Minimum similarity score (default: 0.0)')
1090
+ parser.add_argument('--tags', help='Comma-separated tags to filter by')
1091
+ parser.add_argument('--verbose', action='store_true', help='Show detailed information')
1092
+ parser.add_argument('--json', action='store_true', help='Output results as JSON')
1093
+ parser.add_argument('--no-content', action='store_true', help='Hide content in results (show only metadata)')
1094
+ parser.add_argument('--timeout', type=int, default=30, help='Request timeout in seconds (default: 30)')
1095
+
1096
+ args = parser.parse_args()
1097
+
1098
+ # Ensure endpoint starts with http:// or https://
1099
+ endpoint = args.endpoint
1100
+ if not endpoint.startswith(('http://', 'https://')):
1101
+ endpoint = f"http://{endpoint}"
1102
+
1103
+ # Ensure endpoint ends with /search
1104
+ if not endpoint.endswith('/search'):
1105
+ if endpoint.endswith('/'):
1106
+ endpoint += 'search'
1107
+ else:
1108
+ endpoint += '/search'
1109
+
1110
+ try:
1111
+ import requests
1112
+ except ImportError:
1113
+ print("Error: requests library not available. Install with: pip install requests")
1114
+ sys.exit(1)
1115
+
1116
+ # Prepare request payload
1117
+ payload = {
1118
+ 'query': args.query,
1119
+ 'index_name': args.index_name,
1120
+ 'count': args.count,
1121
+ 'similarity_threshold': args.similarity_threshold
1122
+ }
1123
+
1124
+ if args.tags:
1125
+ payload['tags'] = [tag.strip() for tag in args.tags.split(',')]
1126
+
1127
+ if args.verbose:
1128
+ print(f"Searching remote endpoint: {endpoint}")
1129
+ print(f"Payload: {payload}")
1130
+ print()
1131
+
1132
+ try:
1133
+ # Make the API request
1134
+ response = requests.post(
1135
+ endpoint,
1136
+ json=payload,
1137
+ headers={'Content-Type': 'application/json'},
1138
+ timeout=args.timeout
1139
+ )
1140
+
1141
+ if response.status_code == 200:
1142
+ result = response.json()
1143
+
1144
+ if args.json:
1145
+ # Output raw JSON response
1146
+ import json
1147
+ print(json.dumps(result, indent=2))
1148
+ else:
1149
+ # Human-readable output
1150
+ results = result.get('results', [])
1151
+ if not results:
1152
+ print(f"No results found for '{args.query}' in index '{args.index_name}'")
1153
+ sys.exit(0)
1154
+
1155
+ print(f"Found {len(results)} result(s) for '{args.query}' in index '{args.index_name}':")
1156
+ if result.get('enhanced_query') and result.get('enhanced_query') != args.query:
1157
+ print(f"Enhanced query: '{result.get('enhanced_query')}'")
1158
+ print("=" * 80)
1159
+
1160
+ for i, search_result in enumerate(results):
1161
+ print(f"\n[{i+1}] Score: {search_result.get('score', 0):.4f}")
1162
+
1163
+ # Show metadata
1164
+ metadata = search_result.get('metadata', {})
1165
+ print(f"File: {metadata.get('filename', 'Unknown')}")
1166
+ if metadata.get('section'):
1167
+ print(f"Section: {metadata['section']}")
1168
+ if metadata.get('line_start'):
1169
+ print(f"Lines: {metadata['line_start']}-{metadata.get('line_end', metadata['line_start'])}")
1170
+ if metadata.get('tags'):
1171
+ print(f"Tags: {', '.join(metadata['tags'])}")
1172
+
1173
+ # Show content unless suppressed
1174
+ if not args.no_content and 'content' in search_result:
1175
+ content = search_result['content']
1176
+ if len(content) > 500 and not args.verbose:
1177
+ content = content[:500] + "..."
1178
+ print(f"\nContent:\n{content}")
1179
+
1180
+ if i < len(results) - 1:
1181
+ print("-" * 80)
1182
+
1183
+ elif response.status_code == 404:
1184
+ try:
1185
+ error_detail = response.json()
1186
+ error_msg = error_detail.get('detail', 'Index not found')
1187
+ except:
1188
+ error_msg = 'Index not found'
1189
+ print(f"Error: {error_msg}")
1190
+ sys.exit(1)
1191
+ else:
1192
+ try:
1193
+ error_detail = response.json()
1194
+ error_msg = error_detail.get('detail', f'HTTP {response.status_code}')
1195
+ except:
1196
+ error_msg = f'HTTP {response.status_code}: {response.text}'
1197
+ print(f"Error: {error_msg}")
1198
+ sys.exit(1)
1199
+
1200
+ except requests.ConnectionError:
1201
+ print(f"Error: Could not connect to {endpoint}")
1202
+ print("Make sure the search server is running")
1203
+ sys.exit(1)
1204
+ except requests.Timeout:
1205
+ print(f"Error: Request timed out after {args.timeout} seconds")
1206
+ sys.exit(1)
1207
+ except requests.RequestException as e:
1208
+ print(f"Error making request: {e}")
1209
+ sys.exit(1)
1210
+ except Exception as e:
1211
+ print(f"Error: {e}")
1212
+ if args.verbose:
1213
+ import traceback
1214
+ traceback.print_exc()
1215
+ sys.exit(1)
1216
+
1217
+ def console_entry_point():
1218
+ """Console script entry point for pip installation"""
1219
+ import sys
1220
+
1221
+ # Fast help check - show help without importing heavy modules
1222
+ if len(sys.argv) > 1 and sys.argv[1] in ['--help', '-h']:
1223
+ print("""usage: sw-search [-h] [--output OUTPUT] [--chunking-strategy {sentence,sliding,paragraph,page,semantic,topic,qa}]
1224
+ [--max-sentences-per-chunk MAX_SENTENCES_PER_CHUNK] [--chunk-size CHUNK_SIZE]
1225
+ [--overlap-size OVERLAP_SIZE] [--split-newlines SPLIT_NEWLINES] [--file-types FILE_TYPES]
1226
+ [--exclude EXCLUDE] [--languages LANGUAGES] [--model MODEL] [--tags TAGS]
1227
+ [--index-nlp-backend {nltk,spacy}] [--verbose] [--validate]
1228
+ [--semantic-threshold SEMANTIC_THRESHOLD] [--topic-threshold TOPIC_THRESHOLD]
1229
+ sources [sources ...]
1230
+
1231
+ Build local search index from documents
1232
+
1233
+ positional arguments:
1234
+ sources Source files and/or directories to index
1235
+
1236
+ options:
1237
+ -h, --help show this help message and exit
1238
+ --output OUTPUT Output .swsearch file (default: sources.swsearch)
1239
+ --chunking-strategy {sentence,sliding,paragraph,page,semantic,topic,qa}
1240
+ Chunking strategy to use (default: sentence)
1241
+ --max-sentences-per-chunk MAX_SENTENCES_PER_CHUNK
1242
+ Maximum sentences per chunk for sentence strategy (default: 5)
1243
+ --chunk-size CHUNK_SIZE
1244
+ Chunk size in words for sliding window strategy (default: 50)
1245
+ --overlap-size OVERLAP_SIZE
1246
+ Overlap size in words for sliding window strategy (default: 10)
1247
+ --split-newlines SPLIT_NEWLINES
1248
+ Split on multiple newlines (for sentence strategy)
1249
+ --file-types FILE_TYPES
1250
+ Comma-separated file extensions to include for directories (default: md,txt,rst)
1251
+ --exclude EXCLUDE Comma-separated glob patterns to exclude (e.g., "**/test/**,**/__pycache__/**")
1252
+ --languages LANGUAGES
1253
+ Comma-separated language codes (default: en)
1254
+ --model MODEL Sentence transformer model name (default: sentence-transformers/all-mpnet-base-v2)
1255
+ --tags TAGS Comma-separated tags to add to all chunks
1256
+ --index-nlp-backend {nltk,spacy}
1257
+ NLP backend for document processing: nltk (fast, default) or spacy (better quality, slower)
1258
+ --verbose Enable verbose output
1259
+ --validate Validate the created index after building
1260
+ --semantic-threshold SEMANTIC_THRESHOLD
1261
+ Similarity threshold for semantic chunking (default: 0.5)
1262
+ --topic-threshold TOPIC_THRESHOLD
1263
+ Similarity threshold for topic chunking (default: 0.3)
1264
+
1265
+ Examples:
1266
+ # Basic usage with directory (defaults to sentence chunking with 5 sentences per chunk)
1267
+ sw-search ./docs
1268
+
1269
+ # Multiple directories
1270
+ sw-search ./docs ./examples --file-types md,txt,py
1271
+
1272
+ # Individual files
1273
+ sw-search README.md ./docs/guide.md ./src/main.py
1274
+
1275
+ # Mixed sources (directories and files)
1276
+ sw-search ./docs README.md ./examples specific_file.txt --file-types md,txt,py
1277
+
1278
+ # Sentence-based chunking with custom parameters
1279
+ sw-search ./docs \\
1280
+ --chunking-strategy sentence \\
1281
+ --max-sentences-per-chunk 10 \\
1282
+ --split-newlines 2
1283
+
1284
+ # Sliding window chunking
1285
+ sw-search ./docs \\
1286
+ --chunking-strategy sliding \\
1287
+ --chunk-size 100 \\
1288
+ --overlap-size 20
1289
+
1290
+ # Paragraph-based chunking
1291
+ sw-search ./docs \\
1292
+ --chunking-strategy paragraph \\
1293
+ --file-types md,txt,rst
1294
+
1295
+ # Page-based chunking (good for PDFs)
1296
+ sw-search ./docs \\
1297
+ --chunking-strategy page \\
1298
+ --file-types pdf
1299
+
1300
+ # Semantic chunking (groups semantically similar sentences)
1301
+ sw-search ./docs \\
1302
+ --chunking-strategy semantic \\
1303
+ --semantic-threshold 0.6
1304
+
1305
+ # Topic-based chunking (groups by topic changes)
1306
+ sw-search ./docs \\
1307
+ --chunking-strategy topic \\
1308
+ --topic-threshold 0.2
1309
+
1310
+ # QA-optimized chunking (optimized for question-answering)
1311
+ sw-search ./docs \\
1312
+ --chunking-strategy qa
1313
+
1314
+ # Full configuration example
1315
+ sw-search ./docs ./examples README.md \\
1316
+ --output ./knowledge.swsearch \\
1317
+ --chunking-strategy sentence \\
1318
+ --max-sentences-per-chunk 8 \\
1319
+ --file-types md,txt,rst,py \\
1320
+ --exclude "**/test/**,**/__pycache__/**" \\
1321
+ --languages en,es,fr \\
1322
+ --model sentence-transformers/all-mpnet-base-v2 \\
1323
+ --tags documentation,api \\
1324
+ --verbose
1325
+
1326
+ # Validate an existing index
1327
+ sw-search validate ./docs.swsearch
1328
+
1329
+ # Search within an index
1330
+ sw-search search ./docs.swsearch "how to create an agent"
1331
+ sw-search search ./docs.swsearch "API reference" --count 3 --verbose
1332
+ sw-search search ./docs.swsearch "configuration" --tags documentation --json
1333
+
1334
+ # Search via remote API
1335
+ sw-search remote http://localhost:8001 "how to create an agent" --index-name docs
1336
+ sw-search remote localhost:8001 "API reference" --index-name docs --count 3 --verbose
1337
+ """)
1338
+ return
1339
+
1340
+ # Check for subcommands
1341
+ if len(sys.argv) > 1:
1342
+ if sys.argv[1] == 'validate':
1343
+ # Remove 'validate' from argv and call validate_command
1344
+ sys.argv.pop(1)
1345
+ validate_command()
1346
+ return
1347
+ elif sys.argv[1] == 'search':
1348
+ # Remove 'search' from argv and call search_command
1349
+ sys.argv.pop(1)
1350
+ search_command()
1351
+ return
1352
+ elif sys.argv[1] == 'remote':
1353
+ # Remove 'remote' from argv and call remote_command
1354
+ sys.argv.pop(1)
1355
+ remote_command()
1356
+ return
1357
+ elif sys.argv[1] == 'migrate':
1358
+ # Remove 'migrate' from argv and call migrate_command
1359
+ sys.argv.pop(1)
1360
+ migrate_command()
1361
+ return
1362
+
1363
+ # Regular build command
1364
+ main()
1365
+
1366
+ if __name__ == '__main__':
1367
+ main()