signalwire-agents 0.1.13__py3-none-any.whl → 1.0.17.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. signalwire_agents/__init__.py +99 -15
  2. signalwire_agents/agent_server.py +248 -60
  3. signalwire_agents/agents/bedrock.py +296 -0
  4. signalwire_agents/cli/__init__.py +9 -0
  5. signalwire_agents/cli/build_search.py +951 -41
  6. signalwire_agents/cli/config.py +80 -0
  7. signalwire_agents/cli/core/__init__.py +10 -0
  8. signalwire_agents/cli/core/agent_loader.py +470 -0
  9. signalwire_agents/cli/core/argparse_helpers.py +179 -0
  10. signalwire_agents/cli/core/dynamic_config.py +71 -0
  11. signalwire_agents/cli/core/service_loader.py +303 -0
  12. signalwire_agents/cli/dokku.py +2320 -0
  13. signalwire_agents/cli/execution/__init__.py +10 -0
  14. signalwire_agents/cli/execution/datamap_exec.py +446 -0
  15. signalwire_agents/cli/execution/webhook_exec.py +134 -0
  16. signalwire_agents/cli/init_project.py +2636 -0
  17. signalwire_agents/cli/output/__init__.py +10 -0
  18. signalwire_agents/cli/output/output_formatter.py +255 -0
  19. signalwire_agents/cli/output/swml_dump.py +186 -0
  20. signalwire_agents/cli/simulation/__init__.py +10 -0
  21. signalwire_agents/cli/simulation/data_generation.py +374 -0
  22. signalwire_agents/cli/simulation/data_overrides.py +200 -0
  23. signalwire_agents/cli/simulation/mock_env.py +282 -0
  24. signalwire_agents/cli/swaig_test_wrapper.py +52 -0
  25. signalwire_agents/cli/test_swaig.py +566 -2366
  26. signalwire_agents/cli/types.py +81 -0
  27. signalwire_agents/core/__init__.py +2 -2
  28. signalwire_agents/core/agent/__init__.py +12 -0
  29. signalwire_agents/core/agent/config/__init__.py +12 -0
  30. signalwire_agents/core/agent/deployment/__init__.py +9 -0
  31. signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
  32. signalwire_agents/core/agent/prompt/__init__.py +14 -0
  33. signalwire_agents/core/agent/prompt/manager.py +306 -0
  34. signalwire_agents/core/agent/routing/__init__.py +9 -0
  35. signalwire_agents/core/agent/security/__init__.py +9 -0
  36. signalwire_agents/core/agent/swml/__init__.py +9 -0
  37. signalwire_agents/core/agent/tools/__init__.py +15 -0
  38. signalwire_agents/core/agent/tools/decorator.py +97 -0
  39. signalwire_agents/core/agent/tools/registry.py +210 -0
  40. signalwire_agents/core/agent_base.py +845 -2916
  41. signalwire_agents/core/auth_handler.py +233 -0
  42. signalwire_agents/core/config_loader.py +259 -0
  43. signalwire_agents/core/contexts.py +418 -0
  44. signalwire_agents/core/data_map.py +3 -15
  45. signalwire_agents/core/function_result.py +116 -44
  46. signalwire_agents/core/logging_config.py +162 -18
  47. signalwire_agents/core/mixins/__init__.py +28 -0
  48. signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
  49. signalwire_agents/core/mixins/auth_mixin.py +280 -0
  50. signalwire_agents/core/mixins/prompt_mixin.py +358 -0
  51. signalwire_agents/core/mixins/serverless_mixin.py +460 -0
  52. signalwire_agents/core/mixins/skill_mixin.py +55 -0
  53. signalwire_agents/core/mixins/state_mixin.py +153 -0
  54. signalwire_agents/core/mixins/tool_mixin.py +230 -0
  55. signalwire_agents/core/mixins/web_mixin.py +1142 -0
  56. signalwire_agents/core/security_config.py +333 -0
  57. signalwire_agents/core/skill_base.py +84 -1
  58. signalwire_agents/core/skill_manager.py +62 -20
  59. signalwire_agents/core/swaig_function.py +18 -5
  60. signalwire_agents/core/swml_builder.py +207 -11
  61. signalwire_agents/core/swml_handler.py +27 -21
  62. signalwire_agents/core/swml_renderer.py +123 -312
  63. signalwire_agents/core/swml_service.py +171 -203
  64. signalwire_agents/mcp_gateway/__init__.py +29 -0
  65. signalwire_agents/mcp_gateway/gateway_service.py +564 -0
  66. signalwire_agents/mcp_gateway/mcp_manager.py +513 -0
  67. signalwire_agents/mcp_gateway/session_manager.py +218 -0
  68. signalwire_agents/prefabs/concierge.py +0 -3
  69. signalwire_agents/prefabs/faq_bot.py +0 -3
  70. signalwire_agents/prefabs/info_gatherer.py +0 -3
  71. signalwire_agents/prefabs/receptionist.py +0 -3
  72. signalwire_agents/prefabs/survey.py +0 -3
  73. signalwire_agents/schema.json +9218 -5489
  74. signalwire_agents/search/__init__.py +7 -1
  75. signalwire_agents/search/document_processor.py +490 -31
  76. signalwire_agents/search/index_builder.py +307 -37
  77. signalwire_agents/search/migration.py +418 -0
  78. signalwire_agents/search/models.py +30 -0
  79. signalwire_agents/search/pgvector_backend.py +748 -0
  80. signalwire_agents/search/query_processor.py +162 -31
  81. signalwire_agents/search/search_engine.py +916 -35
  82. signalwire_agents/search/search_service.py +376 -53
  83. signalwire_agents/skills/README.md +452 -0
  84. signalwire_agents/skills/__init__.py +14 -2
  85. signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
  86. signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
  87. signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
  88. signalwire_agents/skills/datasphere/README.md +210 -0
  89. signalwire_agents/skills/datasphere/skill.py +84 -3
  90. signalwire_agents/skills/datasphere_serverless/README.md +258 -0
  91. signalwire_agents/skills/datasphere_serverless/__init__.py +9 -0
  92. signalwire_agents/skills/datasphere_serverless/skill.py +82 -1
  93. signalwire_agents/skills/datetime/README.md +132 -0
  94. signalwire_agents/skills/datetime/__init__.py +9 -0
  95. signalwire_agents/skills/datetime/skill.py +20 -7
  96. signalwire_agents/skills/joke/README.md +149 -0
  97. signalwire_agents/skills/joke/__init__.py +9 -0
  98. signalwire_agents/skills/joke/skill.py +21 -0
  99. signalwire_agents/skills/math/README.md +161 -0
  100. signalwire_agents/skills/math/__init__.py +9 -0
  101. signalwire_agents/skills/math/skill.py +18 -4
  102. signalwire_agents/skills/mcp_gateway/README.md +230 -0
  103. signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
  104. signalwire_agents/skills/mcp_gateway/skill.py +421 -0
  105. signalwire_agents/skills/native_vector_search/README.md +210 -0
  106. signalwire_agents/skills/native_vector_search/__init__.py +9 -0
  107. signalwire_agents/skills/native_vector_search/skill.py +569 -101
  108. signalwire_agents/skills/play_background_file/README.md +218 -0
  109. signalwire_agents/skills/play_background_file/__init__.py +12 -0
  110. signalwire_agents/skills/play_background_file/skill.py +242 -0
  111. signalwire_agents/skills/registry.py +395 -40
  112. signalwire_agents/skills/spider/README.md +236 -0
  113. signalwire_agents/skills/spider/__init__.py +13 -0
  114. signalwire_agents/skills/spider/skill.py +598 -0
  115. signalwire_agents/skills/swml_transfer/README.md +395 -0
  116. signalwire_agents/skills/swml_transfer/__init__.py +10 -0
  117. signalwire_agents/skills/swml_transfer/skill.py +359 -0
  118. signalwire_agents/skills/weather_api/README.md +178 -0
  119. signalwire_agents/skills/weather_api/__init__.py +12 -0
  120. signalwire_agents/skills/weather_api/skill.py +191 -0
  121. signalwire_agents/skills/web_search/README.md +163 -0
  122. signalwire_agents/skills/web_search/__init__.py +9 -0
  123. signalwire_agents/skills/web_search/skill.py +586 -112
  124. signalwire_agents/skills/wikipedia_search/README.md +228 -0
  125. signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
  126. signalwire_agents/skills/{wikipedia → wikipedia_search}/skill.py +33 -3
  127. signalwire_agents/web/__init__.py +17 -0
  128. signalwire_agents/web/web_service.py +559 -0
  129. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-agent-init.1 +400 -0
  130. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-search.1 +483 -0
  131. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/swaig-test.1 +308 -0
  132. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/METADATA +347 -215
  133. signalwire_agents-1.0.17.dev4.dist-info/RECORD +147 -0
  134. signalwire_agents-1.0.17.dev4.dist-info/entry_points.txt +6 -0
  135. signalwire_agents/core/state/file_state_manager.py +0 -219
  136. signalwire_agents/core/state/state_manager.py +0 -101
  137. signalwire_agents/skills/wikipedia/__init__.py +0 -9
  138. signalwire_agents-0.1.13.data/data/schema.json +0 -5611
  139. signalwire_agents-0.1.13.dist-info/RECORD +0 -67
  140. signalwire_agents-0.1.13.dist-info/entry_points.txt +0 -3
  141. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/WHEEL +0 -0
  142. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/licenses/LICENSE +0 -0
  143. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/top_level.txt +0 -0
@@ -10,7 +10,9 @@ See LICENSE file in the project root for full license information.
10
10
  import argparse
11
11
  import sys
12
12
  from pathlib import Path
13
- from ..search.index_builder import IndexBuilder
13
+ from datetime import datetime
14
+
15
+ from signalwire_agents.search.models import MODEL_ALIASES, DEFAULT_MODEL, resolve_model_alias
14
16
 
15
17
  def main():
16
18
  """Main entry point for the build-search command"""
@@ -19,7 +21,7 @@ def main():
19
21
  formatter_class=argparse.RawDescriptionHelpFormatter,
20
22
  epilog="""
21
23
  Examples:
22
- # Basic usage with directory (defaults to sentence chunking with 50 sentences per chunk)
24
+ # Basic usage with directory (defaults to sentence chunking with 5 sentences per chunk)
23
25
  sw-search ./docs
24
26
 
25
27
  # Multiple directories
@@ -34,7 +36,7 @@ Examples:
34
36
  # Sentence-based chunking with custom parameters
35
37
  sw-search ./docs \\
36
38
  --chunking-strategy sentence \\
37
- --max-sentences-per-chunk 30 \\
39
+ --max-sentences-per-chunk 10 \\
38
40
  --split-newlines 2
39
41
 
40
42
  # Sliding window chunking
@@ -53,11 +55,64 @@ Examples:
53
55
  --chunking-strategy page \\
54
56
  --file-types pdf
55
57
 
58
+ # Semantic chunking (groups semantically similar sentences)
59
+ sw-search ./docs \\
60
+ --chunking-strategy semantic \\
61
+ --semantic-threshold 0.6
62
+
63
+ # Topic-based chunking (groups by topic changes)
64
+ sw-search ./docs \\
65
+ --chunking-strategy topic \\
66
+ --topic-threshold 0.2
67
+
68
+ # QA-optimized chunking (optimized for question-answering)
69
+ sw-search ./docs \\
70
+ --chunking-strategy qa
71
+
72
+ # Markdown-aware chunking (preserves headers, detects code blocks, adds tags)
73
+ sw-search ./docs \\
74
+ --chunking-strategy markdown \\
75
+ --file-types md
76
+ # This strategy:
77
+ # - Chunks at header boundaries (h1, h2, h3...)
78
+ # - Detects code blocks and extracts language (python, bash, etc)
79
+ # - Adds "code" tags to chunks with code for better search
80
+ # - Preserves section hierarchy in metadata
81
+
82
+ # Model selection examples (performance vs quality tradeoff)
83
+ sw-search ./docs --model mini # Fastest (~5x faster), 384 dims, good for most use cases
84
+ sw-search ./docs --model base # Balanced speed/quality, 768 dims (previous default)
85
+ sw-search ./docs --model large # Best quality (same as base currently)
86
+ # Or use full model names:
87
+ sw-search ./docs --model sentence-transformers/all-MiniLM-L6-v2
88
+ sw-search ./docs --model sentence-transformers/all-mpnet-base-v2
89
+
90
+ # JSON-based chunking (pre-chunked content)
91
+ sw-search ./api_chunks.json \
92
+ --chunking-strategy json \
93
+ --file-types json
94
+
95
+ # Export chunks to JSON for review (single file)
96
+ sw-search ./docs \\
97
+ --output-format json \\
98
+ --output all_chunks.json
99
+
100
+ # Export chunks to JSON (one file per source)
101
+ sw-search ./docs \\
102
+ --output-format json \\
103
+ --output-dir ./chunks/
104
+
105
+ # Build index from exported JSON chunks
106
+ sw-search ./chunks/ \\
107
+ --chunking-strategy json \\
108
+ --file-types json \\
109
+ --output final.swsearch
110
+
56
111
  # Full configuration example
57
112
  sw-search ./docs ./examples README.md \\
58
113
  --output ./knowledge.swsearch \\
59
114
  --chunking-strategy sentence \\
60
- --max-sentences-per-chunk 50 \\
115
+ --max-sentences-per-chunk 8 \\
61
116
  --file-types md,txt,rst,py \\
62
117
  --exclude "**/test/**,**/__pycache__/**" \\
63
118
  --languages en,es,fr \\
@@ -72,6 +127,41 @@ Examples:
72
127
  sw-search search ./docs.swsearch "how to create an agent"
73
128
  sw-search search ./docs.swsearch "API reference" --count 3 --verbose
74
129
  sw-search search ./docs.swsearch "configuration" --tags documentation --json
130
+
131
+ # Search via remote API
132
+ sw-search remote http://localhost:8001 "how to create an agent" --index-name docs
133
+ sw-search remote localhost:8001 "API reference" --index-name docs --count 3 --verbose
134
+
135
+ # Migrate between backends
136
+ sw-search migrate ./docs.swsearch --to-pgvector \\
137
+ --connection-string "postgresql://user:pass@localhost/db" \\
138
+ --collection-name docs_collection
139
+ sw-search migrate --info ./docs.swsearch
140
+
141
+ # PostgreSQL pgvector backend (direct build to PostgreSQL)
142
+ sw-search ./docs \\
143
+ --backend pgvector \\
144
+ --connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
145
+ --output docs_collection
146
+
147
+ # pgvector with markdown strategy (best for documentation with code examples)
148
+ sw-search ./docs \\
149
+ --backend pgvector \\
150
+ --connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
151
+ --output docs_collection \\
152
+ --chunking-strategy markdown
153
+
154
+ # Overwrite existing pgvector collection
155
+ sw-search ./docs \\
156
+ --backend pgvector \\
157
+ --connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
158
+ --output docs_collection \\
159
+ --overwrite
160
+
161
+ # Search in pgvector collection
162
+ sw-search search docs_collection "how to create an agent" \\
163
+ --backend pgvector \\
164
+ --connection-string "postgresql://user:pass@localhost/knowledge"
75
165
  """
76
166
  )
77
167
 
@@ -83,21 +173,51 @@ Examples:
83
173
 
84
174
  parser.add_argument(
85
175
  '--output',
86
- help='Output .swsearch file (default: sources.swsearch)'
176
+ help='Output .swsearch file (default: sources.swsearch) or collection name for pgvector'
177
+ )
178
+
179
+ parser.add_argument(
180
+ '--output-dir',
181
+ help='Output directory for results (creates one file per source file when used with --output-format json, or auto-names index files)'
182
+ )
183
+
184
+ parser.add_argument(
185
+ '--output-format',
186
+ choices=['index', 'json'],
187
+ default='index',
188
+ help='Output format: index (create search index) or json (export chunks as JSON) (default: index)'
189
+ )
190
+
191
+ parser.add_argument(
192
+ '--backend',
193
+ choices=['sqlite', 'pgvector'],
194
+ default='sqlite',
195
+ help='Storage backend to use (default: sqlite)'
196
+ )
197
+
198
+ parser.add_argument(
199
+ '--connection-string',
200
+ help='PostgreSQL connection string for pgvector backend'
201
+ )
202
+
203
+ parser.add_argument(
204
+ '--overwrite',
205
+ action='store_true',
206
+ help='Overwrite existing collection (pgvector backend only)'
87
207
  )
88
208
 
89
209
  parser.add_argument(
90
210
  '--chunking-strategy',
91
- choices=['sentence', 'sliding', 'paragraph', 'page'],
211
+ choices=['sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json', 'markdown'],
92
212
  default='sentence',
93
- help='Chunking strategy to use (default: sentence)'
213
+ help='Chunking strategy to use (default: sentence). Use "markdown" for documentation with code blocks.'
94
214
  )
95
215
 
96
216
  parser.add_argument(
97
217
  '--max-sentences-per-chunk',
98
218
  type=int,
99
- default=50,
100
- help='Maximum sentences per chunk for sentence strategy (default: 50)'
219
+ default=5,
220
+ help='Maximum sentences per chunk for sentence strategy (default: 5)'
101
221
  )
102
222
 
103
223
  parser.add_argument(
@@ -117,7 +237,7 @@ Examples:
117
237
  parser.add_argument(
118
238
  '--split-newlines',
119
239
  type=int,
120
- help='Split on multiple newlines for sentence strategy (optional)'
240
+ help='Split on multiple newlines (for sentence strategy)'
121
241
  )
122
242
 
123
243
  parser.add_argument(
@@ -139,8 +259,8 @@ Examples:
139
259
 
140
260
  parser.add_argument(
141
261
  '--model',
142
- default='sentence-transformers/all-mpnet-base-v2',
143
- help='Sentence transformer model name (default: sentence-transformers/all-mpnet-base-v2)'
262
+ default=DEFAULT_MODEL,
263
+ help=f'Sentence transformer model name or alias (mini/base/large). Default: mini ({DEFAULT_MODEL})'
144
264
  )
145
265
 
146
266
  parser.add_argument(
@@ -148,6 +268,13 @@ Examples:
148
268
  help='Comma-separated tags to add to all chunks'
149
269
  )
150
270
 
271
+ parser.add_argument(
272
+ '--index-nlp-backend',
273
+ choices=['nltk', 'spacy'],
274
+ default='nltk',
275
+ help='NLP backend for document processing: nltk (fast, default) or spacy (better quality, slower)'
276
+ )
277
+
151
278
  parser.add_argument(
152
279
  '--verbose',
153
280
  action='store_true',
@@ -160,8 +287,25 @@ Examples:
160
287
  help='Validate the created index after building'
161
288
  )
162
289
 
290
+ parser.add_argument(
291
+ '--semantic-threshold',
292
+ type=float,
293
+ default=0.5,
294
+ help='Similarity threshold for semantic chunking (default: 0.5)'
295
+ )
296
+
297
+ parser.add_argument(
298
+ '--topic-threshold',
299
+ type=float,
300
+ default=0.3,
301
+ help='Similarity threshold for topic chunking (default: 0.3)'
302
+ )
303
+
163
304
  args = parser.parse_args()
164
305
 
306
+ # Resolve model aliases
307
+ args.model = resolve_model_alias(args.model)
308
+
165
309
  # Validate sources
166
310
  valid_sources = []
167
311
  for source in args.sources:
@@ -175,18 +319,75 @@ Examples:
175
319
  print("Error: No valid sources found")
176
320
  sys.exit(1)
177
321
 
178
- # Default output filename
179
- if not args.output:
322
+ # Validate backend configuration
323
+ if args.backend == 'pgvector' and not args.connection_string:
324
+ print("Error: --connection-string is required for pgvector backend")
325
+ sys.exit(1)
326
+
327
+ # Validate output options
328
+ if args.output and args.output_dir:
329
+ print("Error: Cannot specify both --output and --output-dir")
330
+ sys.exit(1)
331
+
332
+ # Handle JSON output format differently
333
+ if args.output_format == 'json':
334
+ # JSON export doesn't use backend
335
+ if args.backend != 'sqlite':
336
+ print("Warning: --backend is ignored when using --output-format json")
337
+
338
+ # Determine output location
339
+ if args.output_dir:
340
+ # Multiple files mode
341
+ output_path = Path(args.output_dir)
342
+ if not output_path.exists():
343
+ output_path.mkdir(parents=True, exist_ok=True)
344
+ elif args.output:
345
+ # Single file mode
346
+ output_path = Path(args.output)
347
+ if not output_path.suffix:
348
+ output_path = output_path.with_suffix('.json')
349
+ else:
350
+ # Default to single file
351
+ output_path = Path('chunks.json')
352
+ args.output = str(output_path)
353
+
354
+ # Default output filename (for index format)
355
+ if args.output_format == 'index' and not args.output and not args.output_dir:
356
+ if args.backend == 'sqlite':
357
+ if len(valid_sources) == 1:
358
+ # Single source - use its name
359
+ source_name = valid_sources[0].stem if valid_sources[0].is_file() else valid_sources[0].name
360
+ args.output = f"{source_name}.swsearch"
361
+ else:
362
+ # Multiple sources - use generic name
363
+ args.output = "sources.swsearch"
364
+ else:
365
+ # For pgvector, use a default collection name
366
+ if len(valid_sources) == 1:
367
+ source_name = valid_sources[0].stem if valid_sources[0].is_file() else valid_sources[0].name
368
+ args.output = source_name
369
+ else:
370
+ args.output = "documents"
371
+
372
+ # Handle --output-dir for index format
373
+ if args.output_format == 'index' and args.output_dir:
374
+ # Auto-generate output filename in the directory
180
375
  if len(valid_sources) == 1:
181
- # Single source - use its name
182
376
  source_name = valid_sources[0].stem if valid_sources[0].is_file() else valid_sources[0].name
183
- args.output = f"{source_name}.swsearch"
184
377
  else:
185
- # Multiple sources - use generic name
186
- args.output = "sources.swsearch"
378
+ source_name = "combined"
379
+
380
+ output_dir = Path(args.output_dir)
381
+ output_dir.mkdir(parents=True, exist_ok=True)
382
+
383
+ if args.backend == 'sqlite':
384
+ args.output = str(output_dir / f"{source_name}.swsearch")
385
+ else:
386
+ # For pgvector, still use the name as collection
387
+ args.output = source_name
187
388
 
188
- # Ensure output has .swsearch extension
189
- if not args.output.endswith('.swsearch'):
389
+ # Ensure output has .swsearch extension for sqlite (but not for JSON format)
390
+ if args.output_format == 'index' and args.backend == 'sqlite' and args.output and not args.output.endswith('.swsearch'):
190
391
  args.output += '.swsearch'
191
392
 
192
393
  # Parse lists
@@ -197,13 +398,19 @@ Examples:
197
398
 
198
399
  if args.verbose:
199
400
  print(f"Building search index:")
401
+ print(f" Backend: {args.backend}")
200
402
  print(f" Sources: {[str(s) for s in valid_sources]}")
201
- print(f" Output: {args.output}")
403
+ if args.backend == 'sqlite':
404
+ print(f" Output file: {args.output}")
405
+ else:
406
+ print(f" Collection name: {args.output}")
407
+ print(f" Connection: {args.connection_string}")
202
408
  print(f" File types (for directories): {file_types}")
203
409
  print(f" Exclude patterns: {exclude_patterns}")
204
410
  print(f" Languages: {languages}")
205
411
  print(f" Model: {args.model}")
206
412
  print(f" Chunking strategy: {args.chunking_strategy}")
413
+ print(f" Index NLP backend: {args.index_nlp_backend}")
207
414
 
208
415
  if args.chunking_strategy == 'sentence':
209
416
  print(f" Max sentences per chunk: {args.max_sentences_per_chunk}")
@@ -216,12 +423,116 @@ Examples:
216
423
  print(f" Chunking by paragraphs (double newlines)")
217
424
  elif args.chunking_strategy == 'page':
218
425
  print(f" Chunking by pages")
426
+ elif args.chunking_strategy == 'semantic':
427
+ print(f" Semantic chunking (similarity threshold: {args.semantic_threshold})")
428
+ elif args.chunking_strategy == 'topic':
429
+ print(f" Topic-based chunking (similarity threshold: {args.topic_threshold})")
430
+ elif args.chunking_strategy == 'qa':
431
+ print(f" QA-optimized chunking")
219
432
 
220
433
  print(f" Tags: {tags}")
221
434
  print()
222
435
 
223
436
  try:
224
- # Create index builder
437
+ # Handle JSON export mode
438
+ if args.output_format == 'json':
439
+ # Import what we need for chunking
440
+ from signalwire_agents.search.index_builder import IndexBuilder
441
+ import json
442
+
443
+ builder = IndexBuilder(
444
+ chunking_strategy=args.chunking_strategy,
445
+ max_sentences_per_chunk=args.max_sentences_per_chunk,
446
+ chunk_size=args.chunk_size,
447
+ chunk_overlap=args.overlap_size,
448
+ split_newlines=args.split_newlines,
449
+ index_nlp_backend=args.index_nlp_backend,
450
+ verbose=args.verbose,
451
+ semantic_threshold=args.semantic_threshold,
452
+ topic_threshold=args.topic_threshold
453
+ )
454
+
455
+ # Process files and export chunks
456
+ all_chunks = []
457
+ chunk_files_created = []
458
+
459
+ # Discover files from sources
460
+ files = builder._discover_files_from_sources(valid_sources, file_types, exclude_patterns)
461
+
462
+ if args.verbose:
463
+ print(f"Processing {len(files)} files...")
464
+
465
+ for file_path in files:
466
+ try:
467
+ # Determine base directory for relative paths
468
+ base_dir = builder._get_base_directory_for_file(file_path, valid_sources)
469
+
470
+ # Process file into chunks
471
+ chunks = builder._process_file(file_path, base_dir, tags)
472
+
473
+ if args.output_dir:
474
+ # Create individual JSON file
475
+ relative_path = file_path.relative_to(base_dir) if base_dir else file_path.name
476
+ json_filename = relative_path.with_suffix('.json')
477
+ json_path = Path(args.output_dir) / json_filename
478
+
479
+ # Create subdirectories if needed
480
+ json_path.parent.mkdir(parents=True, exist_ok=True)
481
+
482
+ # Save chunks to JSON
483
+ chunk_data = {
484
+ "chunks": chunks,
485
+ "metadata": {
486
+ "source_file": str(relative_path),
487
+ "total_chunks": len(chunks),
488
+ "chunking_strategy": args.chunking_strategy,
489
+ "processing_date": datetime.now().isoformat()
490
+ }
491
+ }
492
+
493
+ with open(json_path, 'w', encoding='utf-8') as f:
494
+ json.dump(chunk_data, f, indent=2, ensure_ascii=False)
495
+
496
+ chunk_files_created.append(json_path)
497
+ if args.verbose:
498
+ print(f" Created: {json_path} ({len(chunks)} chunks)")
499
+ else:
500
+ # Accumulate all chunks for single file output
501
+ all_chunks.extend(chunks)
502
+
503
+ except Exception as e:
504
+ print(f"Error processing {file_path}: {e}")
505
+ if args.verbose:
506
+ import traceback
507
+ traceback.print_exc()
508
+
509
+ # Handle single file output
510
+ if not args.output_dir:
511
+ output_data = {
512
+ "chunks": all_chunks,
513
+ "metadata": {
514
+ "total_chunks": len(all_chunks),
515
+ "total_files": len(files),
516
+ "chunking_strategy": args.chunking_strategy,
517
+ "processing_date": datetime.now().isoformat()
518
+ }
519
+ }
520
+
521
+ with open(args.output, 'w', encoding='utf-8') as f:
522
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
523
+
524
+ print(f"✓ Exported {len(all_chunks)} chunks to {args.output}")
525
+ else:
526
+ print(f"✓ Created {len(chunk_files_created)} JSON files in {args.output_dir}")
527
+ total_chunks = sum(len(json.load(open(f))['chunks']) for f in chunk_files_created)
528
+ print(f" Total chunks: {total_chunks}")
529
+
530
+ # Exit early for JSON format
531
+ return
532
+
533
+ # Regular index building mode
534
+ # Create index builder - import only when actually needed
535
+ from signalwire_agents.search.index_builder import IndexBuilder
225
536
  builder = IndexBuilder(
226
537
  model_name=args.model,
227
538
  chunking_strategy=args.chunking_strategy,
@@ -229,7 +540,12 @@ Examples:
229
540
  chunk_size=args.chunk_size,
230
541
  chunk_overlap=args.overlap_size,
231
542
  split_newlines=args.split_newlines,
232
- verbose=args.verbose
543
+ index_nlp_backend=args.index_nlp_backend,
544
+ verbose=args.verbose,
545
+ semantic_threshold=args.semantic_threshold,
546
+ topic_threshold=args.topic_threshold,
547
+ backend=args.backend,
548
+ connection_string=args.connection_string
233
549
  )
234
550
 
235
551
  # Build index with multiple sources
@@ -239,7 +555,8 @@ Examples:
239
555
  file_types=file_types,
240
556
  exclude_patterns=exclude_patterns,
241
557
  languages=languages,
242
- tags=tags
558
+ tags=tags,
559
+ overwrite=args.overwrite if args.backend == 'pgvector' else False
243
560
  )
244
561
 
245
562
  # Validate if requested
@@ -258,7 +575,17 @@ Examples:
258
575
  print(f"✗ Index validation failed: {validation['error']}")
259
576
  sys.exit(1)
260
577
 
261
- print(f"\n✓ Search index created successfully: {args.output}")
578
+ if args.backend == 'sqlite':
579
+ # Check if the index was actually created
580
+ import os
581
+ if os.path.exists(args.output):
582
+ print(f"\n✓ Search index created successfully: {args.output}")
583
+ else:
584
+ print(f"\n✗ Search index creation failed - no files were processed")
585
+ sys.exit(1)
586
+ else:
587
+ print(f"\n✓ Search collection created successfully: {args.output}")
588
+ print(f" Connection: {args.connection_string}")
262
589
 
263
590
  except KeyboardInterrupt:
264
591
  print("\n\nBuild interrupted by user")
@@ -283,7 +610,7 @@ def validate_command():
283
610
  sys.exit(1)
284
611
 
285
612
  try:
286
- from ..search.index_builder import IndexBuilder
613
+ from signalwire_agents.search.index_builder import IndexBuilder
287
614
  builder = IndexBuilder()
288
615
 
289
616
  validation = builder.validate_index(args.index_file)
@@ -310,29 +637,57 @@ def validate_command():
310
637
 
311
638
  def search_command():
312
639
  """Search within an existing search index"""
313
- parser = argparse.ArgumentParser(description='Search within a .swsearch index file')
314
- parser.add_argument('index_file', help='Path to .swsearch file to search')
315
- parser.add_argument('query', help='Search query')
640
+ parser = argparse.ArgumentParser(description='Search within a .swsearch index file or pgvector collection')
641
+ parser.add_argument('index_source', help='Path to .swsearch file or collection name for pgvector')
642
+ parser.add_argument('query', nargs='?', help='Search query (optional if using --shell)')
643
+ parser.add_argument('--backend', choices=['sqlite', 'pgvector'], default='sqlite',
644
+ help='Storage backend (default: sqlite)')
645
+ parser.add_argument('--connection-string', help='PostgreSQL connection string for pgvector backend')
646
+ parser.add_argument('--shell', action='store_true',
647
+ help='Interactive shell mode - load once and search multiple times')
316
648
  parser.add_argument('--count', type=int, default=5, help='Number of results to return (default: 5)')
317
649
  parser.add_argument('--distance-threshold', type=float, default=0.0, help='Minimum similarity score (default: 0.0)')
318
650
  parser.add_argument('--tags', help='Comma-separated tags to filter by')
319
- parser.add_argument('--nlp-backend', choices=['nltk', 'spacy'], default='nltk',
320
- help='NLP backend to use: nltk (fast, default) or spacy (better quality, requires model download)')
651
+ parser.add_argument('--query-nlp-backend', choices=['nltk', 'spacy'], default='nltk',
652
+ help='NLP backend for query processing: nltk (fast, default) or spacy (better quality, slower)')
653
+ parser.add_argument('--keyword-weight', type=float, default=None,
654
+ help='Manual keyword weight (0.0-1.0). Overrides automatic weight detection.')
321
655
  parser.add_argument('--verbose', action='store_true', help='Show detailed information')
322
656
  parser.add_argument('--json', action='store_true', help='Output results as JSON')
323
657
  parser.add_argument('--no-content', action='store_true', help='Hide content in results (show only metadata)')
658
+ parser.add_argument('--model', help='Override embedding model for query (mini/base/large or full model name)')
324
659
 
325
660
  args = parser.parse_args()
326
661
 
327
- if not Path(args.index_file).exists():
328
- print(f"Error: Index file does not exist: {args.index_file}")
662
+ # Validate arguments
663
+ if not args.shell and not args.query:
664
+ print("Error: Query is required unless using --shell mode")
665
+ sys.exit(1)
666
+
667
+ # Resolve model aliases
668
+ if args.model and args.model in MODEL_ALIASES:
669
+ args.model = MODEL_ALIASES[args.model]
670
+
671
+ # Validate keyword weight if provided
672
+ if args.keyword_weight is not None:
673
+ if args.keyword_weight < 0.0 or args.keyword_weight > 1.0:
674
+ print("Error: --keyword-weight must be between 0.0 and 1.0")
675
+ sys.exit(1)
676
+
677
+ # Validate backend configuration
678
+ if args.backend == 'pgvector' and not args.connection_string:
679
+ print("Error: --connection-string is required for pgvector backend")
680
+ sys.exit(1)
681
+
682
+ if args.backend == 'sqlite' and not Path(args.index_source).exists():
683
+ print(f"Error: Index file does not exist: {args.index_source}")
329
684
  sys.exit(1)
330
685
 
331
686
  try:
332
687
  # Import search dependencies
333
688
  try:
334
- from ..search.search_engine import SearchEngine
335
- from ..search.query_processor import preprocess_query
689
+ from signalwire_agents.search.search_engine import SearchEngine
690
+ from signalwire_agents.search.query_processor import preprocess_query
336
691
  except ImportError as e:
337
692
  print(f"Error: Search functionality not available. Install with: pip install signalwire-agents[search]")
338
693
  print(f"Details: {e}")
@@ -340,20 +695,173 @@ def search_command():
340
695
 
341
696
  # Load search engine
342
697
  if args.verbose:
343
- print(f"Loading search index: {args.index_file}")
698
+ if args.backend == 'sqlite':
699
+ print(f"Loading search index: {args.index_source}")
700
+ else:
701
+ print(f"Connecting to pgvector collection: {args.index_source}")
344
702
 
345
- engine = SearchEngine(args.index_file)
703
+ if args.backend == 'sqlite':
704
+ # Pass the model from the index or override if specified
705
+ model = args.model if args.model else None
706
+ engine = SearchEngine(backend='sqlite', index_path=args.index_source, model=model)
707
+ else:
708
+ # Pass the model override if specified
709
+ model = args.model if args.model else None
710
+ engine = SearchEngine(backend='pgvector', connection_string=args.connection_string,
711
+ collection_name=args.index_source, model=model)
346
712
 
347
713
  # Get index stats
348
714
  stats = engine.get_stats()
715
+
716
+ # Get the model from index config if not overridden
717
+ model_to_use = args.model
718
+ if not model_to_use and 'config' in stats:
719
+ # SQLite uses 'embedding_model', pgvector uses 'model_name'
720
+ model_to_use = stats['config'].get('embedding_model') or stats['config'].get('model_name')
721
+
722
+ # Shell mode implementation
723
+ if args.shell:
724
+ import time
725
+ print(f"Search Shell - Index: {args.index_source}")
726
+ print(f"Backend: {args.backend}")
727
+ print(f"Index contains {stats['total_chunks']} chunks from {stats['total_files']} files")
728
+ if model_to_use:
729
+ print(f"Model: {model_to_use}")
730
+ print("Type 'exit' or 'quit' to leave, 'help' for options")
731
+ print("-" * 60)
732
+
733
+ while True:
734
+ try:
735
+ query = input("\nsearch> ").strip()
736
+
737
+ if not query:
738
+ continue
739
+
740
+ if query.lower() in ['exit', 'quit', 'q']:
741
+ print("Goodbye!")
742
+ break
743
+
744
+ if query.lower() == 'help':
745
+ print("\nShell commands:")
746
+ print(" help - Show this help")
747
+ print(" exit/quit/q - Exit shell")
748
+ print(" count=N - Set result count (current: {})".format(args.count))
749
+ print(" tags=tag1,tag2 - Set tag filter (current: {})".format(args.tags or 'none'))
750
+ print(" verbose - Toggle verbose output")
751
+ print("\nOr type any search query...")
752
+ continue
753
+
754
+ # Handle shell commands
755
+ if query.startswith('count='):
756
+ try:
757
+ args.count = int(query.split('=')[1])
758
+ print(f"Result count set to: {args.count}")
759
+ except:
760
+ print("Invalid count value")
761
+ continue
762
+
763
+ if query.startswith('tags='):
764
+ tag_str = query.split('=', 1)[1]
765
+ args.tags = tag_str if tag_str else None
766
+ tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
767
+ print(f"Tags filter set to: {tags or 'none'}")
768
+ continue
769
+
770
+ if query == 'verbose':
771
+ args.verbose = not args.verbose
772
+ print(f"Verbose output: {'on' if args.verbose else 'off'}")
773
+ continue
774
+
775
+ # Perform search with timing
776
+ start_time = time.time()
777
+
778
+ # Preprocess query
779
+ enhanced = preprocess_query(
780
+ query,
781
+ vector=True,
782
+ query_nlp_backend=args.query_nlp_backend,
783
+ model_name=model_to_use,
784
+ preserve_original=True,
785
+ max_synonyms=2
786
+ )
787
+
788
+ # Parse tags
789
+ tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
790
+
791
+ # Perform search
792
+ results = engine.search(
793
+ query_vector=enhanced.get('vector'),
794
+ enhanced_text=enhanced.get('enhanced_text', query),
795
+ count=args.count,
796
+ similarity_threshold=args.similarity_threshold,
797
+ tags=tags,
798
+ keyword_weight=args.keyword_weight,
799
+ original_query=query
800
+ )
801
+
802
+ search_time = time.time() - start_time
803
+
804
+ # Display results
805
+ if not results:
806
+ print(f"\nNo results found for '{query}' ({search_time:.3f}s)")
807
+ else:
808
+ print(f"\nFound {len(results)} result(s) for '{query}' ({search_time:.3f}s):")
809
+ if enhanced.get('enhanced_text') != query and args.verbose:
810
+ print(f"Enhanced query: '{enhanced.get('enhanced_text')}'")
811
+ print("=" * 60)
812
+
813
+ for i, result in enumerate(results):
814
+ print(f"\n[{i+1}] Score: {result['score']:.4f}")
815
+
816
+ # Show metadata
817
+ metadata = result['metadata']
818
+ print(f"File: {metadata.get('filename', 'Unknown')}")
819
+ if metadata.get('section'):
820
+ print(f"Section: {metadata['section']}")
821
+
822
+ # Show content unless suppressed
823
+ if not args.no_content:
824
+ content = result['content']
825
+ if len(content) > 300 and not args.verbose:
826
+ content = content[:300] + "..."
827
+ print(f"\n{content}")
828
+
829
+ if i < len(results) - 1:
830
+ print("-" * 40)
831
+
832
+ except KeyboardInterrupt:
833
+ print("\nUse 'exit' to quit")
834
+ except EOFError:
835
+ print("\nGoodbye!")
836
+ break
837
+ except Exception as e:
838
+ print(f"\nError: {e}")
839
+ if args.verbose:
840
+ import traceback
841
+ traceback.print_exc()
842
+
843
+ return # Exit after shell mode
844
+
845
+ # Normal single query mode
349
846
  if args.verbose:
350
847
  print(f"Index contains {stats['total_chunks']} chunks from {stats['total_files']} files")
351
848
  print(f"Searching for: '{args.query}'")
352
- print(f"NLP Backend: {args.nlp_backend}")
849
+ print(f"Query NLP Backend: {args.query_nlp_backend}")
850
+ if args.model:
851
+ print(f"Override model: {args.model}")
852
+ elif model_to_use:
853
+ print(f"Using index model: {model_to_use}")
353
854
  print()
354
855
 
355
856
  # Preprocess query
356
- enhanced = preprocess_query(args.query, vector=True, nlp_backend=args.nlp_backend)
857
+ enhanced = preprocess_query(
858
+ args.query,
859
+ vector=True, # Both backends need vector for similarity search
860
+ query_nlp_backend=args.query_nlp_backend,
861
+ model_name=model_to_use,
862
+ preserve_original=True, # Keep original query terms
863
+ max_synonyms=2 # Reduce synonym expansion
864
+ )
357
865
 
358
866
  # Parse tags if provided
359
867
  tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
@@ -363,8 +871,10 @@ def search_command():
363
871
  query_vector=enhanced.get('vector'),
364
872
  enhanced_text=enhanced.get('enhanced_text', args.query),
365
873
  count=args.count,
366
- distance_threshold=args.distance_threshold,
367
- tags=tags
874
+ similarity_threshold=args.similarity_threshold,
875
+ tags=tags,
876
+ keyword_weight=args.keyword_weight,
877
+ original_query=args.query # Pass original for exact match boosting
368
878
  )
369
879
 
370
880
  if args.json:
@@ -433,10 +943,400 @@ def search_command():
433
943
  traceback.print_exc()
434
944
  sys.exit(1)
435
945
 
946
+ def migrate_command():
947
+ """Migrate search indexes between backends"""
948
+ parser = argparse.ArgumentParser(
949
+ description='Migrate search indexes between SQLite and pgvector backends',
950
+ epilog="""
951
+ Examples:
952
+ # Migrate SQLite to pgvector
953
+ sw-search migrate ./docs.swsearch \\
954
+ --to-pgvector \\
955
+ --connection-string "postgresql://user:pass@localhost/db" \\
956
+ --collection-name docs_collection
957
+
958
+ # Migrate with overwrite
959
+ sw-search migrate ./docs.swsearch \\
960
+ --to-pgvector \\
961
+ --connection-string "postgresql://user:pass@localhost/db" \\
962
+ --collection-name docs_collection \\
963
+ --overwrite
964
+
965
+ # Get index information
966
+ sw-search migrate --info ./docs.swsearch
967
+ """,
968
+ formatter_class=argparse.RawDescriptionHelpFormatter
969
+ )
970
+
971
+ # Source argument (optional if using --info)
972
+ parser.add_argument('source', nargs='?', help='Source index file or collection')
973
+
974
+ # Migration direction
975
+ migration_group = parser.add_mutually_exclusive_group()
976
+ migration_group.add_argument('--to-pgvector', action='store_true',
977
+ help='Migrate SQLite index to pgvector')
978
+ migration_group.add_argument('--to-sqlite', action='store_true',
979
+ help='Migrate pgvector collection to SQLite (not yet implemented)')
980
+ migration_group.add_argument('--info', action='store_true',
981
+ help='Show information about an index')
982
+
983
+ # pgvector options
984
+ parser.add_argument('--connection-string',
985
+ help='PostgreSQL connection string for pgvector')
986
+ parser.add_argument('--collection-name',
987
+ help='Collection name for pgvector')
988
+ parser.add_argument('--overwrite', action='store_true',
989
+ help='Overwrite existing collection')
990
+
991
+ # SQLite options
992
+ parser.add_argument('--output',
993
+ help='Output .swsearch file path (for --to-sqlite)')
994
+
995
+ # Common options
996
+ parser.add_argument('--batch-size', type=int, default=100,
997
+ help='Number of chunks to process at once (default: 100)')
998
+ parser.add_argument('--verbose', action='store_true',
999
+ help='Show detailed progress')
1000
+
1001
+ args = parser.parse_args()
1002
+
1003
+ # Handle --info flag
1004
+ if args.info:
1005
+ if not args.source:
1006
+ print("Error: Source index required with --info")
1007
+ sys.exit(1)
1008
+
1009
+ try:
1010
+ from signalwire_agents.search.migration import SearchIndexMigrator
1011
+ migrator = SearchIndexMigrator(verbose=args.verbose)
1012
+ info = migrator.get_index_info(args.source)
1013
+
1014
+ print(f"Index Information: {args.source}")
1015
+ print(f" Type: {info['type']}")
1016
+ if info['type'] == 'sqlite':
1017
+ print(f" Total chunks: {info['total_chunks']}")
1018
+ print(f" Total files: {info['total_files']}")
1019
+ print(f" Model: {info['config'].get('embedding_model', 'Unknown')}")
1020
+ print(f" Dimensions: {info['config'].get('embedding_dimensions', 'Unknown')}")
1021
+ print(f" Created: {info['config'].get('created_at', 'Unknown')}")
1022
+ if args.verbose:
1023
+ print("\n Full configuration:")
1024
+ for key, value in info['config'].items():
1025
+ print(f" {key}: {value}")
1026
+ else:
1027
+ print(" Unable to determine index type")
1028
+ except Exception as e:
1029
+ print(f"Error getting index info: {e}")
1030
+ sys.exit(1)
1031
+ return
1032
+
1033
+ # Validate arguments for migration
1034
+ if not args.source:
1035
+ print("Error: Source index required for migration")
1036
+ sys.exit(1)
1037
+
1038
+ if not args.to_pgvector and not args.to_sqlite:
1039
+ print("Error: Must specify migration direction (--to-pgvector or --to-sqlite)")
1040
+ sys.exit(1)
1041
+
1042
+ try:
1043
+ from signalwire_agents.search.migration import SearchIndexMigrator
1044
+ migrator = SearchIndexMigrator(verbose=args.verbose)
1045
+
1046
+ if args.to_pgvector:
1047
+ # Validate pgvector arguments
1048
+ if not args.connection_string:
1049
+ print("Error: --connection-string required for pgvector migration")
1050
+ sys.exit(1)
1051
+ if not args.collection_name:
1052
+ print("Error: --collection-name required for pgvector migration")
1053
+ sys.exit(1)
1054
+
1055
+ # Perform migration
1056
+ print(f"Migrating {args.source} to pgvector collection '{args.collection_name}'...")
1057
+ stats = migrator.migrate_sqlite_to_pgvector(
1058
+ sqlite_path=args.source,
1059
+ connection_string=args.connection_string,
1060
+ collection_name=args.collection_name,
1061
+ overwrite=args.overwrite,
1062
+ batch_size=args.batch_size
1063
+ )
1064
+
1065
+ print(f"\n✓ Migration completed successfully!")
1066
+ print(f" Chunks migrated: {stats['chunks_migrated']}")
1067
+ print(f" Errors: {stats['errors']}")
1068
+
1069
+ elif args.to_sqlite:
1070
+ print("Error: pgvector to SQLite migration not yet implemented")
1071
+ print("This feature is planned for future development")
1072
+ sys.exit(1)
1073
+
1074
+ except Exception as e:
1075
+ print(f"\nError during migration: {e}")
1076
+ if args.verbose:
1077
+ import traceback
1078
+ traceback.print_exc()
1079
+ sys.exit(1)
1080
+
1081
+
1082
+ def remote_command():
1083
+ """Search via remote API endpoint"""
1084
+ parser = argparse.ArgumentParser(description='Search via remote API endpoint')
1085
+ parser.add_argument('endpoint', help='Remote API endpoint URL (e.g., http://localhost:8001)')
1086
+ parser.add_argument('query', help='Search query')
1087
+ parser.add_argument('--index-name', required=True, help='Name of the index to search')
1088
+ parser.add_argument('--count', type=int, default=5, help='Number of results to return (default: 5)')
1089
+ parser.add_argument('--distance-threshold', type=float, default=0.0, help='Minimum similarity score (default: 0.0)')
1090
+ parser.add_argument('--tags', help='Comma-separated tags to filter by')
1091
+ parser.add_argument('--verbose', action='store_true', help='Show detailed information')
1092
+ parser.add_argument('--json', action='store_true', help='Output results as JSON')
1093
+ parser.add_argument('--no-content', action='store_true', help='Hide content in results (show only metadata)')
1094
+ parser.add_argument('--timeout', type=int, default=30, help='Request timeout in seconds (default: 30)')
1095
+
1096
+ args = parser.parse_args()
1097
+
1098
+ # Ensure endpoint starts with http:// or https://
1099
+ endpoint = args.endpoint
1100
+ if not endpoint.startswith(('http://', 'https://')):
1101
+ endpoint = f"http://{endpoint}"
1102
+
1103
+ # Ensure endpoint ends with /search
1104
+ if not endpoint.endswith('/search'):
1105
+ if endpoint.endswith('/'):
1106
+ endpoint += 'search'
1107
+ else:
1108
+ endpoint += '/search'
1109
+
1110
+ try:
1111
+ import requests
1112
+ except ImportError:
1113
+ print("Error: requests library not available. Install with: pip install requests")
1114
+ sys.exit(1)
1115
+
1116
+ # Prepare request payload
1117
+ payload = {
1118
+ 'query': args.query,
1119
+ 'index_name': args.index_name,
1120
+ 'count': args.count,
1121
+ 'similarity_threshold': args.similarity_threshold
1122
+ }
1123
+
1124
+ if args.tags:
1125
+ payload['tags'] = [tag.strip() for tag in args.tags.split(',')]
1126
+
1127
+ if args.verbose:
1128
+ print(f"Searching remote endpoint: {endpoint}")
1129
+ print(f"Payload: {payload}")
1130
+ print()
1131
+
1132
+ try:
1133
+ # Make the API request
1134
+ response = requests.post(
1135
+ endpoint,
1136
+ json=payload,
1137
+ headers={'Content-Type': 'application/json'},
1138
+ timeout=args.timeout
1139
+ )
1140
+
1141
+ if response.status_code == 200:
1142
+ result = response.json()
1143
+
1144
+ if args.json:
1145
+ # Output raw JSON response
1146
+ import json
1147
+ print(json.dumps(result, indent=2))
1148
+ else:
1149
+ # Human-readable output
1150
+ results = result.get('results', [])
1151
+ if not results:
1152
+ print(f"No results found for '{args.query}' in index '{args.index_name}'")
1153
+ sys.exit(0)
1154
+
1155
+ print(f"Found {len(results)} result(s) for '{args.query}' in index '{args.index_name}':")
1156
+ if result.get('enhanced_query') and result.get('enhanced_query') != args.query:
1157
+ print(f"Enhanced query: '{result.get('enhanced_query')}'")
1158
+ print("=" * 80)
1159
+
1160
+ for i, search_result in enumerate(results):
1161
+ print(f"\n[{i+1}] Score: {search_result.get('score', 0):.4f}")
1162
+
1163
+ # Show metadata
1164
+ metadata = search_result.get('metadata', {})
1165
+ print(f"File: {metadata.get('filename', 'Unknown')}")
1166
+ if metadata.get('section'):
1167
+ print(f"Section: {metadata['section']}")
1168
+ if metadata.get('line_start'):
1169
+ print(f"Lines: {metadata['line_start']}-{metadata.get('line_end', metadata['line_start'])}")
1170
+ if metadata.get('tags'):
1171
+ print(f"Tags: {', '.join(metadata['tags'])}")
1172
+
1173
+ # Show content unless suppressed
1174
+ if not args.no_content and 'content' in search_result:
1175
+ content = search_result['content']
1176
+ if len(content) > 500 and not args.verbose:
1177
+ content = content[:500] + "..."
1178
+ print(f"\nContent:\n{content}")
1179
+
1180
+ if i < len(results) - 1:
1181
+ print("-" * 80)
1182
+
1183
+ elif response.status_code == 404:
1184
+ try:
1185
+ error_detail = response.json()
1186
+ error_msg = error_detail.get('detail', 'Index not found')
1187
+ except:
1188
+ error_msg = 'Index not found'
1189
+ print(f"Error: {error_msg}")
1190
+ sys.exit(1)
1191
+ else:
1192
+ try:
1193
+ error_detail = response.json()
1194
+ error_msg = error_detail.get('detail', f'HTTP {response.status_code}')
1195
+ except:
1196
+ error_msg = f'HTTP {response.status_code}: {response.text}'
1197
+ print(f"Error: {error_msg}")
1198
+ sys.exit(1)
1199
+
1200
+ except requests.ConnectionError:
1201
+ print(f"Error: Could not connect to {endpoint}")
1202
+ print("Make sure the search server is running")
1203
+ sys.exit(1)
1204
+ except requests.Timeout:
1205
+ print(f"Error: Request timed out after {args.timeout} seconds")
1206
+ sys.exit(1)
1207
+ except requests.RequestException as e:
1208
+ print(f"Error making request: {e}")
1209
+ sys.exit(1)
1210
+ except Exception as e:
1211
+ print(f"Error: {e}")
1212
+ if args.verbose:
1213
+ import traceback
1214
+ traceback.print_exc()
1215
+ sys.exit(1)
1216
+
436
1217
  def console_entry_point():
437
1218
  """Console script entry point for pip installation"""
438
1219
  import sys
439
1220
 
1221
+ # Fast help check - show help without importing heavy modules
1222
+ if len(sys.argv) > 1 and sys.argv[1] in ['--help', '-h']:
1223
+ print("""usage: sw-search [-h] [--output OUTPUT] [--chunking-strategy {sentence,sliding,paragraph,page,semantic,topic,qa}]
1224
+ [--max-sentences-per-chunk MAX_SENTENCES_PER_CHUNK] [--chunk-size CHUNK_SIZE]
1225
+ [--overlap-size OVERLAP_SIZE] [--split-newlines SPLIT_NEWLINES] [--file-types FILE_TYPES]
1226
+ [--exclude EXCLUDE] [--languages LANGUAGES] [--model MODEL] [--tags TAGS]
1227
+ [--index-nlp-backend {nltk,spacy}] [--verbose] [--validate]
1228
+ [--semantic-threshold SEMANTIC_THRESHOLD] [--topic-threshold TOPIC_THRESHOLD]
1229
+ sources [sources ...]
1230
+
1231
+ Build local search index from documents
1232
+
1233
+ positional arguments:
1234
+ sources Source files and/or directories to index
1235
+
1236
+ options:
1237
+ -h, --help show this help message and exit
1238
+ --output OUTPUT Output .swsearch file (default: sources.swsearch)
1239
+ --chunking-strategy {sentence,sliding,paragraph,page,semantic,topic,qa}
1240
+ Chunking strategy to use (default: sentence)
1241
+ --max-sentences-per-chunk MAX_SENTENCES_PER_CHUNK
1242
+ Maximum sentences per chunk for sentence strategy (default: 5)
1243
+ --chunk-size CHUNK_SIZE
1244
+ Chunk size in words for sliding window strategy (default: 50)
1245
+ --overlap-size OVERLAP_SIZE
1246
+ Overlap size in words for sliding window strategy (default: 10)
1247
+ --split-newlines SPLIT_NEWLINES
1248
+ Split on multiple newlines (for sentence strategy)
1249
+ --file-types FILE_TYPES
1250
+ Comma-separated file extensions to include for directories (default: md,txt,rst)
1251
+ --exclude EXCLUDE Comma-separated glob patterns to exclude (e.g., "**/test/**,**/__pycache__/**")
1252
+ --languages LANGUAGES
1253
+ Comma-separated language codes (default: en)
1254
+ --model MODEL Sentence transformer model name (default: sentence-transformers/all-mpnet-base-v2)
1255
+ --tags TAGS Comma-separated tags to add to all chunks
1256
+ --index-nlp-backend {nltk,spacy}
1257
+ NLP backend for document processing: nltk (fast, default) or spacy (better quality, slower)
1258
+ --verbose Enable verbose output
1259
+ --validate Validate the created index after building
1260
+ --semantic-threshold SEMANTIC_THRESHOLD
1261
+ Similarity threshold for semantic chunking (default: 0.5)
1262
+ --topic-threshold TOPIC_THRESHOLD
1263
+ Similarity threshold for topic chunking (default: 0.3)
1264
+
1265
+ Examples:
1266
+ # Basic usage with directory (defaults to sentence chunking with 5 sentences per chunk)
1267
+ sw-search ./docs
1268
+
1269
+ # Multiple directories
1270
+ sw-search ./docs ./examples --file-types md,txt,py
1271
+
1272
+ # Individual files
1273
+ sw-search README.md ./docs/guide.md ./src/main.py
1274
+
1275
+ # Mixed sources (directories and files)
1276
+ sw-search ./docs README.md ./examples specific_file.txt --file-types md,txt,py
1277
+
1278
+ # Sentence-based chunking with custom parameters
1279
+ sw-search ./docs \\
1280
+ --chunking-strategy sentence \\
1281
+ --max-sentences-per-chunk 10 \\
1282
+ --split-newlines 2
1283
+
1284
+ # Sliding window chunking
1285
+ sw-search ./docs \\
1286
+ --chunking-strategy sliding \\
1287
+ --chunk-size 100 \\
1288
+ --overlap-size 20
1289
+
1290
+ # Paragraph-based chunking
1291
+ sw-search ./docs \\
1292
+ --chunking-strategy paragraph \\
1293
+ --file-types md,txt,rst
1294
+
1295
+ # Page-based chunking (good for PDFs)
1296
+ sw-search ./docs \\
1297
+ --chunking-strategy page \\
1298
+ --file-types pdf
1299
+
1300
+ # Semantic chunking (groups semantically similar sentences)
1301
+ sw-search ./docs \\
1302
+ --chunking-strategy semantic \\
1303
+ --semantic-threshold 0.6
1304
+
1305
+ # Topic-based chunking (groups by topic changes)
1306
+ sw-search ./docs \\
1307
+ --chunking-strategy topic \\
1308
+ --topic-threshold 0.2
1309
+
1310
+ # QA-optimized chunking (optimized for question-answering)
1311
+ sw-search ./docs \\
1312
+ --chunking-strategy qa
1313
+
1314
+ # Full configuration example
1315
+ sw-search ./docs ./examples README.md \\
1316
+ --output ./knowledge.swsearch \\
1317
+ --chunking-strategy sentence \\
1318
+ --max-sentences-per-chunk 8 \\
1319
+ --file-types md,txt,rst,py \\
1320
+ --exclude "**/test/**,**/__pycache__/**" \\
1321
+ --languages en,es,fr \\
1322
+ --model sentence-transformers/all-mpnet-base-v2 \\
1323
+ --tags documentation,api \\
1324
+ --verbose
1325
+
1326
+ # Validate an existing index
1327
+ sw-search validate ./docs.swsearch
1328
+
1329
+ # Search within an index
1330
+ sw-search search ./docs.swsearch "how to create an agent"
1331
+ sw-search search ./docs.swsearch "API reference" --count 3 --verbose
1332
+ sw-search search ./docs.swsearch "configuration" --tags documentation --json
1333
+
1334
+ # Search via remote API
1335
+ sw-search remote http://localhost:8001 "how to create an agent" --index-name docs
1336
+ sw-search remote localhost:8001 "API reference" --index-name docs --count 3 --verbose
1337
+ """)
1338
+ return
1339
+
440
1340
  # Check for subcommands
441
1341
  if len(sys.argv) > 1:
442
1342
  if sys.argv[1] == 'validate':
@@ -449,6 +1349,16 @@ def console_entry_point():
449
1349
  sys.argv.pop(1)
450
1350
  search_command()
451
1351
  return
1352
+ elif sys.argv[1] == 'remote':
1353
+ # Remove 'remote' from argv and call remote_command
1354
+ sys.argv.pop(1)
1355
+ remote_command()
1356
+ return
1357
+ elif sys.argv[1] == 'migrate':
1358
+ # Remove 'migrate' from argv and call migrate_command
1359
+ sys.argv.pop(1)
1360
+ migrate_command()
1361
+ return
452
1362
 
453
1363
  # Regular build command
454
1364
  main()