signalwire-agents 0.1.6__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +130 -4
- signalwire_agents/agent_server.py +438 -32
- signalwire_agents/agents/bedrock.py +296 -0
- signalwire_agents/cli/__init__.py +18 -0
- signalwire_agents/cli/build_search.py +1367 -0
- signalwire_agents/cli/config.py +80 -0
- signalwire_agents/cli/core/__init__.py +10 -0
- signalwire_agents/cli/core/agent_loader.py +470 -0
- signalwire_agents/cli/core/argparse_helpers.py +179 -0
- signalwire_agents/cli/core/dynamic_config.py +71 -0
- signalwire_agents/cli/core/service_loader.py +303 -0
- signalwire_agents/cli/execution/__init__.py +10 -0
- signalwire_agents/cli/execution/datamap_exec.py +446 -0
- signalwire_agents/cli/execution/webhook_exec.py +134 -0
- signalwire_agents/cli/init_project.py +1225 -0
- signalwire_agents/cli/output/__init__.py +10 -0
- signalwire_agents/cli/output/output_formatter.py +255 -0
- signalwire_agents/cli/output/swml_dump.py +186 -0
- signalwire_agents/cli/simulation/__init__.py +10 -0
- signalwire_agents/cli/simulation/data_generation.py +374 -0
- signalwire_agents/cli/simulation/data_overrides.py +200 -0
- signalwire_agents/cli/simulation/mock_env.py +282 -0
- signalwire_agents/cli/swaig_test_wrapper.py +52 -0
- signalwire_agents/cli/test_swaig.py +809 -0
- signalwire_agents/cli/types.py +81 -0
- signalwire_agents/core/__init__.py +2 -2
- signalwire_agents/core/agent/__init__.py +12 -0
- signalwire_agents/core/agent/config/__init__.py +12 -0
- signalwire_agents/core/agent/deployment/__init__.py +9 -0
- signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
- signalwire_agents/core/agent/prompt/__init__.py +14 -0
- signalwire_agents/core/agent/prompt/manager.py +306 -0
- signalwire_agents/core/agent/routing/__init__.py +9 -0
- signalwire_agents/core/agent/security/__init__.py +9 -0
- signalwire_agents/core/agent/swml/__init__.py +9 -0
- signalwire_agents/core/agent/tools/__init__.py +15 -0
- signalwire_agents/core/agent/tools/decorator.py +97 -0
- signalwire_agents/core/agent/tools/registry.py +210 -0
- signalwire_agents/core/agent_base.py +959 -2166
- signalwire_agents/core/auth_handler.py +233 -0
- signalwire_agents/core/config_loader.py +259 -0
- signalwire_agents/core/contexts.py +707 -0
- signalwire_agents/core/data_map.py +487 -0
- signalwire_agents/core/function_result.py +1150 -1
- signalwire_agents/core/logging_config.py +376 -0
- signalwire_agents/core/mixins/__init__.py +28 -0
- signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
- signalwire_agents/core/mixins/auth_mixin.py +287 -0
- signalwire_agents/core/mixins/prompt_mixin.py +358 -0
- signalwire_agents/core/mixins/serverless_mixin.py +368 -0
- signalwire_agents/core/mixins/skill_mixin.py +55 -0
- signalwire_agents/core/mixins/state_mixin.py +153 -0
- signalwire_agents/core/mixins/tool_mixin.py +230 -0
- signalwire_agents/core/mixins/web_mixin.py +1134 -0
- signalwire_agents/core/security/session_manager.py +174 -86
- signalwire_agents/core/security_config.py +333 -0
- signalwire_agents/core/skill_base.py +200 -0
- signalwire_agents/core/skill_manager.py +244 -0
- signalwire_agents/core/swaig_function.py +33 -9
- signalwire_agents/core/swml_builder.py +212 -12
- signalwire_agents/core/swml_handler.py +43 -13
- signalwire_agents/core/swml_renderer.py +123 -297
- signalwire_agents/core/swml_service.py +277 -260
- signalwire_agents/prefabs/concierge.py +6 -2
- signalwire_agents/prefabs/info_gatherer.py +149 -33
- signalwire_agents/prefabs/receptionist.py +14 -22
- signalwire_agents/prefabs/survey.py +6 -2
- signalwire_agents/schema.json +9218 -5489
- signalwire_agents/search/__init__.py +137 -0
- signalwire_agents/search/document_processor.py +1223 -0
- signalwire_agents/search/index_builder.py +804 -0
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +752 -0
- signalwire_agents/search/query_processor.py +502 -0
- signalwire_agents/search/search_engine.py +1264 -0
- signalwire_agents/search/search_service.py +574 -0
- signalwire_agents/skills/README.md +452 -0
- signalwire_agents/skills/__init__.py +23 -0
- signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
- signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
- signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
- signalwire_agents/skills/datasphere/README.md +210 -0
- signalwire_agents/skills/datasphere/__init__.py +12 -0
- signalwire_agents/skills/datasphere/skill.py +310 -0
- signalwire_agents/skills/datasphere_serverless/README.md +258 -0
- signalwire_agents/skills/datasphere_serverless/__init__.py +10 -0
- signalwire_agents/skills/datasphere_serverless/skill.py +237 -0
- signalwire_agents/skills/datetime/README.md +132 -0
- signalwire_agents/skills/datetime/__init__.py +10 -0
- signalwire_agents/skills/datetime/skill.py +126 -0
- signalwire_agents/skills/joke/README.md +149 -0
- signalwire_agents/skills/joke/__init__.py +10 -0
- signalwire_agents/skills/joke/skill.py +109 -0
- signalwire_agents/skills/math/README.md +161 -0
- signalwire_agents/skills/math/__init__.py +10 -0
- signalwire_agents/skills/math/skill.py +105 -0
- signalwire_agents/skills/mcp_gateway/README.md +230 -0
- signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
- signalwire_agents/skills/mcp_gateway/skill.py +421 -0
- signalwire_agents/skills/native_vector_search/README.md +210 -0
- signalwire_agents/skills/native_vector_search/__init__.py +10 -0
- signalwire_agents/skills/native_vector_search/skill.py +820 -0
- signalwire_agents/skills/play_background_file/README.md +218 -0
- signalwire_agents/skills/play_background_file/__init__.py +12 -0
- signalwire_agents/skills/play_background_file/skill.py +242 -0
- signalwire_agents/skills/registry.py +459 -0
- signalwire_agents/skills/spider/README.md +236 -0
- signalwire_agents/skills/spider/__init__.py +13 -0
- signalwire_agents/skills/spider/skill.py +598 -0
- signalwire_agents/skills/swml_transfer/README.md +395 -0
- signalwire_agents/skills/swml_transfer/__init__.py +10 -0
- signalwire_agents/skills/swml_transfer/skill.py +359 -0
- signalwire_agents/skills/weather_api/README.md +178 -0
- signalwire_agents/skills/weather_api/__init__.py +12 -0
- signalwire_agents/skills/weather_api/skill.py +191 -0
- signalwire_agents/skills/web_search/README.md +163 -0
- signalwire_agents/skills/web_search/__init__.py +10 -0
- signalwire_agents/skills/web_search/skill.py +739 -0
- signalwire_agents/skills/wikipedia_search/README.md +228 -0
- signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
- signalwire_agents/skills/wikipedia_search/skill.py +210 -0
- signalwire_agents/utils/__init__.py +14 -0
- signalwire_agents/utils/schema_utils.py +111 -44
- signalwire_agents/web/__init__.py +17 -0
- signalwire_agents/web/web_service.py +559 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/sw-agent-init.1 +307 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/sw-search.1 +483 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/swaig-test.1 +308 -0
- signalwire_agents-1.0.7.dist-info/METADATA +992 -0
- signalwire_agents-1.0.7.dist-info/RECORD +142 -0
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/WHEEL +1 -1
- signalwire_agents-1.0.7.dist-info/entry_points.txt +4 -0
- signalwire_agents/core/state/file_state_manager.py +0 -219
- signalwire_agents/core/state/state_manager.py +0 -101
- signalwire_agents-0.1.6.data/data/schema.json +0 -5611
- signalwire_agents-0.1.6.dist-info/METADATA +0 -199
- signalwire_agents-0.1.6.dist-info/RECORD +0 -34
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1367 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright (c) 2025 SignalWire
|
|
3
|
+
|
|
4
|
+
This file is part of the SignalWire AI Agents SDK.
|
|
5
|
+
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
See LICENSE file in the project root for full license information.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
|
|
15
|
+
from signalwire_agents.search.models import MODEL_ALIASES, DEFAULT_MODEL, resolve_model_alias
|
|
16
|
+
|
|
17
|
+
def main():
|
|
18
|
+
"""Main entry point for the build-search command"""
|
|
19
|
+
parser = argparse.ArgumentParser(
|
|
20
|
+
description='Build local search index from documents',
|
|
21
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
22
|
+
epilog="""
|
|
23
|
+
Examples:
|
|
24
|
+
# Basic usage with directory (defaults to sentence chunking with 5 sentences per chunk)
|
|
25
|
+
sw-search ./docs
|
|
26
|
+
|
|
27
|
+
# Multiple directories
|
|
28
|
+
sw-search ./docs ./examples --file-types md,txt,py
|
|
29
|
+
|
|
30
|
+
# Individual files
|
|
31
|
+
sw-search README.md ./docs/guide.md ./src/main.py
|
|
32
|
+
|
|
33
|
+
# Mixed sources (directories and files)
|
|
34
|
+
sw-search ./docs README.md ./examples specific_file.txt --file-types md,txt,py
|
|
35
|
+
|
|
36
|
+
# Sentence-based chunking with custom parameters
|
|
37
|
+
sw-search ./docs \\
|
|
38
|
+
--chunking-strategy sentence \\
|
|
39
|
+
--max-sentences-per-chunk 10 \\
|
|
40
|
+
--split-newlines 2
|
|
41
|
+
|
|
42
|
+
# Sliding window chunking
|
|
43
|
+
sw-search ./docs \\
|
|
44
|
+
--chunking-strategy sliding \\
|
|
45
|
+
--chunk-size 100 \\
|
|
46
|
+
--overlap-size 20
|
|
47
|
+
|
|
48
|
+
# Paragraph-based chunking
|
|
49
|
+
sw-search ./docs \\
|
|
50
|
+
--chunking-strategy paragraph \\
|
|
51
|
+
--file-types md,txt,rst
|
|
52
|
+
|
|
53
|
+
# Page-based chunking (good for PDFs)
|
|
54
|
+
sw-search ./docs \\
|
|
55
|
+
--chunking-strategy page \\
|
|
56
|
+
--file-types pdf
|
|
57
|
+
|
|
58
|
+
# Semantic chunking (groups semantically similar sentences)
|
|
59
|
+
sw-search ./docs \\
|
|
60
|
+
--chunking-strategy semantic \\
|
|
61
|
+
--semantic-threshold 0.6
|
|
62
|
+
|
|
63
|
+
# Topic-based chunking (groups by topic changes)
|
|
64
|
+
sw-search ./docs \\
|
|
65
|
+
--chunking-strategy topic \\
|
|
66
|
+
--topic-threshold 0.2
|
|
67
|
+
|
|
68
|
+
# QA-optimized chunking (optimized for question-answering)
|
|
69
|
+
sw-search ./docs \\
|
|
70
|
+
--chunking-strategy qa
|
|
71
|
+
|
|
72
|
+
# Markdown-aware chunking (preserves headers, detects code blocks, adds tags)
|
|
73
|
+
sw-search ./docs \\
|
|
74
|
+
--chunking-strategy markdown \\
|
|
75
|
+
--file-types md
|
|
76
|
+
# This strategy:
|
|
77
|
+
# - Chunks at header boundaries (h1, h2, h3...)
|
|
78
|
+
# - Detects code blocks and extracts language (python, bash, etc)
|
|
79
|
+
# - Adds "code" tags to chunks with code for better search
|
|
80
|
+
# - Preserves section hierarchy in metadata
|
|
81
|
+
|
|
82
|
+
# Model selection examples (performance vs quality tradeoff)
|
|
83
|
+
sw-search ./docs --model mini # Fastest (~5x faster), 384 dims, good for most use cases
|
|
84
|
+
sw-search ./docs --model base # Balanced speed/quality, 768 dims (previous default)
|
|
85
|
+
sw-search ./docs --model large # Best quality (same as base currently)
|
|
86
|
+
# Or use full model names:
|
|
87
|
+
sw-search ./docs --model sentence-transformers/all-MiniLM-L6-v2
|
|
88
|
+
sw-search ./docs --model sentence-transformers/all-mpnet-base-v2
|
|
89
|
+
|
|
90
|
+
# JSON-based chunking (pre-chunked content)
|
|
91
|
+
sw-search ./api_chunks.json \
|
|
92
|
+
--chunking-strategy json \
|
|
93
|
+
--file-types json
|
|
94
|
+
|
|
95
|
+
# Export chunks to JSON for review (single file)
|
|
96
|
+
sw-search ./docs \\
|
|
97
|
+
--output-format json \\
|
|
98
|
+
--output all_chunks.json
|
|
99
|
+
|
|
100
|
+
# Export chunks to JSON (one file per source)
|
|
101
|
+
sw-search ./docs \\
|
|
102
|
+
--output-format json \\
|
|
103
|
+
--output-dir ./chunks/
|
|
104
|
+
|
|
105
|
+
# Build index from exported JSON chunks
|
|
106
|
+
sw-search ./chunks/ \\
|
|
107
|
+
--chunking-strategy json \\
|
|
108
|
+
--file-types json \\
|
|
109
|
+
--output final.swsearch
|
|
110
|
+
|
|
111
|
+
# Full configuration example
|
|
112
|
+
sw-search ./docs ./examples README.md \\
|
|
113
|
+
--output ./knowledge.swsearch \\
|
|
114
|
+
--chunking-strategy sentence \\
|
|
115
|
+
--max-sentences-per-chunk 8 \\
|
|
116
|
+
--file-types md,txt,rst,py \\
|
|
117
|
+
--exclude "**/test/**,**/__pycache__/**" \\
|
|
118
|
+
--languages en,es,fr \\
|
|
119
|
+
--model sentence-transformers/all-mpnet-base-v2 \\
|
|
120
|
+
--tags documentation,api \\
|
|
121
|
+
--verbose
|
|
122
|
+
|
|
123
|
+
# Validate an existing index
|
|
124
|
+
sw-search validate ./docs.swsearch
|
|
125
|
+
|
|
126
|
+
# Search within an index
|
|
127
|
+
sw-search search ./docs.swsearch "how to create an agent"
|
|
128
|
+
sw-search search ./docs.swsearch "API reference" --count 3 --verbose
|
|
129
|
+
sw-search search ./docs.swsearch "configuration" --tags documentation --json
|
|
130
|
+
|
|
131
|
+
# Search via remote API
|
|
132
|
+
sw-search remote http://localhost:8001 "how to create an agent" --index-name docs
|
|
133
|
+
sw-search remote localhost:8001 "API reference" --index-name docs --count 3 --verbose
|
|
134
|
+
|
|
135
|
+
# Migrate between backends
|
|
136
|
+
sw-search migrate ./docs.swsearch --to-pgvector \\
|
|
137
|
+
--connection-string "postgresql://user:pass@localhost/db" \\
|
|
138
|
+
--collection-name docs_collection
|
|
139
|
+
sw-search migrate --info ./docs.swsearch
|
|
140
|
+
|
|
141
|
+
# PostgreSQL pgvector backend (direct build to PostgreSQL)
|
|
142
|
+
sw-search ./docs \\
|
|
143
|
+
--backend pgvector \\
|
|
144
|
+
--connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
|
|
145
|
+
--output docs_collection
|
|
146
|
+
|
|
147
|
+
# pgvector with markdown strategy (best for documentation with code examples)
|
|
148
|
+
sw-search ./docs \\
|
|
149
|
+
--backend pgvector \\
|
|
150
|
+
--connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
|
|
151
|
+
--output docs_collection \\
|
|
152
|
+
--chunking-strategy markdown
|
|
153
|
+
|
|
154
|
+
# Overwrite existing pgvector collection
|
|
155
|
+
sw-search ./docs \\
|
|
156
|
+
--backend pgvector \\
|
|
157
|
+
--connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
|
|
158
|
+
--output docs_collection \\
|
|
159
|
+
--overwrite
|
|
160
|
+
|
|
161
|
+
# Search in pgvector collection
|
|
162
|
+
sw-search search docs_collection "how to create an agent" \\
|
|
163
|
+
--backend pgvector \\
|
|
164
|
+
--connection-string "postgresql://user:pass@localhost/knowledge"
|
|
165
|
+
"""
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
parser.add_argument(
|
|
169
|
+
'sources',
|
|
170
|
+
nargs='+',
|
|
171
|
+
help='Source files and/or directories to index'
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
parser.add_argument(
|
|
175
|
+
'--output',
|
|
176
|
+
help='Output .swsearch file (default: sources.swsearch) or collection name for pgvector'
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
parser.add_argument(
|
|
180
|
+
'--output-dir',
|
|
181
|
+
help='Output directory for results (creates one file per source file when used with --output-format json, or auto-names index files)'
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
parser.add_argument(
|
|
185
|
+
'--output-format',
|
|
186
|
+
choices=['index', 'json'],
|
|
187
|
+
default='index',
|
|
188
|
+
help='Output format: index (create search index) or json (export chunks as JSON) (default: index)'
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
parser.add_argument(
|
|
192
|
+
'--backend',
|
|
193
|
+
choices=['sqlite', 'pgvector'],
|
|
194
|
+
default='sqlite',
|
|
195
|
+
help='Storage backend to use (default: sqlite)'
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
parser.add_argument(
|
|
199
|
+
'--connection-string',
|
|
200
|
+
help='PostgreSQL connection string for pgvector backend'
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
parser.add_argument(
|
|
204
|
+
'--overwrite',
|
|
205
|
+
action='store_true',
|
|
206
|
+
help='Overwrite existing collection (pgvector backend only)'
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
parser.add_argument(
|
|
210
|
+
'--chunking-strategy',
|
|
211
|
+
choices=['sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json', 'markdown'],
|
|
212
|
+
default='sentence',
|
|
213
|
+
help='Chunking strategy to use (default: sentence). Use "markdown" for documentation with code blocks.'
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
parser.add_argument(
|
|
217
|
+
'--max-sentences-per-chunk',
|
|
218
|
+
type=int,
|
|
219
|
+
default=5,
|
|
220
|
+
help='Maximum sentences per chunk for sentence strategy (default: 5)'
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
parser.add_argument(
|
|
224
|
+
'--chunk-size',
|
|
225
|
+
type=int,
|
|
226
|
+
default=50,
|
|
227
|
+
help='Chunk size in words for sliding window strategy (default: 50)'
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
parser.add_argument(
|
|
231
|
+
'--overlap-size',
|
|
232
|
+
type=int,
|
|
233
|
+
default=10,
|
|
234
|
+
help='Overlap size in words for sliding window strategy (default: 10)'
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
parser.add_argument(
|
|
238
|
+
'--split-newlines',
|
|
239
|
+
type=int,
|
|
240
|
+
help='Split on multiple newlines (for sentence strategy)'
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
parser.add_argument(
|
|
244
|
+
'--file-types',
|
|
245
|
+
default='md,txt,rst',
|
|
246
|
+
help='Comma-separated file extensions to include for directories (default: md,txt,rst)'
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
parser.add_argument(
|
|
250
|
+
'--exclude',
|
|
251
|
+
help='Comma-separated glob patterns to exclude (e.g., "**/test/**,**/__pycache__/**")'
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
parser.add_argument(
|
|
255
|
+
'--languages',
|
|
256
|
+
default='en',
|
|
257
|
+
help='Comma-separated language codes (default: en)'
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
parser.add_argument(
|
|
261
|
+
'--model',
|
|
262
|
+
default=DEFAULT_MODEL,
|
|
263
|
+
help=f'Sentence transformer model name or alias (mini/base/large). Default: mini ({DEFAULT_MODEL})'
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
parser.add_argument(
|
|
267
|
+
'--tags',
|
|
268
|
+
help='Comma-separated tags to add to all chunks'
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
parser.add_argument(
|
|
272
|
+
'--index-nlp-backend',
|
|
273
|
+
choices=['nltk', 'spacy'],
|
|
274
|
+
default='nltk',
|
|
275
|
+
help='NLP backend for document processing: nltk (fast, default) or spacy (better quality, slower)'
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
parser.add_argument(
|
|
279
|
+
'--verbose',
|
|
280
|
+
action='store_true',
|
|
281
|
+
help='Enable verbose output'
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
parser.add_argument(
|
|
285
|
+
'--validate',
|
|
286
|
+
action='store_true',
|
|
287
|
+
help='Validate the created index after building'
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
parser.add_argument(
|
|
291
|
+
'--semantic-threshold',
|
|
292
|
+
type=float,
|
|
293
|
+
default=0.5,
|
|
294
|
+
help='Similarity threshold for semantic chunking (default: 0.5)'
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
parser.add_argument(
|
|
298
|
+
'--topic-threshold',
|
|
299
|
+
type=float,
|
|
300
|
+
default=0.3,
|
|
301
|
+
help='Similarity threshold for topic chunking (default: 0.3)'
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
args = parser.parse_args()
|
|
305
|
+
|
|
306
|
+
# Resolve model aliases
|
|
307
|
+
args.model = resolve_model_alias(args.model)
|
|
308
|
+
|
|
309
|
+
# Validate sources
|
|
310
|
+
valid_sources = []
|
|
311
|
+
for source in args.sources:
|
|
312
|
+
source_path = Path(source)
|
|
313
|
+
if not source_path.exists():
|
|
314
|
+
print(f"Warning: Source does not exist, skipping: {source}")
|
|
315
|
+
continue
|
|
316
|
+
valid_sources.append(source_path)
|
|
317
|
+
|
|
318
|
+
if not valid_sources:
|
|
319
|
+
print("Error: No valid sources found")
|
|
320
|
+
sys.exit(1)
|
|
321
|
+
|
|
322
|
+
# Validate backend configuration
|
|
323
|
+
if args.backend == 'pgvector' and not args.connection_string:
|
|
324
|
+
print("Error: --connection-string is required for pgvector backend")
|
|
325
|
+
sys.exit(1)
|
|
326
|
+
|
|
327
|
+
# Validate output options
|
|
328
|
+
if args.output and args.output_dir:
|
|
329
|
+
print("Error: Cannot specify both --output and --output-dir")
|
|
330
|
+
sys.exit(1)
|
|
331
|
+
|
|
332
|
+
# Handle JSON output format differently
|
|
333
|
+
if args.output_format == 'json':
|
|
334
|
+
# JSON export doesn't use backend
|
|
335
|
+
if args.backend != 'sqlite':
|
|
336
|
+
print("Warning: --backend is ignored when using --output-format json")
|
|
337
|
+
|
|
338
|
+
# Determine output location
|
|
339
|
+
if args.output_dir:
|
|
340
|
+
# Multiple files mode
|
|
341
|
+
output_path = Path(args.output_dir)
|
|
342
|
+
if not output_path.exists():
|
|
343
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
344
|
+
elif args.output:
|
|
345
|
+
# Single file mode
|
|
346
|
+
output_path = Path(args.output)
|
|
347
|
+
if not output_path.suffix:
|
|
348
|
+
output_path = output_path.with_suffix('.json')
|
|
349
|
+
else:
|
|
350
|
+
# Default to single file
|
|
351
|
+
output_path = Path('chunks.json')
|
|
352
|
+
args.output = str(output_path)
|
|
353
|
+
|
|
354
|
+
# Default output filename (for index format)
|
|
355
|
+
if args.output_format == 'index' and not args.output and not args.output_dir:
|
|
356
|
+
if args.backend == 'sqlite':
|
|
357
|
+
if len(valid_sources) == 1:
|
|
358
|
+
# Single source - use its name
|
|
359
|
+
source_name = valid_sources[0].stem if valid_sources[0].is_file() else valid_sources[0].name
|
|
360
|
+
args.output = f"{source_name}.swsearch"
|
|
361
|
+
else:
|
|
362
|
+
# Multiple sources - use generic name
|
|
363
|
+
args.output = "sources.swsearch"
|
|
364
|
+
else:
|
|
365
|
+
# For pgvector, use a default collection name
|
|
366
|
+
if len(valid_sources) == 1:
|
|
367
|
+
source_name = valid_sources[0].stem if valid_sources[0].is_file() else valid_sources[0].name
|
|
368
|
+
args.output = source_name
|
|
369
|
+
else:
|
|
370
|
+
args.output = "documents"
|
|
371
|
+
|
|
372
|
+
# Handle --output-dir for index format
|
|
373
|
+
if args.output_format == 'index' and args.output_dir:
|
|
374
|
+
# Auto-generate output filename in the directory
|
|
375
|
+
if len(valid_sources) == 1:
|
|
376
|
+
source_name = valid_sources[0].stem if valid_sources[0].is_file() else valid_sources[0].name
|
|
377
|
+
else:
|
|
378
|
+
source_name = "combined"
|
|
379
|
+
|
|
380
|
+
output_dir = Path(args.output_dir)
|
|
381
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
382
|
+
|
|
383
|
+
if args.backend == 'sqlite':
|
|
384
|
+
args.output = str(output_dir / f"{source_name}.swsearch")
|
|
385
|
+
else:
|
|
386
|
+
# For pgvector, still use the name as collection
|
|
387
|
+
args.output = source_name
|
|
388
|
+
|
|
389
|
+
# Ensure output has .swsearch extension for sqlite (but not for JSON format)
|
|
390
|
+
if args.output_format == 'index' and args.backend == 'sqlite' and args.output and not args.output.endswith('.swsearch'):
|
|
391
|
+
args.output += '.swsearch'
|
|
392
|
+
|
|
393
|
+
# Parse lists
|
|
394
|
+
file_types = [ft.strip() for ft in args.file_types.split(',')]
|
|
395
|
+
exclude_patterns = [p.strip() for p in args.exclude.split(',')] if args.exclude else None
|
|
396
|
+
languages = [lang.strip() for lang in args.languages.split(',')]
|
|
397
|
+
tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
|
|
398
|
+
|
|
399
|
+
if args.verbose:
|
|
400
|
+
print(f"Building search index:")
|
|
401
|
+
print(f" Backend: {args.backend}")
|
|
402
|
+
print(f" Sources: {[str(s) for s in valid_sources]}")
|
|
403
|
+
if args.backend == 'sqlite':
|
|
404
|
+
print(f" Output file: {args.output}")
|
|
405
|
+
else:
|
|
406
|
+
print(f" Collection name: {args.output}")
|
|
407
|
+
print(f" Connection: {args.connection_string}")
|
|
408
|
+
print(f" File types (for directories): {file_types}")
|
|
409
|
+
print(f" Exclude patterns: {exclude_patterns}")
|
|
410
|
+
print(f" Languages: {languages}")
|
|
411
|
+
print(f" Model: {args.model}")
|
|
412
|
+
print(f" Chunking strategy: {args.chunking_strategy}")
|
|
413
|
+
print(f" Index NLP backend: {args.index_nlp_backend}")
|
|
414
|
+
|
|
415
|
+
if args.chunking_strategy == 'sentence':
|
|
416
|
+
print(f" Max sentences per chunk: {args.max_sentences_per_chunk}")
|
|
417
|
+
if args.split_newlines:
|
|
418
|
+
print(f" Split on newlines: {args.split_newlines}")
|
|
419
|
+
elif args.chunking_strategy == 'sliding':
|
|
420
|
+
print(f" Chunk size (words): {args.chunk_size}")
|
|
421
|
+
print(f" Overlap size (words): {args.overlap_size}")
|
|
422
|
+
elif args.chunking_strategy == 'paragraph':
|
|
423
|
+
print(f" Chunking by paragraphs (double newlines)")
|
|
424
|
+
elif args.chunking_strategy == 'page':
|
|
425
|
+
print(f" Chunking by pages")
|
|
426
|
+
elif args.chunking_strategy == 'semantic':
|
|
427
|
+
print(f" Semantic chunking (similarity threshold: {args.semantic_threshold})")
|
|
428
|
+
elif args.chunking_strategy == 'topic':
|
|
429
|
+
print(f" Topic-based chunking (similarity threshold: {args.topic_threshold})")
|
|
430
|
+
elif args.chunking_strategy == 'qa':
|
|
431
|
+
print(f" QA-optimized chunking")
|
|
432
|
+
|
|
433
|
+
print(f" Tags: {tags}")
|
|
434
|
+
print()
|
|
435
|
+
|
|
436
|
+
try:
|
|
437
|
+
# Handle JSON export mode
|
|
438
|
+
if args.output_format == 'json':
|
|
439
|
+
# Import what we need for chunking
|
|
440
|
+
from signalwire_agents.search.index_builder import IndexBuilder
|
|
441
|
+
import json
|
|
442
|
+
|
|
443
|
+
builder = IndexBuilder(
|
|
444
|
+
chunking_strategy=args.chunking_strategy,
|
|
445
|
+
max_sentences_per_chunk=args.max_sentences_per_chunk,
|
|
446
|
+
chunk_size=args.chunk_size,
|
|
447
|
+
chunk_overlap=args.overlap_size,
|
|
448
|
+
split_newlines=args.split_newlines,
|
|
449
|
+
index_nlp_backend=args.index_nlp_backend,
|
|
450
|
+
verbose=args.verbose,
|
|
451
|
+
semantic_threshold=args.semantic_threshold,
|
|
452
|
+
topic_threshold=args.topic_threshold
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# Process files and export chunks
|
|
456
|
+
all_chunks = []
|
|
457
|
+
chunk_files_created = []
|
|
458
|
+
|
|
459
|
+
# Discover files from sources
|
|
460
|
+
files = builder._discover_files_from_sources(valid_sources, file_types, exclude_patterns)
|
|
461
|
+
|
|
462
|
+
if args.verbose:
|
|
463
|
+
print(f"Processing {len(files)} files...")
|
|
464
|
+
|
|
465
|
+
for file_path in files:
|
|
466
|
+
try:
|
|
467
|
+
# Determine base directory for relative paths
|
|
468
|
+
base_dir = builder._get_base_directory_for_file(file_path, valid_sources)
|
|
469
|
+
|
|
470
|
+
# Process file into chunks
|
|
471
|
+
chunks = builder._process_file(file_path, base_dir, tags)
|
|
472
|
+
|
|
473
|
+
if args.output_dir:
|
|
474
|
+
# Create individual JSON file
|
|
475
|
+
relative_path = file_path.relative_to(base_dir) if base_dir else file_path.name
|
|
476
|
+
json_filename = relative_path.with_suffix('.json')
|
|
477
|
+
json_path = Path(args.output_dir) / json_filename
|
|
478
|
+
|
|
479
|
+
# Create subdirectories if needed
|
|
480
|
+
json_path.parent.mkdir(parents=True, exist_ok=True)
|
|
481
|
+
|
|
482
|
+
# Save chunks to JSON
|
|
483
|
+
chunk_data = {
|
|
484
|
+
"chunks": chunks,
|
|
485
|
+
"metadata": {
|
|
486
|
+
"source_file": str(relative_path),
|
|
487
|
+
"total_chunks": len(chunks),
|
|
488
|
+
"chunking_strategy": args.chunking_strategy,
|
|
489
|
+
"processing_date": datetime.now().isoformat()
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
with open(json_path, 'w', encoding='utf-8') as f:
|
|
494
|
+
json.dump(chunk_data, f, indent=2, ensure_ascii=False)
|
|
495
|
+
|
|
496
|
+
chunk_files_created.append(json_path)
|
|
497
|
+
if args.verbose:
|
|
498
|
+
print(f" Created: {json_path} ({len(chunks)} chunks)")
|
|
499
|
+
else:
|
|
500
|
+
# Accumulate all chunks for single file output
|
|
501
|
+
all_chunks.extend(chunks)
|
|
502
|
+
|
|
503
|
+
except Exception as e:
|
|
504
|
+
print(f"Error processing {file_path}: {e}")
|
|
505
|
+
if args.verbose:
|
|
506
|
+
import traceback
|
|
507
|
+
traceback.print_exc()
|
|
508
|
+
|
|
509
|
+
# Handle single file output
|
|
510
|
+
if not args.output_dir:
|
|
511
|
+
output_data = {
|
|
512
|
+
"chunks": all_chunks,
|
|
513
|
+
"metadata": {
|
|
514
|
+
"total_chunks": len(all_chunks),
|
|
515
|
+
"total_files": len(files),
|
|
516
|
+
"chunking_strategy": args.chunking_strategy,
|
|
517
|
+
"processing_date": datetime.now().isoformat()
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
with open(args.output, 'w', encoding='utf-8') as f:
|
|
522
|
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
523
|
+
|
|
524
|
+
print(f"✓ Exported {len(all_chunks)} chunks to {args.output}")
|
|
525
|
+
else:
|
|
526
|
+
print(f"✓ Created {len(chunk_files_created)} JSON files in {args.output_dir}")
|
|
527
|
+
total_chunks = sum(len(json.load(open(f))['chunks']) for f in chunk_files_created)
|
|
528
|
+
print(f" Total chunks: {total_chunks}")
|
|
529
|
+
|
|
530
|
+
# Exit early for JSON format
|
|
531
|
+
return
|
|
532
|
+
|
|
533
|
+
# Regular index building mode
|
|
534
|
+
# Create index builder - import only when actually needed
|
|
535
|
+
from signalwire_agents.search.index_builder import IndexBuilder
|
|
536
|
+
builder = IndexBuilder(
|
|
537
|
+
model_name=args.model,
|
|
538
|
+
chunking_strategy=args.chunking_strategy,
|
|
539
|
+
max_sentences_per_chunk=args.max_sentences_per_chunk,
|
|
540
|
+
chunk_size=args.chunk_size,
|
|
541
|
+
chunk_overlap=args.overlap_size,
|
|
542
|
+
split_newlines=args.split_newlines,
|
|
543
|
+
index_nlp_backend=args.index_nlp_backend,
|
|
544
|
+
verbose=args.verbose,
|
|
545
|
+
semantic_threshold=args.semantic_threshold,
|
|
546
|
+
topic_threshold=args.topic_threshold,
|
|
547
|
+
backend=args.backend,
|
|
548
|
+
connection_string=args.connection_string
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
# Build index with multiple sources
|
|
552
|
+
builder.build_index_from_sources(
|
|
553
|
+
sources=valid_sources,
|
|
554
|
+
output_file=args.output,
|
|
555
|
+
file_types=file_types,
|
|
556
|
+
exclude_patterns=exclude_patterns,
|
|
557
|
+
languages=languages,
|
|
558
|
+
tags=tags,
|
|
559
|
+
overwrite=args.overwrite if args.backend == 'pgvector' else False
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
# Validate if requested
|
|
563
|
+
if args.validate:
|
|
564
|
+
if args.verbose:
|
|
565
|
+
print("\nValidating index...")
|
|
566
|
+
|
|
567
|
+
validation = builder.validate_index(args.output)
|
|
568
|
+
if validation['valid']:
|
|
569
|
+
print(f"✓ Index validation successful:")
|
|
570
|
+
print(f" Chunks: {validation['chunk_count']}")
|
|
571
|
+
print(f" Files: {validation['file_count']}")
|
|
572
|
+
if args.verbose:
|
|
573
|
+
print(f" Config: {validation['config']}")
|
|
574
|
+
else:
|
|
575
|
+
print(f"✗ Index validation failed: {validation['error']}")
|
|
576
|
+
sys.exit(1)
|
|
577
|
+
|
|
578
|
+
if args.backend == 'sqlite':
|
|
579
|
+
# Check if the index was actually created
|
|
580
|
+
import os
|
|
581
|
+
if os.path.exists(args.output):
|
|
582
|
+
print(f"\n✓ Search index created successfully: {args.output}")
|
|
583
|
+
else:
|
|
584
|
+
print(f"\n✗ Search index creation failed - no files were processed")
|
|
585
|
+
sys.exit(1)
|
|
586
|
+
else:
|
|
587
|
+
print(f"\n✓ Search collection created successfully: {args.output}")
|
|
588
|
+
print(f" Connection: {args.connection_string}")
|
|
589
|
+
|
|
590
|
+
except KeyboardInterrupt:
|
|
591
|
+
print("\n\nBuild interrupted by user")
|
|
592
|
+
sys.exit(1)
|
|
593
|
+
except Exception as e:
|
|
594
|
+
print(f"\nError building index: {e}")
|
|
595
|
+
if args.verbose:
|
|
596
|
+
import traceback
|
|
597
|
+
traceback.print_exc()
|
|
598
|
+
sys.exit(1)
|
|
599
|
+
|
|
600
|
+
def validate_command():
|
|
601
|
+
"""Validate an existing search index"""
|
|
602
|
+
parser = argparse.ArgumentParser(description='Validate a search index file')
|
|
603
|
+
parser.add_argument('index_file', help='Path to .swsearch file to validate')
|
|
604
|
+
parser.add_argument('--verbose', action='store_true', help='Show detailed information')
|
|
605
|
+
|
|
606
|
+
args = parser.parse_args()
|
|
607
|
+
|
|
608
|
+
if not Path(args.index_file).exists():
|
|
609
|
+
print(f"Error: Index file does not exist: {args.index_file}")
|
|
610
|
+
sys.exit(1)
|
|
611
|
+
|
|
612
|
+
try:
|
|
613
|
+
from signalwire_agents.search.index_builder import IndexBuilder
|
|
614
|
+
builder = IndexBuilder()
|
|
615
|
+
|
|
616
|
+
validation = builder.validate_index(args.index_file)
|
|
617
|
+
|
|
618
|
+
if validation['valid']:
|
|
619
|
+
print(f"✓ Index is valid: {args.index_file}")
|
|
620
|
+
print(f" Chunks: {validation['chunk_count']}")
|
|
621
|
+
print(f" Files: {validation['file_count']}")
|
|
622
|
+
|
|
623
|
+
if args.verbose and 'config' in validation:
|
|
624
|
+
print("\nConfiguration:")
|
|
625
|
+
for key, value in validation['config'].items():
|
|
626
|
+
print(f" {key}: {value}")
|
|
627
|
+
else:
|
|
628
|
+
print(f"✗ Index validation failed: {validation['error']}")
|
|
629
|
+
sys.exit(1)
|
|
630
|
+
|
|
631
|
+
except Exception as e:
|
|
632
|
+
print(f"Error validating index: {e}")
|
|
633
|
+
if args.verbose:
|
|
634
|
+
import traceback
|
|
635
|
+
traceback.print_exc()
|
|
636
|
+
sys.exit(1)
|
|
637
|
+
|
|
638
|
+
def search_command():
|
|
639
|
+
"""Search within an existing search index"""
|
|
640
|
+
parser = argparse.ArgumentParser(description='Search within a .swsearch index file or pgvector collection')
|
|
641
|
+
parser.add_argument('index_source', help='Path to .swsearch file or collection name for pgvector')
|
|
642
|
+
parser.add_argument('query', nargs='?', help='Search query (optional if using --shell)')
|
|
643
|
+
parser.add_argument('--backend', choices=['sqlite', 'pgvector'], default='sqlite',
|
|
644
|
+
help='Storage backend (default: sqlite)')
|
|
645
|
+
parser.add_argument('--connection-string', help='PostgreSQL connection string for pgvector backend')
|
|
646
|
+
parser.add_argument('--shell', action='store_true',
|
|
647
|
+
help='Interactive shell mode - load once and search multiple times')
|
|
648
|
+
parser.add_argument('--count', type=int, default=5, help='Number of results to return (default: 5)')
|
|
649
|
+
parser.add_argument('--distance-threshold', type=float, default=0.0, help='Minimum similarity score (default: 0.0)')
|
|
650
|
+
parser.add_argument('--tags', help='Comma-separated tags to filter by')
|
|
651
|
+
parser.add_argument('--query-nlp-backend', choices=['nltk', 'spacy'], default='nltk',
|
|
652
|
+
help='NLP backend for query processing: nltk (fast, default) or spacy (better quality, slower)')
|
|
653
|
+
parser.add_argument('--keyword-weight', type=float, default=None,
|
|
654
|
+
help='Manual keyword weight (0.0-1.0). Overrides automatic weight detection.')
|
|
655
|
+
parser.add_argument('--verbose', action='store_true', help='Show detailed information')
|
|
656
|
+
parser.add_argument('--json', action='store_true', help='Output results as JSON')
|
|
657
|
+
parser.add_argument('--no-content', action='store_true', help='Hide content in results (show only metadata)')
|
|
658
|
+
parser.add_argument('--model', help='Override embedding model for query (mini/base/large or full model name)')
|
|
659
|
+
|
|
660
|
+
args = parser.parse_args()
|
|
661
|
+
|
|
662
|
+
# Validate arguments
|
|
663
|
+
if not args.shell and not args.query:
|
|
664
|
+
print("Error: Query is required unless using --shell mode")
|
|
665
|
+
sys.exit(1)
|
|
666
|
+
|
|
667
|
+
# Resolve model aliases
|
|
668
|
+
if args.model and args.model in MODEL_ALIASES:
|
|
669
|
+
args.model = MODEL_ALIASES[args.model]
|
|
670
|
+
|
|
671
|
+
# Validate keyword weight if provided
|
|
672
|
+
if args.keyword_weight is not None:
|
|
673
|
+
if args.keyword_weight < 0.0 or args.keyword_weight > 1.0:
|
|
674
|
+
print("Error: --keyword-weight must be between 0.0 and 1.0")
|
|
675
|
+
sys.exit(1)
|
|
676
|
+
|
|
677
|
+
# Validate backend configuration
|
|
678
|
+
if args.backend == 'pgvector' and not args.connection_string:
|
|
679
|
+
print("Error: --connection-string is required for pgvector backend")
|
|
680
|
+
sys.exit(1)
|
|
681
|
+
|
|
682
|
+
if args.backend == 'sqlite' and not Path(args.index_source).exists():
|
|
683
|
+
print(f"Error: Index file does not exist: {args.index_source}")
|
|
684
|
+
sys.exit(1)
|
|
685
|
+
|
|
686
|
+
try:
|
|
687
|
+
# Import search dependencies
|
|
688
|
+
try:
|
|
689
|
+
from signalwire_agents.search.search_engine import SearchEngine
|
|
690
|
+
from signalwire_agents.search.query_processor import preprocess_query
|
|
691
|
+
except ImportError as e:
|
|
692
|
+
print(f"Error: Search functionality not available. Install with: pip install signalwire-agents[search]")
|
|
693
|
+
print(f"Details: {e}")
|
|
694
|
+
sys.exit(1)
|
|
695
|
+
|
|
696
|
+
# Load search engine
|
|
697
|
+
if args.verbose:
|
|
698
|
+
if args.backend == 'sqlite':
|
|
699
|
+
print(f"Loading search index: {args.index_source}")
|
|
700
|
+
else:
|
|
701
|
+
print(f"Connecting to pgvector collection: {args.index_source}")
|
|
702
|
+
|
|
703
|
+
if args.backend == 'sqlite':
|
|
704
|
+
# Pass the model from the index or override if specified
|
|
705
|
+
model = args.model if args.model else None
|
|
706
|
+
engine = SearchEngine(backend='sqlite', index_path=args.index_source, model=model)
|
|
707
|
+
else:
|
|
708
|
+
# Pass the model override if specified
|
|
709
|
+
model = args.model if args.model else None
|
|
710
|
+
engine = SearchEngine(backend='pgvector', connection_string=args.connection_string,
|
|
711
|
+
collection_name=args.index_source, model=model)
|
|
712
|
+
|
|
713
|
+
# Get index stats
|
|
714
|
+
stats = engine.get_stats()
|
|
715
|
+
|
|
716
|
+
# Get the model from index config if not overridden
|
|
717
|
+
model_to_use = args.model
|
|
718
|
+
if not model_to_use and 'config' in stats:
|
|
719
|
+
# SQLite uses 'embedding_model', pgvector uses 'model_name'
|
|
720
|
+
model_to_use = stats['config'].get('embedding_model') or stats['config'].get('model_name')
|
|
721
|
+
|
|
722
|
+
# Shell mode implementation
|
|
723
|
+
if args.shell:
|
|
724
|
+
import time
|
|
725
|
+
print(f"Search Shell - Index: {args.index_source}")
|
|
726
|
+
print(f"Backend: {args.backend}")
|
|
727
|
+
print(f"Index contains {stats['total_chunks']} chunks from {stats['total_files']} files")
|
|
728
|
+
if model_to_use:
|
|
729
|
+
print(f"Model: {model_to_use}")
|
|
730
|
+
print("Type 'exit' or 'quit' to leave, 'help' for options")
|
|
731
|
+
print("-" * 60)
|
|
732
|
+
|
|
733
|
+
while True:
|
|
734
|
+
try:
|
|
735
|
+
query = input("\nsearch> ").strip()
|
|
736
|
+
|
|
737
|
+
if not query:
|
|
738
|
+
continue
|
|
739
|
+
|
|
740
|
+
if query.lower() in ['exit', 'quit', 'q']:
|
|
741
|
+
print("Goodbye!")
|
|
742
|
+
break
|
|
743
|
+
|
|
744
|
+
if query.lower() == 'help':
|
|
745
|
+
print("\nShell commands:")
|
|
746
|
+
print(" help - Show this help")
|
|
747
|
+
print(" exit/quit/q - Exit shell")
|
|
748
|
+
print(" count=N - Set result count (current: {})".format(args.count))
|
|
749
|
+
print(" tags=tag1,tag2 - Set tag filter (current: {})".format(args.tags or 'none'))
|
|
750
|
+
print(" verbose - Toggle verbose output")
|
|
751
|
+
print("\nOr type any search query...")
|
|
752
|
+
continue
|
|
753
|
+
|
|
754
|
+
# Handle shell commands
|
|
755
|
+
if query.startswith('count='):
|
|
756
|
+
try:
|
|
757
|
+
args.count = int(query.split('=')[1])
|
|
758
|
+
print(f"Result count set to: {args.count}")
|
|
759
|
+
except:
|
|
760
|
+
print("Invalid count value")
|
|
761
|
+
continue
|
|
762
|
+
|
|
763
|
+
if query.startswith('tags='):
|
|
764
|
+
tag_str = query.split('=', 1)[1]
|
|
765
|
+
args.tags = tag_str if tag_str else None
|
|
766
|
+
tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
|
|
767
|
+
print(f"Tags filter set to: {tags or 'none'}")
|
|
768
|
+
continue
|
|
769
|
+
|
|
770
|
+
if query == 'verbose':
|
|
771
|
+
args.verbose = not args.verbose
|
|
772
|
+
print(f"Verbose output: {'on' if args.verbose else 'off'}")
|
|
773
|
+
continue
|
|
774
|
+
|
|
775
|
+
# Perform search with timing
|
|
776
|
+
start_time = time.time()
|
|
777
|
+
|
|
778
|
+
# Preprocess query
|
|
779
|
+
enhanced = preprocess_query(
|
|
780
|
+
query,
|
|
781
|
+
vector=True,
|
|
782
|
+
query_nlp_backend=args.query_nlp_backend,
|
|
783
|
+
model_name=model_to_use,
|
|
784
|
+
preserve_original=True,
|
|
785
|
+
max_synonyms=2
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
# Parse tags
|
|
789
|
+
tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
|
|
790
|
+
|
|
791
|
+
# Perform search
|
|
792
|
+
results = engine.search(
|
|
793
|
+
query_vector=enhanced.get('vector'),
|
|
794
|
+
enhanced_text=enhanced.get('enhanced_text', query),
|
|
795
|
+
count=args.count,
|
|
796
|
+
similarity_threshold=args.similarity_threshold,
|
|
797
|
+
tags=tags,
|
|
798
|
+
keyword_weight=args.keyword_weight,
|
|
799
|
+
original_query=query
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
search_time = time.time() - start_time
|
|
803
|
+
|
|
804
|
+
# Display results
|
|
805
|
+
if not results:
|
|
806
|
+
print(f"\nNo results found for '{query}' ({search_time:.3f}s)")
|
|
807
|
+
else:
|
|
808
|
+
print(f"\nFound {len(results)} result(s) for '{query}' ({search_time:.3f}s):")
|
|
809
|
+
if enhanced.get('enhanced_text') != query and args.verbose:
|
|
810
|
+
print(f"Enhanced query: '{enhanced.get('enhanced_text')}'")
|
|
811
|
+
print("=" * 60)
|
|
812
|
+
|
|
813
|
+
for i, result in enumerate(results):
|
|
814
|
+
print(f"\n[{i+1}] Score: {result['score']:.4f}")
|
|
815
|
+
|
|
816
|
+
# Show metadata
|
|
817
|
+
metadata = result['metadata']
|
|
818
|
+
print(f"File: {metadata.get('filename', 'Unknown')}")
|
|
819
|
+
if metadata.get('section'):
|
|
820
|
+
print(f"Section: {metadata['section']}")
|
|
821
|
+
|
|
822
|
+
# Show content unless suppressed
|
|
823
|
+
if not args.no_content:
|
|
824
|
+
content = result['content']
|
|
825
|
+
if len(content) > 300 and not args.verbose:
|
|
826
|
+
content = content[:300] + "..."
|
|
827
|
+
print(f"\n{content}")
|
|
828
|
+
|
|
829
|
+
if i < len(results) - 1:
|
|
830
|
+
print("-" * 40)
|
|
831
|
+
|
|
832
|
+
except KeyboardInterrupt:
|
|
833
|
+
print("\nUse 'exit' to quit")
|
|
834
|
+
except EOFError:
|
|
835
|
+
print("\nGoodbye!")
|
|
836
|
+
break
|
|
837
|
+
except Exception as e:
|
|
838
|
+
print(f"\nError: {e}")
|
|
839
|
+
if args.verbose:
|
|
840
|
+
import traceback
|
|
841
|
+
traceback.print_exc()
|
|
842
|
+
|
|
843
|
+
return # Exit after shell mode
|
|
844
|
+
|
|
845
|
+
# Normal single query mode
|
|
846
|
+
if args.verbose:
|
|
847
|
+
print(f"Index contains {stats['total_chunks']} chunks from {stats['total_files']} files")
|
|
848
|
+
print(f"Searching for: '{args.query}'")
|
|
849
|
+
print(f"Query NLP Backend: {args.query_nlp_backend}")
|
|
850
|
+
if args.model:
|
|
851
|
+
print(f"Override model: {args.model}")
|
|
852
|
+
elif model_to_use:
|
|
853
|
+
print(f"Using index model: {model_to_use}")
|
|
854
|
+
print()
|
|
855
|
+
|
|
856
|
+
# Preprocess query
|
|
857
|
+
enhanced = preprocess_query(
|
|
858
|
+
args.query,
|
|
859
|
+
vector=True, # Both backends need vector for similarity search
|
|
860
|
+
query_nlp_backend=args.query_nlp_backend,
|
|
861
|
+
model_name=model_to_use,
|
|
862
|
+
preserve_original=True, # Keep original query terms
|
|
863
|
+
max_synonyms=2 # Reduce synonym expansion
|
|
864
|
+
)
|
|
865
|
+
|
|
866
|
+
# Parse tags if provided
|
|
867
|
+
tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
|
|
868
|
+
|
|
869
|
+
# Perform search
|
|
870
|
+
results = engine.search(
|
|
871
|
+
query_vector=enhanced.get('vector'),
|
|
872
|
+
enhanced_text=enhanced.get('enhanced_text', args.query),
|
|
873
|
+
count=args.count,
|
|
874
|
+
similarity_threshold=args.similarity_threshold,
|
|
875
|
+
tags=tags,
|
|
876
|
+
keyword_weight=args.keyword_weight,
|
|
877
|
+
original_query=args.query # Pass original for exact match boosting
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
if args.json:
|
|
881
|
+
# Output as JSON
|
|
882
|
+
import json
|
|
883
|
+
output = {
|
|
884
|
+
'query': args.query,
|
|
885
|
+
'enhanced_query': enhanced.get('enhanced_text', args.query),
|
|
886
|
+
'count': len(results),
|
|
887
|
+
'results': []
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
for i, result in enumerate(results):
|
|
891
|
+
result_data = {
|
|
892
|
+
'rank': i + 1,
|
|
893
|
+
'score': result['score'],
|
|
894
|
+
'metadata': result['metadata']
|
|
895
|
+
}
|
|
896
|
+
if not args.no_content:
|
|
897
|
+
result_data['content'] = result['content']
|
|
898
|
+
output['results'].append(result_data)
|
|
899
|
+
|
|
900
|
+
print(json.dumps(output, indent=2))
|
|
901
|
+
else:
|
|
902
|
+
# Human-readable output
|
|
903
|
+
if not results:
|
|
904
|
+
print(f"No results found for '{args.query}'")
|
|
905
|
+
if tags:
|
|
906
|
+
print(f"(searched with tags: {tags})")
|
|
907
|
+
sys.exit(0)
|
|
908
|
+
|
|
909
|
+
print(f"Found {len(results)} result(s) for '{args.query}':")
|
|
910
|
+
if enhanced.get('enhanced_text') != args.query:
|
|
911
|
+
print(f"Enhanced query: '{enhanced.get('enhanced_text')}'")
|
|
912
|
+
if tags:
|
|
913
|
+
print(f"Filtered by tags: {tags}")
|
|
914
|
+
print("=" * 80)
|
|
915
|
+
|
|
916
|
+
for i, result in enumerate(results):
|
|
917
|
+
print(f"\n[{i+1}] Score: {result['score']:.4f}")
|
|
918
|
+
|
|
919
|
+
# Show metadata
|
|
920
|
+
metadata = result['metadata']
|
|
921
|
+
print(f"File: {metadata.get('filename', 'Unknown')}")
|
|
922
|
+
if metadata.get('section'):
|
|
923
|
+
print(f"Section: {metadata['section']}")
|
|
924
|
+
if metadata.get('line_start'):
|
|
925
|
+
print(f"Lines: {metadata['line_start']}-{metadata.get('line_end', metadata['line_start'])}")
|
|
926
|
+
if metadata.get('tags'):
|
|
927
|
+
print(f"Tags: {', '.join(metadata['tags'])}")
|
|
928
|
+
|
|
929
|
+
# Show content unless suppressed
|
|
930
|
+
if not args.no_content:
|
|
931
|
+
content = result['content']
|
|
932
|
+
if len(content) > 500 and not args.verbose:
|
|
933
|
+
content = content[:500] + "..."
|
|
934
|
+
print(f"\nContent:\n{content}")
|
|
935
|
+
|
|
936
|
+
if i < len(results) - 1:
|
|
937
|
+
print("-" * 80)
|
|
938
|
+
|
|
939
|
+
except Exception as e:
|
|
940
|
+
print(f"Error searching index: {e}")
|
|
941
|
+
if args.verbose:
|
|
942
|
+
import traceback
|
|
943
|
+
traceback.print_exc()
|
|
944
|
+
sys.exit(1)
|
|
945
|
+
|
|
946
|
+
def migrate_command():
|
|
947
|
+
"""Migrate search indexes between backends"""
|
|
948
|
+
parser = argparse.ArgumentParser(
|
|
949
|
+
description='Migrate search indexes between SQLite and pgvector backends',
|
|
950
|
+
epilog="""
|
|
951
|
+
Examples:
|
|
952
|
+
# Migrate SQLite to pgvector
|
|
953
|
+
sw-search migrate ./docs.swsearch \\
|
|
954
|
+
--to-pgvector \\
|
|
955
|
+
--connection-string "postgresql://user:pass@localhost/db" \\
|
|
956
|
+
--collection-name docs_collection
|
|
957
|
+
|
|
958
|
+
# Migrate with overwrite
|
|
959
|
+
sw-search migrate ./docs.swsearch \\
|
|
960
|
+
--to-pgvector \\
|
|
961
|
+
--connection-string "postgresql://user:pass@localhost/db" \\
|
|
962
|
+
--collection-name docs_collection \\
|
|
963
|
+
--overwrite
|
|
964
|
+
|
|
965
|
+
# Get index information
|
|
966
|
+
sw-search migrate --info ./docs.swsearch
|
|
967
|
+
""",
|
|
968
|
+
formatter_class=argparse.RawDescriptionHelpFormatter
|
|
969
|
+
)
|
|
970
|
+
|
|
971
|
+
# Source argument (optional if using --info)
|
|
972
|
+
parser.add_argument('source', nargs='?', help='Source index file or collection')
|
|
973
|
+
|
|
974
|
+
# Migration direction
|
|
975
|
+
migration_group = parser.add_mutually_exclusive_group()
|
|
976
|
+
migration_group.add_argument('--to-pgvector', action='store_true',
|
|
977
|
+
help='Migrate SQLite index to pgvector')
|
|
978
|
+
migration_group.add_argument('--to-sqlite', action='store_true',
|
|
979
|
+
help='Migrate pgvector collection to SQLite (not yet implemented)')
|
|
980
|
+
migration_group.add_argument('--info', action='store_true',
|
|
981
|
+
help='Show information about an index')
|
|
982
|
+
|
|
983
|
+
# pgvector options
|
|
984
|
+
parser.add_argument('--connection-string',
|
|
985
|
+
help='PostgreSQL connection string for pgvector')
|
|
986
|
+
parser.add_argument('--collection-name',
|
|
987
|
+
help='Collection name for pgvector')
|
|
988
|
+
parser.add_argument('--overwrite', action='store_true',
|
|
989
|
+
help='Overwrite existing collection')
|
|
990
|
+
|
|
991
|
+
# SQLite options
|
|
992
|
+
parser.add_argument('--output',
|
|
993
|
+
help='Output .swsearch file path (for --to-sqlite)')
|
|
994
|
+
|
|
995
|
+
# Common options
|
|
996
|
+
parser.add_argument('--batch-size', type=int, default=100,
|
|
997
|
+
help='Number of chunks to process at once (default: 100)')
|
|
998
|
+
parser.add_argument('--verbose', action='store_true',
|
|
999
|
+
help='Show detailed progress')
|
|
1000
|
+
|
|
1001
|
+
args = parser.parse_args()
|
|
1002
|
+
|
|
1003
|
+
# Handle --info flag
|
|
1004
|
+
if args.info:
|
|
1005
|
+
if not args.source:
|
|
1006
|
+
print("Error: Source index required with --info")
|
|
1007
|
+
sys.exit(1)
|
|
1008
|
+
|
|
1009
|
+
try:
|
|
1010
|
+
from signalwire_agents.search.migration import SearchIndexMigrator
|
|
1011
|
+
migrator = SearchIndexMigrator(verbose=args.verbose)
|
|
1012
|
+
info = migrator.get_index_info(args.source)
|
|
1013
|
+
|
|
1014
|
+
print(f"Index Information: {args.source}")
|
|
1015
|
+
print(f" Type: {info['type']}")
|
|
1016
|
+
if info['type'] == 'sqlite':
|
|
1017
|
+
print(f" Total chunks: {info['total_chunks']}")
|
|
1018
|
+
print(f" Total files: {info['total_files']}")
|
|
1019
|
+
print(f" Model: {info['config'].get('embedding_model', 'Unknown')}")
|
|
1020
|
+
print(f" Dimensions: {info['config'].get('embedding_dimensions', 'Unknown')}")
|
|
1021
|
+
print(f" Created: {info['config'].get('created_at', 'Unknown')}")
|
|
1022
|
+
if args.verbose:
|
|
1023
|
+
print("\n Full configuration:")
|
|
1024
|
+
for key, value in info['config'].items():
|
|
1025
|
+
print(f" {key}: {value}")
|
|
1026
|
+
else:
|
|
1027
|
+
print(" Unable to determine index type")
|
|
1028
|
+
except Exception as e:
|
|
1029
|
+
print(f"Error getting index info: {e}")
|
|
1030
|
+
sys.exit(1)
|
|
1031
|
+
return
|
|
1032
|
+
|
|
1033
|
+
# Validate arguments for migration
|
|
1034
|
+
if not args.source:
|
|
1035
|
+
print("Error: Source index required for migration")
|
|
1036
|
+
sys.exit(1)
|
|
1037
|
+
|
|
1038
|
+
if not args.to_pgvector and not args.to_sqlite:
|
|
1039
|
+
print("Error: Must specify migration direction (--to-pgvector or --to-sqlite)")
|
|
1040
|
+
sys.exit(1)
|
|
1041
|
+
|
|
1042
|
+
try:
|
|
1043
|
+
from signalwire_agents.search.migration import SearchIndexMigrator
|
|
1044
|
+
migrator = SearchIndexMigrator(verbose=args.verbose)
|
|
1045
|
+
|
|
1046
|
+
if args.to_pgvector:
|
|
1047
|
+
# Validate pgvector arguments
|
|
1048
|
+
if not args.connection_string:
|
|
1049
|
+
print("Error: --connection-string required for pgvector migration")
|
|
1050
|
+
sys.exit(1)
|
|
1051
|
+
if not args.collection_name:
|
|
1052
|
+
print("Error: --collection-name required for pgvector migration")
|
|
1053
|
+
sys.exit(1)
|
|
1054
|
+
|
|
1055
|
+
# Perform migration
|
|
1056
|
+
print(f"Migrating {args.source} to pgvector collection '{args.collection_name}'...")
|
|
1057
|
+
stats = migrator.migrate_sqlite_to_pgvector(
|
|
1058
|
+
sqlite_path=args.source,
|
|
1059
|
+
connection_string=args.connection_string,
|
|
1060
|
+
collection_name=args.collection_name,
|
|
1061
|
+
overwrite=args.overwrite,
|
|
1062
|
+
batch_size=args.batch_size
|
|
1063
|
+
)
|
|
1064
|
+
|
|
1065
|
+
print(f"\n✓ Migration completed successfully!")
|
|
1066
|
+
print(f" Chunks migrated: {stats['chunks_migrated']}")
|
|
1067
|
+
print(f" Errors: {stats['errors']}")
|
|
1068
|
+
|
|
1069
|
+
elif args.to_sqlite:
|
|
1070
|
+
print("Error: pgvector to SQLite migration not yet implemented")
|
|
1071
|
+
print("This feature is planned for future development")
|
|
1072
|
+
sys.exit(1)
|
|
1073
|
+
|
|
1074
|
+
except Exception as e:
|
|
1075
|
+
print(f"\nError during migration: {e}")
|
|
1076
|
+
if args.verbose:
|
|
1077
|
+
import traceback
|
|
1078
|
+
traceback.print_exc()
|
|
1079
|
+
sys.exit(1)
|
|
1080
|
+
|
|
1081
|
+
|
|
1082
|
+
def remote_command():
|
|
1083
|
+
"""Search via remote API endpoint"""
|
|
1084
|
+
parser = argparse.ArgumentParser(description='Search via remote API endpoint')
|
|
1085
|
+
parser.add_argument('endpoint', help='Remote API endpoint URL (e.g., http://localhost:8001)')
|
|
1086
|
+
parser.add_argument('query', help='Search query')
|
|
1087
|
+
parser.add_argument('--index-name', required=True, help='Name of the index to search')
|
|
1088
|
+
parser.add_argument('--count', type=int, default=5, help='Number of results to return (default: 5)')
|
|
1089
|
+
parser.add_argument('--distance-threshold', type=float, default=0.0, help='Minimum similarity score (default: 0.0)')
|
|
1090
|
+
parser.add_argument('--tags', help='Comma-separated tags to filter by')
|
|
1091
|
+
parser.add_argument('--verbose', action='store_true', help='Show detailed information')
|
|
1092
|
+
parser.add_argument('--json', action='store_true', help='Output results as JSON')
|
|
1093
|
+
parser.add_argument('--no-content', action='store_true', help='Hide content in results (show only metadata)')
|
|
1094
|
+
parser.add_argument('--timeout', type=int, default=30, help='Request timeout in seconds (default: 30)')
|
|
1095
|
+
|
|
1096
|
+
args = parser.parse_args()
|
|
1097
|
+
|
|
1098
|
+
# Ensure endpoint starts with http:// or https://
|
|
1099
|
+
endpoint = args.endpoint
|
|
1100
|
+
if not endpoint.startswith(('http://', 'https://')):
|
|
1101
|
+
endpoint = f"http://{endpoint}"
|
|
1102
|
+
|
|
1103
|
+
# Ensure endpoint ends with /search
|
|
1104
|
+
if not endpoint.endswith('/search'):
|
|
1105
|
+
if endpoint.endswith('/'):
|
|
1106
|
+
endpoint += 'search'
|
|
1107
|
+
else:
|
|
1108
|
+
endpoint += '/search'
|
|
1109
|
+
|
|
1110
|
+
try:
|
|
1111
|
+
import requests
|
|
1112
|
+
except ImportError:
|
|
1113
|
+
print("Error: requests library not available. Install with: pip install requests")
|
|
1114
|
+
sys.exit(1)
|
|
1115
|
+
|
|
1116
|
+
# Prepare request payload
|
|
1117
|
+
payload = {
|
|
1118
|
+
'query': args.query,
|
|
1119
|
+
'index_name': args.index_name,
|
|
1120
|
+
'count': args.count,
|
|
1121
|
+
'similarity_threshold': args.similarity_threshold
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
if args.tags:
|
|
1125
|
+
payload['tags'] = [tag.strip() for tag in args.tags.split(',')]
|
|
1126
|
+
|
|
1127
|
+
if args.verbose:
|
|
1128
|
+
print(f"Searching remote endpoint: {endpoint}")
|
|
1129
|
+
print(f"Payload: {payload}")
|
|
1130
|
+
print()
|
|
1131
|
+
|
|
1132
|
+
try:
|
|
1133
|
+
# Make the API request
|
|
1134
|
+
response = requests.post(
|
|
1135
|
+
endpoint,
|
|
1136
|
+
json=payload,
|
|
1137
|
+
headers={'Content-Type': 'application/json'},
|
|
1138
|
+
timeout=args.timeout
|
|
1139
|
+
)
|
|
1140
|
+
|
|
1141
|
+
if response.status_code == 200:
|
|
1142
|
+
result = response.json()
|
|
1143
|
+
|
|
1144
|
+
if args.json:
|
|
1145
|
+
# Output raw JSON response
|
|
1146
|
+
import json
|
|
1147
|
+
print(json.dumps(result, indent=2))
|
|
1148
|
+
else:
|
|
1149
|
+
# Human-readable output
|
|
1150
|
+
results = result.get('results', [])
|
|
1151
|
+
if not results:
|
|
1152
|
+
print(f"No results found for '{args.query}' in index '{args.index_name}'")
|
|
1153
|
+
sys.exit(0)
|
|
1154
|
+
|
|
1155
|
+
print(f"Found {len(results)} result(s) for '{args.query}' in index '{args.index_name}':")
|
|
1156
|
+
if result.get('enhanced_query') and result.get('enhanced_query') != args.query:
|
|
1157
|
+
print(f"Enhanced query: '{result.get('enhanced_query')}'")
|
|
1158
|
+
print("=" * 80)
|
|
1159
|
+
|
|
1160
|
+
for i, search_result in enumerate(results):
|
|
1161
|
+
print(f"\n[{i+1}] Score: {search_result.get('score', 0):.4f}")
|
|
1162
|
+
|
|
1163
|
+
# Show metadata
|
|
1164
|
+
metadata = search_result.get('metadata', {})
|
|
1165
|
+
print(f"File: {metadata.get('filename', 'Unknown')}")
|
|
1166
|
+
if metadata.get('section'):
|
|
1167
|
+
print(f"Section: {metadata['section']}")
|
|
1168
|
+
if metadata.get('line_start'):
|
|
1169
|
+
print(f"Lines: {metadata['line_start']}-{metadata.get('line_end', metadata['line_start'])}")
|
|
1170
|
+
if metadata.get('tags'):
|
|
1171
|
+
print(f"Tags: {', '.join(metadata['tags'])}")
|
|
1172
|
+
|
|
1173
|
+
# Show content unless suppressed
|
|
1174
|
+
if not args.no_content and 'content' in search_result:
|
|
1175
|
+
content = search_result['content']
|
|
1176
|
+
if len(content) > 500 and not args.verbose:
|
|
1177
|
+
content = content[:500] + "..."
|
|
1178
|
+
print(f"\nContent:\n{content}")
|
|
1179
|
+
|
|
1180
|
+
if i < len(results) - 1:
|
|
1181
|
+
print("-" * 80)
|
|
1182
|
+
|
|
1183
|
+
elif response.status_code == 404:
|
|
1184
|
+
try:
|
|
1185
|
+
error_detail = response.json()
|
|
1186
|
+
error_msg = error_detail.get('detail', 'Index not found')
|
|
1187
|
+
except:
|
|
1188
|
+
error_msg = 'Index not found'
|
|
1189
|
+
print(f"Error: {error_msg}")
|
|
1190
|
+
sys.exit(1)
|
|
1191
|
+
else:
|
|
1192
|
+
try:
|
|
1193
|
+
error_detail = response.json()
|
|
1194
|
+
error_msg = error_detail.get('detail', f'HTTP {response.status_code}')
|
|
1195
|
+
except:
|
|
1196
|
+
error_msg = f'HTTP {response.status_code}: {response.text}'
|
|
1197
|
+
print(f"Error: {error_msg}")
|
|
1198
|
+
sys.exit(1)
|
|
1199
|
+
|
|
1200
|
+
except requests.ConnectionError:
|
|
1201
|
+
print(f"Error: Could not connect to {endpoint}")
|
|
1202
|
+
print("Make sure the search server is running")
|
|
1203
|
+
sys.exit(1)
|
|
1204
|
+
except requests.Timeout:
|
|
1205
|
+
print(f"Error: Request timed out after {args.timeout} seconds")
|
|
1206
|
+
sys.exit(1)
|
|
1207
|
+
except requests.RequestException as e:
|
|
1208
|
+
print(f"Error making request: {e}")
|
|
1209
|
+
sys.exit(1)
|
|
1210
|
+
except Exception as e:
|
|
1211
|
+
print(f"Error: {e}")
|
|
1212
|
+
if args.verbose:
|
|
1213
|
+
import traceback
|
|
1214
|
+
traceback.print_exc()
|
|
1215
|
+
sys.exit(1)
|
|
1216
|
+
|
|
1217
|
+
def console_entry_point():
|
|
1218
|
+
"""Console script entry point for pip installation"""
|
|
1219
|
+
import sys
|
|
1220
|
+
|
|
1221
|
+
# Fast help check - show help without importing heavy modules
|
|
1222
|
+
if len(sys.argv) > 1 and sys.argv[1] in ['--help', '-h']:
|
|
1223
|
+
print("""usage: sw-search [-h] [--output OUTPUT] [--chunking-strategy {sentence,sliding,paragraph,page,semantic,topic,qa}]
|
|
1224
|
+
[--max-sentences-per-chunk MAX_SENTENCES_PER_CHUNK] [--chunk-size CHUNK_SIZE]
|
|
1225
|
+
[--overlap-size OVERLAP_SIZE] [--split-newlines SPLIT_NEWLINES] [--file-types FILE_TYPES]
|
|
1226
|
+
[--exclude EXCLUDE] [--languages LANGUAGES] [--model MODEL] [--tags TAGS]
|
|
1227
|
+
[--index-nlp-backend {nltk,spacy}] [--verbose] [--validate]
|
|
1228
|
+
[--semantic-threshold SEMANTIC_THRESHOLD] [--topic-threshold TOPIC_THRESHOLD]
|
|
1229
|
+
sources [sources ...]
|
|
1230
|
+
|
|
1231
|
+
Build local search index from documents
|
|
1232
|
+
|
|
1233
|
+
positional arguments:
|
|
1234
|
+
sources Source files and/or directories to index
|
|
1235
|
+
|
|
1236
|
+
options:
|
|
1237
|
+
-h, --help show this help message and exit
|
|
1238
|
+
--output OUTPUT Output .swsearch file (default: sources.swsearch)
|
|
1239
|
+
--chunking-strategy {sentence,sliding,paragraph,page,semantic,topic,qa}
|
|
1240
|
+
Chunking strategy to use (default: sentence)
|
|
1241
|
+
--max-sentences-per-chunk MAX_SENTENCES_PER_CHUNK
|
|
1242
|
+
Maximum sentences per chunk for sentence strategy (default: 5)
|
|
1243
|
+
--chunk-size CHUNK_SIZE
|
|
1244
|
+
Chunk size in words for sliding window strategy (default: 50)
|
|
1245
|
+
--overlap-size OVERLAP_SIZE
|
|
1246
|
+
Overlap size in words for sliding window strategy (default: 10)
|
|
1247
|
+
--split-newlines SPLIT_NEWLINES
|
|
1248
|
+
Split on multiple newlines (for sentence strategy)
|
|
1249
|
+
--file-types FILE_TYPES
|
|
1250
|
+
Comma-separated file extensions to include for directories (default: md,txt,rst)
|
|
1251
|
+
--exclude EXCLUDE Comma-separated glob patterns to exclude (e.g., "**/test/**,**/__pycache__/**")
|
|
1252
|
+
--languages LANGUAGES
|
|
1253
|
+
Comma-separated language codes (default: en)
|
|
1254
|
+
--model MODEL Sentence transformer model name (default: sentence-transformers/all-mpnet-base-v2)
|
|
1255
|
+
--tags TAGS Comma-separated tags to add to all chunks
|
|
1256
|
+
--index-nlp-backend {nltk,spacy}
|
|
1257
|
+
NLP backend for document processing: nltk (fast, default) or spacy (better quality, slower)
|
|
1258
|
+
--verbose Enable verbose output
|
|
1259
|
+
--validate Validate the created index after building
|
|
1260
|
+
--semantic-threshold SEMANTIC_THRESHOLD
|
|
1261
|
+
Similarity threshold for semantic chunking (default: 0.5)
|
|
1262
|
+
--topic-threshold TOPIC_THRESHOLD
|
|
1263
|
+
Similarity threshold for topic chunking (default: 0.3)
|
|
1264
|
+
|
|
1265
|
+
Examples:
|
|
1266
|
+
# Basic usage with directory (defaults to sentence chunking with 5 sentences per chunk)
|
|
1267
|
+
sw-search ./docs
|
|
1268
|
+
|
|
1269
|
+
# Multiple directories
|
|
1270
|
+
sw-search ./docs ./examples --file-types md,txt,py
|
|
1271
|
+
|
|
1272
|
+
# Individual files
|
|
1273
|
+
sw-search README.md ./docs/guide.md ./src/main.py
|
|
1274
|
+
|
|
1275
|
+
# Mixed sources (directories and files)
|
|
1276
|
+
sw-search ./docs README.md ./examples specific_file.txt --file-types md,txt,py
|
|
1277
|
+
|
|
1278
|
+
# Sentence-based chunking with custom parameters
|
|
1279
|
+
sw-search ./docs \\
|
|
1280
|
+
--chunking-strategy sentence \\
|
|
1281
|
+
--max-sentences-per-chunk 10 \\
|
|
1282
|
+
--split-newlines 2
|
|
1283
|
+
|
|
1284
|
+
# Sliding window chunking
|
|
1285
|
+
sw-search ./docs \\
|
|
1286
|
+
--chunking-strategy sliding \\
|
|
1287
|
+
--chunk-size 100 \\
|
|
1288
|
+
--overlap-size 20
|
|
1289
|
+
|
|
1290
|
+
# Paragraph-based chunking
|
|
1291
|
+
sw-search ./docs \\
|
|
1292
|
+
--chunking-strategy paragraph \\
|
|
1293
|
+
--file-types md,txt,rst
|
|
1294
|
+
|
|
1295
|
+
# Page-based chunking (good for PDFs)
|
|
1296
|
+
sw-search ./docs \\
|
|
1297
|
+
--chunking-strategy page \\
|
|
1298
|
+
--file-types pdf
|
|
1299
|
+
|
|
1300
|
+
# Semantic chunking (groups semantically similar sentences)
|
|
1301
|
+
sw-search ./docs \\
|
|
1302
|
+
--chunking-strategy semantic \\
|
|
1303
|
+
--semantic-threshold 0.6
|
|
1304
|
+
|
|
1305
|
+
# Topic-based chunking (groups by topic changes)
|
|
1306
|
+
sw-search ./docs \\
|
|
1307
|
+
--chunking-strategy topic \\
|
|
1308
|
+
--topic-threshold 0.2
|
|
1309
|
+
|
|
1310
|
+
# QA-optimized chunking (optimized for question-answering)
|
|
1311
|
+
sw-search ./docs \\
|
|
1312
|
+
--chunking-strategy qa
|
|
1313
|
+
|
|
1314
|
+
# Full configuration example
|
|
1315
|
+
sw-search ./docs ./examples README.md \\
|
|
1316
|
+
--output ./knowledge.swsearch \\
|
|
1317
|
+
--chunking-strategy sentence \\
|
|
1318
|
+
--max-sentences-per-chunk 8 \\
|
|
1319
|
+
--file-types md,txt,rst,py \\
|
|
1320
|
+
--exclude "**/test/**,**/__pycache__/**" \\
|
|
1321
|
+
--languages en,es,fr \\
|
|
1322
|
+
--model sentence-transformers/all-mpnet-base-v2 \\
|
|
1323
|
+
--tags documentation,api \\
|
|
1324
|
+
--verbose
|
|
1325
|
+
|
|
1326
|
+
# Validate an existing index
|
|
1327
|
+
sw-search validate ./docs.swsearch
|
|
1328
|
+
|
|
1329
|
+
# Search within an index
|
|
1330
|
+
sw-search search ./docs.swsearch "how to create an agent"
|
|
1331
|
+
sw-search search ./docs.swsearch "API reference" --count 3 --verbose
|
|
1332
|
+
sw-search search ./docs.swsearch "configuration" --tags documentation --json
|
|
1333
|
+
|
|
1334
|
+
# Search via remote API
|
|
1335
|
+
sw-search remote http://localhost:8001 "how to create an agent" --index-name docs
|
|
1336
|
+
sw-search remote localhost:8001 "API reference" --index-name docs --count 3 --verbose
|
|
1337
|
+
""")
|
|
1338
|
+
return
|
|
1339
|
+
|
|
1340
|
+
# Check for subcommands
|
|
1341
|
+
if len(sys.argv) > 1:
|
|
1342
|
+
if sys.argv[1] == 'validate':
|
|
1343
|
+
# Remove 'validate' from argv and call validate_command
|
|
1344
|
+
sys.argv.pop(1)
|
|
1345
|
+
validate_command()
|
|
1346
|
+
return
|
|
1347
|
+
elif sys.argv[1] == 'search':
|
|
1348
|
+
# Remove 'search' from argv and call search_command
|
|
1349
|
+
sys.argv.pop(1)
|
|
1350
|
+
search_command()
|
|
1351
|
+
return
|
|
1352
|
+
elif sys.argv[1] == 'remote':
|
|
1353
|
+
# Remove 'remote' from argv and call remote_command
|
|
1354
|
+
sys.argv.pop(1)
|
|
1355
|
+
remote_command()
|
|
1356
|
+
return
|
|
1357
|
+
elif sys.argv[1] == 'migrate':
|
|
1358
|
+
# Remove 'migrate' from argv and call migrate_command
|
|
1359
|
+
sys.argv.pop(1)
|
|
1360
|
+
migrate_command()
|
|
1361
|
+
return
|
|
1362
|
+
|
|
1363
|
+
# Regular build command
|
|
1364
|
+
main()
|
|
1365
|
+
|
|
1366
|
+
if __name__ == '__main__':
|
|
1367
|
+
main()
|