signalwire-agents 0.1.13__py3-none-any.whl → 1.0.17.dev4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +99 -15
- signalwire_agents/agent_server.py +248 -60
- signalwire_agents/agents/bedrock.py +296 -0
- signalwire_agents/cli/__init__.py +9 -0
- signalwire_agents/cli/build_search.py +951 -41
- signalwire_agents/cli/config.py +80 -0
- signalwire_agents/cli/core/__init__.py +10 -0
- signalwire_agents/cli/core/agent_loader.py +470 -0
- signalwire_agents/cli/core/argparse_helpers.py +179 -0
- signalwire_agents/cli/core/dynamic_config.py +71 -0
- signalwire_agents/cli/core/service_loader.py +303 -0
- signalwire_agents/cli/dokku.py +2320 -0
- signalwire_agents/cli/execution/__init__.py +10 -0
- signalwire_agents/cli/execution/datamap_exec.py +446 -0
- signalwire_agents/cli/execution/webhook_exec.py +134 -0
- signalwire_agents/cli/init_project.py +2636 -0
- signalwire_agents/cli/output/__init__.py +10 -0
- signalwire_agents/cli/output/output_formatter.py +255 -0
- signalwire_agents/cli/output/swml_dump.py +186 -0
- signalwire_agents/cli/simulation/__init__.py +10 -0
- signalwire_agents/cli/simulation/data_generation.py +374 -0
- signalwire_agents/cli/simulation/data_overrides.py +200 -0
- signalwire_agents/cli/simulation/mock_env.py +282 -0
- signalwire_agents/cli/swaig_test_wrapper.py +52 -0
- signalwire_agents/cli/test_swaig.py +566 -2366
- signalwire_agents/cli/types.py +81 -0
- signalwire_agents/core/__init__.py +2 -2
- signalwire_agents/core/agent/__init__.py +12 -0
- signalwire_agents/core/agent/config/__init__.py +12 -0
- signalwire_agents/core/agent/deployment/__init__.py +9 -0
- signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
- signalwire_agents/core/agent/prompt/__init__.py +14 -0
- signalwire_agents/core/agent/prompt/manager.py +306 -0
- signalwire_agents/core/agent/routing/__init__.py +9 -0
- signalwire_agents/core/agent/security/__init__.py +9 -0
- signalwire_agents/core/agent/swml/__init__.py +9 -0
- signalwire_agents/core/agent/tools/__init__.py +15 -0
- signalwire_agents/core/agent/tools/decorator.py +97 -0
- signalwire_agents/core/agent/tools/registry.py +210 -0
- signalwire_agents/core/agent_base.py +845 -2916
- signalwire_agents/core/auth_handler.py +233 -0
- signalwire_agents/core/config_loader.py +259 -0
- signalwire_agents/core/contexts.py +418 -0
- signalwire_agents/core/data_map.py +3 -15
- signalwire_agents/core/function_result.py +116 -44
- signalwire_agents/core/logging_config.py +162 -18
- signalwire_agents/core/mixins/__init__.py +28 -0
- signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
- signalwire_agents/core/mixins/auth_mixin.py +280 -0
- signalwire_agents/core/mixins/prompt_mixin.py +358 -0
- signalwire_agents/core/mixins/serverless_mixin.py +460 -0
- signalwire_agents/core/mixins/skill_mixin.py +55 -0
- signalwire_agents/core/mixins/state_mixin.py +153 -0
- signalwire_agents/core/mixins/tool_mixin.py +230 -0
- signalwire_agents/core/mixins/web_mixin.py +1142 -0
- signalwire_agents/core/security_config.py +333 -0
- signalwire_agents/core/skill_base.py +84 -1
- signalwire_agents/core/skill_manager.py +62 -20
- signalwire_agents/core/swaig_function.py +18 -5
- signalwire_agents/core/swml_builder.py +207 -11
- signalwire_agents/core/swml_handler.py +27 -21
- signalwire_agents/core/swml_renderer.py +123 -312
- signalwire_agents/core/swml_service.py +171 -203
- signalwire_agents/mcp_gateway/__init__.py +29 -0
- signalwire_agents/mcp_gateway/gateway_service.py +564 -0
- signalwire_agents/mcp_gateway/mcp_manager.py +513 -0
- signalwire_agents/mcp_gateway/session_manager.py +218 -0
- signalwire_agents/prefabs/concierge.py +0 -3
- signalwire_agents/prefabs/faq_bot.py +0 -3
- signalwire_agents/prefabs/info_gatherer.py +0 -3
- signalwire_agents/prefabs/receptionist.py +0 -3
- signalwire_agents/prefabs/survey.py +0 -3
- signalwire_agents/schema.json +9218 -5489
- signalwire_agents/search/__init__.py +7 -1
- signalwire_agents/search/document_processor.py +490 -31
- signalwire_agents/search/index_builder.py +307 -37
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +748 -0
- signalwire_agents/search/query_processor.py +162 -31
- signalwire_agents/search/search_engine.py +916 -35
- signalwire_agents/search/search_service.py +376 -53
- signalwire_agents/skills/README.md +452 -0
- signalwire_agents/skills/__init__.py +14 -2
- signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
- signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
- signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
- signalwire_agents/skills/datasphere/README.md +210 -0
- signalwire_agents/skills/datasphere/skill.py +84 -3
- signalwire_agents/skills/datasphere_serverless/README.md +258 -0
- signalwire_agents/skills/datasphere_serverless/__init__.py +9 -0
- signalwire_agents/skills/datasphere_serverless/skill.py +82 -1
- signalwire_agents/skills/datetime/README.md +132 -0
- signalwire_agents/skills/datetime/__init__.py +9 -0
- signalwire_agents/skills/datetime/skill.py +20 -7
- signalwire_agents/skills/joke/README.md +149 -0
- signalwire_agents/skills/joke/__init__.py +9 -0
- signalwire_agents/skills/joke/skill.py +21 -0
- signalwire_agents/skills/math/README.md +161 -0
- signalwire_agents/skills/math/__init__.py +9 -0
- signalwire_agents/skills/math/skill.py +18 -4
- signalwire_agents/skills/mcp_gateway/README.md +230 -0
- signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
- signalwire_agents/skills/mcp_gateway/skill.py +421 -0
- signalwire_agents/skills/native_vector_search/README.md +210 -0
- signalwire_agents/skills/native_vector_search/__init__.py +9 -0
- signalwire_agents/skills/native_vector_search/skill.py +569 -101
- signalwire_agents/skills/play_background_file/README.md +218 -0
- signalwire_agents/skills/play_background_file/__init__.py +12 -0
- signalwire_agents/skills/play_background_file/skill.py +242 -0
- signalwire_agents/skills/registry.py +395 -40
- signalwire_agents/skills/spider/README.md +236 -0
- signalwire_agents/skills/spider/__init__.py +13 -0
- signalwire_agents/skills/spider/skill.py +598 -0
- signalwire_agents/skills/swml_transfer/README.md +395 -0
- signalwire_agents/skills/swml_transfer/__init__.py +10 -0
- signalwire_agents/skills/swml_transfer/skill.py +359 -0
- signalwire_agents/skills/weather_api/README.md +178 -0
- signalwire_agents/skills/weather_api/__init__.py +12 -0
- signalwire_agents/skills/weather_api/skill.py +191 -0
- signalwire_agents/skills/web_search/README.md +163 -0
- signalwire_agents/skills/web_search/__init__.py +9 -0
- signalwire_agents/skills/web_search/skill.py +586 -112
- signalwire_agents/skills/wikipedia_search/README.md +228 -0
- signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
- signalwire_agents/skills/{wikipedia → wikipedia_search}/skill.py +33 -3
- signalwire_agents/web/__init__.py +17 -0
- signalwire_agents/web/web_service.py +559 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-agent-init.1 +400 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-search.1 +483 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/swaig-test.1 +308 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/METADATA +347 -215
- signalwire_agents-1.0.17.dev4.dist-info/RECORD +147 -0
- signalwire_agents-1.0.17.dev4.dist-info/entry_points.txt +6 -0
- signalwire_agents/core/state/file_state_manager.py +0 -219
- signalwire_agents/core/state/state_manager.py +0 -101
- signalwire_agents/skills/wikipedia/__init__.py +0 -9
- signalwire_agents-0.1.13.data/data/schema.json +0 -5611
- signalwire_agents-0.1.13.dist-info/RECORD +0 -67
- signalwire_agents-0.1.13.dist-info/entry_points.txt +0 -3
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/top_level.txt +0 -0
|
@@ -10,7 +10,9 @@ See LICENSE file in the project root for full license information.
|
|
|
10
10
|
import argparse
|
|
11
11
|
import sys
|
|
12
12
|
from pathlib import Path
|
|
13
|
-
from
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
|
|
15
|
+
from signalwire_agents.search.models import MODEL_ALIASES, DEFAULT_MODEL, resolve_model_alias
|
|
14
16
|
|
|
15
17
|
def main():
|
|
16
18
|
"""Main entry point for the build-search command"""
|
|
@@ -19,7 +21,7 @@ def main():
|
|
|
19
21
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
20
22
|
epilog="""
|
|
21
23
|
Examples:
|
|
22
|
-
# Basic usage with directory (defaults to sentence chunking with
|
|
24
|
+
# Basic usage with directory (defaults to sentence chunking with 5 sentences per chunk)
|
|
23
25
|
sw-search ./docs
|
|
24
26
|
|
|
25
27
|
# Multiple directories
|
|
@@ -34,7 +36,7 @@ Examples:
|
|
|
34
36
|
# Sentence-based chunking with custom parameters
|
|
35
37
|
sw-search ./docs \\
|
|
36
38
|
--chunking-strategy sentence \\
|
|
37
|
-
--max-sentences-per-chunk
|
|
39
|
+
--max-sentences-per-chunk 10 \\
|
|
38
40
|
--split-newlines 2
|
|
39
41
|
|
|
40
42
|
# Sliding window chunking
|
|
@@ -53,11 +55,64 @@ Examples:
|
|
|
53
55
|
--chunking-strategy page \\
|
|
54
56
|
--file-types pdf
|
|
55
57
|
|
|
58
|
+
# Semantic chunking (groups semantically similar sentences)
|
|
59
|
+
sw-search ./docs \\
|
|
60
|
+
--chunking-strategy semantic \\
|
|
61
|
+
--semantic-threshold 0.6
|
|
62
|
+
|
|
63
|
+
# Topic-based chunking (groups by topic changes)
|
|
64
|
+
sw-search ./docs \\
|
|
65
|
+
--chunking-strategy topic \\
|
|
66
|
+
--topic-threshold 0.2
|
|
67
|
+
|
|
68
|
+
# QA-optimized chunking (optimized for question-answering)
|
|
69
|
+
sw-search ./docs \\
|
|
70
|
+
--chunking-strategy qa
|
|
71
|
+
|
|
72
|
+
# Markdown-aware chunking (preserves headers, detects code blocks, adds tags)
|
|
73
|
+
sw-search ./docs \\
|
|
74
|
+
--chunking-strategy markdown \\
|
|
75
|
+
--file-types md
|
|
76
|
+
# This strategy:
|
|
77
|
+
# - Chunks at header boundaries (h1, h2, h3...)
|
|
78
|
+
# - Detects code blocks and extracts language (python, bash, etc)
|
|
79
|
+
# - Adds "code" tags to chunks with code for better search
|
|
80
|
+
# - Preserves section hierarchy in metadata
|
|
81
|
+
|
|
82
|
+
# Model selection examples (performance vs quality tradeoff)
|
|
83
|
+
sw-search ./docs --model mini # Fastest (~5x faster), 384 dims, good for most use cases
|
|
84
|
+
sw-search ./docs --model base # Balanced speed/quality, 768 dims (previous default)
|
|
85
|
+
sw-search ./docs --model large # Best quality (same as base currently)
|
|
86
|
+
# Or use full model names:
|
|
87
|
+
sw-search ./docs --model sentence-transformers/all-MiniLM-L6-v2
|
|
88
|
+
sw-search ./docs --model sentence-transformers/all-mpnet-base-v2
|
|
89
|
+
|
|
90
|
+
# JSON-based chunking (pre-chunked content)
|
|
91
|
+
sw-search ./api_chunks.json \
|
|
92
|
+
--chunking-strategy json \
|
|
93
|
+
--file-types json
|
|
94
|
+
|
|
95
|
+
# Export chunks to JSON for review (single file)
|
|
96
|
+
sw-search ./docs \\
|
|
97
|
+
--output-format json \\
|
|
98
|
+
--output all_chunks.json
|
|
99
|
+
|
|
100
|
+
# Export chunks to JSON (one file per source)
|
|
101
|
+
sw-search ./docs \\
|
|
102
|
+
--output-format json \\
|
|
103
|
+
--output-dir ./chunks/
|
|
104
|
+
|
|
105
|
+
# Build index from exported JSON chunks
|
|
106
|
+
sw-search ./chunks/ \\
|
|
107
|
+
--chunking-strategy json \\
|
|
108
|
+
--file-types json \\
|
|
109
|
+
--output final.swsearch
|
|
110
|
+
|
|
56
111
|
# Full configuration example
|
|
57
112
|
sw-search ./docs ./examples README.md \\
|
|
58
113
|
--output ./knowledge.swsearch \\
|
|
59
114
|
--chunking-strategy sentence \\
|
|
60
|
-
--max-sentences-per-chunk
|
|
115
|
+
--max-sentences-per-chunk 8 \\
|
|
61
116
|
--file-types md,txt,rst,py \\
|
|
62
117
|
--exclude "**/test/**,**/__pycache__/**" \\
|
|
63
118
|
--languages en,es,fr \\
|
|
@@ -72,6 +127,41 @@ Examples:
|
|
|
72
127
|
sw-search search ./docs.swsearch "how to create an agent"
|
|
73
128
|
sw-search search ./docs.swsearch "API reference" --count 3 --verbose
|
|
74
129
|
sw-search search ./docs.swsearch "configuration" --tags documentation --json
|
|
130
|
+
|
|
131
|
+
# Search via remote API
|
|
132
|
+
sw-search remote http://localhost:8001 "how to create an agent" --index-name docs
|
|
133
|
+
sw-search remote localhost:8001 "API reference" --index-name docs --count 3 --verbose
|
|
134
|
+
|
|
135
|
+
# Migrate between backends
|
|
136
|
+
sw-search migrate ./docs.swsearch --to-pgvector \\
|
|
137
|
+
--connection-string "postgresql://user:pass@localhost/db" \\
|
|
138
|
+
--collection-name docs_collection
|
|
139
|
+
sw-search migrate --info ./docs.swsearch
|
|
140
|
+
|
|
141
|
+
# PostgreSQL pgvector backend (direct build to PostgreSQL)
|
|
142
|
+
sw-search ./docs \\
|
|
143
|
+
--backend pgvector \\
|
|
144
|
+
--connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
|
|
145
|
+
--output docs_collection
|
|
146
|
+
|
|
147
|
+
# pgvector with markdown strategy (best for documentation with code examples)
|
|
148
|
+
sw-search ./docs \\
|
|
149
|
+
--backend pgvector \\
|
|
150
|
+
--connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
|
|
151
|
+
--output docs_collection \\
|
|
152
|
+
--chunking-strategy markdown
|
|
153
|
+
|
|
154
|
+
# Overwrite existing pgvector collection
|
|
155
|
+
sw-search ./docs \\
|
|
156
|
+
--backend pgvector \\
|
|
157
|
+
--connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
|
|
158
|
+
--output docs_collection \\
|
|
159
|
+
--overwrite
|
|
160
|
+
|
|
161
|
+
# Search in pgvector collection
|
|
162
|
+
sw-search search docs_collection "how to create an agent" \\
|
|
163
|
+
--backend pgvector \\
|
|
164
|
+
--connection-string "postgresql://user:pass@localhost/knowledge"
|
|
75
165
|
"""
|
|
76
166
|
)
|
|
77
167
|
|
|
@@ -83,21 +173,51 @@ Examples:
|
|
|
83
173
|
|
|
84
174
|
parser.add_argument(
|
|
85
175
|
'--output',
|
|
86
|
-
help='Output .swsearch file (default: sources.swsearch)'
|
|
176
|
+
help='Output .swsearch file (default: sources.swsearch) or collection name for pgvector'
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
parser.add_argument(
|
|
180
|
+
'--output-dir',
|
|
181
|
+
help='Output directory for results (creates one file per source file when used with --output-format json, or auto-names index files)'
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
parser.add_argument(
|
|
185
|
+
'--output-format',
|
|
186
|
+
choices=['index', 'json'],
|
|
187
|
+
default='index',
|
|
188
|
+
help='Output format: index (create search index) or json (export chunks as JSON) (default: index)'
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
parser.add_argument(
|
|
192
|
+
'--backend',
|
|
193
|
+
choices=['sqlite', 'pgvector'],
|
|
194
|
+
default='sqlite',
|
|
195
|
+
help='Storage backend to use (default: sqlite)'
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
parser.add_argument(
|
|
199
|
+
'--connection-string',
|
|
200
|
+
help='PostgreSQL connection string for pgvector backend'
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
parser.add_argument(
|
|
204
|
+
'--overwrite',
|
|
205
|
+
action='store_true',
|
|
206
|
+
help='Overwrite existing collection (pgvector backend only)'
|
|
87
207
|
)
|
|
88
208
|
|
|
89
209
|
parser.add_argument(
|
|
90
210
|
'--chunking-strategy',
|
|
91
|
-
choices=['sentence', 'sliding', 'paragraph', 'page'],
|
|
211
|
+
choices=['sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json', 'markdown'],
|
|
92
212
|
default='sentence',
|
|
93
|
-
help='Chunking strategy to use (default: sentence)'
|
|
213
|
+
help='Chunking strategy to use (default: sentence). Use "markdown" for documentation with code blocks.'
|
|
94
214
|
)
|
|
95
215
|
|
|
96
216
|
parser.add_argument(
|
|
97
217
|
'--max-sentences-per-chunk',
|
|
98
218
|
type=int,
|
|
99
|
-
default=
|
|
100
|
-
help='Maximum sentences per chunk for sentence strategy (default:
|
|
219
|
+
default=5,
|
|
220
|
+
help='Maximum sentences per chunk for sentence strategy (default: 5)'
|
|
101
221
|
)
|
|
102
222
|
|
|
103
223
|
parser.add_argument(
|
|
@@ -117,7 +237,7 @@ Examples:
|
|
|
117
237
|
parser.add_argument(
|
|
118
238
|
'--split-newlines',
|
|
119
239
|
type=int,
|
|
120
|
-
help='Split on multiple newlines for sentence strategy
|
|
240
|
+
help='Split on multiple newlines (for sentence strategy)'
|
|
121
241
|
)
|
|
122
242
|
|
|
123
243
|
parser.add_argument(
|
|
@@ -139,8 +259,8 @@ Examples:
|
|
|
139
259
|
|
|
140
260
|
parser.add_argument(
|
|
141
261
|
'--model',
|
|
142
|
-
default=
|
|
143
|
-
help='Sentence transformer model name (
|
|
262
|
+
default=DEFAULT_MODEL,
|
|
263
|
+
help=f'Sentence transformer model name or alias (mini/base/large). Default: mini ({DEFAULT_MODEL})'
|
|
144
264
|
)
|
|
145
265
|
|
|
146
266
|
parser.add_argument(
|
|
@@ -148,6 +268,13 @@ Examples:
|
|
|
148
268
|
help='Comma-separated tags to add to all chunks'
|
|
149
269
|
)
|
|
150
270
|
|
|
271
|
+
parser.add_argument(
|
|
272
|
+
'--index-nlp-backend',
|
|
273
|
+
choices=['nltk', 'spacy'],
|
|
274
|
+
default='nltk',
|
|
275
|
+
help='NLP backend for document processing: nltk (fast, default) or spacy (better quality, slower)'
|
|
276
|
+
)
|
|
277
|
+
|
|
151
278
|
parser.add_argument(
|
|
152
279
|
'--verbose',
|
|
153
280
|
action='store_true',
|
|
@@ -160,8 +287,25 @@ Examples:
|
|
|
160
287
|
help='Validate the created index after building'
|
|
161
288
|
)
|
|
162
289
|
|
|
290
|
+
parser.add_argument(
|
|
291
|
+
'--semantic-threshold',
|
|
292
|
+
type=float,
|
|
293
|
+
default=0.5,
|
|
294
|
+
help='Similarity threshold for semantic chunking (default: 0.5)'
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
parser.add_argument(
|
|
298
|
+
'--topic-threshold',
|
|
299
|
+
type=float,
|
|
300
|
+
default=0.3,
|
|
301
|
+
help='Similarity threshold for topic chunking (default: 0.3)'
|
|
302
|
+
)
|
|
303
|
+
|
|
163
304
|
args = parser.parse_args()
|
|
164
305
|
|
|
306
|
+
# Resolve model aliases
|
|
307
|
+
args.model = resolve_model_alias(args.model)
|
|
308
|
+
|
|
165
309
|
# Validate sources
|
|
166
310
|
valid_sources = []
|
|
167
311
|
for source in args.sources:
|
|
@@ -175,18 +319,75 @@ Examples:
|
|
|
175
319
|
print("Error: No valid sources found")
|
|
176
320
|
sys.exit(1)
|
|
177
321
|
|
|
178
|
-
#
|
|
179
|
-
if not args.
|
|
322
|
+
# Validate backend configuration
|
|
323
|
+
if args.backend == 'pgvector' and not args.connection_string:
|
|
324
|
+
print("Error: --connection-string is required for pgvector backend")
|
|
325
|
+
sys.exit(1)
|
|
326
|
+
|
|
327
|
+
# Validate output options
|
|
328
|
+
if args.output and args.output_dir:
|
|
329
|
+
print("Error: Cannot specify both --output and --output-dir")
|
|
330
|
+
sys.exit(1)
|
|
331
|
+
|
|
332
|
+
# Handle JSON output format differently
|
|
333
|
+
if args.output_format == 'json':
|
|
334
|
+
# JSON export doesn't use backend
|
|
335
|
+
if args.backend != 'sqlite':
|
|
336
|
+
print("Warning: --backend is ignored when using --output-format json")
|
|
337
|
+
|
|
338
|
+
# Determine output location
|
|
339
|
+
if args.output_dir:
|
|
340
|
+
# Multiple files mode
|
|
341
|
+
output_path = Path(args.output_dir)
|
|
342
|
+
if not output_path.exists():
|
|
343
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
344
|
+
elif args.output:
|
|
345
|
+
# Single file mode
|
|
346
|
+
output_path = Path(args.output)
|
|
347
|
+
if not output_path.suffix:
|
|
348
|
+
output_path = output_path.with_suffix('.json')
|
|
349
|
+
else:
|
|
350
|
+
# Default to single file
|
|
351
|
+
output_path = Path('chunks.json')
|
|
352
|
+
args.output = str(output_path)
|
|
353
|
+
|
|
354
|
+
# Default output filename (for index format)
|
|
355
|
+
if args.output_format == 'index' and not args.output and not args.output_dir:
|
|
356
|
+
if args.backend == 'sqlite':
|
|
357
|
+
if len(valid_sources) == 1:
|
|
358
|
+
# Single source - use its name
|
|
359
|
+
source_name = valid_sources[0].stem if valid_sources[0].is_file() else valid_sources[0].name
|
|
360
|
+
args.output = f"{source_name}.swsearch"
|
|
361
|
+
else:
|
|
362
|
+
# Multiple sources - use generic name
|
|
363
|
+
args.output = "sources.swsearch"
|
|
364
|
+
else:
|
|
365
|
+
# For pgvector, use a default collection name
|
|
366
|
+
if len(valid_sources) == 1:
|
|
367
|
+
source_name = valid_sources[0].stem if valid_sources[0].is_file() else valid_sources[0].name
|
|
368
|
+
args.output = source_name
|
|
369
|
+
else:
|
|
370
|
+
args.output = "documents"
|
|
371
|
+
|
|
372
|
+
# Handle --output-dir for index format
|
|
373
|
+
if args.output_format == 'index' and args.output_dir:
|
|
374
|
+
# Auto-generate output filename in the directory
|
|
180
375
|
if len(valid_sources) == 1:
|
|
181
|
-
# Single source - use its name
|
|
182
376
|
source_name = valid_sources[0].stem if valid_sources[0].is_file() else valid_sources[0].name
|
|
183
|
-
args.output = f"{source_name}.swsearch"
|
|
184
377
|
else:
|
|
185
|
-
|
|
186
|
-
|
|
378
|
+
source_name = "combined"
|
|
379
|
+
|
|
380
|
+
output_dir = Path(args.output_dir)
|
|
381
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
382
|
+
|
|
383
|
+
if args.backend == 'sqlite':
|
|
384
|
+
args.output = str(output_dir / f"{source_name}.swsearch")
|
|
385
|
+
else:
|
|
386
|
+
# For pgvector, still use the name as collection
|
|
387
|
+
args.output = source_name
|
|
187
388
|
|
|
188
|
-
# Ensure output has .swsearch extension
|
|
189
|
-
if not args.output.endswith('.swsearch'):
|
|
389
|
+
# Ensure output has .swsearch extension for sqlite (but not for JSON format)
|
|
390
|
+
if args.output_format == 'index' and args.backend == 'sqlite' and args.output and not args.output.endswith('.swsearch'):
|
|
190
391
|
args.output += '.swsearch'
|
|
191
392
|
|
|
192
393
|
# Parse lists
|
|
@@ -197,13 +398,19 @@ Examples:
|
|
|
197
398
|
|
|
198
399
|
if args.verbose:
|
|
199
400
|
print(f"Building search index:")
|
|
401
|
+
print(f" Backend: {args.backend}")
|
|
200
402
|
print(f" Sources: {[str(s) for s in valid_sources]}")
|
|
201
|
-
|
|
403
|
+
if args.backend == 'sqlite':
|
|
404
|
+
print(f" Output file: {args.output}")
|
|
405
|
+
else:
|
|
406
|
+
print(f" Collection name: {args.output}")
|
|
407
|
+
print(f" Connection: {args.connection_string}")
|
|
202
408
|
print(f" File types (for directories): {file_types}")
|
|
203
409
|
print(f" Exclude patterns: {exclude_patterns}")
|
|
204
410
|
print(f" Languages: {languages}")
|
|
205
411
|
print(f" Model: {args.model}")
|
|
206
412
|
print(f" Chunking strategy: {args.chunking_strategy}")
|
|
413
|
+
print(f" Index NLP backend: {args.index_nlp_backend}")
|
|
207
414
|
|
|
208
415
|
if args.chunking_strategy == 'sentence':
|
|
209
416
|
print(f" Max sentences per chunk: {args.max_sentences_per_chunk}")
|
|
@@ -216,12 +423,116 @@ Examples:
|
|
|
216
423
|
print(f" Chunking by paragraphs (double newlines)")
|
|
217
424
|
elif args.chunking_strategy == 'page':
|
|
218
425
|
print(f" Chunking by pages")
|
|
426
|
+
elif args.chunking_strategy == 'semantic':
|
|
427
|
+
print(f" Semantic chunking (similarity threshold: {args.semantic_threshold})")
|
|
428
|
+
elif args.chunking_strategy == 'topic':
|
|
429
|
+
print(f" Topic-based chunking (similarity threshold: {args.topic_threshold})")
|
|
430
|
+
elif args.chunking_strategy == 'qa':
|
|
431
|
+
print(f" QA-optimized chunking")
|
|
219
432
|
|
|
220
433
|
print(f" Tags: {tags}")
|
|
221
434
|
print()
|
|
222
435
|
|
|
223
436
|
try:
|
|
224
|
-
#
|
|
437
|
+
# Handle JSON export mode
|
|
438
|
+
if args.output_format == 'json':
|
|
439
|
+
# Import what we need for chunking
|
|
440
|
+
from signalwire_agents.search.index_builder import IndexBuilder
|
|
441
|
+
import json
|
|
442
|
+
|
|
443
|
+
builder = IndexBuilder(
|
|
444
|
+
chunking_strategy=args.chunking_strategy,
|
|
445
|
+
max_sentences_per_chunk=args.max_sentences_per_chunk,
|
|
446
|
+
chunk_size=args.chunk_size,
|
|
447
|
+
chunk_overlap=args.overlap_size,
|
|
448
|
+
split_newlines=args.split_newlines,
|
|
449
|
+
index_nlp_backend=args.index_nlp_backend,
|
|
450
|
+
verbose=args.verbose,
|
|
451
|
+
semantic_threshold=args.semantic_threshold,
|
|
452
|
+
topic_threshold=args.topic_threshold
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# Process files and export chunks
|
|
456
|
+
all_chunks = []
|
|
457
|
+
chunk_files_created = []
|
|
458
|
+
|
|
459
|
+
# Discover files from sources
|
|
460
|
+
files = builder._discover_files_from_sources(valid_sources, file_types, exclude_patterns)
|
|
461
|
+
|
|
462
|
+
if args.verbose:
|
|
463
|
+
print(f"Processing {len(files)} files...")
|
|
464
|
+
|
|
465
|
+
for file_path in files:
|
|
466
|
+
try:
|
|
467
|
+
# Determine base directory for relative paths
|
|
468
|
+
base_dir = builder._get_base_directory_for_file(file_path, valid_sources)
|
|
469
|
+
|
|
470
|
+
# Process file into chunks
|
|
471
|
+
chunks = builder._process_file(file_path, base_dir, tags)
|
|
472
|
+
|
|
473
|
+
if args.output_dir:
|
|
474
|
+
# Create individual JSON file
|
|
475
|
+
relative_path = file_path.relative_to(base_dir) if base_dir else file_path.name
|
|
476
|
+
json_filename = relative_path.with_suffix('.json')
|
|
477
|
+
json_path = Path(args.output_dir) / json_filename
|
|
478
|
+
|
|
479
|
+
# Create subdirectories if needed
|
|
480
|
+
json_path.parent.mkdir(parents=True, exist_ok=True)
|
|
481
|
+
|
|
482
|
+
# Save chunks to JSON
|
|
483
|
+
chunk_data = {
|
|
484
|
+
"chunks": chunks,
|
|
485
|
+
"metadata": {
|
|
486
|
+
"source_file": str(relative_path),
|
|
487
|
+
"total_chunks": len(chunks),
|
|
488
|
+
"chunking_strategy": args.chunking_strategy,
|
|
489
|
+
"processing_date": datetime.now().isoformat()
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
with open(json_path, 'w', encoding='utf-8') as f:
|
|
494
|
+
json.dump(chunk_data, f, indent=2, ensure_ascii=False)
|
|
495
|
+
|
|
496
|
+
chunk_files_created.append(json_path)
|
|
497
|
+
if args.verbose:
|
|
498
|
+
print(f" Created: {json_path} ({len(chunks)} chunks)")
|
|
499
|
+
else:
|
|
500
|
+
# Accumulate all chunks for single file output
|
|
501
|
+
all_chunks.extend(chunks)
|
|
502
|
+
|
|
503
|
+
except Exception as e:
|
|
504
|
+
print(f"Error processing {file_path}: {e}")
|
|
505
|
+
if args.verbose:
|
|
506
|
+
import traceback
|
|
507
|
+
traceback.print_exc()
|
|
508
|
+
|
|
509
|
+
# Handle single file output
|
|
510
|
+
if not args.output_dir:
|
|
511
|
+
output_data = {
|
|
512
|
+
"chunks": all_chunks,
|
|
513
|
+
"metadata": {
|
|
514
|
+
"total_chunks": len(all_chunks),
|
|
515
|
+
"total_files": len(files),
|
|
516
|
+
"chunking_strategy": args.chunking_strategy,
|
|
517
|
+
"processing_date": datetime.now().isoformat()
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
with open(args.output, 'w', encoding='utf-8') as f:
|
|
522
|
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
523
|
+
|
|
524
|
+
print(f"✓ Exported {len(all_chunks)} chunks to {args.output}")
|
|
525
|
+
else:
|
|
526
|
+
print(f"✓ Created {len(chunk_files_created)} JSON files in {args.output_dir}")
|
|
527
|
+
total_chunks = sum(len(json.load(open(f))['chunks']) for f in chunk_files_created)
|
|
528
|
+
print(f" Total chunks: {total_chunks}")
|
|
529
|
+
|
|
530
|
+
# Exit early for JSON format
|
|
531
|
+
return
|
|
532
|
+
|
|
533
|
+
# Regular index building mode
|
|
534
|
+
# Create index builder - import only when actually needed
|
|
535
|
+
from signalwire_agents.search.index_builder import IndexBuilder
|
|
225
536
|
builder = IndexBuilder(
|
|
226
537
|
model_name=args.model,
|
|
227
538
|
chunking_strategy=args.chunking_strategy,
|
|
@@ -229,7 +540,12 @@ Examples:
|
|
|
229
540
|
chunk_size=args.chunk_size,
|
|
230
541
|
chunk_overlap=args.overlap_size,
|
|
231
542
|
split_newlines=args.split_newlines,
|
|
232
|
-
|
|
543
|
+
index_nlp_backend=args.index_nlp_backend,
|
|
544
|
+
verbose=args.verbose,
|
|
545
|
+
semantic_threshold=args.semantic_threshold,
|
|
546
|
+
topic_threshold=args.topic_threshold,
|
|
547
|
+
backend=args.backend,
|
|
548
|
+
connection_string=args.connection_string
|
|
233
549
|
)
|
|
234
550
|
|
|
235
551
|
# Build index with multiple sources
|
|
@@ -239,7 +555,8 @@ Examples:
|
|
|
239
555
|
file_types=file_types,
|
|
240
556
|
exclude_patterns=exclude_patterns,
|
|
241
557
|
languages=languages,
|
|
242
|
-
tags=tags
|
|
558
|
+
tags=tags,
|
|
559
|
+
overwrite=args.overwrite if args.backend == 'pgvector' else False
|
|
243
560
|
)
|
|
244
561
|
|
|
245
562
|
# Validate if requested
|
|
@@ -258,7 +575,17 @@ Examples:
|
|
|
258
575
|
print(f"✗ Index validation failed: {validation['error']}")
|
|
259
576
|
sys.exit(1)
|
|
260
577
|
|
|
261
|
-
|
|
578
|
+
if args.backend == 'sqlite':
|
|
579
|
+
# Check if the index was actually created
|
|
580
|
+
import os
|
|
581
|
+
if os.path.exists(args.output):
|
|
582
|
+
print(f"\n✓ Search index created successfully: {args.output}")
|
|
583
|
+
else:
|
|
584
|
+
print(f"\n✗ Search index creation failed - no files were processed")
|
|
585
|
+
sys.exit(1)
|
|
586
|
+
else:
|
|
587
|
+
print(f"\n✓ Search collection created successfully: {args.output}")
|
|
588
|
+
print(f" Connection: {args.connection_string}")
|
|
262
589
|
|
|
263
590
|
except KeyboardInterrupt:
|
|
264
591
|
print("\n\nBuild interrupted by user")
|
|
@@ -283,7 +610,7 @@ def validate_command():
|
|
|
283
610
|
sys.exit(1)
|
|
284
611
|
|
|
285
612
|
try:
|
|
286
|
-
from
|
|
613
|
+
from signalwire_agents.search.index_builder import IndexBuilder
|
|
287
614
|
builder = IndexBuilder()
|
|
288
615
|
|
|
289
616
|
validation = builder.validate_index(args.index_file)
|
|
@@ -310,29 +637,57 @@ def validate_command():
|
|
|
310
637
|
|
|
311
638
|
def search_command():
|
|
312
639
|
"""Search within an existing search index"""
|
|
313
|
-
parser = argparse.ArgumentParser(description='Search within a .swsearch index file')
|
|
314
|
-
parser.add_argument('
|
|
315
|
-
parser.add_argument('query', help='Search query')
|
|
640
|
+
parser = argparse.ArgumentParser(description='Search within a .swsearch index file or pgvector collection')
|
|
641
|
+
parser.add_argument('index_source', help='Path to .swsearch file or collection name for pgvector')
|
|
642
|
+
parser.add_argument('query', nargs='?', help='Search query (optional if using --shell)')
|
|
643
|
+
parser.add_argument('--backend', choices=['sqlite', 'pgvector'], default='sqlite',
|
|
644
|
+
help='Storage backend (default: sqlite)')
|
|
645
|
+
parser.add_argument('--connection-string', help='PostgreSQL connection string for pgvector backend')
|
|
646
|
+
parser.add_argument('--shell', action='store_true',
|
|
647
|
+
help='Interactive shell mode - load once and search multiple times')
|
|
316
648
|
parser.add_argument('--count', type=int, default=5, help='Number of results to return (default: 5)')
|
|
317
649
|
parser.add_argument('--distance-threshold', type=float, default=0.0, help='Minimum similarity score (default: 0.0)')
|
|
318
650
|
parser.add_argument('--tags', help='Comma-separated tags to filter by')
|
|
319
|
-
parser.add_argument('--nlp-backend', choices=['nltk', 'spacy'], default='nltk',
|
|
320
|
-
help='NLP backend
|
|
651
|
+
parser.add_argument('--query-nlp-backend', choices=['nltk', 'spacy'], default='nltk',
|
|
652
|
+
help='NLP backend for query processing: nltk (fast, default) or spacy (better quality, slower)')
|
|
653
|
+
parser.add_argument('--keyword-weight', type=float, default=None,
|
|
654
|
+
help='Manual keyword weight (0.0-1.0). Overrides automatic weight detection.')
|
|
321
655
|
parser.add_argument('--verbose', action='store_true', help='Show detailed information')
|
|
322
656
|
parser.add_argument('--json', action='store_true', help='Output results as JSON')
|
|
323
657
|
parser.add_argument('--no-content', action='store_true', help='Hide content in results (show only metadata)')
|
|
658
|
+
parser.add_argument('--model', help='Override embedding model for query (mini/base/large or full model name)')
|
|
324
659
|
|
|
325
660
|
args = parser.parse_args()
|
|
326
661
|
|
|
327
|
-
|
|
328
|
-
|
|
662
|
+
# Validate arguments
|
|
663
|
+
if not args.shell and not args.query:
|
|
664
|
+
print("Error: Query is required unless using --shell mode")
|
|
665
|
+
sys.exit(1)
|
|
666
|
+
|
|
667
|
+
# Resolve model aliases
|
|
668
|
+
if args.model and args.model in MODEL_ALIASES:
|
|
669
|
+
args.model = MODEL_ALIASES[args.model]
|
|
670
|
+
|
|
671
|
+
# Validate keyword weight if provided
|
|
672
|
+
if args.keyword_weight is not None:
|
|
673
|
+
if args.keyword_weight < 0.0 or args.keyword_weight > 1.0:
|
|
674
|
+
print("Error: --keyword-weight must be between 0.0 and 1.0")
|
|
675
|
+
sys.exit(1)
|
|
676
|
+
|
|
677
|
+
# Validate backend configuration
|
|
678
|
+
if args.backend == 'pgvector' and not args.connection_string:
|
|
679
|
+
print("Error: --connection-string is required for pgvector backend")
|
|
680
|
+
sys.exit(1)
|
|
681
|
+
|
|
682
|
+
if args.backend == 'sqlite' and not Path(args.index_source).exists():
|
|
683
|
+
print(f"Error: Index file does not exist: {args.index_source}")
|
|
329
684
|
sys.exit(1)
|
|
330
685
|
|
|
331
686
|
try:
|
|
332
687
|
# Import search dependencies
|
|
333
688
|
try:
|
|
334
|
-
from
|
|
335
|
-
from
|
|
689
|
+
from signalwire_agents.search.search_engine import SearchEngine
|
|
690
|
+
from signalwire_agents.search.query_processor import preprocess_query
|
|
336
691
|
except ImportError as e:
|
|
337
692
|
print(f"Error: Search functionality not available. Install with: pip install signalwire-agents[search]")
|
|
338
693
|
print(f"Details: {e}")
|
|
@@ -340,20 +695,173 @@ def search_command():
|
|
|
340
695
|
|
|
341
696
|
# Load search engine
|
|
342
697
|
if args.verbose:
|
|
343
|
-
|
|
698
|
+
if args.backend == 'sqlite':
|
|
699
|
+
print(f"Loading search index: {args.index_source}")
|
|
700
|
+
else:
|
|
701
|
+
print(f"Connecting to pgvector collection: {args.index_source}")
|
|
344
702
|
|
|
345
|
-
|
|
703
|
+
if args.backend == 'sqlite':
|
|
704
|
+
# Pass the model from the index or override if specified
|
|
705
|
+
model = args.model if args.model else None
|
|
706
|
+
engine = SearchEngine(backend='sqlite', index_path=args.index_source, model=model)
|
|
707
|
+
else:
|
|
708
|
+
# Pass the model override if specified
|
|
709
|
+
model = args.model if args.model else None
|
|
710
|
+
engine = SearchEngine(backend='pgvector', connection_string=args.connection_string,
|
|
711
|
+
collection_name=args.index_source, model=model)
|
|
346
712
|
|
|
347
713
|
# Get index stats
|
|
348
714
|
stats = engine.get_stats()
|
|
715
|
+
|
|
716
|
+
# Get the model from index config if not overridden
|
|
717
|
+
model_to_use = args.model
|
|
718
|
+
if not model_to_use and 'config' in stats:
|
|
719
|
+
# SQLite uses 'embedding_model', pgvector uses 'model_name'
|
|
720
|
+
model_to_use = stats['config'].get('embedding_model') or stats['config'].get('model_name')
|
|
721
|
+
|
|
722
|
+
# Shell mode implementation
|
|
723
|
+
if args.shell:
|
|
724
|
+
import time
|
|
725
|
+
print(f"Search Shell - Index: {args.index_source}")
|
|
726
|
+
print(f"Backend: {args.backend}")
|
|
727
|
+
print(f"Index contains {stats['total_chunks']} chunks from {stats['total_files']} files")
|
|
728
|
+
if model_to_use:
|
|
729
|
+
print(f"Model: {model_to_use}")
|
|
730
|
+
print("Type 'exit' or 'quit' to leave, 'help' for options")
|
|
731
|
+
print("-" * 60)
|
|
732
|
+
|
|
733
|
+
while True:
|
|
734
|
+
try:
|
|
735
|
+
query = input("\nsearch> ").strip()
|
|
736
|
+
|
|
737
|
+
if not query:
|
|
738
|
+
continue
|
|
739
|
+
|
|
740
|
+
if query.lower() in ['exit', 'quit', 'q']:
|
|
741
|
+
print("Goodbye!")
|
|
742
|
+
break
|
|
743
|
+
|
|
744
|
+
if query.lower() == 'help':
|
|
745
|
+
print("\nShell commands:")
|
|
746
|
+
print(" help - Show this help")
|
|
747
|
+
print(" exit/quit/q - Exit shell")
|
|
748
|
+
print(" count=N - Set result count (current: {})".format(args.count))
|
|
749
|
+
print(" tags=tag1,tag2 - Set tag filter (current: {})".format(args.tags or 'none'))
|
|
750
|
+
print(" verbose - Toggle verbose output")
|
|
751
|
+
print("\nOr type any search query...")
|
|
752
|
+
continue
|
|
753
|
+
|
|
754
|
+
# Handle shell commands
|
|
755
|
+
if query.startswith('count='):
|
|
756
|
+
try:
|
|
757
|
+
args.count = int(query.split('=')[1])
|
|
758
|
+
print(f"Result count set to: {args.count}")
|
|
759
|
+
except:
|
|
760
|
+
print("Invalid count value")
|
|
761
|
+
continue
|
|
762
|
+
|
|
763
|
+
if query.startswith('tags='):
|
|
764
|
+
tag_str = query.split('=', 1)[1]
|
|
765
|
+
args.tags = tag_str if tag_str else None
|
|
766
|
+
tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
|
|
767
|
+
print(f"Tags filter set to: {tags or 'none'}")
|
|
768
|
+
continue
|
|
769
|
+
|
|
770
|
+
if query == 'verbose':
|
|
771
|
+
args.verbose = not args.verbose
|
|
772
|
+
print(f"Verbose output: {'on' if args.verbose else 'off'}")
|
|
773
|
+
continue
|
|
774
|
+
|
|
775
|
+
# Perform search with timing
|
|
776
|
+
start_time = time.time()
|
|
777
|
+
|
|
778
|
+
# Preprocess query
|
|
779
|
+
enhanced = preprocess_query(
|
|
780
|
+
query,
|
|
781
|
+
vector=True,
|
|
782
|
+
query_nlp_backend=args.query_nlp_backend,
|
|
783
|
+
model_name=model_to_use,
|
|
784
|
+
preserve_original=True,
|
|
785
|
+
max_synonyms=2
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
# Parse tags
|
|
789
|
+
tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
|
|
790
|
+
|
|
791
|
+
# Perform search
|
|
792
|
+
results = engine.search(
|
|
793
|
+
query_vector=enhanced.get('vector'),
|
|
794
|
+
enhanced_text=enhanced.get('enhanced_text', query),
|
|
795
|
+
count=args.count,
|
|
796
|
+
similarity_threshold=args.similarity_threshold,
|
|
797
|
+
tags=tags,
|
|
798
|
+
keyword_weight=args.keyword_weight,
|
|
799
|
+
original_query=query
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
search_time = time.time() - start_time
|
|
803
|
+
|
|
804
|
+
# Display results
|
|
805
|
+
if not results:
|
|
806
|
+
print(f"\nNo results found for '{query}' ({search_time:.3f}s)")
|
|
807
|
+
else:
|
|
808
|
+
print(f"\nFound {len(results)} result(s) for '{query}' ({search_time:.3f}s):")
|
|
809
|
+
if enhanced.get('enhanced_text') != query and args.verbose:
|
|
810
|
+
print(f"Enhanced query: '{enhanced.get('enhanced_text')}'")
|
|
811
|
+
print("=" * 60)
|
|
812
|
+
|
|
813
|
+
for i, result in enumerate(results):
|
|
814
|
+
print(f"\n[{i+1}] Score: {result['score']:.4f}")
|
|
815
|
+
|
|
816
|
+
# Show metadata
|
|
817
|
+
metadata = result['metadata']
|
|
818
|
+
print(f"File: {metadata.get('filename', 'Unknown')}")
|
|
819
|
+
if metadata.get('section'):
|
|
820
|
+
print(f"Section: {metadata['section']}")
|
|
821
|
+
|
|
822
|
+
# Show content unless suppressed
|
|
823
|
+
if not args.no_content:
|
|
824
|
+
content = result['content']
|
|
825
|
+
if len(content) > 300 and not args.verbose:
|
|
826
|
+
content = content[:300] + "..."
|
|
827
|
+
print(f"\n{content}")
|
|
828
|
+
|
|
829
|
+
if i < len(results) - 1:
|
|
830
|
+
print("-" * 40)
|
|
831
|
+
|
|
832
|
+
except KeyboardInterrupt:
|
|
833
|
+
print("\nUse 'exit' to quit")
|
|
834
|
+
except EOFError:
|
|
835
|
+
print("\nGoodbye!")
|
|
836
|
+
break
|
|
837
|
+
except Exception as e:
|
|
838
|
+
print(f"\nError: {e}")
|
|
839
|
+
if args.verbose:
|
|
840
|
+
import traceback
|
|
841
|
+
traceback.print_exc()
|
|
842
|
+
|
|
843
|
+
return # Exit after shell mode
|
|
844
|
+
|
|
845
|
+
# Normal single query mode
|
|
349
846
|
if args.verbose:
|
|
350
847
|
print(f"Index contains {stats['total_chunks']} chunks from {stats['total_files']} files")
|
|
351
848
|
print(f"Searching for: '{args.query}'")
|
|
352
|
-
print(f"NLP Backend: {args.
|
|
849
|
+
print(f"Query NLP Backend: {args.query_nlp_backend}")
|
|
850
|
+
if args.model:
|
|
851
|
+
print(f"Override model: {args.model}")
|
|
852
|
+
elif model_to_use:
|
|
853
|
+
print(f"Using index model: {model_to_use}")
|
|
353
854
|
print()
|
|
354
855
|
|
|
355
856
|
# Preprocess query
|
|
356
|
-
enhanced = preprocess_query(
|
|
857
|
+
enhanced = preprocess_query(
|
|
858
|
+
args.query,
|
|
859
|
+
vector=True, # Both backends need vector for similarity search
|
|
860
|
+
query_nlp_backend=args.query_nlp_backend,
|
|
861
|
+
model_name=model_to_use,
|
|
862
|
+
preserve_original=True, # Keep original query terms
|
|
863
|
+
max_synonyms=2 # Reduce synonym expansion
|
|
864
|
+
)
|
|
357
865
|
|
|
358
866
|
# Parse tags if provided
|
|
359
867
|
tags = [tag.strip() for tag in args.tags.split(',')] if args.tags else None
|
|
@@ -363,8 +871,10 @@ def search_command():
|
|
|
363
871
|
query_vector=enhanced.get('vector'),
|
|
364
872
|
enhanced_text=enhanced.get('enhanced_text', args.query),
|
|
365
873
|
count=args.count,
|
|
366
|
-
|
|
367
|
-
tags=tags
|
|
874
|
+
similarity_threshold=args.similarity_threshold,
|
|
875
|
+
tags=tags,
|
|
876
|
+
keyword_weight=args.keyword_weight,
|
|
877
|
+
original_query=args.query # Pass original for exact match boosting
|
|
368
878
|
)
|
|
369
879
|
|
|
370
880
|
if args.json:
|
|
@@ -433,10 +943,400 @@ def search_command():
|
|
|
433
943
|
traceback.print_exc()
|
|
434
944
|
sys.exit(1)
|
|
435
945
|
|
|
946
|
+
def migrate_command():
|
|
947
|
+
"""Migrate search indexes between backends"""
|
|
948
|
+
parser = argparse.ArgumentParser(
|
|
949
|
+
description='Migrate search indexes between SQLite and pgvector backends',
|
|
950
|
+
epilog="""
|
|
951
|
+
Examples:
|
|
952
|
+
# Migrate SQLite to pgvector
|
|
953
|
+
sw-search migrate ./docs.swsearch \\
|
|
954
|
+
--to-pgvector \\
|
|
955
|
+
--connection-string "postgresql://user:pass@localhost/db" \\
|
|
956
|
+
--collection-name docs_collection
|
|
957
|
+
|
|
958
|
+
# Migrate with overwrite
|
|
959
|
+
sw-search migrate ./docs.swsearch \\
|
|
960
|
+
--to-pgvector \\
|
|
961
|
+
--connection-string "postgresql://user:pass@localhost/db" \\
|
|
962
|
+
--collection-name docs_collection \\
|
|
963
|
+
--overwrite
|
|
964
|
+
|
|
965
|
+
# Get index information
|
|
966
|
+
sw-search migrate --info ./docs.swsearch
|
|
967
|
+
""",
|
|
968
|
+
formatter_class=argparse.RawDescriptionHelpFormatter
|
|
969
|
+
)
|
|
970
|
+
|
|
971
|
+
# Source argument (optional if using --info)
|
|
972
|
+
parser.add_argument('source', nargs='?', help='Source index file or collection')
|
|
973
|
+
|
|
974
|
+
# Migration direction
|
|
975
|
+
migration_group = parser.add_mutually_exclusive_group()
|
|
976
|
+
migration_group.add_argument('--to-pgvector', action='store_true',
|
|
977
|
+
help='Migrate SQLite index to pgvector')
|
|
978
|
+
migration_group.add_argument('--to-sqlite', action='store_true',
|
|
979
|
+
help='Migrate pgvector collection to SQLite (not yet implemented)')
|
|
980
|
+
migration_group.add_argument('--info', action='store_true',
|
|
981
|
+
help='Show information about an index')
|
|
982
|
+
|
|
983
|
+
# pgvector options
|
|
984
|
+
parser.add_argument('--connection-string',
|
|
985
|
+
help='PostgreSQL connection string for pgvector')
|
|
986
|
+
parser.add_argument('--collection-name',
|
|
987
|
+
help='Collection name for pgvector')
|
|
988
|
+
parser.add_argument('--overwrite', action='store_true',
|
|
989
|
+
help='Overwrite existing collection')
|
|
990
|
+
|
|
991
|
+
# SQLite options
|
|
992
|
+
parser.add_argument('--output',
|
|
993
|
+
help='Output .swsearch file path (for --to-sqlite)')
|
|
994
|
+
|
|
995
|
+
# Common options
|
|
996
|
+
parser.add_argument('--batch-size', type=int, default=100,
|
|
997
|
+
help='Number of chunks to process at once (default: 100)')
|
|
998
|
+
parser.add_argument('--verbose', action='store_true',
|
|
999
|
+
help='Show detailed progress')
|
|
1000
|
+
|
|
1001
|
+
args = parser.parse_args()
|
|
1002
|
+
|
|
1003
|
+
# Handle --info flag
|
|
1004
|
+
if args.info:
|
|
1005
|
+
if not args.source:
|
|
1006
|
+
print("Error: Source index required with --info")
|
|
1007
|
+
sys.exit(1)
|
|
1008
|
+
|
|
1009
|
+
try:
|
|
1010
|
+
from signalwire_agents.search.migration import SearchIndexMigrator
|
|
1011
|
+
migrator = SearchIndexMigrator(verbose=args.verbose)
|
|
1012
|
+
info = migrator.get_index_info(args.source)
|
|
1013
|
+
|
|
1014
|
+
print(f"Index Information: {args.source}")
|
|
1015
|
+
print(f" Type: {info['type']}")
|
|
1016
|
+
if info['type'] == 'sqlite':
|
|
1017
|
+
print(f" Total chunks: {info['total_chunks']}")
|
|
1018
|
+
print(f" Total files: {info['total_files']}")
|
|
1019
|
+
print(f" Model: {info['config'].get('embedding_model', 'Unknown')}")
|
|
1020
|
+
print(f" Dimensions: {info['config'].get('embedding_dimensions', 'Unknown')}")
|
|
1021
|
+
print(f" Created: {info['config'].get('created_at', 'Unknown')}")
|
|
1022
|
+
if args.verbose:
|
|
1023
|
+
print("\n Full configuration:")
|
|
1024
|
+
for key, value in info['config'].items():
|
|
1025
|
+
print(f" {key}: {value}")
|
|
1026
|
+
else:
|
|
1027
|
+
print(" Unable to determine index type")
|
|
1028
|
+
except Exception as e:
|
|
1029
|
+
print(f"Error getting index info: {e}")
|
|
1030
|
+
sys.exit(1)
|
|
1031
|
+
return
|
|
1032
|
+
|
|
1033
|
+
# Validate arguments for migration
|
|
1034
|
+
if not args.source:
|
|
1035
|
+
print("Error: Source index required for migration")
|
|
1036
|
+
sys.exit(1)
|
|
1037
|
+
|
|
1038
|
+
if not args.to_pgvector and not args.to_sqlite:
|
|
1039
|
+
print("Error: Must specify migration direction (--to-pgvector or --to-sqlite)")
|
|
1040
|
+
sys.exit(1)
|
|
1041
|
+
|
|
1042
|
+
try:
|
|
1043
|
+
from signalwire_agents.search.migration import SearchIndexMigrator
|
|
1044
|
+
migrator = SearchIndexMigrator(verbose=args.verbose)
|
|
1045
|
+
|
|
1046
|
+
if args.to_pgvector:
|
|
1047
|
+
# Validate pgvector arguments
|
|
1048
|
+
if not args.connection_string:
|
|
1049
|
+
print("Error: --connection-string required for pgvector migration")
|
|
1050
|
+
sys.exit(1)
|
|
1051
|
+
if not args.collection_name:
|
|
1052
|
+
print("Error: --collection-name required for pgvector migration")
|
|
1053
|
+
sys.exit(1)
|
|
1054
|
+
|
|
1055
|
+
# Perform migration
|
|
1056
|
+
print(f"Migrating {args.source} to pgvector collection '{args.collection_name}'...")
|
|
1057
|
+
stats = migrator.migrate_sqlite_to_pgvector(
|
|
1058
|
+
sqlite_path=args.source,
|
|
1059
|
+
connection_string=args.connection_string,
|
|
1060
|
+
collection_name=args.collection_name,
|
|
1061
|
+
overwrite=args.overwrite,
|
|
1062
|
+
batch_size=args.batch_size
|
|
1063
|
+
)
|
|
1064
|
+
|
|
1065
|
+
print(f"\n✓ Migration completed successfully!")
|
|
1066
|
+
print(f" Chunks migrated: {stats['chunks_migrated']}")
|
|
1067
|
+
print(f" Errors: {stats['errors']}")
|
|
1068
|
+
|
|
1069
|
+
elif args.to_sqlite:
|
|
1070
|
+
print("Error: pgvector to SQLite migration not yet implemented")
|
|
1071
|
+
print("This feature is planned for future development")
|
|
1072
|
+
sys.exit(1)
|
|
1073
|
+
|
|
1074
|
+
except Exception as e:
|
|
1075
|
+
print(f"\nError during migration: {e}")
|
|
1076
|
+
if args.verbose:
|
|
1077
|
+
import traceback
|
|
1078
|
+
traceback.print_exc()
|
|
1079
|
+
sys.exit(1)
|
|
1080
|
+
|
|
1081
|
+
|
|
1082
|
+
def remote_command():
|
|
1083
|
+
"""Search via remote API endpoint"""
|
|
1084
|
+
parser = argparse.ArgumentParser(description='Search via remote API endpoint')
|
|
1085
|
+
parser.add_argument('endpoint', help='Remote API endpoint URL (e.g., http://localhost:8001)')
|
|
1086
|
+
parser.add_argument('query', help='Search query')
|
|
1087
|
+
parser.add_argument('--index-name', required=True, help='Name of the index to search')
|
|
1088
|
+
parser.add_argument('--count', type=int, default=5, help='Number of results to return (default: 5)')
|
|
1089
|
+
parser.add_argument('--distance-threshold', type=float, default=0.0, help='Minimum similarity score (default: 0.0)')
|
|
1090
|
+
parser.add_argument('--tags', help='Comma-separated tags to filter by')
|
|
1091
|
+
parser.add_argument('--verbose', action='store_true', help='Show detailed information')
|
|
1092
|
+
parser.add_argument('--json', action='store_true', help='Output results as JSON')
|
|
1093
|
+
parser.add_argument('--no-content', action='store_true', help='Hide content in results (show only metadata)')
|
|
1094
|
+
parser.add_argument('--timeout', type=int, default=30, help='Request timeout in seconds (default: 30)')
|
|
1095
|
+
|
|
1096
|
+
args = parser.parse_args()
|
|
1097
|
+
|
|
1098
|
+
# Ensure endpoint starts with http:// or https://
|
|
1099
|
+
endpoint = args.endpoint
|
|
1100
|
+
if not endpoint.startswith(('http://', 'https://')):
|
|
1101
|
+
endpoint = f"http://{endpoint}"
|
|
1102
|
+
|
|
1103
|
+
# Ensure endpoint ends with /search
|
|
1104
|
+
if not endpoint.endswith('/search'):
|
|
1105
|
+
if endpoint.endswith('/'):
|
|
1106
|
+
endpoint += 'search'
|
|
1107
|
+
else:
|
|
1108
|
+
endpoint += '/search'
|
|
1109
|
+
|
|
1110
|
+
try:
|
|
1111
|
+
import requests
|
|
1112
|
+
except ImportError:
|
|
1113
|
+
print("Error: requests library not available. Install with: pip install requests")
|
|
1114
|
+
sys.exit(1)
|
|
1115
|
+
|
|
1116
|
+
# Prepare request payload
|
|
1117
|
+
payload = {
|
|
1118
|
+
'query': args.query,
|
|
1119
|
+
'index_name': args.index_name,
|
|
1120
|
+
'count': args.count,
|
|
1121
|
+
'similarity_threshold': args.similarity_threshold
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
if args.tags:
|
|
1125
|
+
payload['tags'] = [tag.strip() for tag in args.tags.split(',')]
|
|
1126
|
+
|
|
1127
|
+
if args.verbose:
|
|
1128
|
+
print(f"Searching remote endpoint: {endpoint}")
|
|
1129
|
+
print(f"Payload: {payload}")
|
|
1130
|
+
print()
|
|
1131
|
+
|
|
1132
|
+
try:
|
|
1133
|
+
# Make the API request
|
|
1134
|
+
response = requests.post(
|
|
1135
|
+
endpoint,
|
|
1136
|
+
json=payload,
|
|
1137
|
+
headers={'Content-Type': 'application/json'},
|
|
1138
|
+
timeout=args.timeout
|
|
1139
|
+
)
|
|
1140
|
+
|
|
1141
|
+
if response.status_code == 200:
|
|
1142
|
+
result = response.json()
|
|
1143
|
+
|
|
1144
|
+
if args.json:
|
|
1145
|
+
# Output raw JSON response
|
|
1146
|
+
import json
|
|
1147
|
+
print(json.dumps(result, indent=2))
|
|
1148
|
+
else:
|
|
1149
|
+
# Human-readable output
|
|
1150
|
+
results = result.get('results', [])
|
|
1151
|
+
if not results:
|
|
1152
|
+
print(f"No results found for '{args.query}' in index '{args.index_name}'")
|
|
1153
|
+
sys.exit(0)
|
|
1154
|
+
|
|
1155
|
+
print(f"Found {len(results)} result(s) for '{args.query}' in index '{args.index_name}':")
|
|
1156
|
+
if result.get('enhanced_query') and result.get('enhanced_query') != args.query:
|
|
1157
|
+
print(f"Enhanced query: '{result.get('enhanced_query')}'")
|
|
1158
|
+
print("=" * 80)
|
|
1159
|
+
|
|
1160
|
+
for i, search_result in enumerate(results):
|
|
1161
|
+
print(f"\n[{i+1}] Score: {search_result.get('score', 0):.4f}")
|
|
1162
|
+
|
|
1163
|
+
# Show metadata
|
|
1164
|
+
metadata = search_result.get('metadata', {})
|
|
1165
|
+
print(f"File: {metadata.get('filename', 'Unknown')}")
|
|
1166
|
+
if metadata.get('section'):
|
|
1167
|
+
print(f"Section: {metadata['section']}")
|
|
1168
|
+
if metadata.get('line_start'):
|
|
1169
|
+
print(f"Lines: {metadata['line_start']}-{metadata.get('line_end', metadata['line_start'])}")
|
|
1170
|
+
if metadata.get('tags'):
|
|
1171
|
+
print(f"Tags: {', '.join(metadata['tags'])}")
|
|
1172
|
+
|
|
1173
|
+
# Show content unless suppressed
|
|
1174
|
+
if not args.no_content and 'content' in search_result:
|
|
1175
|
+
content = search_result['content']
|
|
1176
|
+
if len(content) > 500 and not args.verbose:
|
|
1177
|
+
content = content[:500] + "..."
|
|
1178
|
+
print(f"\nContent:\n{content}")
|
|
1179
|
+
|
|
1180
|
+
if i < len(results) - 1:
|
|
1181
|
+
print("-" * 80)
|
|
1182
|
+
|
|
1183
|
+
elif response.status_code == 404:
|
|
1184
|
+
try:
|
|
1185
|
+
error_detail = response.json()
|
|
1186
|
+
error_msg = error_detail.get('detail', 'Index not found')
|
|
1187
|
+
except:
|
|
1188
|
+
error_msg = 'Index not found'
|
|
1189
|
+
print(f"Error: {error_msg}")
|
|
1190
|
+
sys.exit(1)
|
|
1191
|
+
else:
|
|
1192
|
+
try:
|
|
1193
|
+
error_detail = response.json()
|
|
1194
|
+
error_msg = error_detail.get('detail', f'HTTP {response.status_code}')
|
|
1195
|
+
except:
|
|
1196
|
+
error_msg = f'HTTP {response.status_code}: {response.text}'
|
|
1197
|
+
print(f"Error: {error_msg}")
|
|
1198
|
+
sys.exit(1)
|
|
1199
|
+
|
|
1200
|
+
except requests.ConnectionError:
|
|
1201
|
+
print(f"Error: Could not connect to {endpoint}")
|
|
1202
|
+
print("Make sure the search server is running")
|
|
1203
|
+
sys.exit(1)
|
|
1204
|
+
except requests.Timeout:
|
|
1205
|
+
print(f"Error: Request timed out after {args.timeout} seconds")
|
|
1206
|
+
sys.exit(1)
|
|
1207
|
+
except requests.RequestException as e:
|
|
1208
|
+
print(f"Error making request: {e}")
|
|
1209
|
+
sys.exit(1)
|
|
1210
|
+
except Exception as e:
|
|
1211
|
+
print(f"Error: {e}")
|
|
1212
|
+
if args.verbose:
|
|
1213
|
+
import traceback
|
|
1214
|
+
traceback.print_exc()
|
|
1215
|
+
sys.exit(1)
|
|
1216
|
+
|
|
436
1217
|
def console_entry_point():
|
|
437
1218
|
"""Console script entry point for pip installation"""
|
|
438
1219
|
import sys
|
|
439
1220
|
|
|
1221
|
+
# Fast help check - show help without importing heavy modules
|
|
1222
|
+
if len(sys.argv) > 1 and sys.argv[1] in ['--help', '-h']:
|
|
1223
|
+
print("""usage: sw-search [-h] [--output OUTPUT] [--chunking-strategy {sentence,sliding,paragraph,page,semantic,topic,qa}]
|
|
1224
|
+
[--max-sentences-per-chunk MAX_SENTENCES_PER_CHUNK] [--chunk-size CHUNK_SIZE]
|
|
1225
|
+
[--overlap-size OVERLAP_SIZE] [--split-newlines SPLIT_NEWLINES] [--file-types FILE_TYPES]
|
|
1226
|
+
[--exclude EXCLUDE] [--languages LANGUAGES] [--model MODEL] [--tags TAGS]
|
|
1227
|
+
[--index-nlp-backend {nltk,spacy}] [--verbose] [--validate]
|
|
1228
|
+
[--semantic-threshold SEMANTIC_THRESHOLD] [--topic-threshold TOPIC_THRESHOLD]
|
|
1229
|
+
sources [sources ...]
|
|
1230
|
+
|
|
1231
|
+
Build local search index from documents
|
|
1232
|
+
|
|
1233
|
+
positional arguments:
|
|
1234
|
+
sources Source files and/or directories to index
|
|
1235
|
+
|
|
1236
|
+
options:
|
|
1237
|
+
-h, --help show this help message and exit
|
|
1238
|
+
--output OUTPUT Output .swsearch file (default: sources.swsearch)
|
|
1239
|
+
--chunking-strategy {sentence,sliding,paragraph,page,semantic,topic,qa}
|
|
1240
|
+
Chunking strategy to use (default: sentence)
|
|
1241
|
+
--max-sentences-per-chunk MAX_SENTENCES_PER_CHUNK
|
|
1242
|
+
Maximum sentences per chunk for sentence strategy (default: 5)
|
|
1243
|
+
--chunk-size CHUNK_SIZE
|
|
1244
|
+
Chunk size in words for sliding window strategy (default: 50)
|
|
1245
|
+
--overlap-size OVERLAP_SIZE
|
|
1246
|
+
Overlap size in words for sliding window strategy (default: 10)
|
|
1247
|
+
--split-newlines SPLIT_NEWLINES
|
|
1248
|
+
Split on multiple newlines (for sentence strategy)
|
|
1249
|
+
--file-types FILE_TYPES
|
|
1250
|
+
Comma-separated file extensions to include for directories (default: md,txt,rst)
|
|
1251
|
+
--exclude EXCLUDE Comma-separated glob patterns to exclude (e.g., "**/test/**,**/__pycache__/**")
|
|
1252
|
+
--languages LANGUAGES
|
|
1253
|
+
Comma-separated language codes (default: en)
|
|
1254
|
+
--model MODEL Sentence transformer model name (default: sentence-transformers/all-mpnet-base-v2)
|
|
1255
|
+
--tags TAGS Comma-separated tags to add to all chunks
|
|
1256
|
+
--index-nlp-backend {nltk,spacy}
|
|
1257
|
+
NLP backend for document processing: nltk (fast, default) or spacy (better quality, slower)
|
|
1258
|
+
--verbose Enable verbose output
|
|
1259
|
+
--validate Validate the created index after building
|
|
1260
|
+
--semantic-threshold SEMANTIC_THRESHOLD
|
|
1261
|
+
Similarity threshold for semantic chunking (default: 0.5)
|
|
1262
|
+
--topic-threshold TOPIC_THRESHOLD
|
|
1263
|
+
Similarity threshold for topic chunking (default: 0.3)
|
|
1264
|
+
|
|
1265
|
+
Examples:
|
|
1266
|
+
# Basic usage with directory (defaults to sentence chunking with 5 sentences per chunk)
|
|
1267
|
+
sw-search ./docs
|
|
1268
|
+
|
|
1269
|
+
# Multiple directories
|
|
1270
|
+
sw-search ./docs ./examples --file-types md,txt,py
|
|
1271
|
+
|
|
1272
|
+
# Individual files
|
|
1273
|
+
sw-search README.md ./docs/guide.md ./src/main.py
|
|
1274
|
+
|
|
1275
|
+
# Mixed sources (directories and files)
|
|
1276
|
+
sw-search ./docs README.md ./examples specific_file.txt --file-types md,txt,py
|
|
1277
|
+
|
|
1278
|
+
# Sentence-based chunking with custom parameters
|
|
1279
|
+
sw-search ./docs \\
|
|
1280
|
+
--chunking-strategy sentence \\
|
|
1281
|
+
--max-sentences-per-chunk 10 \\
|
|
1282
|
+
--split-newlines 2
|
|
1283
|
+
|
|
1284
|
+
# Sliding window chunking
|
|
1285
|
+
sw-search ./docs \\
|
|
1286
|
+
--chunking-strategy sliding \\
|
|
1287
|
+
--chunk-size 100 \\
|
|
1288
|
+
--overlap-size 20
|
|
1289
|
+
|
|
1290
|
+
# Paragraph-based chunking
|
|
1291
|
+
sw-search ./docs \\
|
|
1292
|
+
--chunking-strategy paragraph \\
|
|
1293
|
+
--file-types md,txt,rst
|
|
1294
|
+
|
|
1295
|
+
# Page-based chunking (good for PDFs)
|
|
1296
|
+
sw-search ./docs \\
|
|
1297
|
+
--chunking-strategy page \\
|
|
1298
|
+
--file-types pdf
|
|
1299
|
+
|
|
1300
|
+
# Semantic chunking (groups semantically similar sentences)
|
|
1301
|
+
sw-search ./docs \\
|
|
1302
|
+
--chunking-strategy semantic \\
|
|
1303
|
+
--semantic-threshold 0.6
|
|
1304
|
+
|
|
1305
|
+
# Topic-based chunking (groups by topic changes)
|
|
1306
|
+
sw-search ./docs \\
|
|
1307
|
+
--chunking-strategy topic \\
|
|
1308
|
+
--topic-threshold 0.2
|
|
1309
|
+
|
|
1310
|
+
# QA-optimized chunking (optimized for question-answering)
|
|
1311
|
+
sw-search ./docs \\
|
|
1312
|
+
--chunking-strategy qa
|
|
1313
|
+
|
|
1314
|
+
# Full configuration example
|
|
1315
|
+
sw-search ./docs ./examples README.md \\
|
|
1316
|
+
--output ./knowledge.swsearch \\
|
|
1317
|
+
--chunking-strategy sentence \\
|
|
1318
|
+
--max-sentences-per-chunk 8 \\
|
|
1319
|
+
--file-types md,txt,rst,py \\
|
|
1320
|
+
--exclude "**/test/**,**/__pycache__/**" \\
|
|
1321
|
+
--languages en,es,fr \\
|
|
1322
|
+
--model sentence-transformers/all-mpnet-base-v2 \\
|
|
1323
|
+
--tags documentation,api \\
|
|
1324
|
+
--verbose
|
|
1325
|
+
|
|
1326
|
+
# Validate an existing index
|
|
1327
|
+
sw-search validate ./docs.swsearch
|
|
1328
|
+
|
|
1329
|
+
# Search within an index
|
|
1330
|
+
sw-search search ./docs.swsearch "how to create an agent"
|
|
1331
|
+
sw-search search ./docs.swsearch "API reference" --count 3 --verbose
|
|
1332
|
+
sw-search search ./docs.swsearch "configuration" --tags documentation --json
|
|
1333
|
+
|
|
1334
|
+
# Search via remote API
|
|
1335
|
+
sw-search remote http://localhost:8001 "how to create an agent" --index-name docs
|
|
1336
|
+
sw-search remote localhost:8001 "API reference" --index-name docs --count 3 --verbose
|
|
1337
|
+
""")
|
|
1338
|
+
return
|
|
1339
|
+
|
|
440
1340
|
# Check for subcommands
|
|
441
1341
|
if len(sys.argv) > 1:
|
|
442
1342
|
if sys.argv[1] == 'validate':
|
|
@@ -449,6 +1349,16 @@ def console_entry_point():
|
|
|
449
1349
|
sys.argv.pop(1)
|
|
450
1350
|
search_command()
|
|
451
1351
|
return
|
|
1352
|
+
elif sys.argv[1] == 'remote':
|
|
1353
|
+
# Remove 'remote' from argv and call remote_command
|
|
1354
|
+
sys.argv.pop(1)
|
|
1355
|
+
remote_command()
|
|
1356
|
+
return
|
|
1357
|
+
elif sys.argv[1] == 'migrate':
|
|
1358
|
+
# Remove 'migrate' from argv and call migrate_command
|
|
1359
|
+
sys.argv.pop(1)
|
|
1360
|
+
migrate_command()
|
|
1361
|
+
return
|
|
452
1362
|
|
|
453
1363
|
# Regular build command
|
|
454
1364
|
main()
|