tos-vectors-embed-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ # Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ # This file has been modified by Beijing Volcano Engine Technology Co., Ltd. on 2026-02-12
6
+ #
7
+ # Original file was released under Apache License 2.0, with the full license text
8
+ # available at http://www.apache.org/licenses/LICENSE-2.0.
9
+ #
10
+ # This modified file is released under the same license.
11
+ """TOS Vectors Embed CLI package."""
@@ -0,0 +1,14 @@
1
+ # Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ # This file has been modified by Beijing Volcano Engine Technology Co., Ltd. on 2026-02-12
6
+ #
7
+ # Original file was released under Apache License 2.0, with the full license text
8
+ # available at http://www.apache.org/licenses/LICENSE-2.0.
9
+ #
10
+ # This modified file is released under the same license.
11
+ """Version information for TOS Vectors CLI."""
12
+
13
+ __version__ = "0.1.0"
14
+ __version_info__ = tuple(int(i) for i in __version__.split('.'))
tos_vectors/cli.py ADDED
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
+ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+ #
6
+ # This file has been modified by Beijing Volcano Engine Technology Co., Ltd. on 2026-02-12
7
+ #
8
+ # Original file was released under Apache License 2.0, with the full license text
9
+ # available at http://www.apache.org/licenses/LICENSE-2.0.
10
+ #
11
+ # This modified file is released under the same license.
12
+ """Main CLI entry point for TOS Vectors."""
13
+
14
+ import click
15
+ from rich.console import Console
16
+ from rich.traceback import install
17
+ from tos_vectors.commands.embed_put import embed_put
18
+ from tos_vectors.commands.embed_query import embed_query
19
+
20
+ # Install rich traceback handler
21
+ install(show_locals=True)
22
+ console = Console()
23
+
24
+
25
+ @click.group()
26
+ @click.version_option(version="0.1.0")
27
+ @click.option('--account-id', help='Volcengine account id to use')
28
+ @click.option('--region', default='cn-beijing', help='TOS region name')
29
+ @click.option('--debug', is_flag=True, help='Enable debug mode with detailed logging')
30
+ @click.pass_context
31
+ def cli(ctx, account_id, region, debug):
32
+ """TOS Vectors Embed CLI - Standalone tool for vector embedding operations with TOS and Ark."""
33
+ ctx.ensure_object(dict)
34
+ ctx.obj['account_id'] = account_id
35
+ ctx.obj['region'] = region
36
+ ctx.obj['console'] = console
37
+ ctx.obj['debug'] = debug
38
+
39
+ if debug:
40
+ console.print("[dim] Debug mode enabled[/dim]")
41
+ console.print(f"[dim] Volcengine Account ID: {account_id}[/dim]")
42
+ console.print(f"[dim] TOS Region: {region}[/dim]")
43
+
44
+
45
+ # Register commands as subcommands
46
+ cli.add_command(embed_put, name='put')
47
+ cli.add_command(embed_query, name='query')
48
+
49
+
50
+ def main():
51
+ """Main entry point."""
52
+ try:
53
+ cli()
54
+ except KeyboardInterrupt:
55
+ console.print("\n[yellow]Operation cancelled by user[/yellow]")
56
+ except Exception as e:
57
+ console.print(f"[red]Error: {str(e)}[/red]")
58
+ raise
59
+
60
+
61
+ if __name__ == '__main__':
62
+ main()
@@ -0,0 +1,11 @@
1
+ # Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ # This file has been modified by Beijing Volcano Engine Technology Co., Ltd. on 2026-02-12
6
+ #
7
+ # Original file was released under Apache License 2.0, with the full license text
8
+ # available at http://www.apache.org/licenses/LICENSE-2.0.
9
+ #
10
+ # This modified file is released under the same license.
11
+ """Commands for TOS Vectors CLI."""
@@ -0,0 +1,389 @@
1
+ # Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ # This file has been modified by Beijing Volcano Engine Technology Co., Ltd. on 2026-02-12
6
+ #
7
+ # Original file was released under Apache License 2.0, with the full license text
8
+ # available at http://www.apache.org/licenses/LICENSE-2.0.
9
+ #
10
+ # This modified file is released under the same license.
11
+ """Command implementation for embedding and storing vectors."""
12
+
13
+ import json
14
+ import click
15
+ from rich.progress import Progress, SpinnerColumn, TextColumn
16
+
17
+ from tos_vectors.core.services import ArkService, TOSVectorService
18
+ from tos_vectors.core.unified_processor import UnifiedProcessor
19
+ from tos_vectors.utils.config import get_region
20
+ from tos_vectors.utils.models import (
21
+ get_model_info,
22
+ validate_user_parameters,
23
+ prepare_processing_input,
24
+ determine_content_type
25
+ )
26
+ from tos_vectors.core.streaming_batch_orchestrator import (
27
+ StreamingBatchOrchestrator
28
+ )
29
+
30
+
31
+ def _create_progress_context(console):
32
+ """Create progress context for operations."""
33
+ return Progress(
34
+ SpinnerColumn(),
35
+ TextColumn("[progress.description]{task.description}"),
36
+ console=console,
37
+ transient=True
38
+ )
39
+
40
+
41
+ def _validate_inputs(text_value, text, image, video, model, key, filename_as_key):
42
+ """Validate input parameters."""
43
+ inputs_provided = sum(bool(x) for x in [
44
+ text_value, text, image, video
45
+ ])
46
+
47
+ if inputs_provided == 0:
48
+ raise click.ClickException(
49
+ "At least one input must be provided: --text-value, --text, "
50
+ "--image, or --video"
51
+ )
52
+
53
+ # Check mutual exclusivity of key parameters
54
+ if key and filename_as_key:
55
+ raise click.ClickException(
56
+ "Cannot use both --key and --filename-as-key. Choose one."
57
+ )
58
+
59
+ # --filename-as-key not allowed with --text-value
60
+ if filename_as_key and text_value:
61
+ raise click.ClickException(
62
+ "--filename-as-key is not supported with --text-value "
63
+ "(no file or object to extract name from)"
64
+ )
65
+
66
+ # Special case: Allow multimodal input for supported models
67
+ is_multimodal_input = model.supports_multimodal_input() and sum(
68
+ 1 for inp in [text_value, text, image, video]
69
+ if inp is not None
70
+ ) >= 2
71
+
72
+ if inputs_provided > 1 and not is_multimodal_input:
73
+ raise click.ClickException(
74
+ "Only one input type can be specified at a time, "
75
+ "except for multimodal input with supported models"
76
+ )
77
+
78
+ return is_multimodal_input
79
+
80
+
81
+ @click.command()
82
+ @click.option('--vector-bucket-name', required=True, help='TOS vector bucket name')
83
+ @click.option('--index-name', required=True, help='Vector index name')
84
+ @click.option('--model-id', required=True, help='Ark embedding model ID')
85
+ @click.option('--text-value', help='Direct text input to embed')
86
+ @click.option('--text', help='Text file path (local file or TOS URI)')
87
+ @click.option('--image', help='Image file path (local file or TOS URI)')
88
+ @click.option('--video', help='Video file path (local file)')
89
+ @click.option('--key', help='Custom vector key (auto-generated UUID if not provided)')
90
+ @click.option('--key-prefix', help='Prefix to prepend to all vector keys')
91
+ @click.option('--filename-as-key', is_flag=True, help='Use filename as vector key')
92
+ @click.option('--metadata', help='Additional metadata associated with the vector; provided as JSON string')
93
+ @click.option('--ark-inference-params', help='JSON string with model-specific parameters matching Ark API format')
94
+ @click.option('--max-workers', default=4, type=int, help='Maximum parallel workers for batch processing (default: 4)')
95
+ @click.option('--batch-size', type=click.IntRange(1, 500), default=500, help='Vectors per TOS Vector put_vectors call (1-500, default: 500)')
96
+ @click.option('--output', type=click.Choice(['json', 'table']), default='json', help='Output format')
97
+ @click.option('--region', help='TOS region name (effective in TOS path mode)')
98
+ @click.pass_context
99
+ def embed_put(ctx, vector_bucket_name, index_name, model_id, text_value, text, image, video, ark_inference_params, key, key_prefix, filename_as_key, metadata, max_workers, batch_size, output, region):
100
+ """Unified embed and store vectors command."""
101
+
102
+ console = ctx.obj['console']
103
+ account_id = ctx.obj.get('account_id')
104
+ debug = ctx.obj.get('debug', False)
105
+ region = get_region(region or ctx.obj.get('region'))
106
+
107
+ # Load model properties once at start
108
+ model = get_model_info(model_id)
109
+ if not model:
110
+ raise click.ClickException(f"Unsupported model: {model_id}")
111
+
112
+ # Parse parameters
113
+ user_ark_params = {}
114
+ if ark_inference_params:
115
+ try:
116
+ user_ark_params = json.loads(ark_inference_params)
117
+ except json.JSONDecodeError:
118
+ raise click.ClickException(
119
+ "Invalid JSON in --ark-inference-params parameter"
120
+ )
121
+
122
+ metadata_dict = {}
123
+ if metadata:
124
+ try:
125
+ metadata_dict = json.loads(metadata)
126
+ except json.JSONDecodeError:
127
+ raise click.ClickException("Invalid JSON in --metadata parameter")
128
+
129
+ # Early validation of user parameters before any processing
130
+ if user_ark_params:
131
+ try:
132
+ content_type = determine_content_type(
133
+ text_value,
134
+ text,
135
+ image,
136
+ video
137
+ )
138
+ system_keys = model.get_system_keys(content_type)
139
+ # Dummy values for validation
140
+ system_payload = {k: None for k in system_keys}
141
+
142
+ # Validate using utility function
143
+ validate_user_parameters(system_payload, user_ark_params)
144
+ except ValueError as e:
145
+ raise click.ClickException(str(e))
146
+
147
+ # Validate inputs
148
+ is_multimodal = _validate_inputs(
149
+ text_value,
150
+ text,
151
+ image,
152
+ video,
153
+ model,
154
+ key,
155
+ filename_as_key
156
+ )
157
+
158
+ try:
159
+ # Initialize services
160
+ ark_service = ArkService(region=region, debug=debug, console=console)
161
+ tos_vector_service = TOSVectorService(
162
+ region=region, account_id=account_id,
163
+ debug=debug, console=console
164
+ )
165
+
166
+ # Create unified processor
167
+ processor = UnifiedProcessor(ark_service, tos_vector_service, region=region)
168
+
169
+ # Fetch index dimensions once at the top level
170
+ try:
171
+ index_info = tos_vector_service.get_index(
172
+ vector_bucket_name, index_name
173
+ )
174
+ index_dimensions = index_info.get("dimension")
175
+ if not index_dimensions:
176
+ raise click.ClickException(
177
+ f"Could not determine dimensions for index {index_name}"
178
+ )
179
+ except Exception as e:
180
+ raise click.ClickException(
181
+ f"Failed to get index information: {str(e)}"
182
+ )
183
+
184
+ # Prepare processing input
185
+ processing_input = prepare_processing_input(
186
+ text_value,
187
+ text,
188
+ image,
189
+ video,
190
+ is_multimodal,
191
+ metadata_dict,
192
+ key,
193
+ filename_as_key,
194
+ key_prefix
195
+ )
196
+
197
+ # Check for wildcard patterns (streaming batch processing)
198
+ eligible = (
199
+ processing_input.content_type in ["text", "image", "video"]
200
+ and "file_path" in processing_input.data
201
+ )
202
+
203
+ if eligible:
204
+ file_path = processing_input.data["file_path"]
205
+ if '*' in file_path or '?' in file_path:
206
+ _process_streaming_batch(
207
+ file_path,
208
+ processing_input.content_type,
209
+ vector_bucket_name,
210
+ index_name,
211
+ model,
212
+ metadata_dict,
213
+ user_ark_params,
214
+ processor,
215
+ console,
216
+ output,
217
+ max_workers,
218
+ batch_size,
219
+ index_dimensions,
220
+ processing_input.filename_as_key,
221
+ processing_input.key_prefix
222
+ )
223
+ return
224
+
225
+ # Process input to generate embeddings
226
+ with _create_progress_context(console) as progress:
227
+ progress.add_task(
228
+ f"[cyan]Embedding {processing_input.content_type}..."
229
+ )
230
+ result = processor.process(
231
+ model=model,
232
+ processing_input=processing_input,
233
+ user_ark_params=user_ark_params,
234
+ vector_bucket_name=vector_bucket_name,
235
+ index_name=index_name,
236
+ precomputed_dimensions=index_dimensions
237
+ )
238
+
239
+ # Store vectors with batch_size handling
240
+ progress.add_task(
241
+ f"Storing {len(result.vectors)} vector(s)...",
242
+ total=None
243
+ )
244
+
245
+ # Handle batch_size for single file processing too
246
+ vector_count = len(result.vectors)
247
+
248
+ if vector_count <= batch_size:
249
+ stored_keys = processor.store_vectors(
250
+ result.vectors,
251
+ vector_bucket_name,
252
+ index_name
253
+ )
254
+ else:
255
+ stored_keys = []
256
+ for i in range(0, vector_count, batch_size):
257
+ chunk = result.vectors[i:i + batch_size]
258
+ chunk_keys = processor.store_vectors(
259
+ chunk,
260
+ vector_bucket_name,
261
+ index_name
262
+ )
263
+ stored_keys.extend(chunk_keys)
264
+
265
+ # Prepare output
266
+ if result.result_type == "multiclip":
267
+ output_result = {
268
+ 'type': 'multiclip',
269
+ 'bucket': vector_bucket_name,
270
+ 'index': index_name,
271
+ 'model': model.model_id,
272
+ 'contentType': processing_input.content_type,
273
+ 'totalVectors': len(stored_keys),
274
+ 'keys': stored_keys
275
+ }
276
+ if result.job_id:
277
+ output_result['jobId'] = result.job_id
278
+ else:
279
+ output_result = {
280
+ 'key': stored_keys[0],
281
+ 'bucket': vector_bucket_name,
282
+ 'index': index_name,
283
+ 'model': model.model_id,
284
+ 'contentType': processing_input.content_type,
285
+ 'embeddingDimensions': index_dimensions,
286
+ 'metadata': result.vectors[0]['metadata']
287
+ }
288
+
289
+ console.print_json(data=output_result)
290
+
291
+ except Exception as e:
292
+ raise click.ClickException(str(e))
293
+
294
+
295
+ def _process_streaming_batch(file_path, content_type, vector_bucket_name, index_name, model, metadata_dict, user_ark_params, processor, console, output, max_workers, batch_size, index_dimensions, filename_as_key, key_prefix):
296
+ """Process wildcard pattern using streaming batch orchestrator."""
297
+
298
+ try:
299
+ # Create streaming batch orchestrator
300
+ streaming_orchestrator = StreamingBatchOrchestrator(
301
+ processor,
302
+ max_workers,
303
+ batch_size
304
+ )
305
+
306
+ console.print(f"Starting streaming batch processing: {file_path}")
307
+
308
+ # Process using streaming approach (no pre-loading of file paths)
309
+ batch_result = streaming_orchestrator.process_streaming_batch(
310
+ file_path,
311
+ content_type,
312
+ vector_bucket_name,
313
+ index_name,
314
+ model,
315
+ metadata_dict,
316
+ user_ark_params,
317
+ index_dimensions,
318
+ filename_as_key,
319
+ key_prefix
320
+ )
321
+
322
+ # Display results
323
+ result_dict = {
324
+ "type": "streaming_batch",
325
+ "bucket": vector_bucket_name,
326
+ "index": index_name,
327
+ "model": model.model_id,
328
+ "contentType": content_type,
329
+ "totalFiles": (
330
+ batch_result.processed_count + batch_result.failed_count
331
+ ),
332
+ "processedFiles": batch_result.processed_count,
333
+ "failedFiles": batch_result.failed_count,
334
+ "totalVectors": len(batch_result.processed_keys),
335
+ # Show first 10
336
+ "vectorKeys": (
337
+ batch_result.processed_keys[:10]
338
+ if batch_result.processed_keys else []
339
+ )
340
+ }
341
+
342
+ if batch_result.errors:
343
+ result_dict["errors"] = batch_result.errors[:10]
344
+
345
+ if output == "table":
346
+ _display_batch_table(result_dict, console)
347
+ else:
348
+ console.print_json(data=result_dict)
349
+
350
+ # Print display limit messages after output
351
+ if len(batch_result.processed_keys) > 10:
352
+ console.print(
353
+ f"[dim]Note: Showing first 10 of "
354
+ f"{len(batch_result.processed_keys)} vector keys[/dim]"
355
+ )
356
+
357
+ if batch_result.errors and len(batch_result.errors) > 10:
358
+ console.print(
359
+ f"[dim]Note: Showing first 10 of "
360
+ f"{len(batch_result.errors)} errors[/dim]"
361
+ )
362
+
363
+ return result_dict if output == "json" else None
364
+
365
+ except Exception as e:
366
+ console.print(
367
+ f"[red]Streaming batch processing failed: {str(e)}[/red]"
368
+ )
369
+ raise click.ClickException(
370
+ f"Streaming batch processing failed: {str(e)}"
371
+ )
372
+
373
+
374
+ def _display_batch_table(result, console):
375
+ """Display batch results in table format."""
376
+ from rich.table import Table
377
+
378
+ table = Table(title="Batch Processing Results")
379
+ table.add_column("Metric", style="cyan")
380
+ table.add_column("Value", style="green")
381
+
382
+ table.add_row("Total Files", str(result["totalFiles"]))
383
+ table.add_row("Processed Files", str(result["processedFiles"]))
384
+ table.add_row("Failed Files", str(result["failedFiles"]))
385
+ table.add_row("Total Vectors", str(result["totalVectors"]))
386
+ table.add_row("Model", result["model"])
387
+ table.add_row("Content Type", result["contentType"])
388
+
389
+ console.print(table)