nv-ingest-client 2025.11.14.dev20251114__tar.gz → 2025.12.24.dev20251224__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client.egg-info → nv_ingest_client-2025.12.24.dev20251224}/PKG-INFO +2 -1
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/pyproject.toml +1 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/client/client.py +119 -9
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/client/interface.py +301 -83
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/nv_ingest_cli.py +2 -2
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/jobs/job_spec.py +27 -2
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/caption.py +12 -1
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/extract.py +51 -2
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/store.py +18 -13
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/file_processing/extract.py +23 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/util.py +34 -1
- nv_ingest_client-2025.12.24.dev20251224/src/nv_ingest_client/util/vdb/adt_vdb.py +243 -0
- nv_ingest_client-2025.12.24.dev20251224/src/nv_ingest_client/util/vdb/lancedb.py +276 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/vdb/milvus.py +46 -22
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224/src/nv_ingest_client.egg-info}/PKG-INFO +2 -1
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client.egg-info/SOURCES.txt +1 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client.egg-info/requires.txt +1 -0
- nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -27
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/LICENSE +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/MANIFEST.in +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/README.md +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/setup.cfg +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/cli/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/cli/util/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/cli/util/click.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/cli/util/processing.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/cli/util/system.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/client/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/client/ingest_job_handler.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/client/util/processing.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/embed.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/ocr_extraction.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/udf.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/dataset.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/document_analysis.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/milvus.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/process_json_files.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/processing.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/system.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/transport.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/zipkin.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/version.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest-client
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.12.24.dev20251224
|
|
4
4
|
Summary: Python client for the nv-ingest service
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -223,6 +223,7 @@ Requires-Dist: pydantic-settings>2.0.0
|
|
|
223
223
|
Requires-Dist: requests>=2.28.2
|
|
224
224
|
Requires-Dist: setuptools>=78.1.1
|
|
225
225
|
Requires-Dist: tqdm>=4.67.1
|
|
226
|
+
Requires-Dist: lancedb>=0.25.3
|
|
226
227
|
Provides-Extra: milvus
|
|
227
228
|
Requires-Dist: pymilvus==2.5.10; extra == "milvus"
|
|
228
229
|
Requires-Dist: pymilvus[bulk_writer,model]; extra == "milvus"
|
|
@@ -202,6 +202,13 @@ class _ConcurrentProcessor:
|
|
|
202
202
|
if not self.job_queue_id:
|
|
203
203
|
logger.warning("job_queue_id is not set; submission of new jobs will fail.")
|
|
204
204
|
|
|
205
|
+
# Executor check required for run_async
|
|
206
|
+
if not hasattr(client, "_worker_pool"):
|
|
207
|
+
raise AttributeError("Client object is missing the '_worker_pool' attribute, required for run_async.")
|
|
208
|
+
if not isinstance(client._worker_pool, ThreadPoolExecutor):
|
|
209
|
+
raise TypeError("Client's '_worker_pool' must be a ThreadPoolExecutor for run_async.")
|
|
210
|
+
self._executor = client._worker_pool
|
|
211
|
+
|
|
205
212
|
# --------------------------------------------------------------------------
|
|
206
213
|
# Private Methods
|
|
207
214
|
# --------------------------------------------------------------------------
|
|
@@ -246,7 +253,7 @@ class _ConcurrentProcessor:
|
|
|
246
253
|
# Attempt to mark state as FAILED locally in the client (best effort)
|
|
247
254
|
try:
|
|
248
255
|
# Use a method assumed to safely get the state object
|
|
249
|
-
job_state = self.client.
|
|
256
|
+
job_state = self.client._get_and_check_job_state(job_index)
|
|
250
257
|
# Check state exists and is not already terminal before updating
|
|
251
258
|
if (
|
|
252
259
|
job_state and hasattr(job_state, "state") and job_state.state not in ["FAILED", "COMPLETED"]
|
|
@@ -495,7 +502,10 @@ class _ConcurrentProcessor:
|
|
|
495
502
|
|
|
496
503
|
return batch_futures_dict, normalized_job_indices
|
|
497
504
|
|
|
498
|
-
|
|
505
|
+
# --------------------------------------------------------------------------
|
|
506
|
+
# Core Processing Logic
|
|
507
|
+
# --------------------------------------------------------------------------
|
|
508
|
+
def _process_all_jobs(self) -> Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
|
|
499
509
|
"""
|
|
500
510
|
Executes the main processing loop in batches.
|
|
501
511
|
|
|
@@ -640,6 +650,44 @@ class _ConcurrentProcessor:
|
|
|
640
650
|
|
|
641
651
|
return self.results, self.failures, self.traces if self.return_traces else []
|
|
642
652
|
|
|
653
|
+
# --------------------------------------------------------------------------
|
|
654
|
+
# Public Methods
|
|
655
|
+
# --------------------------------------------------------------------------
|
|
656
|
+
|
|
657
|
+
def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
|
|
658
|
+
"""
|
|
659
|
+
Executes the main processing loop synchronously.
|
|
660
|
+
|
|
661
|
+
This method orchestrates the job processing by maintaining a constant
|
|
662
|
+
pool of in-flight jobs, handling submissions, fetches, and retries until
|
|
663
|
+
all jobs are complete. It blocks until all jobs are processed.
|
|
664
|
+
|
|
665
|
+
Returns
|
|
666
|
+
-------
|
|
667
|
+
Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]
|
|
668
|
+
A tuple containing:
|
|
669
|
+
1. A list of successfully fetched job results.
|
|
670
|
+
2. A list of tuples for failed jobs (job_index, error_message).
|
|
671
|
+
3. A list of trace dictionaries if `return_traces` was True.
|
|
672
|
+
"""
|
|
673
|
+
return self._process_all_jobs()
|
|
674
|
+
|
|
675
|
+
def run_async(self) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
|
|
676
|
+
"""
|
|
677
|
+
Executes the main processing loop asynchronously.
|
|
678
|
+
|
|
679
|
+
Submits the entire processing logic to the client's background
|
|
680
|
+
thread pool and returns a Future that resolves with the final
|
|
681
|
+
results, failures, and traces once all jobs are complete.
|
|
682
|
+
|
|
683
|
+
Returns
|
|
684
|
+
-------
|
|
685
|
+
Future
|
|
686
|
+
A future representing the asynchronous execution. Its result()
|
|
687
|
+
will be a tuple containing (results, failures, traces).
|
|
688
|
+
"""
|
|
689
|
+
return self._executor.submit(self._process_all_jobs)
|
|
690
|
+
|
|
643
691
|
|
|
644
692
|
class NvIngestClient:
|
|
645
693
|
"""
|
|
@@ -1221,7 +1269,7 @@ class NvIngestClient:
|
|
|
1221
1269
|
----------
|
|
1222
1270
|
batch_size : Optional[int]
|
|
1223
1271
|
The batch_size value to validate. None uses value from
|
|
1224
|
-
NV_INGEST_BATCH_SIZE environment variable or default
|
|
1272
|
+
NV_INGEST_BATCH_SIZE environment variable or default 16.
|
|
1225
1273
|
|
|
1226
1274
|
Returns
|
|
1227
1275
|
-------
|
|
@@ -1231,18 +1279,18 @@ class NvIngestClient:
|
|
|
1231
1279
|
# Handle None/default case
|
|
1232
1280
|
if batch_size is None:
|
|
1233
1281
|
try:
|
|
1234
|
-
batch_size = int(os.getenv("NV_INGEST_CLIENT_BATCH_SIZE", "
|
|
1282
|
+
batch_size = int(os.getenv("NV_INGEST_CLIENT_BATCH_SIZE", "16"))
|
|
1235
1283
|
except ValueError:
|
|
1236
|
-
batch_size =
|
|
1284
|
+
batch_size = 16
|
|
1237
1285
|
|
|
1238
1286
|
# Validate type and range
|
|
1239
1287
|
if not isinstance(batch_size, int):
|
|
1240
|
-
logger.warning(f"batch_size must be an integer, got {type(batch_size).__name__}. Using default
|
|
1241
|
-
return
|
|
1288
|
+
logger.warning(f"batch_size must be an integer, got {type(batch_size).__name__}. Using default 16.")
|
|
1289
|
+
return 16
|
|
1242
1290
|
|
|
1243
1291
|
if batch_size < 1:
|
|
1244
|
-
logger.warning(f"batch_size must be >= 1, got {batch_size}. Using default
|
|
1245
|
-
return
|
|
1292
|
+
logger.warning(f"batch_size must be >= 1, got {batch_size}. Using default 16.")
|
|
1293
|
+
return 16
|
|
1246
1294
|
|
|
1247
1295
|
# Performance guidance warnings
|
|
1248
1296
|
if batch_size < 8:
|
|
@@ -1377,6 +1425,68 @@ class NvIngestClient:
|
|
|
1377
1425
|
logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
|
|
1378
1426
|
return results
|
|
1379
1427
|
|
|
1428
|
+
def process_jobs_concurrently_async(
|
|
1429
|
+
self,
|
|
1430
|
+
job_indices: Union[str, List[str]],
|
|
1431
|
+
job_queue_id: Optional[str] = None,
|
|
1432
|
+
batch_size: Optional[int] = None,
|
|
1433
|
+
timeout: int = 100,
|
|
1434
|
+
max_job_retries: Optional[int] = None,
|
|
1435
|
+
retry_delay: float = 0.5,
|
|
1436
|
+
initial_fetch_delay: float = 0.3,
|
|
1437
|
+
fail_on_submit_error: bool = False,
|
|
1438
|
+
completion_callback: Optional[Callable[[Any, str], None]] = None,
|
|
1439
|
+
stream_to_callback_only: bool = False,
|
|
1440
|
+
return_full_response: bool = False,
|
|
1441
|
+
verbose: bool = False,
|
|
1442
|
+
return_traces: bool = False,
|
|
1443
|
+
) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
|
|
1444
|
+
"""
|
|
1445
|
+
Submit and fetch multiple jobs concurrently and asynchronously.
|
|
1446
|
+
|
|
1447
|
+
This method initializes the processing and returns a Future immediately. The Future
|
|
1448
|
+
will resolve with a fixed 3-part tuple `(results, failures, traces)` once all
|
|
1449
|
+
jobs have completed.
|
|
1450
|
+
|
|
1451
|
+
Parameters are identical to `process_jobs_concurrently`.
|
|
1452
|
+
|
|
1453
|
+
Returns
|
|
1454
|
+
-------
|
|
1455
|
+
Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]
|
|
1456
|
+
A future that completes when all jobs are done. Its result is a tuple
|
|
1457
|
+
containing (successful_results, failures, traces).
|
|
1458
|
+
"""
|
|
1459
|
+
if isinstance(job_indices, str):
|
|
1460
|
+
job_indices = [job_indices]
|
|
1461
|
+
|
|
1462
|
+
if not job_indices:
|
|
1463
|
+
immediate_future: Future = Future()
|
|
1464
|
+
immediate_future.set_result(([], [], []))
|
|
1465
|
+
return immediate_future
|
|
1466
|
+
|
|
1467
|
+
validated_batch_size = self._validate_batch_size(batch_size)
|
|
1468
|
+
effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
|
|
1469
|
+
|
|
1470
|
+
processor = _ConcurrentProcessor(
|
|
1471
|
+
client=self,
|
|
1472
|
+
batch_size=validated_batch_size,
|
|
1473
|
+
job_indices=job_indices,
|
|
1474
|
+
job_queue_id=job_queue_id,
|
|
1475
|
+
timeout=effective_timeout,
|
|
1476
|
+
max_job_retries=max_job_retries,
|
|
1477
|
+
retry_delay=retry_delay,
|
|
1478
|
+
initial_fetch_delay=initial_fetch_delay,
|
|
1479
|
+
completion_callback=completion_callback,
|
|
1480
|
+
fail_on_submit_error=fail_on_submit_error,
|
|
1481
|
+
stream_to_callback_only=stream_to_callback_only,
|
|
1482
|
+
return_full_response=return_full_response,
|
|
1483
|
+
verbose=verbose,
|
|
1484
|
+
return_traces=return_traces,
|
|
1485
|
+
)
|
|
1486
|
+
|
|
1487
|
+
# Asynchronous call
|
|
1488
|
+
return processor.run_async()
|
|
1489
|
+
|
|
1380
1490
|
def _ensure_submitted(self, job_ids: Union[str, List[str]]) -> None:
|
|
1381
1491
|
"""
|
|
1382
1492
|
Block until all specified jobs have been marked submitted.
|