nv-ingest-client 2025.11.14.dev20251114__tar.gz → 2025.12.24.dev20251224__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client.egg-info → nv_ingest_client-2025.12.24.dev20251224}/PKG-INFO +2 -1
  2. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/pyproject.toml +1 -0
  3. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/client/client.py +119 -9
  4. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/client/interface.py +301 -83
  5. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/nv_ingest_cli.py +2 -2
  6. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/jobs/job_spec.py +27 -2
  7. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/caption.py +12 -1
  8. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/extract.py +51 -2
  9. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/store.py +18 -13
  10. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/file_processing/extract.py +23 -0
  11. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/util.py +34 -1
  12. nv_ingest_client-2025.12.24.dev20251224/src/nv_ingest_client/util/vdb/adt_vdb.py +243 -0
  13. nv_ingest_client-2025.12.24.dev20251224/src/nv_ingest_client/util/vdb/lancedb.py +276 -0
  14. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/vdb/milvus.py +46 -22
  15. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224/src/nv_ingest_client.egg-info}/PKG-INFO +2 -1
  16. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client.egg-info/SOURCES.txt +1 -0
  17. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client.egg-info/requires.txt +1 -0
  18. nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -27
  19. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/LICENSE +0 -0
  20. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/MANIFEST.in +0 -0
  21. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/README.md +0 -0
  22. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/setup.cfg +0 -0
  23. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/__init__.py +0 -0
  24. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/cli/__init__.py +0 -0
  25. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/cli/util/__init__.py +0 -0
  26. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/cli/util/click.py +0 -0
  27. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/cli/util/processing.py +0 -0
  28. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/cli/util/system.py +0 -0
  29. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/client/__init__.py +0 -0
  30. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/client/ingest_job_handler.py +0 -0
  31. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/client/util/processing.py +0 -0
  32. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/__init__.py +0 -0
  33. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
  34. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
  35. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
  36. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
  37. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
  38. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
  39. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/embed.py +0 -0
  40. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
  41. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
  42. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/ocr_extraction.py +0 -0
  43. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
  44. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
  45. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
  46. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
  47. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/udf.py +0 -0
  48. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
  49. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/__init__.py +0 -0
  50. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/dataset.py +0 -0
  51. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/document_analysis.py +0 -0
  52. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
  53. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
  54. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/milvus.py +0 -0
  55. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/process_json_files.py +0 -0
  56. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/processing.py +0 -0
  57. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/system.py +0 -0
  58. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/transport.py +0 -0
  59. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
  60. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
  61. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client/util/zipkin.py +0 -0
  62. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
  63. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
  64. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
  65. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.24.dev20251224}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.11.14.dev20251114
3
+ Version: 2025.12.24.dev20251224
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -223,6 +223,7 @@ Requires-Dist: pydantic-settings>2.0.0
223
223
  Requires-Dist: requests>=2.28.2
224
224
  Requires-Dist: setuptools>=78.1.1
225
225
  Requires-Dist: tqdm>=4.67.1
226
+ Requires-Dist: lancedb>=0.25.3
226
227
  Provides-Extra: milvus
227
228
  Requires-Dist: pymilvus==2.5.10; extra == "milvus"
228
229
  Requires-Dist: pymilvus[bulk_writer,model]; extra == "milvus"
@@ -30,6 +30,7 @@ dependencies = [
30
30
  "requests>=2.28.2",
31
31
  "setuptools>=78.1.1",
32
32
  "tqdm>=4.67.1",
33
+ "lancedb>=0.25.3",
33
34
  ]
34
35
 
35
36
  [project.optional-dependencies]
@@ -202,6 +202,13 @@ class _ConcurrentProcessor:
202
202
  if not self.job_queue_id:
203
203
  logger.warning("job_queue_id is not set; submission of new jobs will fail.")
204
204
 
205
+ # Executor check required for run_async
206
+ if not hasattr(client, "_worker_pool"):
207
+ raise AttributeError("Client object is missing the '_worker_pool' attribute, required for run_async.")
208
+ if not isinstance(client._worker_pool, ThreadPoolExecutor):
209
+ raise TypeError("Client's '_worker_pool' must be a ThreadPoolExecutor for run_async.")
210
+ self._executor = client._worker_pool
211
+
205
212
  # --------------------------------------------------------------------------
206
213
  # Private Methods
207
214
  # --------------------------------------------------------------------------
@@ -246,7 +253,7 @@ class _ConcurrentProcessor:
246
253
  # Attempt to mark state as FAILED locally in the client (best effort)
247
254
  try:
248
255
  # Use a method assumed to safely get the state object
249
- job_state = self.client._get_job_state_object(job_index)
256
+ job_state = self.client._get_and_check_job_state(job_index)
250
257
  # Check state exists and is not already terminal before updating
251
258
  if (
252
259
  job_state and hasattr(job_state, "state") and job_state.state not in ["FAILED", "COMPLETED"]
@@ -495,7 +502,10 @@ class _ConcurrentProcessor:
495
502
 
496
503
  return batch_futures_dict, normalized_job_indices
497
504
 
498
- def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
505
+ # --------------------------------------------------------------------------
506
+ # Core Processing Logic
507
+ # --------------------------------------------------------------------------
508
+ def _process_all_jobs(self) -> Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
499
509
  """
500
510
  Executes the main processing loop in batches.
501
511
 
@@ -640,6 +650,44 @@ class _ConcurrentProcessor:
640
650
 
641
651
  return self.results, self.failures, self.traces if self.return_traces else []
642
652
 
653
+ # --------------------------------------------------------------------------
654
+ # Public Methods
655
+ # --------------------------------------------------------------------------
656
+
657
+ def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
658
+ """
659
+ Executes the main processing loop synchronously.
660
+
661
+ This method orchestrates the job processing by maintaining a constant
662
+ pool of in-flight jobs, handling submissions, fetches, and retries until
663
+ all jobs are complete. It blocks until all jobs are processed.
664
+
665
+ Returns
666
+ -------
667
+ Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]
668
+ A tuple containing:
669
+ 1. A list of successfully fetched job results.
670
+ 2. A list of tuples for failed jobs (job_index, error_message).
671
+ 3. A list of trace dictionaries if `return_traces` was True.
672
+ """
673
+ return self._process_all_jobs()
674
+
675
+ def run_async(self) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
676
+ """
677
+ Executes the main processing loop asynchronously.
678
+
679
+ Submits the entire processing logic to the client's background
680
+ thread pool and returns a Future that resolves with the final
681
+ results, failures, and traces once all jobs are complete.
682
+
683
+ Returns
684
+ -------
685
+ Future
686
+ A future representing the asynchronous execution. Its result()
687
+ will be a tuple containing (results, failures, traces).
688
+ """
689
+ return self._executor.submit(self._process_all_jobs)
690
+
643
691
 
644
692
  class NvIngestClient:
645
693
  """
@@ -1221,7 +1269,7 @@ class NvIngestClient:
1221
1269
  ----------
1222
1270
  batch_size : Optional[int]
1223
1271
  The batch_size value to validate. None uses value from
1224
- NV_INGEST_BATCH_SIZE environment variable or default 32.
1272
+ NV_INGEST_BATCH_SIZE environment variable or default 16.
1225
1273
 
1226
1274
  Returns
1227
1275
  -------
@@ -1231,18 +1279,18 @@ class NvIngestClient:
1231
1279
  # Handle None/default case
1232
1280
  if batch_size is None:
1233
1281
  try:
1234
- batch_size = int(os.getenv("NV_INGEST_CLIENT_BATCH_SIZE", "32"))
1282
+ batch_size = int(os.getenv("NV_INGEST_CLIENT_BATCH_SIZE", "16"))
1235
1283
  except ValueError:
1236
- batch_size = 32
1284
+ batch_size = 16
1237
1285
 
1238
1286
  # Validate type and range
1239
1287
  if not isinstance(batch_size, int):
1240
- logger.warning(f"batch_size must be an integer, got {type(batch_size).__name__}. Using default 32.")
1241
- return 32
1288
+ logger.warning(f"batch_size must be an integer, got {type(batch_size).__name__}. Using default 16.")
1289
+ return 16
1242
1290
 
1243
1291
  if batch_size < 1:
1244
- logger.warning(f"batch_size must be >= 1, got {batch_size}. Using default 32.")
1245
- return 32
1292
+ logger.warning(f"batch_size must be >= 1, got {batch_size}. Using default 16.")
1293
+ return 16
1246
1294
 
1247
1295
  # Performance guidance warnings
1248
1296
  if batch_size < 8:
@@ -1377,6 +1425,68 @@ class NvIngestClient:
1377
1425
  logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
1378
1426
  return results
1379
1427
 
1428
+ def process_jobs_concurrently_async(
1429
+ self,
1430
+ job_indices: Union[str, List[str]],
1431
+ job_queue_id: Optional[str] = None,
1432
+ batch_size: Optional[int] = None,
1433
+ timeout: int = 100,
1434
+ max_job_retries: Optional[int] = None,
1435
+ retry_delay: float = 0.5,
1436
+ initial_fetch_delay: float = 0.3,
1437
+ fail_on_submit_error: bool = False,
1438
+ completion_callback: Optional[Callable[[Any, str], None]] = None,
1439
+ stream_to_callback_only: bool = False,
1440
+ return_full_response: bool = False,
1441
+ verbose: bool = False,
1442
+ return_traces: bool = False,
1443
+ ) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
1444
+ """
1445
+ Submit and fetch multiple jobs concurrently and asynchronously.
1446
+
1447
+ This method initializes the processing and returns a Future immediately. The Future
1448
+ will resolve with a fixed 3-part tuple `(results, failures, traces)` once all
1449
+ jobs have completed.
1450
+
1451
+ Parameters are identical to `process_jobs_concurrently`.
1452
+
1453
+ Returns
1454
+ -------
1455
+ Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]
1456
+ A future that completes when all jobs are done. Its result is a tuple
1457
+ containing (successful_results, failures, traces).
1458
+ """
1459
+ if isinstance(job_indices, str):
1460
+ job_indices = [job_indices]
1461
+
1462
+ if not job_indices:
1463
+ immediate_future: Future = Future()
1464
+ immediate_future.set_result(([], [], []))
1465
+ return immediate_future
1466
+
1467
+ validated_batch_size = self._validate_batch_size(batch_size)
1468
+ effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
1469
+
1470
+ processor = _ConcurrentProcessor(
1471
+ client=self,
1472
+ batch_size=validated_batch_size,
1473
+ job_indices=job_indices,
1474
+ job_queue_id=job_queue_id,
1475
+ timeout=effective_timeout,
1476
+ max_job_retries=max_job_retries,
1477
+ retry_delay=retry_delay,
1478
+ initial_fetch_delay=initial_fetch_delay,
1479
+ completion_callback=completion_callback,
1480
+ fail_on_submit_error=fail_on_submit_error,
1481
+ stream_to_callback_only=stream_to_callback_only,
1482
+ return_full_response=return_full_response,
1483
+ verbose=verbose,
1484
+ return_traces=return_traces,
1485
+ )
1486
+
1487
+ # Asynchronous call
1488
+ return processor.run_async()
1489
+
1380
1490
  def _ensure_submitted(self, job_ids: Union[str, List[str]]) -> None:
1381
1491
  """
1382
1492
  Block until all specified jobs have been marked submitted.