nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (38) hide show
  1. nv_ingest_client/cli/util/click.py +182 -30
  2. nv_ingest_client/cli/util/processing.py +0 -393
  3. nv_ingest_client/client/client.py +561 -207
  4. nv_ingest_client/client/ingest_job_handler.py +412 -0
  5. nv_ingest_client/client/interface.py +466 -59
  6. nv_ingest_client/client/util/processing.py +11 -1
  7. nv_ingest_client/nv_ingest_cli.py +58 -6
  8. nv_ingest_client/primitives/jobs/job_spec.py +32 -10
  9. nv_ingest_client/primitives/tasks/__init__.py +6 -4
  10. nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
  11. nv_ingest_client/primitives/tasks/caption.py +10 -16
  12. nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
  13. nv_ingest_client/primitives/tasks/dedup.py +12 -21
  14. nv_ingest_client/primitives/tasks/embed.py +37 -76
  15. nv_ingest_client/primitives/tasks/extract.py +68 -169
  16. nv_ingest_client/primitives/tasks/filter.py +22 -28
  17. nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
  18. nv_ingest_client/primitives/tasks/split.py +17 -18
  19. nv_ingest_client/primitives/tasks/store.py +29 -29
  20. nv_ingest_client/primitives/tasks/task_base.py +1 -72
  21. nv_ingest_client/primitives/tasks/task_factory.py +10 -11
  22. nv_ingest_client/primitives/tasks/udf.py +349 -0
  23. nv_ingest_client/util/dataset.py +8 -2
  24. nv_ingest_client/util/document_analysis.py +314 -0
  25. nv_ingest_client/util/image_disk_utils.py +300 -0
  26. nv_ingest_client/util/transport.py +12 -6
  27. nv_ingest_client/util/util.py +66 -0
  28. nv_ingest_client/util/vdb/milvus.py +220 -75
  29. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
  30. nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
  31. nv_ingest_client/cli/util/tasks.py +0 -3
  32. nv_ingest_client/primitives/exceptions.py +0 -0
  33. nv_ingest_client/primitives/tasks/transform.py +0 -0
  34. nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
  35. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
  36. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
  37. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
  38. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,7 @@
6
6
 
7
7
  import collections
8
8
  import glob
9
+ import gzip
9
10
  import json
10
11
  import logging
11
12
  import os
@@ -27,6 +28,16 @@ from typing import Union
27
28
  from urllib.parse import urlparse
28
29
 
29
30
  import fsspec
31
+ from nv_ingest_api.internal.enums.common import PipelinePhase
32
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
33
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
34
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
35
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
36
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
37
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
38
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
39
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
40
+ from nv_ingest_api.util.introspection.function_inspect import infer_udf_function_name
30
41
  from nv_ingest_client.client.client import NvIngestClient
31
42
  from nv_ingest_client.client.util.processing import get_valid_filename
32
43
  from nv_ingest_client.client.util.processing import save_document_results_to_jsonl
@@ -38,19 +49,12 @@ from nv_ingest_client.primitives.tasks import EmbedTask
38
49
  from nv_ingest_client.primitives.tasks import ExtractTask
39
50
  from nv_ingest_client.primitives.tasks import FilterTask
40
51
  from nv_ingest_client.primitives.tasks import SplitTask
41
- from nv_ingest_client.primitives.tasks import StoreEmbedTask
42
52
  from nv_ingest_client.primitives.tasks import StoreTask
43
- from nv_ingest_client.primitives.tasks.caption import CaptionTaskSchema
44
- from nv_ingest_client.primitives.tasks.dedup import DedupTaskSchema
45
- from nv_ingest_client.primitives.tasks.embed import EmbedTaskSchema
46
- from nv_ingest_client.primitives.tasks.extract import ExtractTaskSchema
47
- from nv_ingest_client.primitives.tasks.filter import FilterTaskSchema
48
- from nv_ingest_client.primitives.tasks.split import SplitTaskSchema
49
- from nv_ingest_client.primitives.tasks.store import StoreEmbedTaskSchema
50
- from nv_ingest_client.primitives.tasks.store import StoreTaskSchema
53
+ from nv_ingest_client.primitives.tasks import StoreEmbedTask
54
+ from nv_ingest_client.primitives.tasks import UDFTask
51
55
  from nv_ingest_client.util.processing import check_schema
52
56
  from nv_ingest_client.util.system import ensure_directory_with_permissions
53
- from nv_ingest_client.util.util import filter_function_kwargs
57
+ from nv_ingest_client.util.util import filter_function_kwargs, apply_pdf_split_config_to_job_specs
54
58
  from nv_ingest_client.util.vdb import VDB, get_vdb_op_cls
55
59
  from tqdm import tqdm
56
60
 
@@ -90,17 +94,20 @@ def ensure_job_specs(func):
90
94
 
91
95
 
92
96
  class LazyLoadedList(collections.abc.Sequence):
93
- def __init__(self, filepath: str, expected_len: Optional[int] = None):
97
+ def __init__(self, filepath: str, expected_len: Optional[int] = None, compression: Optional[str] = None):
94
98
  self.filepath = filepath
95
99
  self._len: Optional[int] = expected_len # Store pre-calculated length
96
100
  self._offsets: Optional[List[int]] = None
101
+ self.compression = compression
97
102
 
98
103
  if self._len == 0:
99
104
  self._offsets = []
100
105
 
106
+ self._open = gzip.open if self.compression == "gzip" else open
107
+
101
108
  def __iter__(self) -> Iterator[Any]:
102
109
  try:
103
- with open(self.filepath, "r", encoding="utf-8") as f:
110
+ with self._open(self.filepath, "rt", encoding="utf-8") as f:
104
111
  for line in f:
105
112
  yield json.loads(line)
106
113
  except FileNotFoundError:
@@ -117,7 +124,7 @@ class LazyLoadedList(collections.abc.Sequence):
117
124
  self._offsets = []
118
125
  line_count = 0
119
126
  try:
120
- with open(self.filepath, "rb") as f:
127
+ with self._open(self.filepath, "rb") as f:
121
128
  while True:
122
129
  current_pos = f.tell()
123
130
  line = f.readline()
@@ -141,10 +148,12 @@ class LazyLoadedList(collections.abc.Sequence):
141
148
  def __len__(self) -> int:
142
149
  if self._len is not None:
143
150
  return self._len
151
+
144
152
  if self._offsets is not None:
145
153
  self._len = len(self._offsets)
146
154
  return self._len
147
155
  self._build_index()
156
+
148
157
  return self._len if self._len is not None else 0
149
158
 
150
159
  def __getitem__(self, idx: int) -> Any:
@@ -167,7 +176,7 @@ class LazyLoadedList(collections.abc.Sequence):
167
176
  raise IndexError(f"Index {idx} out of range for {self.filepath} (len: {len(self._offsets)})")
168
177
 
169
178
  try:
170
- with open(self.filepath, "rb") as f:
179
+ with self._open(self.filepath, "rb") as f:
171
180
  f.seek(self._offsets[idx])
172
181
  line_bytes = f.readline()
173
182
  return json.loads(line_bytes.decode("utf-8"))
@@ -218,6 +227,7 @@ class Ingestor:
218
227
  self._client = client
219
228
  self._job_queue_id = job_queue_id
220
229
  self._vdb_bulk_upload = None
230
+ self._purge_results_after_vdb_upload = True
221
231
 
222
232
  if self._client is None:
223
233
  client_kwargs = filter_function_kwargs(NvIngestClient, **kwargs)
@@ -236,6 +246,21 @@ class Ingestor:
236
246
  self._output_config = None
237
247
  self._created_temp_output_dir = None
238
248
 
249
+ def __enter__(self):
250
+ return self
251
+
252
+ def __exit__(self, exc_type, exc_value, traceback):
253
+ if self._output_config and (self._output_config["cleanup"] is True):
254
+ dir_to_cleanup = self._output_config["output_directory"]
255
+ try:
256
+ shutil.rmtree(dir_to_cleanup)
257
+ except FileNotFoundError:
258
+ logger.warning(
259
+ f"Directory to be cleaned up not found (might have been removed already): {dir_to_cleanup}"
260
+ )
261
+ except OSError as e:
262
+ logger.error(f"Error removing {dir_to_cleanup}: {e}")
263
+
239
264
  def _create_client(self, **kwargs) -> None:
240
265
  """
241
266
  Creates an instance of NvIngestClient if `_client` is not set.
@@ -377,15 +402,9 @@ class Ingestor:
377
402
  show_progress: bool = False,
378
403
  return_failures: bool = False,
379
404
  save_to_disk: bool = False,
405
+ return_traces: bool = False,
380
406
  **kwargs: Any,
381
- ) -> Union[
382
- List[List[Dict[str, Any]]], # In-memory: List of (response['data'] for each doc)
383
- List[LazyLoadedList], # Disk: List of proxies, one per original doc
384
- Tuple[
385
- Union[List[List[Dict[str, Any]]], List[LazyLoadedList]],
386
- List[Tuple[str, str]],
387
- ],
388
- ]: # noqa: E501
407
+ ) -> Union[List[Any], Tuple[Any, ...]]:
389
408
  """
390
409
  Ingest documents by submitting jobs and fetching results concurrently.
391
410
 
@@ -395,22 +414,36 @@ class Ingestor:
395
414
  Whether to display a progress bar. Default is False.
396
415
  return_failures : bool, optional
397
416
  If True, return a tuple (results, failures); otherwise, return only results. Default is False.
417
+ save_to_disk : bool, optional
418
+ If True, save results to disk and return LazyLoadedList proxies. Default is False.
419
+ return_traces : bool, optional
420
+ If True, return trace metrics alongside results. Default is False.
421
+ Traces contain timing metrics (entry, exit, resident_time) for each stage.
398
422
  **kwargs : Any
399
- Additional keyword arguments for the underlying client methods. Supported keys:
400
- 'concurrency_limit', 'timeout', 'max_job_retries', 'retry_delay',
401
- 'data_only', 'verbose'. Unrecognized keys are passed through to
402
- process_jobs_concurrently.
423
+ Additional keyword arguments for the underlying client methods.
424
+ Optional flags include `include_parent_trace_ids=True` to also return
425
+ parent job trace identifiers (V2 API only).
403
426
 
404
427
  Returns
405
428
  -------
406
- results : list of dict
407
- List of successful job results when `return_failures` is False.
408
- results, failures : tuple (list of dict, list of tuple of str)
409
- Tuple containing successful results and failure information when `return_failures` is True.
429
+ list or tuple
430
+ Returns vary based on flags:
431
+ - Default: list of results
432
+ - return_failures=True: (results, failures)
433
+ - return_traces=True: (results, traces)
434
+ - return_failures=True, return_traces=True: (results, failures, traces)
435
+ - Additional combinations with include_parent_trace_ids kwarg
436
+
437
+ Notes
438
+ -----
439
+ Trace metrics include timing data for each processing stage. For detailed
440
+ usage and examples, see src/nv_ingest/api/v2/README.md
410
441
  """
411
442
  if save_to_disk and (not self._output_config):
412
443
  self.save_to_disk()
413
444
 
445
+ include_parent_trace_ids = bool(kwargs.pop("include_parent_trace_ids", False))
446
+
414
447
  self._prepare_ingest_run()
415
448
 
416
449
  # Add jobs locally first
@@ -420,7 +453,7 @@ class Ingestor:
420
453
 
421
454
  final_results_payload_list: Union[List[List[Dict[str, Any]]], List[LazyLoadedList]] = []
422
455
 
423
- # Lock for thread-safe appends to final_results_payload_list by I/O tasks
456
+ # Lock for thread-safe appending to final_results_payload_list by I/O tasks
424
457
  results_lock = threading.Lock() if self._output_config else None
425
458
 
426
459
  io_executor: Optional[ThreadPoolExecutor] = None
@@ -435,7 +468,9 @@ class Ingestor:
435
468
  output_dir = self._output_config["output_directory"]
436
469
  clean_source_basename = get_valid_filename(os.path.basename(source_name))
437
470
  file_name, file_ext = os.path.splitext(clean_source_basename)
438
- file_suffix = f".{file_ext}.results.jsonl"
471
+ file_suffix = f".{file_ext.strip('.')}.results.jsonl"
472
+ if self._output_config["compression"] == "gzip":
473
+ file_suffix += ".gz"
439
474
  jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
440
475
 
441
476
  num_items_saved = save_document_results_to_jsonl(
@@ -443,10 +478,13 @@ class Ingestor:
443
478
  jsonl_filepath,
444
479
  source_name,
445
480
  ensure_parent_dir_exists=False,
481
+ compression=self._output_config["compression"],
446
482
  )
447
483
 
448
484
  if num_items_saved > 0:
449
- results = LazyLoadedList(jsonl_filepath, expected_len=num_items_saved)
485
+ results = LazyLoadedList(
486
+ jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
487
+ )
450
488
  if results_lock:
451
489
  with results_lock:
452
490
  final_results_payload_list.append(results)
@@ -519,11 +557,24 @@ class Ingestor:
519
557
 
520
558
  proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently, **kwargs)
521
559
 
522
- _return_failures = return_failures
523
- if self._vdb_bulk_upload:
524
- return_failures = True
525
-
526
- results, failures = self._client.process_jobs_concurrently(
560
+ # Telemetry controls (optional)
561
+ enable_telemetry: Optional[bool] = kwargs.pop("enable_telemetry", None)
562
+ show_telemetry: Optional[bool] = kwargs.pop("show_telemetry", None)
563
+ if show_telemetry is None:
564
+ # Fallback to env NV_INGEST_CLIENT_SHOW_TELEMETRY (0/1), default off
565
+ try:
566
+ show_telemetry = bool(int(os.getenv("NV_INGEST_CLIENT_SHOW_TELEMETRY", "0")))
567
+ except ValueError:
568
+ show_telemetry = False
569
+ # If user explicitly wants to show telemetry but did not specify enable_telemetry,
570
+ # ensure collection is enabled so summary isn't empty.
571
+ if enable_telemetry is None and show_telemetry:
572
+ enable_telemetry = True
573
+ if enable_telemetry is not None and hasattr(self._client, "enable_telemetry"):
574
+ self._client.enable_telemetry(bool(enable_telemetry))
575
+
576
+ # Call process_jobs_concurrently
577
+ proc_result = self._client.process_jobs_concurrently(
527
578
  job_indices=self._job_ids,
528
579
  job_queue_id=self._job_queue_id,
529
580
  timeout=timeout,
@@ -532,9 +583,17 @@ class Ingestor:
532
583
  return_failures=True,
533
584
  stream_to_callback_only=stream_to_callback_only,
534
585
  verbose=verbose,
586
+ return_traces=return_traces,
535
587
  **proc_kwargs,
536
588
  )
537
589
 
590
+ # Unpack result based on return_traces flag
591
+ if return_traces:
592
+ results, failures, traces_list = proc_result
593
+ else:
594
+ results, failures = proc_result
595
+ traces_list = [] # Empty list when traces not requested
596
+
538
597
  if show_progress and pbar:
539
598
  pbar.close()
540
599
 
@@ -551,13 +610,64 @@ class Ingestor:
551
610
 
552
611
  if self._vdb_bulk_upload:
553
612
  if len(failures) > 0:
554
- raise RuntimeError(f"Failed to ingest documents, unable to complete vdb bulk upload: {failures}")
613
+ # Calculate success metrics
614
+ total_jobs = len(results) + len(failures)
615
+ successful_jobs = len(results)
616
+
617
+ if return_failures:
618
+ # Emit message about partial success
619
+ logger.warning(
620
+ f"Job was not completely successful. "
621
+ f"{successful_jobs} out of {total_jobs} records completed successfully. "
622
+ f"Uploading successful results to vector database."
623
+ )
624
+
625
+ # Upload only the successful results
626
+ if successful_jobs > 0:
627
+ self._vdb_bulk_upload.run(results)
555
628
 
556
- self._vdb_bulk_upload.run(results)
629
+ if self._purge_results_after_vdb_upload:
630
+ logger.info("Purging saved results from disk after successful VDB upload.")
631
+ self._purge_saved_results(results)
557
632
 
558
- return_failures = _return_failures
633
+ else:
634
+ # Original behavior: raise RuntimeError
635
+ raise RuntimeError(
636
+ "Failed to ingest documents, unable to complete vdb bulk upload due to "
637
+ f"no successful results. {len(failures)} out of {total_jobs} records failed "
638
+ )
639
+ else:
640
+ # No failures - proceed with normal upload
641
+ self._vdb_bulk_upload.run(results)
559
642
 
560
- return (results, failures) if return_failures else results
643
+ if self._purge_results_after_vdb_upload:
644
+ logger.info("Purging saved results from disk after successful VDB upload.")
645
+ self._purge_saved_results(results)
646
+
647
+ # Print telemetry summary if requested
648
+ if show_telemetry:
649
+ try:
650
+ summary = self._client.summarize_telemetry()
651
+ # Print to stdout and log for convenience
652
+ print("NvIngestClient Telemetry Summary:", json.dumps(summary, indent=2))
653
+ logger.info("NvIngestClient Telemetry Summary: %s", json.dumps(summary, indent=2))
654
+ except Exception:
655
+ pass
656
+
657
+ parent_trace_ids = self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
658
+
659
+ # Build return tuple based on requested outputs
660
+ # Order: results, failures (if requested), traces (if requested), parent_trace_ids (if requested)
661
+ returns = [results]
662
+
663
+ if return_failures:
664
+ returns.append(failures)
665
+ if return_traces:
666
+ returns.append(traces_list)
667
+ if include_parent_trace_ids:
668
+ returns.append(parent_trace_ids)
669
+
670
+ return tuple(returns) if len(returns) > 1 else results
561
671
 
562
672
  def ingest_async(self, **kwargs: Any) -> Future:
563
673
  """
@@ -658,8 +768,23 @@ class Ingestor:
658
768
  Ingestor
659
769
  Returns self for chaining.
660
770
  """
661
- task_options = check_schema(DedupTaskSchema, kwargs, "dedup", json.dumps(kwargs))
662
- dedup_task = DedupTask(**task_options.model_dump())
771
+ # Extract content_type and build params dict for API schema
772
+ content_type = kwargs.pop("content_type", "text") # Default to "text" if not specified
773
+ params = kwargs # Remaining parameters go into params dict
774
+
775
+ # Validate with API schema
776
+ api_options = {
777
+ "content_type": content_type,
778
+ "params": params,
779
+ }
780
+ task_options = check_schema(IngestTaskDedupSchema, api_options, "dedup", json.dumps(api_options))
781
+
782
+ # Extract individual parameters from API schema for DedupTask constructor
783
+ dedup_params = {
784
+ "content_type": task_options.content_type,
785
+ "filter": task_options.params.filter,
786
+ }
787
+ dedup_task = DedupTask(**dedup_params)
663
788
  self._job_specs.add_task(dedup_task)
664
789
 
665
790
  return self
@@ -679,8 +804,14 @@ class Ingestor:
679
804
  Ingestor
680
805
  Returns self for chaining.
681
806
  """
682
- task_options = check_schema(EmbedTaskSchema, kwargs, "embed", json.dumps(kwargs))
683
- embed_task = EmbedTask(**task_options.model_dump())
807
+ # Filter out deprecated parameters before API schema validation
808
+ # The EmbedTask constructor handles these deprecated parameters with warnings
809
+ filtered_kwargs = {k: v for k, v in kwargs.items() if k not in ["text", "tables"]}
810
+
811
+ _ = check_schema(IngestTaskEmbedSchema, filtered_kwargs, "embed", json.dumps(filtered_kwargs))
812
+
813
+ # Pass original kwargs to EmbedTask constructor so it can handle deprecated parameters
814
+ embed_task = EmbedTask(**kwargs)
684
815
  self._job_specs.add_task(embed_task)
685
816
 
686
817
  return self
@@ -727,9 +858,52 @@ class Ingestor:
727
858
  extract_page_as_image=extract_page_as_image,
728
859
  **kwargs,
729
860
  )
730
- task_options = check_schema(ExtractTaskSchema, task_options, "extract", json.dumps(task_options))
731
861
 
732
- extract_task = ExtractTask(**task_options.model_dump())
862
+ # Extract method from task_options for API schema
863
+ method = task_options.pop("extract_method", None)
864
+ if method is None:
865
+ # Let ExtractTask constructor handle default method selection
866
+ method = "pdfium" # Default fallback
867
+
868
+ # Build params dict for API schema
869
+ params = {k: v for k, v in task_options.items() if k != "document_type"}
870
+
871
+ # Map document type to API schema expected values
872
+ # Handle common file extension to DocumentTypeEnum mapping
873
+ document_type_mapping = {
874
+ "txt": "text",
875
+ "md": "text",
876
+ "sh": "text",
877
+ "json": "text",
878
+ "jpg": "jpeg",
879
+ "jpeg": "jpeg",
880
+ "png": "png",
881
+ "pdf": "pdf",
882
+ "docx": "docx",
883
+ "pptx": "pptx",
884
+ "html": "html",
885
+ "bmp": "bmp",
886
+ "tiff": "tiff",
887
+ "svg": "svg",
888
+ "mp3": "mp3",
889
+ "wav": "wav",
890
+ }
891
+
892
+ # Use mapped document type for API schema validation
893
+ api_document_type = document_type_mapping.get(document_type.lower(), document_type)
894
+
895
+ # Validate with API schema
896
+ api_task_options = {
897
+ "document_type": api_document_type,
898
+ "method": method,
899
+ "params": params,
900
+ }
901
+
902
+ check_schema(IngestTaskExtractSchema, api_task_options, "extract", json.dumps(api_task_options))
903
+
904
+ # Create ExtractTask with mapped document type for API schema compatibility
905
+ extract_task_params = {"document_type": api_document_type, "extract_method": method, **params}
906
+ extract_task = ExtractTask(**extract_task_params)
733
907
  self._job_specs.add_task(extract_task, document_type=document_type)
734
908
 
735
909
  return self
@@ -749,8 +923,27 @@ class Ingestor:
749
923
  Ingestor
750
924
  Returns self for chaining.
751
925
  """
752
- task_options = check_schema(FilterTaskSchema, kwargs, "filter", json.dumps(kwargs))
753
- filter_task = FilterTask(**task_options.model_dump())
926
+ # Restructure parameters to match API schema structure
927
+ params_fields = {"min_size", "max_aspect_ratio", "min_aspect_ratio", "filter"}
928
+ params = {k: v for k, v in kwargs.items() if k in params_fields}
929
+ top_level = {k: v for k, v in kwargs.items() if k not in params_fields}
930
+
931
+ # Build API schema structure
932
+ api_kwargs = top_level.copy()
933
+ if params:
934
+ api_kwargs["params"] = params
935
+
936
+ task_options = check_schema(IngestTaskFilterSchema, api_kwargs, "filter", json.dumps(api_kwargs))
937
+
938
+ # Extract individual parameters from API schema for FilterTask constructor
939
+ filter_params = {
940
+ "content_type": task_options.content_type,
941
+ "min_size": task_options.params.min_size,
942
+ "max_aspect_ratio": task_options.params.max_aspect_ratio,
943
+ "min_aspect_ratio": task_options.params.min_aspect_ratio,
944
+ "filter": task_options.params.filter,
945
+ }
946
+ filter_task = FilterTask(**filter_params)
754
947
  self._job_specs.add_task(filter_task)
755
948
 
756
949
  return self
@@ -770,7 +963,7 @@ class Ingestor:
770
963
  Ingestor
771
964
  Returns self for chaining.
772
965
  """
773
- task_options = check_schema(SplitTaskSchema, kwargs, "split", json.dumps(kwargs))
966
+ task_options = check_schema(IngestTaskSplitSchema, kwargs, "split", json.dumps(kwargs))
774
967
  extract_task = SplitTask(**task_options.model_dump())
775
968
  self._job_specs.add_task(extract_task)
776
969
 
@@ -791,8 +984,24 @@ class Ingestor:
791
984
  Ingestor
792
985
  Returns self for chaining.
793
986
  """
794
- task_options = check_schema(StoreTaskSchema, kwargs, "store", json.dumps(kwargs))
795
- store_task = StoreTask(**task_options.model_dump())
987
+ # Handle parameter name mapping: store_method -> method for API schema
988
+ if "store_method" in kwargs:
989
+ kwargs["method"] = kwargs.pop("store_method")
990
+
991
+ # Provide default method if not specified (matching client StoreTask behavior)
992
+ if "method" not in kwargs:
993
+ kwargs["method"] = "minio"
994
+
995
+ task_options = check_schema(IngestTaskStoreSchema, kwargs, "store", json.dumps(kwargs))
996
+
997
+ # Map API schema fields back to StoreTask constructor parameters
998
+ store_params = {
999
+ "structured": task_options.structured,
1000
+ "images": task_options.images,
1001
+ "store_method": task_options.method, # Map method back to store_method
1002
+ "params": task_options.params,
1003
+ }
1004
+ store_task = StoreTask(**store_params)
796
1005
  self._job_specs.add_task(store_task)
797
1006
 
798
1007
  return self
@@ -800,30 +1009,106 @@ class Ingestor:
800
1009
  @ensure_job_specs
801
1010
  def store_embed(self, **kwargs: Any) -> "Ingestor":
802
1011
  """
803
- Adds a StoreTask to the batch job specification.
1012
+ Adds a StoreEmbedTask to the batch job specification.
804
1013
 
805
1014
  Parameters
806
1015
  ----------
807
1016
  kwargs : dict
808
- Parameters specific to the StoreTask.
1017
+ Parameters specific to the StoreEmbedTask.
809
1018
 
810
1019
  Returns
811
1020
  -------
812
1021
  Ingestor
813
1022
  Returns self for chaining.
814
1023
  """
815
- task_options = check_schema(StoreEmbedTaskSchema, kwargs, "store_embedding", json.dumps(kwargs))
1024
+ task_options = check_schema(IngestTaskStoreEmbedSchema, kwargs, "store_embedding", json.dumps(kwargs))
816
1025
  store_task = StoreEmbedTask(**task_options.model_dump())
817
1026
  self._job_specs.add_task(store_task)
818
1027
 
819
1028
  return self
820
1029
 
821
- def vdb_upload(self, **kwargs: Any) -> "Ingestor":
1030
+ def udf(
1031
+ self,
1032
+ udf_function: str,
1033
+ udf_function_name: Optional[str] = None,
1034
+ phase: Optional[Union[PipelinePhase, int, str]] = None,
1035
+ target_stage: Optional[str] = None,
1036
+ run_before: bool = False,
1037
+ run_after: bool = False,
1038
+ ) -> "Ingestor":
1039
+ """
1040
+ Adds a UDFTask to the batch job specification.
1041
+
1042
+ Parameters
1043
+ ----------
1044
+ udf_function : str
1045
+ UDF specification. Supports three formats:
1046
+ 1. Inline function: 'def my_func(control_message): ...'
1047
+ 2. Import path: 'my_module.my_function'
1048
+ 3. File path: '/path/to/file.py:function_name'
1049
+ udf_function_name : str, optional
1050
+ Name of the function to execute from the UDF specification.
1051
+ If not provided, attempts to infer from udf_function.
1052
+ phase : Union[PipelinePhase, int, str], optional
1053
+ Pipeline phase to execute UDF. Accepts phase names ('extract', 'split', 'embed', 'response')
1054
+ or numbers (1-4). Cannot be used with target_stage.
1055
+ target_stage : str, optional
1056
+ Specific stage name to target for UDF execution. Cannot be used with phase.
1057
+ run_before : bool, optional
1058
+ If True and target_stage is specified, run UDF before the target stage. Default: False.
1059
+ run_after : bool, optional
1060
+ If True and target_stage is specified, run UDF after the target stage. Default: False.
1061
+
1062
+ Returns
1063
+ -------
1064
+ Ingestor
1065
+ Returns self for chaining.
1066
+
1067
+ Raises
1068
+ ------
1069
+ ValueError
1070
+ If udf_function_name cannot be inferred and is not provided explicitly,
1071
+ or if both phase and target_stage are specified, or if neither is specified.
1072
+ """
1073
+ # Validate mutual exclusivity of phase and target_stage
1074
+ if phase is not None and target_stage is not None:
1075
+ raise ValueError("Cannot specify both 'phase' and 'target_stage'. Please specify only one.")
1076
+ elif phase is None and target_stage is None:
1077
+ # Default to response phase for backward compatibility
1078
+ phase = PipelinePhase.RESPONSE
1079
+
1080
+ # Try to infer udf_function_name if not provided
1081
+ if udf_function_name is None:
1082
+ udf_function_name = infer_udf_function_name(udf_function)
1083
+ if udf_function_name is None:
1084
+ raise ValueError(
1085
+ f"Could not infer UDF function name from '{udf_function}'. "
1086
+ "Please specify 'udf_function_name' explicitly."
1087
+ )
1088
+ logger.info(f"Inferred UDF function name: {udf_function_name}")
1089
+
1090
+ # Use UDFTask constructor with explicit parameters
1091
+ udf_task = UDFTask(
1092
+ udf_function=udf_function,
1093
+ udf_function_name=udf_function_name,
1094
+ phase=phase,
1095
+ target_stage=target_stage,
1096
+ run_before=run_before,
1097
+ run_after=run_after,
1098
+ )
1099
+ self._job_specs.add_task(udf_task)
1100
+
1101
+ return self
1102
+
1103
+ def vdb_upload(self, purge_results_after_upload: bool = True, **kwargs: Any) -> "Ingestor":
822
1104
  """
823
1105
  Adds a VdbUploadTask to the batch job specification.
824
1106
 
825
1107
  Parameters
826
1108
  ----------
1109
+ purge_results_after_upload : bool, optional
1110
+ If True, the saved result files will be deleted from disk after a successful
1111
+ upload. This requires `save_to_disk()` to be active. Defaults to True
827
1112
  kwargs : dict
828
1113
  Parameters specific to the VdbUploadTask.
829
1114
 
@@ -842,24 +1127,100 @@ class Ingestor:
842
1127
  raise ValueError(f"Invalid type for op: {type(vdb_op)}, must be type VDB or str.")
843
1128
 
844
1129
  self._vdb_bulk_upload = vdb_op
1130
+ self._purge_results_after_vdb_upload = purge_results_after_upload
845
1131
 
846
1132
  return self
847
1133
 
848
1134
  def save_to_disk(
849
1135
  self,
850
1136
  output_directory: Optional[str] = None,
1137
+ cleanup: bool = True,
1138
+ compression: Optional[str] = "gzip",
851
1139
  ) -> "Ingestor":
1140
+ """Configures the Ingestor to save results to disk instead of memory.
1141
+
1142
+ This method enables disk-based storage for ingestion results. When called,
1143
+ the `ingest()` method will write the output for each processed document to a
1144
+ separate JSONL file. The return value of `ingest()` will be a list of
1145
+ `LazyLoadedList` objects, which are memory-efficient proxies to these files.
1146
+
1147
+ The output directory can be specified directly, via an environment variable,
1148
+ or a temporary directory will be created automatically.
1149
+
1150
+ Parameters
1151
+ ----------
1152
+ output_directory : str, optional
1153
+ The path to the directory where result files (.jsonl) will be saved.
1154
+ If not provided, it defaults to the value of the environment variable
1155
+ `NV_INGEST_CLIENT_SAVE_TO_DISK_OUTPUT_DIRECTORY`. If the environment
1156
+ variable is also not set, a temporary directory will be created.
1157
+ Defaults to None.
1158
+ cleanup : bool, optional)
1159
+ If True, the entire `output_directory` will be recursively deleted
1160
+ when the Ingestor's context is exited (i.e., when used in a `with`
1161
+ statement).
1162
+ Defaults to True.
1163
+ compression : str, optional
1164
+ The compression algorithm to use for the saved result files.
1165
+ Currently, the only supported value is `'gzip'`. To disable
1166
+ compression, set this parameter to `None`. Defaults to `'gzip'`,
1167
+ which significantly reduces the disk space required for results.
1168
+ When enabled, files are saved with a `.gz` suffix (e.g., `results.jsonl.gz`).
1169
+
1170
+ Returns
1171
+ -------
1172
+ Ingestor
1173
+ Returns self for chaining.
1174
+ """
1175
+ output_directory = output_directory or os.getenv("NV_INGEST_CLIENT_SAVE_TO_DISK_OUTPUT_DIRECTORY")
1176
+
852
1177
  if not output_directory:
853
1178
  self._created_temp_output_dir = tempfile.mkdtemp(prefix="ingestor_results_")
854
1179
  output_directory = self._created_temp_output_dir
855
1180
 
856
1181
  self._output_config = {
857
1182
  "output_directory": output_directory,
1183
+ "cleanup": cleanup,
1184
+ "compression": compression,
858
1185
  }
859
1186
  ensure_directory_with_permissions(output_directory)
860
1187
 
861
1188
  return self
862
1189
 
1190
+ def _purge_saved_results(self, saved_results: List[LazyLoadedList]):
1191
+ """
1192
+ Deletes the .jsonl files associated with the results and the temporary
1193
+ output directory if it was created by this Ingestor instance.
1194
+ """
1195
+ if not self._output_config:
1196
+ logger.warning("Purge requested, but save_to_disk was not configured. No files to purge.")
1197
+ return
1198
+
1199
+ deleted_files_count = 0
1200
+ for result_item in saved_results:
1201
+ if isinstance(result_item, LazyLoadedList) and hasattr(result_item, "filepath"):
1202
+ filepath = result_item.filepath
1203
+ try:
1204
+ if os.path.exists(filepath):
1205
+ os.remove(filepath)
1206
+ deleted_files_count += 1
1207
+ logger.debug(f"Purged result file: {filepath}")
1208
+ except OSError as e:
1209
+ logger.error(f"Error purging result file {filepath}: {e}", exc_info=True)
1210
+
1211
+ logger.info(f"Purged {deleted_files_count} saved result file(s).")
1212
+
1213
+ if self._created_temp_output_dir:
1214
+ logger.info(f"Removing temporary output directory: {self._created_temp_output_dir}")
1215
+ try:
1216
+ shutil.rmtree(self._created_temp_output_dir)
1217
+ self._created_temp_output_dir = None # Reset flag after successful removal
1218
+ except OSError as e:
1219
+ logger.error(
1220
+ f"Error removing temporary output directory {self._created_temp_output_dir}: {e}",
1221
+ exc_info=True,
1222
+ )
1223
+
863
1224
  @ensure_job_specs
864
1225
  def caption(self, **kwargs: Any) -> "Ingestor":
865
1226
  """
@@ -875,12 +1236,58 @@ class Ingestor:
875
1236
  Ingestor
876
1237
  Returns self for chaining.
877
1238
  """
878
- task_options = check_schema(CaptionTaskSchema, kwargs, "caption", json.dumps(kwargs))
879
- caption_task = CaptionTask(**task_options.model_dump())
1239
+ task_options = check_schema(IngestTaskCaptionSchema, kwargs, "caption", json.dumps(kwargs))
1240
+
1241
+ # Extract individual parameters from API schema for CaptionTask constructor
1242
+ caption_params = {
1243
+ "api_key": task_options.api_key,
1244
+ "endpoint_url": task_options.endpoint_url,
1245
+ "prompt": task_options.prompt,
1246
+ "model_name": task_options.model_name,
1247
+ }
1248
+ caption_task = CaptionTask(**caption_params)
880
1249
  self._job_specs.add_task(caption_task)
881
1250
 
882
1251
  return self
883
1252
 
1253
+ @ensure_job_specs
1254
+ def pdf_split_config(self, pages_per_chunk: int = 32) -> "Ingestor":
1255
+ """
1256
+ Configure PDF splitting behavior for V2 API.
1257
+
1258
+ Parameters
1259
+ ----------
1260
+ pages_per_chunk : int, optional
1261
+ Number of pages per PDF chunk (default: 32)
1262
+ Server enforces boundaries: min=1, max=128
1263
+
1264
+ Returns
1265
+ -------
1266
+ Ingestor
1267
+ Self for method chaining
1268
+
1269
+ Notes
1270
+ -----
1271
+ - Only affects V2 API endpoints with PDF splitting support
1272
+ - Server will clamp values outside [1, 128] range
1273
+ - Smaller chunks = more parallelism but more overhead
1274
+ - Larger chunks = less overhead but reduced concurrency
1275
+ """
1276
+ MIN_PAGES = 1
1277
+ MAX_PAGES = 128
1278
+
1279
+ # Warn if value will be clamped by server
1280
+ if pages_per_chunk < MIN_PAGES:
1281
+ logger.warning(f"pages_per_chunk={pages_per_chunk} is below minimum. Server will clamp to {MIN_PAGES}.")
1282
+ elif pages_per_chunk > MAX_PAGES:
1283
+ logger.warning(f"pages_per_chunk={pages_per_chunk} exceeds maximum. Server will clamp to {MAX_PAGES}.")
1284
+
1285
+ # Flatten all job specs and apply PDF config using shared utility
1286
+ all_job_specs = [spec for job_specs in self._job_specs._file_type_to_job_spec.values() for spec in job_specs]
1287
+ apply_pdf_split_config_to_job_specs(all_job_specs, pages_per_chunk)
1288
+
1289
+ return self
1290
+
884
1291
  def _count_job_states(self, job_states: set[JobStateEnum]) -> int:
885
1292
  """
886
1293
  Counts the jobs in specified states.