nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/cli/util/click.py +182 -30
- nv_ingest_client/cli/util/processing.py +0 -393
- nv_ingest_client/client/client.py +561 -207
- nv_ingest_client/client/ingest_job_handler.py +412 -0
- nv_ingest_client/client/interface.py +466 -59
- nv_ingest_client/client/util/processing.py +11 -1
- nv_ingest_client/nv_ingest_cli.py +58 -6
- nv_ingest_client/primitives/jobs/job_spec.py +32 -10
- nv_ingest_client/primitives/tasks/__init__.py +6 -4
- nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
- nv_ingest_client/primitives/tasks/caption.py +10 -16
- nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
- nv_ingest_client/primitives/tasks/dedup.py +12 -21
- nv_ingest_client/primitives/tasks/embed.py +37 -76
- nv_ingest_client/primitives/tasks/extract.py +68 -169
- nv_ingest_client/primitives/tasks/filter.py +22 -28
- nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
- nv_ingest_client/primitives/tasks/split.py +17 -18
- nv_ingest_client/primitives/tasks/store.py +29 -29
- nv_ingest_client/primitives/tasks/task_base.py +1 -72
- nv_ingest_client/primitives/tasks/task_factory.py +10 -11
- nv_ingest_client/primitives/tasks/udf.py +349 -0
- nv_ingest_client/util/dataset.py +8 -2
- nv_ingest_client/util/document_analysis.py +314 -0
- nv_ingest_client/util/image_disk_utils.py +300 -0
- nv_ingest_client/util/transport.py +12 -6
- nv_ingest_client/util/util.py +66 -0
- nv_ingest_client/util/vdb/milvus.py +220 -75
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
- nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
- nv_ingest_client/cli/util/tasks.py +0 -3
- nv_ingest_client/primitives/exceptions.py +0 -0
- nv_ingest_client/primitives/tasks/transform.py +0 -0
- nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
import collections
|
|
8
8
|
import glob
|
|
9
|
+
import gzip
|
|
9
10
|
import json
|
|
10
11
|
import logging
|
|
11
12
|
import os
|
|
@@ -27,6 +28,16 @@ from typing import Union
|
|
|
27
28
|
from urllib.parse import urlparse
|
|
28
29
|
|
|
29
30
|
import fsspec
|
|
31
|
+
from nv_ingest_api.internal.enums.common import PipelinePhase
|
|
32
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
|
|
33
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
|
|
34
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
|
|
35
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
|
|
36
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
|
|
37
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
|
|
38
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
|
|
39
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
|
|
40
|
+
from nv_ingest_api.util.introspection.function_inspect import infer_udf_function_name
|
|
30
41
|
from nv_ingest_client.client.client import NvIngestClient
|
|
31
42
|
from nv_ingest_client.client.util.processing import get_valid_filename
|
|
32
43
|
from nv_ingest_client.client.util.processing import save_document_results_to_jsonl
|
|
@@ -38,19 +49,12 @@ from nv_ingest_client.primitives.tasks import EmbedTask
|
|
|
38
49
|
from nv_ingest_client.primitives.tasks import ExtractTask
|
|
39
50
|
from nv_ingest_client.primitives.tasks import FilterTask
|
|
40
51
|
from nv_ingest_client.primitives.tasks import SplitTask
|
|
41
|
-
from nv_ingest_client.primitives.tasks import StoreEmbedTask
|
|
42
52
|
from nv_ingest_client.primitives.tasks import StoreTask
|
|
43
|
-
from nv_ingest_client.primitives.tasks
|
|
44
|
-
from nv_ingest_client.primitives.tasks
|
|
45
|
-
from nv_ingest_client.primitives.tasks.embed import EmbedTaskSchema
|
|
46
|
-
from nv_ingest_client.primitives.tasks.extract import ExtractTaskSchema
|
|
47
|
-
from nv_ingest_client.primitives.tasks.filter import FilterTaskSchema
|
|
48
|
-
from nv_ingest_client.primitives.tasks.split import SplitTaskSchema
|
|
49
|
-
from nv_ingest_client.primitives.tasks.store import StoreEmbedTaskSchema
|
|
50
|
-
from nv_ingest_client.primitives.tasks.store import StoreTaskSchema
|
|
53
|
+
from nv_ingest_client.primitives.tasks import StoreEmbedTask
|
|
54
|
+
from nv_ingest_client.primitives.tasks import UDFTask
|
|
51
55
|
from nv_ingest_client.util.processing import check_schema
|
|
52
56
|
from nv_ingest_client.util.system import ensure_directory_with_permissions
|
|
53
|
-
from nv_ingest_client.util.util import filter_function_kwargs
|
|
57
|
+
from nv_ingest_client.util.util import filter_function_kwargs, apply_pdf_split_config_to_job_specs
|
|
54
58
|
from nv_ingest_client.util.vdb import VDB, get_vdb_op_cls
|
|
55
59
|
from tqdm import tqdm
|
|
56
60
|
|
|
@@ -90,17 +94,20 @@ def ensure_job_specs(func):
|
|
|
90
94
|
|
|
91
95
|
|
|
92
96
|
class LazyLoadedList(collections.abc.Sequence):
|
|
93
|
-
def __init__(self, filepath: str, expected_len: Optional[int] = None):
|
|
97
|
+
def __init__(self, filepath: str, expected_len: Optional[int] = None, compression: Optional[str] = None):
|
|
94
98
|
self.filepath = filepath
|
|
95
99
|
self._len: Optional[int] = expected_len # Store pre-calculated length
|
|
96
100
|
self._offsets: Optional[List[int]] = None
|
|
101
|
+
self.compression = compression
|
|
97
102
|
|
|
98
103
|
if self._len == 0:
|
|
99
104
|
self._offsets = []
|
|
100
105
|
|
|
106
|
+
self._open = gzip.open if self.compression == "gzip" else open
|
|
107
|
+
|
|
101
108
|
def __iter__(self) -> Iterator[Any]:
|
|
102
109
|
try:
|
|
103
|
-
with
|
|
110
|
+
with self._open(self.filepath, "rt", encoding="utf-8") as f:
|
|
104
111
|
for line in f:
|
|
105
112
|
yield json.loads(line)
|
|
106
113
|
except FileNotFoundError:
|
|
@@ -117,7 +124,7 @@ class LazyLoadedList(collections.abc.Sequence):
|
|
|
117
124
|
self._offsets = []
|
|
118
125
|
line_count = 0
|
|
119
126
|
try:
|
|
120
|
-
with
|
|
127
|
+
with self._open(self.filepath, "rb") as f:
|
|
121
128
|
while True:
|
|
122
129
|
current_pos = f.tell()
|
|
123
130
|
line = f.readline()
|
|
@@ -141,10 +148,12 @@ class LazyLoadedList(collections.abc.Sequence):
|
|
|
141
148
|
def __len__(self) -> int:
|
|
142
149
|
if self._len is not None:
|
|
143
150
|
return self._len
|
|
151
|
+
|
|
144
152
|
if self._offsets is not None:
|
|
145
153
|
self._len = len(self._offsets)
|
|
146
154
|
return self._len
|
|
147
155
|
self._build_index()
|
|
156
|
+
|
|
148
157
|
return self._len if self._len is not None else 0
|
|
149
158
|
|
|
150
159
|
def __getitem__(self, idx: int) -> Any:
|
|
@@ -167,7 +176,7 @@ class LazyLoadedList(collections.abc.Sequence):
|
|
|
167
176
|
raise IndexError(f"Index {idx} out of range for {self.filepath} (len: {len(self._offsets)})")
|
|
168
177
|
|
|
169
178
|
try:
|
|
170
|
-
with
|
|
179
|
+
with self._open(self.filepath, "rb") as f:
|
|
171
180
|
f.seek(self._offsets[idx])
|
|
172
181
|
line_bytes = f.readline()
|
|
173
182
|
return json.loads(line_bytes.decode("utf-8"))
|
|
@@ -218,6 +227,7 @@ class Ingestor:
|
|
|
218
227
|
self._client = client
|
|
219
228
|
self._job_queue_id = job_queue_id
|
|
220
229
|
self._vdb_bulk_upload = None
|
|
230
|
+
self._purge_results_after_vdb_upload = True
|
|
221
231
|
|
|
222
232
|
if self._client is None:
|
|
223
233
|
client_kwargs = filter_function_kwargs(NvIngestClient, **kwargs)
|
|
@@ -236,6 +246,21 @@ class Ingestor:
|
|
|
236
246
|
self._output_config = None
|
|
237
247
|
self._created_temp_output_dir = None
|
|
238
248
|
|
|
249
|
+
def __enter__(self):
|
|
250
|
+
return self
|
|
251
|
+
|
|
252
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
253
|
+
if self._output_config and (self._output_config["cleanup"] is True):
|
|
254
|
+
dir_to_cleanup = self._output_config["output_directory"]
|
|
255
|
+
try:
|
|
256
|
+
shutil.rmtree(dir_to_cleanup)
|
|
257
|
+
except FileNotFoundError:
|
|
258
|
+
logger.warning(
|
|
259
|
+
f"Directory to be cleaned up not found (might have been removed already): {dir_to_cleanup}"
|
|
260
|
+
)
|
|
261
|
+
except OSError as e:
|
|
262
|
+
logger.error(f"Error removing {dir_to_cleanup}: {e}")
|
|
263
|
+
|
|
239
264
|
def _create_client(self, **kwargs) -> None:
|
|
240
265
|
"""
|
|
241
266
|
Creates an instance of NvIngestClient if `_client` is not set.
|
|
@@ -377,15 +402,9 @@ class Ingestor:
|
|
|
377
402
|
show_progress: bool = False,
|
|
378
403
|
return_failures: bool = False,
|
|
379
404
|
save_to_disk: bool = False,
|
|
405
|
+
return_traces: bool = False,
|
|
380
406
|
**kwargs: Any,
|
|
381
|
-
) -> Union[
|
|
382
|
-
List[List[Dict[str, Any]]], # In-memory: List of (response['data'] for each doc)
|
|
383
|
-
List[LazyLoadedList], # Disk: List of proxies, one per original doc
|
|
384
|
-
Tuple[
|
|
385
|
-
Union[List[List[Dict[str, Any]]], List[LazyLoadedList]],
|
|
386
|
-
List[Tuple[str, str]],
|
|
387
|
-
],
|
|
388
|
-
]: # noqa: E501
|
|
407
|
+
) -> Union[List[Any], Tuple[Any, ...]]:
|
|
389
408
|
"""
|
|
390
409
|
Ingest documents by submitting jobs and fetching results concurrently.
|
|
391
410
|
|
|
@@ -395,22 +414,36 @@ class Ingestor:
|
|
|
395
414
|
Whether to display a progress bar. Default is False.
|
|
396
415
|
return_failures : bool, optional
|
|
397
416
|
If True, return a tuple (results, failures); otherwise, return only results. Default is False.
|
|
417
|
+
save_to_disk : bool, optional
|
|
418
|
+
If True, save results to disk and return LazyLoadedList proxies. Default is False.
|
|
419
|
+
return_traces : bool, optional
|
|
420
|
+
If True, return trace metrics alongside results. Default is False.
|
|
421
|
+
Traces contain timing metrics (entry, exit, resident_time) for each stage.
|
|
398
422
|
**kwargs : Any
|
|
399
|
-
Additional keyword arguments for the underlying client methods.
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
process_jobs_concurrently.
|
|
423
|
+
Additional keyword arguments for the underlying client methods.
|
|
424
|
+
Optional flags include `include_parent_trace_ids=True` to also return
|
|
425
|
+
parent job trace identifiers (V2 API only).
|
|
403
426
|
|
|
404
427
|
Returns
|
|
405
428
|
-------
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
429
|
+
list or tuple
|
|
430
|
+
Returns vary based on flags:
|
|
431
|
+
- Default: list of results
|
|
432
|
+
- return_failures=True: (results, failures)
|
|
433
|
+
- return_traces=True: (results, traces)
|
|
434
|
+
- return_failures=True, return_traces=True: (results, failures, traces)
|
|
435
|
+
- Additional combinations with include_parent_trace_ids kwarg
|
|
436
|
+
|
|
437
|
+
Notes
|
|
438
|
+
-----
|
|
439
|
+
Trace metrics include timing data for each processing stage. For detailed
|
|
440
|
+
usage and examples, see src/nv_ingest/api/v2/README.md
|
|
410
441
|
"""
|
|
411
442
|
if save_to_disk and (not self._output_config):
|
|
412
443
|
self.save_to_disk()
|
|
413
444
|
|
|
445
|
+
include_parent_trace_ids = bool(kwargs.pop("include_parent_trace_ids", False))
|
|
446
|
+
|
|
414
447
|
self._prepare_ingest_run()
|
|
415
448
|
|
|
416
449
|
# Add jobs locally first
|
|
@@ -420,7 +453,7 @@ class Ingestor:
|
|
|
420
453
|
|
|
421
454
|
final_results_payload_list: Union[List[List[Dict[str, Any]]], List[LazyLoadedList]] = []
|
|
422
455
|
|
|
423
|
-
# Lock for thread-safe
|
|
456
|
+
# Lock for thread-safe appending to final_results_payload_list by I/O tasks
|
|
424
457
|
results_lock = threading.Lock() if self._output_config else None
|
|
425
458
|
|
|
426
459
|
io_executor: Optional[ThreadPoolExecutor] = None
|
|
@@ -435,7 +468,9 @@ class Ingestor:
|
|
|
435
468
|
output_dir = self._output_config["output_directory"]
|
|
436
469
|
clean_source_basename = get_valid_filename(os.path.basename(source_name))
|
|
437
470
|
file_name, file_ext = os.path.splitext(clean_source_basename)
|
|
438
|
-
file_suffix = f".{file_ext}.results.jsonl"
|
|
471
|
+
file_suffix = f".{file_ext.strip('.')}.results.jsonl"
|
|
472
|
+
if self._output_config["compression"] == "gzip":
|
|
473
|
+
file_suffix += ".gz"
|
|
439
474
|
jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
|
|
440
475
|
|
|
441
476
|
num_items_saved = save_document_results_to_jsonl(
|
|
@@ -443,10 +478,13 @@ class Ingestor:
|
|
|
443
478
|
jsonl_filepath,
|
|
444
479
|
source_name,
|
|
445
480
|
ensure_parent_dir_exists=False,
|
|
481
|
+
compression=self._output_config["compression"],
|
|
446
482
|
)
|
|
447
483
|
|
|
448
484
|
if num_items_saved > 0:
|
|
449
|
-
results = LazyLoadedList(
|
|
485
|
+
results = LazyLoadedList(
|
|
486
|
+
jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
|
|
487
|
+
)
|
|
450
488
|
if results_lock:
|
|
451
489
|
with results_lock:
|
|
452
490
|
final_results_payload_list.append(results)
|
|
@@ -519,11 +557,24 @@ class Ingestor:
|
|
|
519
557
|
|
|
520
558
|
proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently, **kwargs)
|
|
521
559
|
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
560
|
+
# Telemetry controls (optional)
|
|
561
|
+
enable_telemetry: Optional[bool] = kwargs.pop("enable_telemetry", None)
|
|
562
|
+
show_telemetry: Optional[bool] = kwargs.pop("show_telemetry", None)
|
|
563
|
+
if show_telemetry is None:
|
|
564
|
+
# Fallback to env NV_INGEST_CLIENT_SHOW_TELEMETRY (0/1), default off
|
|
565
|
+
try:
|
|
566
|
+
show_telemetry = bool(int(os.getenv("NV_INGEST_CLIENT_SHOW_TELEMETRY", "0")))
|
|
567
|
+
except ValueError:
|
|
568
|
+
show_telemetry = False
|
|
569
|
+
# If user explicitly wants to show telemetry but did not specify enable_telemetry,
|
|
570
|
+
# ensure collection is enabled so summary isn't empty.
|
|
571
|
+
if enable_telemetry is None and show_telemetry:
|
|
572
|
+
enable_telemetry = True
|
|
573
|
+
if enable_telemetry is not None and hasattr(self._client, "enable_telemetry"):
|
|
574
|
+
self._client.enable_telemetry(bool(enable_telemetry))
|
|
575
|
+
|
|
576
|
+
# Call process_jobs_concurrently
|
|
577
|
+
proc_result = self._client.process_jobs_concurrently(
|
|
527
578
|
job_indices=self._job_ids,
|
|
528
579
|
job_queue_id=self._job_queue_id,
|
|
529
580
|
timeout=timeout,
|
|
@@ -532,9 +583,17 @@ class Ingestor:
|
|
|
532
583
|
return_failures=True,
|
|
533
584
|
stream_to_callback_only=stream_to_callback_only,
|
|
534
585
|
verbose=verbose,
|
|
586
|
+
return_traces=return_traces,
|
|
535
587
|
**proc_kwargs,
|
|
536
588
|
)
|
|
537
589
|
|
|
590
|
+
# Unpack result based on return_traces flag
|
|
591
|
+
if return_traces:
|
|
592
|
+
results, failures, traces_list = proc_result
|
|
593
|
+
else:
|
|
594
|
+
results, failures = proc_result
|
|
595
|
+
traces_list = [] # Empty list when traces not requested
|
|
596
|
+
|
|
538
597
|
if show_progress and pbar:
|
|
539
598
|
pbar.close()
|
|
540
599
|
|
|
@@ -551,13 +610,64 @@ class Ingestor:
|
|
|
551
610
|
|
|
552
611
|
if self._vdb_bulk_upload:
|
|
553
612
|
if len(failures) > 0:
|
|
554
|
-
|
|
613
|
+
# Calculate success metrics
|
|
614
|
+
total_jobs = len(results) + len(failures)
|
|
615
|
+
successful_jobs = len(results)
|
|
616
|
+
|
|
617
|
+
if return_failures:
|
|
618
|
+
# Emit message about partial success
|
|
619
|
+
logger.warning(
|
|
620
|
+
f"Job was not completely successful. "
|
|
621
|
+
f"{successful_jobs} out of {total_jobs} records completed successfully. "
|
|
622
|
+
f"Uploading successful results to vector database."
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
# Upload only the successful results
|
|
626
|
+
if successful_jobs > 0:
|
|
627
|
+
self._vdb_bulk_upload.run(results)
|
|
555
628
|
|
|
556
|
-
|
|
629
|
+
if self._purge_results_after_vdb_upload:
|
|
630
|
+
logger.info("Purging saved results from disk after successful VDB upload.")
|
|
631
|
+
self._purge_saved_results(results)
|
|
557
632
|
|
|
558
|
-
|
|
633
|
+
else:
|
|
634
|
+
# Original behavior: raise RuntimeError
|
|
635
|
+
raise RuntimeError(
|
|
636
|
+
"Failed to ingest documents, unable to complete vdb bulk upload due to "
|
|
637
|
+
f"no successful results. {len(failures)} out of {total_jobs} records failed "
|
|
638
|
+
)
|
|
639
|
+
else:
|
|
640
|
+
# No failures - proceed with normal upload
|
|
641
|
+
self._vdb_bulk_upload.run(results)
|
|
559
642
|
|
|
560
|
-
|
|
643
|
+
if self._purge_results_after_vdb_upload:
|
|
644
|
+
logger.info("Purging saved results from disk after successful VDB upload.")
|
|
645
|
+
self._purge_saved_results(results)
|
|
646
|
+
|
|
647
|
+
# Print telemetry summary if requested
|
|
648
|
+
if show_telemetry:
|
|
649
|
+
try:
|
|
650
|
+
summary = self._client.summarize_telemetry()
|
|
651
|
+
# Print to stdout and log for convenience
|
|
652
|
+
print("NvIngestClient Telemetry Summary:", json.dumps(summary, indent=2))
|
|
653
|
+
logger.info("NvIngestClient Telemetry Summary: %s", json.dumps(summary, indent=2))
|
|
654
|
+
except Exception:
|
|
655
|
+
pass
|
|
656
|
+
|
|
657
|
+
parent_trace_ids = self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
|
|
658
|
+
|
|
659
|
+
# Build return tuple based on requested outputs
|
|
660
|
+
# Order: results, failures (if requested), traces (if requested), parent_trace_ids (if requested)
|
|
661
|
+
returns = [results]
|
|
662
|
+
|
|
663
|
+
if return_failures:
|
|
664
|
+
returns.append(failures)
|
|
665
|
+
if return_traces:
|
|
666
|
+
returns.append(traces_list)
|
|
667
|
+
if include_parent_trace_ids:
|
|
668
|
+
returns.append(parent_trace_ids)
|
|
669
|
+
|
|
670
|
+
return tuple(returns) if len(returns) > 1 else results
|
|
561
671
|
|
|
562
672
|
def ingest_async(self, **kwargs: Any) -> Future:
|
|
563
673
|
"""
|
|
@@ -658,8 +768,23 @@ class Ingestor:
|
|
|
658
768
|
Ingestor
|
|
659
769
|
Returns self for chaining.
|
|
660
770
|
"""
|
|
661
|
-
|
|
662
|
-
|
|
771
|
+
# Extract content_type and build params dict for API schema
|
|
772
|
+
content_type = kwargs.pop("content_type", "text") # Default to "text" if not specified
|
|
773
|
+
params = kwargs # Remaining parameters go into params dict
|
|
774
|
+
|
|
775
|
+
# Validate with API schema
|
|
776
|
+
api_options = {
|
|
777
|
+
"content_type": content_type,
|
|
778
|
+
"params": params,
|
|
779
|
+
}
|
|
780
|
+
task_options = check_schema(IngestTaskDedupSchema, api_options, "dedup", json.dumps(api_options))
|
|
781
|
+
|
|
782
|
+
# Extract individual parameters from API schema for DedupTask constructor
|
|
783
|
+
dedup_params = {
|
|
784
|
+
"content_type": task_options.content_type,
|
|
785
|
+
"filter": task_options.params.filter,
|
|
786
|
+
}
|
|
787
|
+
dedup_task = DedupTask(**dedup_params)
|
|
663
788
|
self._job_specs.add_task(dedup_task)
|
|
664
789
|
|
|
665
790
|
return self
|
|
@@ -679,8 +804,14 @@ class Ingestor:
|
|
|
679
804
|
Ingestor
|
|
680
805
|
Returns self for chaining.
|
|
681
806
|
"""
|
|
682
|
-
|
|
683
|
-
|
|
807
|
+
# Filter out deprecated parameters before API schema validation
|
|
808
|
+
# The EmbedTask constructor handles these deprecated parameters with warnings
|
|
809
|
+
filtered_kwargs = {k: v for k, v in kwargs.items() if k not in ["text", "tables"]}
|
|
810
|
+
|
|
811
|
+
_ = check_schema(IngestTaskEmbedSchema, filtered_kwargs, "embed", json.dumps(filtered_kwargs))
|
|
812
|
+
|
|
813
|
+
# Pass original kwargs to EmbedTask constructor so it can handle deprecated parameters
|
|
814
|
+
embed_task = EmbedTask(**kwargs)
|
|
684
815
|
self._job_specs.add_task(embed_task)
|
|
685
816
|
|
|
686
817
|
return self
|
|
@@ -727,9 +858,52 @@ class Ingestor:
|
|
|
727
858
|
extract_page_as_image=extract_page_as_image,
|
|
728
859
|
**kwargs,
|
|
729
860
|
)
|
|
730
|
-
task_options = check_schema(ExtractTaskSchema, task_options, "extract", json.dumps(task_options))
|
|
731
861
|
|
|
732
|
-
|
|
862
|
+
# Extract method from task_options for API schema
|
|
863
|
+
method = task_options.pop("extract_method", None)
|
|
864
|
+
if method is None:
|
|
865
|
+
# Let ExtractTask constructor handle default method selection
|
|
866
|
+
method = "pdfium" # Default fallback
|
|
867
|
+
|
|
868
|
+
# Build params dict for API schema
|
|
869
|
+
params = {k: v for k, v in task_options.items() if k != "document_type"}
|
|
870
|
+
|
|
871
|
+
# Map document type to API schema expected values
|
|
872
|
+
# Handle common file extension to DocumentTypeEnum mapping
|
|
873
|
+
document_type_mapping = {
|
|
874
|
+
"txt": "text",
|
|
875
|
+
"md": "text",
|
|
876
|
+
"sh": "text",
|
|
877
|
+
"json": "text",
|
|
878
|
+
"jpg": "jpeg",
|
|
879
|
+
"jpeg": "jpeg",
|
|
880
|
+
"png": "png",
|
|
881
|
+
"pdf": "pdf",
|
|
882
|
+
"docx": "docx",
|
|
883
|
+
"pptx": "pptx",
|
|
884
|
+
"html": "html",
|
|
885
|
+
"bmp": "bmp",
|
|
886
|
+
"tiff": "tiff",
|
|
887
|
+
"svg": "svg",
|
|
888
|
+
"mp3": "mp3",
|
|
889
|
+
"wav": "wav",
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
# Use mapped document type for API schema validation
|
|
893
|
+
api_document_type = document_type_mapping.get(document_type.lower(), document_type)
|
|
894
|
+
|
|
895
|
+
# Validate with API schema
|
|
896
|
+
api_task_options = {
|
|
897
|
+
"document_type": api_document_type,
|
|
898
|
+
"method": method,
|
|
899
|
+
"params": params,
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
check_schema(IngestTaskExtractSchema, api_task_options, "extract", json.dumps(api_task_options))
|
|
903
|
+
|
|
904
|
+
# Create ExtractTask with mapped document type for API schema compatibility
|
|
905
|
+
extract_task_params = {"document_type": api_document_type, "extract_method": method, **params}
|
|
906
|
+
extract_task = ExtractTask(**extract_task_params)
|
|
733
907
|
self._job_specs.add_task(extract_task, document_type=document_type)
|
|
734
908
|
|
|
735
909
|
return self
|
|
@@ -749,8 +923,27 @@ class Ingestor:
|
|
|
749
923
|
Ingestor
|
|
750
924
|
Returns self for chaining.
|
|
751
925
|
"""
|
|
752
|
-
|
|
753
|
-
|
|
926
|
+
# Restructure parameters to match API schema structure
|
|
927
|
+
params_fields = {"min_size", "max_aspect_ratio", "min_aspect_ratio", "filter"}
|
|
928
|
+
params = {k: v for k, v in kwargs.items() if k in params_fields}
|
|
929
|
+
top_level = {k: v for k, v in kwargs.items() if k not in params_fields}
|
|
930
|
+
|
|
931
|
+
# Build API schema structure
|
|
932
|
+
api_kwargs = top_level.copy()
|
|
933
|
+
if params:
|
|
934
|
+
api_kwargs["params"] = params
|
|
935
|
+
|
|
936
|
+
task_options = check_schema(IngestTaskFilterSchema, api_kwargs, "filter", json.dumps(api_kwargs))
|
|
937
|
+
|
|
938
|
+
# Extract individual parameters from API schema for FilterTask constructor
|
|
939
|
+
filter_params = {
|
|
940
|
+
"content_type": task_options.content_type,
|
|
941
|
+
"min_size": task_options.params.min_size,
|
|
942
|
+
"max_aspect_ratio": task_options.params.max_aspect_ratio,
|
|
943
|
+
"min_aspect_ratio": task_options.params.min_aspect_ratio,
|
|
944
|
+
"filter": task_options.params.filter,
|
|
945
|
+
}
|
|
946
|
+
filter_task = FilterTask(**filter_params)
|
|
754
947
|
self._job_specs.add_task(filter_task)
|
|
755
948
|
|
|
756
949
|
return self
|
|
@@ -770,7 +963,7 @@ class Ingestor:
|
|
|
770
963
|
Ingestor
|
|
771
964
|
Returns self for chaining.
|
|
772
965
|
"""
|
|
773
|
-
task_options = check_schema(
|
|
966
|
+
task_options = check_schema(IngestTaskSplitSchema, kwargs, "split", json.dumps(kwargs))
|
|
774
967
|
extract_task = SplitTask(**task_options.model_dump())
|
|
775
968
|
self._job_specs.add_task(extract_task)
|
|
776
969
|
|
|
@@ -791,8 +984,24 @@ class Ingestor:
|
|
|
791
984
|
Ingestor
|
|
792
985
|
Returns self for chaining.
|
|
793
986
|
"""
|
|
794
|
-
|
|
795
|
-
|
|
987
|
+
# Handle parameter name mapping: store_method -> method for API schema
|
|
988
|
+
if "store_method" in kwargs:
|
|
989
|
+
kwargs["method"] = kwargs.pop("store_method")
|
|
990
|
+
|
|
991
|
+
# Provide default method if not specified (matching client StoreTask behavior)
|
|
992
|
+
if "method" not in kwargs:
|
|
993
|
+
kwargs["method"] = "minio"
|
|
994
|
+
|
|
995
|
+
task_options = check_schema(IngestTaskStoreSchema, kwargs, "store", json.dumps(kwargs))
|
|
996
|
+
|
|
997
|
+
# Map API schema fields back to StoreTask constructor parameters
|
|
998
|
+
store_params = {
|
|
999
|
+
"structured": task_options.structured,
|
|
1000
|
+
"images": task_options.images,
|
|
1001
|
+
"store_method": task_options.method, # Map method back to store_method
|
|
1002
|
+
"params": task_options.params,
|
|
1003
|
+
}
|
|
1004
|
+
store_task = StoreTask(**store_params)
|
|
796
1005
|
self._job_specs.add_task(store_task)
|
|
797
1006
|
|
|
798
1007
|
return self
|
|
@@ -800,30 +1009,106 @@ class Ingestor:
|
|
|
800
1009
|
@ensure_job_specs
|
|
801
1010
|
def store_embed(self, **kwargs: Any) -> "Ingestor":
|
|
802
1011
|
"""
|
|
803
|
-
Adds a
|
|
1012
|
+
Adds a StoreEmbedTask to the batch job specification.
|
|
804
1013
|
|
|
805
1014
|
Parameters
|
|
806
1015
|
----------
|
|
807
1016
|
kwargs : dict
|
|
808
|
-
Parameters specific to the
|
|
1017
|
+
Parameters specific to the StoreEmbedTask.
|
|
809
1018
|
|
|
810
1019
|
Returns
|
|
811
1020
|
-------
|
|
812
1021
|
Ingestor
|
|
813
1022
|
Returns self for chaining.
|
|
814
1023
|
"""
|
|
815
|
-
task_options = check_schema(
|
|
1024
|
+
task_options = check_schema(IngestTaskStoreEmbedSchema, kwargs, "store_embedding", json.dumps(kwargs))
|
|
816
1025
|
store_task = StoreEmbedTask(**task_options.model_dump())
|
|
817
1026
|
self._job_specs.add_task(store_task)
|
|
818
1027
|
|
|
819
1028
|
return self
|
|
820
1029
|
|
|
821
|
-
def
|
|
1030
|
+
def udf(
|
|
1031
|
+
self,
|
|
1032
|
+
udf_function: str,
|
|
1033
|
+
udf_function_name: Optional[str] = None,
|
|
1034
|
+
phase: Optional[Union[PipelinePhase, int, str]] = None,
|
|
1035
|
+
target_stage: Optional[str] = None,
|
|
1036
|
+
run_before: bool = False,
|
|
1037
|
+
run_after: bool = False,
|
|
1038
|
+
) -> "Ingestor":
|
|
1039
|
+
"""
|
|
1040
|
+
Adds a UDFTask to the batch job specification.
|
|
1041
|
+
|
|
1042
|
+
Parameters
|
|
1043
|
+
----------
|
|
1044
|
+
udf_function : str
|
|
1045
|
+
UDF specification. Supports three formats:
|
|
1046
|
+
1. Inline function: 'def my_func(control_message): ...'
|
|
1047
|
+
2. Import path: 'my_module.my_function'
|
|
1048
|
+
3. File path: '/path/to/file.py:function_name'
|
|
1049
|
+
udf_function_name : str, optional
|
|
1050
|
+
Name of the function to execute from the UDF specification.
|
|
1051
|
+
If not provided, attempts to infer from udf_function.
|
|
1052
|
+
phase : Union[PipelinePhase, int, str], optional
|
|
1053
|
+
Pipeline phase to execute UDF. Accepts phase names ('extract', 'split', 'embed', 'response')
|
|
1054
|
+
or numbers (1-4). Cannot be used with target_stage.
|
|
1055
|
+
target_stage : str, optional
|
|
1056
|
+
Specific stage name to target for UDF execution. Cannot be used with phase.
|
|
1057
|
+
run_before : bool, optional
|
|
1058
|
+
If True and target_stage is specified, run UDF before the target stage. Default: False.
|
|
1059
|
+
run_after : bool, optional
|
|
1060
|
+
If True and target_stage is specified, run UDF after the target stage. Default: False.
|
|
1061
|
+
|
|
1062
|
+
Returns
|
|
1063
|
+
-------
|
|
1064
|
+
Ingestor
|
|
1065
|
+
Returns self for chaining.
|
|
1066
|
+
|
|
1067
|
+
Raises
|
|
1068
|
+
------
|
|
1069
|
+
ValueError
|
|
1070
|
+
If udf_function_name cannot be inferred and is not provided explicitly,
|
|
1071
|
+
or if both phase and target_stage are specified, or if neither is specified.
|
|
1072
|
+
"""
|
|
1073
|
+
# Validate mutual exclusivity of phase and target_stage
|
|
1074
|
+
if phase is not None and target_stage is not None:
|
|
1075
|
+
raise ValueError("Cannot specify both 'phase' and 'target_stage'. Please specify only one.")
|
|
1076
|
+
elif phase is None and target_stage is None:
|
|
1077
|
+
# Default to response phase for backward compatibility
|
|
1078
|
+
phase = PipelinePhase.RESPONSE
|
|
1079
|
+
|
|
1080
|
+
# Try to infer udf_function_name if not provided
|
|
1081
|
+
if udf_function_name is None:
|
|
1082
|
+
udf_function_name = infer_udf_function_name(udf_function)
|
|
1083
|
+
if udf_function_name is None:
|
|
1084
|
+
raise ValueError(
|
|
1085
|
+
f"Could not infer UDF function name from '{udf_function}'. "
|
|
1086
|
+
"Please specify 'udf_function_name' explicitly."
|
|
1087
|
+
)
|
|
1088
|
+
logger.info(f"Inferred UDF function name: {udf_function_name}")
|
|
1089
|
+
|
|
1090
|
+
# Use UDFTask constructor with explicit parameters
|
|
1091
|
+
udf_task = UDFTask(
|
|
1092
|
+
udf_function=udf_function,
|
|
1093
|
+
udf_function_name=udf_function_name,
|
|
1094
|
+
phase=phase,
|
|
1095
|
+
target_stage=target_stage,
|
|
1096
|
+
run_before=run_before,
|
|
1097
|
+
run_after=run_after,
|
|
1098
|
+
)
|
|
1099
|
+
self._job_specs.add_task(udf_task)
|
|
1100
|
+
|
|
1101
|
+
return self
|
|
1102
|
+
|
|
1103
|
+
def vdb_upload(self, purge_results_after_upload: bool = True, **kwargs: Any) -> "Ingestor":
|
|
822
1104
|
"""
|
|
823
1105
|
Adds a VdbUploadTask to the batch job specification.
|
|
824
1106
|
|
|
825
1107
|
Parameters
|
|
826
1108
|
----------
|
|
1109
|
+
purge_results_after_upload : bool, optional
|
|
1110
|
+
If True, the saved result files will be deleted from disk after a successful
|
|
1111
|
+
upload. This requires `save_to_disk()` to be active. Defaults to True
|
|
827
1112
|
kwargs : dict
|
|
828
1113
|
Parameters specific to the VdbUploadTask.
|
|
829
1114
|
|
|
@@ -842,24 +1127,100 @@ class Ingestor:
|
|
|
842
1127
|
raise ValueError(f"Invalid type for op: {type(vdb_op)}, must be type VDB or str.")
|
|
843
1128
|
|
|
844
1129
|
self._vdb_bulk_upload = vdb_op
|
|
1130
|
+
self._purge_results_after_vdb_upload = purge_results_after_upload
|
|
845
1131
|
|
|
846
1132
|
return self
|
|
847
1133
|
|
|
848
1134
|
def save_to_disk(
|
|
849
1135
|
self,
|
|
850
1136
|
output_directory: Optional[str] = None,
|
|
1137
|
+
cleanup: bool = True,
|
|
1138
|
+
compression: Optional[str] = "gzip",
|
|
851
1139
|
) -> "Ingestor":
|
|
1140
|
+
"""Configures the Ingestor to save results to disk instead of memory.
|
|
1141
|
+
|
|
1142
|
+
This method enables disk-based storage for ingestion results. When called,
|
|
1143
|
+
the `ingest()` method will write the output for each processed document to a
|
|
1144
|
+
separate JSONL file. The return value of `ingest()` will be a list of
|
|
1145
|
+
`LazyLoadedList` objects, which are memory-efficient proxies to these files.
|
|
1146
|
+
|
|
1147
|
+
The output directory can be specified directly, via an environment variable,
|
|
1148
|
+
or a temporary directory will be created automatically.
|
|
1149
|
+
|
|
1150
|
+
Parameters
|
|
1151
|
+
----------
|
|
1152
|
+
output_directory : str, optional
|
|
1153
|
+
The path to the directory where result files (.jsonl) will be saved.
|
|
1154
|
+
If not provided, it defaults to the value of the environment variable
|
|
1155
|
+
`NV_INGEST_CLIENT_SAVE_TO_DISK_OUTPUT_DIRECTORY`. If the environment
|
|
1156
|
+
variable is also not set, a temporary directory will be created.
|
|
1157
|
+
Defaults to None.
|
|
1158
|
+
cleanup : bool, optional)
|
|
1159
|
+
If True, the entire `output_directory` will be recursively deleted
|
|
1160
|
+
when the Ingestor's context is exited (i.e., when used in a `with`
|
|
1161
|
+
statement).
|
|
1162
|
+
Defaults to True.
|
|
1163
|
+
compression : str, optional
|
|
1164
|
+
The compression algorithm to use for the saved result files.
|
|
1165
|
+
Currently, the only supported value is `'gzip'`. To disable
|
|
1166
|
+
compression, set this parameter to `None`. Defaults to `'gzip'`,
|
|
1167
|
+
which significantly reduces the disk space required for results.
|
|
1168
|
+
When enabled, files are saved with a `.gz` suffix (e.g., `results.jsonl.gz`).
|
|
1169
|
+
|
|
1170
|
+
Returns
|
|
1171
|
+
-------
|
|
1172
|
+
Ingestor
|
|
1173
|
+
Returns self for chaining.
|
|
1174
|
+
"""
|
|
1175
|
+
output_directory = output_directory or os.getenv("NV_INGEST_CLIENT_SAVE_TO_DISK_OUTPUT_DIRECTORY")
|
|
1176
|
+
|
|
852
1177
|
if not output_directory:
|
|
853
1178
|
self._created_temp_output_dir = tempfile.mkdtemp(prefix="ingestor_results_")
|
|
854
1179
|
output_directory = self._created_temp_output_dir
|
|
855
1180
|
|
|
856
1181
|
self._output_config = {
|
|
857
1182
|
"output_directory": output_directory,
|
|
1183
|
+
"cleanup": cleanup,
|
|
1184
|
+
"compression": compression,
|
|
858
1185
|
}
|
|
859
1186
|
ensure_directory_with_permissions(output_directory)
|
|
860
1187
|
|
|
861
1188
|
return self
|
|
862
1189
|
|
|
1190
|
+
def _purge_saved_results(self, saved_results: List[LazyLoadedList]):
|
|
1191
|
+
"""
|
|
1192
|
+
Deletes the .jsonl files associated with the results and the temporary
|
|
1193
|
+
output directory if it was created by this Ingestor instance.
|
|
1194
|
+
"""
|
|
1195
|
+
if not self._output_config:
|
|
1196
|
+
logger.warning("Purge requested, but save_to_disk was not configured. No files to purge.")
|
|
1197
|
+
return
|
|
1198
|
+
|
|
1199
|
+
deleted_files_count = 0
|
|
1200
|
+
for result_item in saved_results:
|
|
1201
|
+
if isinstance(result_item, LazyLoadedList) and hasattr(result_item, "filepath"):
|
|
1202
|
+
filepath = result_item.filepath
|
|
1203
|
+
try:
|
|
1204
|
+
if os.path.exists(filepath):
|
|
1205
|
+
os.remove(filepath)
|
|
1206
|
+
deleted_files_count += 1
|
|
1207
|
+
logger.debug(f"Purged result file: {filepath}")
|
|
1208
|
+
except OSError as e:
|
|
1209
|
+
logger.error(f"Error purging result file {filepath}: {e}", exc_info=True)
|
|
1210
|
+
|
|
1211
|
+
logger.info(f"Purged {deleted_files_count} saved result file(s).")
|
|
1212
|
+
|
|
1213
|
+
if self._created_temp_output_dir:
|
|
1214
|
+
logger.info(f"Removing temporary output directory: {self._created_temp_output_dir}")
|
|
1215
|
+
try:
|
|
1216
|
+
shutil.rmtree(self._created_temp_output_dir)
|
|
1217
|
+
self._created_temp_output_dir = None # Reset flag after successful removal
|
|
1218
|
+
except OSError as e:
|
|
1219
|
+
logger.error(
|
|
1220
|
+
f"Error removing temporary output directory {self._created_temp_output_dir}: {e}",
|
|
1221
|
+
exc_info=True,
|
|
1222
|
+
)
|
|
1223
|
+
|
|
863
1224
|
@ensure_job_specs
|
|
864
1225
|
def caption(self, **kwargs: Any) -> "Ingestor":
|
|
865
1226
|
"""
|
|
@@ -875,12 +1236,58 @@ class Ingestor:
|
|
|
875
1236
|
Ingestor
|
|
876
1237
|
Returns self for chaining.
|
|
877
1238
|
"""
|
|
878
|
-
task_options = check_schema(
|
|
879
|
-
|
|
1239
|
+
task_options = check_schema(IngestTaskCaptionSchema, kwargs, "caption", json.dumps(kwargs))
|
|
1240
|
+
|
|
1241
|
+
# Extract individual parameters from API schema for CaptionTask constructor
|
|
1242
|
+
caption_params = {
|
|
1243
|
+
"api_key": task_options.api_key,
|
|
1244
|
+
"endpoint_url": task_options.endpoint_url,
|
|
1245
|
+
"prompt": task_options.prompt,
|
|
1246
|
+
"model_name": task_options.model_name,
|
|
1247
|
+
}
|
|
1248
|
+
caption_task = CaptionTask(**caption_params)
|
|
880
1249
|
self._job_specs.add_task(caption_task)
|
|
881
1250
|
|
|
882
1251
|
return self
|
|
883
1252
|
|
|
1253
|
+
@ensure_job_specs
|
|
1254
|
+
def pdf_split_config(self, pages_per_chunk: int = 32) -> "Ingestor":
|
|
1255
|
+
"""
|
|
1256
|
+
Configure PDF splitting behavior for V2 API.
|
|
1257
|
+
|
|
1258
|
+
Parameters
|
|
1259
|
+
----------
|
|
1260
|
+
pages_per_chunk : int, optional
|
|
1261
|
+
Number of pages per PDF chunk (default: 32)
|
|
1262
|
+
Server enforces boundaries: min=1, max=128
|
|
1263
|
+
|
|
1264
|
+
Returns
|
|
1265
|
+
-------
|
|
1266
|
+
Ingestor
|
|
1267
|
+
Self for method chaining
|
|
1268
|
+
|
|
1269
|
+
Notes
|
|
1270
|
+
-----
|
|
1271
|
+
- Only affects V2 API endpoints with PDF splitting support
|
|
1272
|
+
- Server will clamp values outside [1, 128] range
|
|
1273
|
+
- Smaller chunks = more parallelism but more overhead
|
|
1274
|
+
- Larger chunks = less overhead but reduced concurrency
|
|
1275
|
+
"""
|
|
1276
|
+
MIN_PAGES = 1
|
|
1277
|
+
MAX_PAGES = 128
|
|
1278
|
+
|
|
1279
|
+
# Warn if value will be clamped by server
|
|
1280
|
+
if pages_per_chunk < MIN_PAGES:
|
|
1281
|
+
logger.warning(f"pages_per_chunk={pages_per_chunk} is below minimum. Server will clamp to {MIN_PAGES}.")
|
|
1282
|
+
elif pages_per_chunk > MAX_PAGES:
|
|
1283
|
+
logger.warning(f"pages_per_chunk={pages_per_chunk} exceeds maximum. Server will clamp to {MAX_PAGES}.")
|
|
1284
|
+
|
|
1285
|
+
# Flatten all job specs and apply PDF config using shared utility
|
|
1286
|
+
all_job_specs = [spec for job_specs in self._job_specs._file_type_to_job_spec.values() for spec in job_specs]
|
|
1287
|
+
apply_pdf_split_config_to_job_specs(all_job_specs, pages_per_chunk)
|
|
1288
|
+
|
|
1289
|
+
return self
|
|
1290
|
+
|
|
884
1291
|
def _count_job_states(self, job_states: set[JobStateEnum]) -> int:
|
|
885
1292
|
"""
|
|
886
1293
|
Counts the jobs in specified states.
|