nv-ingest-client 2025.9.26.dev20250926__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/cli/util/processing.py +0 -393
- nv_ingest_client/client/client.py +511 -205
- nv_ingest_client/client/ingest_job_handler.py +412 -0
- nv_ingest_client/client/interface.py +137 -24
- nv_ingest_client/client/util/processing.py +11 -1
- nv_ingest_client/nv_ingest_cli.py +28 -4
- nv_ingest_client/primitives/jobs/job_spec.py +1 -0
- nv_ingest_client/primitives/tasks/embed.py +16 -0
- nv_ingest_client/primitives/tasks/extract.py +1 -1
- nv_ingest_client/primitives/tasks/filter.py +1 -1
- nv_ingest_client/primitives/tasks/task_factory.py +9 -12
- nv_ingest_client/primitives/tasks/udf.py +24 -27
- nv_ingest_client/util/document_analysis.py +1 -1
- nv_ingest_client/util/util.py +26 -0
- nv_ingest_client/util/vdb/milvus.py +12 -9
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -1
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/RECORD +21 -20
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
import collections
|
|
8
8
|
import glob
|
|
9
|
+
import gzip
|
|
9
10
|
import json
|
|
10
11
|
import logging
|
|
11
12
|
import os
|
|
@@ -53,7 +54,7 @@ from nv_ingest_client.primitives.tasks import StoreEmbedTask
|
|
|
53
54
|
from nv_ingest_client.primitives.tasks import UDFTask
|
|
54
55
|
from nv_ingest_client.util.processing import check_schema
|
|
55
56
|
from nv_ingest_client.util.system import ensure_directory_with_permissions
|
|
56
|
-
from nv_ingest_client.util.util import filter_function_kwargs
|
|
57
|
+
from nv_ingest_client.util.util import filter_function_kwargs, apply_pdf_split_config_to_job_specs
|
|
57
58
|
from nv_ingest_client.util.vdb import VDB, get_vdb_op_cls
|
|
58
59
|
from tqdm import tqdm
|
|
59
60
|
|
|
@@ -93,17 +94,20 @@ def ensure_job_specs(func):
|
|
|
93
94
|
|
|
94
95
|
|
|
95
96
|
class LazyLoadedList(collections.abc.Sequence):
|
|
96
|
-
def __init__(self, filepath: str, expected_len: Optional[int] = None):
|
|
97
|
+
def __init__(self, filepath: str, expected_len: Optional[int] = None, compression: Optional[str] = None):
|
|
97
98
|
self.filepath = filepath
|
|
98
99
|
self._len: Optional[int] = expected_len # Store pre-calculated length
|
|
99
100
|
self._offsets: Optional[List[int]] = None
|
|
101
|
+
self.compression = compression
|
|
100
102
|
|
|
101
103
|
if self._len == 0:
|
|
102
104
|
self._offsets = []
|
|
103
105
|
|
|
106
|
+
self._open = gzip.open if self.compression == "gzip" else open
|
|
107
|
+
|
|
104
108
|
def __iter__(self) -> Iterator[Any]:
|
|
105
109
|
try:
|
|
106
|
-
with
|
|
110
|
+
with self._open(self.filepath, "rt", encoding="utf-8") as f:
|
|
107
111
|
for line in f:
|
|
108
112
|
yield json.loads(line)
|
|
109
113
|
except FileNotFoundError:
|
|
@@ -120,7 +124,7 @@ class LazyLoadedList(collections.abc.Sequence):
|
|
|
120
124
|
self._offsets = []
|
|
121
125
|
line_count = 0
|
|
122
126
|
try:
|
|
123
|
-
with
|
|
127
|
+
with self._open(self.filepath, "rb") as f:
|
|
124
128
|
while True:
|
|
125
129
|
current_pos = f.tell()
|
|
126
130
|
line = f.readline()
|
|
@@ -144,10 +148,12 @@ class LazyLoadedList(collections.abc.Sequence):
|
|
|
144
148
|
def __len__(self) -> int:
|
|
145
149
|
if self._len is not None:
|
|
146
150
|
return self._len
|
|
151
|
+
|
|
147
152
|
if self._offsets is not None:
|
|
148
153
|
self._len = len(self._offsets)
|
|
149
154
|
return self._len
|
|
150
155
|
self._build_index()
|
|
156
|
+
|
|
151
157
|
return self._len if self._len is not None else 0
|
|
152
158
|
|
|
153
159
|
def __getitem__(self, idx: int) -> Any:
|
|
@@ -170,7 +176,7 @@ class LazyLoadedList(collections.abc.Sequence):
|
|
|
170
176
|
raise IndexError(f"Index {idx} out of range for {self.filepath} (len: {len(self._offsets)})")
|
|
171
177
|
|
|
172
178
|
try:
|
|
173
|
-
with
|
|
179
|
+
with self._open(self.filepath, "rb") as f:
|
|
174
180
|
f.seek(self._offsets[idx])
|
|
175
181
|
line_bytes = f.readline()
|
|
176
182
|
return json.loads(line_bytes.decode("utf-8"))
|
|
@@ -396,15 +402,9 @@ class Ingestor:
|
|
|
396
402
|
show_progress: bool = False,
|
|
397
403
|
return_failures: bool = False,
|
|
398
404
|
save_to_disk: bool = False,
|
|
405
|
+
return_traces: bool = False,
|
|
399
406
|
**kwargs: Any,
|
|
400
|
-
) -> Union[
|
|
401
|
-
List[List[Dict[str, Any]]], # In-memory: List of (response['data'] for each doc)
|
|
402
|
-
List[LazyLoadedList], # Disk: List of proxies, one per original doc
|
|
403
|
-
Tuple[
|
|
404
|
-
Union[List[List[Dict[str, Any]]], List[LazyLoadedList]],
|
|
405
|
-
List[Tuple[str, str]],
|
|
406
|
-
],
|
|
407
|
-
]: # noqa: E501
|
|
407
|
+
) -> Union[List[Any], Tuple[Any, ...]]:
|
|
408
408
|
"""
|
|
409
409
|
Ingest documents by submitting jobs and fetching results concurrently.
|
|
410
410
|
|
|
@@ -414,22 +414,36 @@ class Ingestor:
|
|
|
414
414
|
Whether to display a progress bar. Default is False.
|
|
415
415
|
return_failures : bool, optional
|
|
416
416
|
If True, return a tuple (results, failures); otherwise, return only results. Default is False.
|
|
417
|
+
save_to_disk : bool, optional
|
|
418
|
+
If True, save results to disk and return LazyLoadedList proxies. Default is False.
|
|
419
|
+
return_traces : bool, optional
|
|
420
|
+
If True, return trace metrics alongside results. Default is False.
|
|
421
|
+
Traces contain timing metrics (entry, exit, resident_time) for each stage.
|
|
417
422
|
**kwargs : Any
|
|
418
|
-
Additional keyword arguments for the underlying client methods.
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
process_jobs_concurrently.
|
|
423
|
+
Additional keyword arguments for the underlying client methods.
|
|
424
|
+
Optional flags include `include_parent_trace_ids=True` to also return
|
|
425
|
+
parent job trace identifiers (V2 API only).
|
|
422
426
|
|
|
423
427
|
Returns
|
|
424
428
|
-------
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
+
list or tuple
|
|
430
|
+
Returns vary based on flags:
|
|
431
|
+
- Default: list of results
|
|
432
|
+
- return_failures=True: (results, failures)
|
|
433
|
+
- return_traces=True: (results, traces)
|
|
434
|
+
- return_failures=True, return_traces=True: (results, failures, traces)
|
|
435
|
+
- Additional combinations with include_parent_trace_ids kwarg
|
|
436
|
+
|
|
437
|
+
Notes
|
|
438
|
+
-----
|
|
439
|
+
Trace metrics include timing data for each processing stage. For detailed
|
|
440
|
+
usage and examples, see src/nv_ingest/api/v2/README.md
|
|
429
441
|
"""
|
|
430
442
|
if save_to_disk and (not self._output_config):
|
|
431
443
|
self.save_to_disk()
|
|
432
444
|
|
|
445
|
+
include_parent_trace_ids = bool(kwargs.pop("include_parent_trace_ids", False))
|
|
446
|
+
|
|
433
447
|
self._prepare_ingest_run()
|
|
434
448
|
|
|
435
449
|
# Add jobs locally first
|
|
@@ -455,6 +469,8 @@ class Ingestor:
|
|
|
455
469
|
clean_source_basename = get_valid_filename(os.path.basename(source_name))
|
|
456
470
|
file_name, file_ext = os.path.splitext(clean_source_basename)
|
|
457
471
|
file_suffix = f".{file_ext.strip('.')}.results.jsonl"
|
|
472
|
+
if self._output_config["compression"] == "gzip":
|
|
473
|
+
file_suffix += ".gz"
|
|
458
474
|
jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
|
|
459
475
|
|
|
460
476
|
num_items_saved = save_document_results_to_jsonl(
|
|
@@ -462,10 +478,13 @@ class Ingestor:
|
|
|
462
478
|
jsonl_filepath,
|
|
463
479
|
source_name,
|
|
464
480
|
ensure_parent_dir_exists=False,
|
|
481
|
+
compression=self._output_config["compression"],
|
|
465
482
|
)
|
|
466
483
|
|
|
467
484
|
if num_items_saved > 0:
|
|
468
|
-
results = LazyLoadedList(
|
|
485
|
+
results = LazyLoadedList(
|
|
486
|
+
jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
|
|
487
|
+
)
|
|
469
488
|
if results_lock:
|
|
470
489
|
with results_lock:
|
|
471
490
|
final_results_payload_list.append(results)
|
|
@@ -538,7 +557,24 @@ class Ingestor:
|
|
|
538
557
|
|
|
539
558
|
proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently, **kwargs)
|
|
540
559
|
|
|
541
|
-
|
|
560
|
+
# Telemetry controls (optional)
|
|
561
|
+
enable_telemetry: Optional[bool] = kwargs.pop("enable_telemetry", None)
|
|
562
|
+
show_telemetry: Optional[bool] = kwargs.pop("show_telemetry", None)
|
|
563
|
+
if show_telemetry is None:
|
|
564
|
+
# Fallback to env NV_INGEST_CLIENT_SHOW_TELEMETRY (0/1), default off
|
|
565
|
+
try:
|
|
566
|
+
show_telemetry = bool(int(os.getenv("NV_INGEST_CLIENT_SHOW_TELEMETRY", "0")))
|
|
567
|
+
except ValueError:
|
|
568
|
+
show_telemetry = False
|
|
569
|
+
# If user explicitly wants to show telemetry but did not specify enable_telemetry,
|
|
570
|
+
# ensure collection is enabled so summary isn't empty.
|
|
571
|
+
if enable_telemetry is None and show_telemetry:
|
|
572
|
+
enable_telemetry = True
|
|
573
|
+
if enable_telemetry is not None and hasattr(self._client, "enable_telemetry"):
|
|
574
|
+
self._client.enable_telemetry(bool(enable_telemetry))
|
|
575
|
+
|
|
576
|
+
# Call process_jobs_concurrently
|
|
577
|
+
proc_result = self._client.process_jobs_concurrently(
|
|
542
578
|
job_indices=self._job_ids,
|
|
543
579
|
job_queue_id=self._job_queue_id,
|
|
544
580
|
timeout=timeout,
|
|
@@ -547,9 +583,17 @@ class Ingestor:
|
|
|
547
583
|
return_failures=True,
|
|
548
584
|
stream_to_callback_only=stream_to_callback_only,
|
|
549
585
|
verbose=verbose,
|
|
586
|
+
return_traces=return_traces,
|
|
550
587
|
**proc_kwargs,
|
|
551
588
|
)
|
|
552
589
|
|
|
590
|
+
# Unpack result based on return_traces flag
|
|
591
|
+
if return_traces:
|
|
592
|
+
results, failures, traces_list = proc_result
|
|
593
|
+
else:
|
|
594
|
+
results, failures = proc_result
|
|
595
|
+
traces_list = [] # Empty list when traces not requested
|
|
596
|
+
|
|
553
597
|
if show_progress and pbar:
|
|
554
598
|
pbar.close()
|
|
555
599
|
|
|
@@ -600,7 +644,30 @@ class Ingestor:
|
|
|
600
644
|
logger.info("Purging saved results from disk after successful VDB upload.")
|
|
601
645
|
self._purge_saved_results(results)
|
|
602
646
|
|
|
603
|
-
|
|
647
|
+
# Print telemetry summary if requested
|
|
648
|
+
if show_telemetry:
|
|
649
|
+
try:
|
|
650
|
+
summary = self._client.summarize_telemetry()
|
|
651
|
+
# Print to stdout and log for convenience
|
|
652
|
+
print("NvIngestClient Telemetry Summary:", json.dumps(summary, indent=2))
|
|
653
|
+
logger.info("NvIngestClient Telemetry Summary: %s", json.dumps(summary, indent=2))
|
|
654
|
+
except Exception:
|
|
655
|
+
pass
|
|
656
|
+
|
|
657
|
+
parent_trace_ids = self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
|
|
658
|
+
|
|
659
|
+
# Build return tuple based on requested outputs
|
|
660
|
+
# Order: results, failures (if requested), traces (if requested), parent_trace_ids (if requested)
|
|
661
|
+
returns = [results]
|
|
662
|
+
|
|
663
|
+
if return_failures:
|
|
664
|
+
returns.append(failures)
|
|
665
|
+
if return_traces:
|
|
666
|
+
returns.append(traces_list)
|
|
667
|
+
if include_parent_trace_ids:
|
|
668
|
+
returns.append(parent_trace_ids)
|
|
669
|
+
|
|
670
|
+
return tuple(returns) if len(returns) > 1 else results
|
|
604
671
|
|
|
605
672
|
def ingest_async(self, **kwargs: Any) -> Future:
|
|
606
673
|
"""
|
|
@@ -1068,6 +1135,7 @@ class Ingestor:
|
|
|
1068
1135
|
self,
|
|
1069
1136
|
output_directory: Optional[str] = None,
|
|
1070
1137
|
cleanup: bool = True,
|
|
1138
|
+
compression: Optional[str] = "gzip",
|
|
1071
1139
|
) -> "Ingestor":
|
|
1072
1140
|
"""Configures the Ingestor to save results to disk instead of memory.
|
|
1073
1141
|
|
|
@@ -1092,6 +1160,12 @@ class Ingestor:
|
|
|
1092
1160
|
when the Ingestor's context is exited (i.e., when used in a `with`
|
|
1093
1161
|
statement).
|
|
1094
1162
|
Defaults to True.
|
|
1163
|
+
compression : str, optional
|
|
1164
|
+
The compression algorithm to use for the saved result files.
|
|
1165
|
+
Currently, the only supported value is `'gzip'`. To disable
|
|
1166
|
+
compression, set this parameter to `None`. Defaults to `'gzip'`,
|
|
1167
|
+
which significantly reduces the disk space required for results.
|
|
1168
|
+
When enabled, files are saved with a `.gz` suffix (e.g., `results.jsonl.gz`).
|
|
1095
1169
|
|
|
1096
1170
|
Returns
|
|
1097
1171
|
-------
|
|
@@ -1107,6 +1181,7 @@ class Ingestor:
|
|
|
1107
1181
|
self._output_config = {
|
|
1108
1182
|
"output_directory": output_directory,
|
|
1109
1183
|
"cleanup": cleanup,
|
|
1184
|
+
"compression": compression,
|
|
1110
1185
|
}
|
|
1111
1186
|
ensure_directory_with_permissions(output_directory)
|
|
1112
1187
|
|
|
@@ -1175,6 +1250,44 @@ class Ingestor:
|
|
|
1175
1250
|
|
|
1176
1251
|
return self
|
|
1177
1252
|
|
|
1253
|
+
@ensure_job_specs
|
|
1254
|
+
def pdf_split_config(self, pages_per_chunk: int = 32) -> "Ingestor":
|
|
1255
|
+
"""
|
|
1256
|
+
Configure PDF splitting behavior for V2 API.
|
|
1257
|
+
|
|
1258
|
+
Parameters
|
|
1259
|
+
----------
|
|
1260
|
+
pages_per_chunk : int, optional
|
|
1261
|
+
Number of pages per PDF chunk (default: 32)
|
|
1262
|
+
Server enforces boundaries: min=1, max=128
|
|
1263
|
+
|
|
1264
|
+
Returns
|
|
1265
|
+
-------
|
|
1266
|
+
Ingestor
|
|
1267
|
+
Self for method chaining
|
|
1268
|
+
|
|
1269
|
+
Notes
|
|
1270
|
+
-----
|
|
1271
|
+
- Only affects V2 API endpoints with PDF splitting support
|
|
1272
|
+
- Server will clamp values outside [1, 128] range
|
|
1273
|
+
- Smaller chunks = more parallelism but more overhead
|
|
1274
|
+
- Larger chunks = less overhead but reduced concurrency
|
|
1275
|
+
"""
|
|
1276
|
+
MIN_PAGES = 1
|
|
1277
|
+
MAX_PAGES = 128
|
|
1278
|
+
|
|
1279
|
+
# Warn if value will be clamped by server
|
|
1280
|
+
if pages_per_chunk < MIN_PAGES:
|
|
1281
|
+
logger.warning(f"pages_per_chunk={pages_per_chunk} is below minimum. Server will clamp to {MIN_PAGES}.")
|
|
1282
|
+
elif pages_per_chunk > MAX_PAGES:
|
|
1283
|
+
logger.warning(f"pages_per_chunk={pages_per_chunk} exceeds maximum. Server will clamp to {MAX_PAGES}.")
|
|
1284
|
+
|
|
1285
|
+
# Flatten all job specs and apply PDF config using shared utility
|
|
1286
|
+
all_job_specs = [spec for job_specs in self._job_specs._file_type_to_job_spec.values() for spec in job_specs]
|
|
1287
|
+
apply_pdf_split_config_to_job_specs(all_job_specs, pages_per_chunk)
|
|
1288
|
+
|
|
1289
|
+
return self
|
|
1290
|
+
|
|
1178
1291
|
def _count_job_states(self, job_states: set[JobStateEnum]) -> int:
|
|
1179
1292
|
"""
|
|
1180
1293
|
Counts the jobs in specified states.
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import gzip
|
|
1
2
|
import io
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
@@ -6,6 +7,7 @@ import re
|
|
|
6
7
|
from typing import Any
|
|
7
8
|
from typing import Dict
|
|
8
9
|
from typing import List
|
|
10
|
+
from typing import Optional
|
|
9
11
|
from typing import Tuple
|
|
10
12
|
|
|
11
13
|
try:
|
|
@@ -33,6 +35,7 @@ def save_document_results_to_jsonl(
|
|
|
33
35
|
jsonl_output_filepath: str,
|
|
34
36
|
original_source_name_for_log: str,
|
|
35
37
|
ensure_parent_dir_exists: bool = True,
|
|
38
|
+
compression: Optional[str] = None,
|
|
36
39
|
) -> Tuple[int, Dict[str, str]]:
|
|
37
40
|
"""
|
|
38
41
|
Saves a list of extraction items (for a single source document) to a JSON Lines file.
|
|
@@ -50,6 +53,13 @@ def save_document_results_to_jsonl(
|
|
|
50
53
|
if parent_dir:
|
|
51
54
|
os.makedirs(parent_dir, exist_ok=True)
|
|
52
55
|
|
|
56
|
+
if compression == "gzip":
|
|
57
|
+
open_func = gzip.open
|
|
58
|
+
elif compression is None:
|
|
59
|
+
open_func = open
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError(f"Unsupported compression type: {compression}")
|
|
62
|
+
|
|
53
63
|
with io.BytesIO() as buffer:
|
|
54
64
|
for extraction_item in doc_response_data:
|
|
55
65
|
if USING_ORJSON:
|
|
@@ -60,7 +70,7 @@ def save_document_results_to_jsonl(
|
|
|
60
70
|
|
|
61
71
|
count_items_written = len(doc_response_data)
|
|
62
72
|
|
|
63
|
-
with
|
|
73
|
+
with open_func(jsonl_output_filepath, "wb") as f_jsonl:
|
|
64
74
|
f_jsonl.write(full_byte_content)
|
|
65
75
|
|
|
66
76
|
logger.info(
|
|
@@ -25,10 +25,10 @@ from nv_ingest_client.cli.util.click import click_match_and_validate_files
|
|
|
25
25
|
from nv_ingest_client.cli.util.click import click_validate_batch_size
|
|
26
26
|
from nv_ingest_client.cli.util.click import click_validate_file_exists
|
|
27
27
|
from nv_ingest_client.cli.util.click import click_validate_task
|
|
28
|
-
from nv_ingest_client.cli.util.processing import create_and_process_jobs
|
|
29
28
|
from nv_ingest_client.cli.util.processing import report_statistics
|
|
30
29
|
from nv_ingest_client.cli.util.system import configure_logging
|
|
31
30
|
from nv_ingest_client.client import NvIngestClient
|
|
31
|
+
from nv_ingest_client.client.ingest_job_handler import IngestJobHandler
|
|
32
32
|
from nv_ingest_client.util.dataset import get_dataset_files
|
|
33
33
|
from nv_ingest_client.util.dataset import get_dataset_statistics
|
|
34
34
|
from nv_ingest_client.util.system import ensure_directory_with_permissions
|
|
@@ -74,6 +74,12 @@ logger = logging.getLogger(__name__)
|
|
|
74
74
|
@click.option("--client_host", default="localhost", help="DNS name or URL for the endpoint.")
|
|
75
75
|
@click.option("--client_port", default=7670, type=int, help="Port for the client endpoint.")
|
|
76
76
|
@click.option("--client_kwargs", help="Additional arguments to pass to the client.", default="{}")
|
|
77
|
+
@click.option(
|
|
78
|
+
"--api_version",
|
|
79
|
+
default="v1",
|
|
80
|
+
type=click.Choice(["v1", "v2"], case_sensitive=False),
|
|
81
|
+
help="API version to use (v1 or v2). V2 required for PDF split page count feature.",
|
|
82
|
+
)
|
|
77
83
|
@click.option(
|
|
78
84
|
"--client_type",
|
|
79
85
|
default="rest",
|
|
@@ -119,6 +125,8 @@ Example:
|
|
|
119
125
|
--task 'extract:{"document_type":"docx", "extract_text":true, "extract_images":true}'
|
|
120
126
|
--task 'embed'
|
|
121
127
|
--task 'caption:{}'
|
|
128
|
+
--pdf_split_page_count 64 # Configure PDF splitting (requires --api_version v2)
|
|
129
|
+
--api_version v2 # Use V2 API for PDF splitting support
|
|
122
130
|
|
|
123
131
|
\b
|
|
124
132
|
Tasks and Options:
|
|
@@ -207,6 +215,12 @@ for locating portions of the system that might be bottlenecks for the overall ru
|
|
|
207
215
|
)
|
|
208
216
|
@click.option("--zipkin_host", default="localhost", help="DNS name or Zipkin API.")
|
|
209
217
|
@click.option("--zipkin_port", default=9411, type=int, help="Port for the Zipkin trace API")
|
|
218
|
+
@click.option(
|
|
219
|
+
"--pdf_split_page_count",
|
|
220
|
+
default=None,
|
|
221
|
+
type=int,
|
|
222
|
+
help="Number of pages per PDF chunk for splitting. Allows per-request tuning of PDF split size in v2 api.",
|
|
223
|
+
)
|
|
210
224
|
@click.option("--version", is_flag=True, help="Show version.")
|
|
211
225
|
@click.pass_context
|
|
212
226
|
def main(
|
|
@@ -215,6 +229,7 @@ def main(
|
|
|
215
229
|
client_host: str,
|
|
216
230
|
client_kwargs: str,
|
|
217
231
|
client_port: int,
|
|
232
|
+
api_version: str,
|
|
218
233
|
client_type: str,
|
|
219
234
|
concurrency_n: int,
|
|
220
235
|
dataset: str,
|
|
@@ -228,6 +243,7 @@ def main(
|
|
|
228
243
|
collect_profiling_traces: bool,
|
|
229
244
|
zipkin_host: str,
|
|
230
245
|
zipkin_port: int,
|
|
246
|
+
pdf_split_page_count: int,
|
|
231
247
|
task: [str],
|
|
232
248
|
version: [bool],
|
|
233
249
|
):
|
|
@@ -268,6 +284,10 @@ def main(
|
|
|
268
284
|
_client_kwargs_obj = json.loads(client_kwargs)
|
|
269
285
|
except Exception:
|
|
270
286
|
_client_kwargs_obj = {"raw": client_kwargs}
|
|
287
|
+
|
|
288
|
+
# Merge api_version into client_kwargs
|
|
289
|
+
_client_kwargs_obj["api_version"] = api_version
|
|
290
|
+
|
|
271
291
|
_sanitized_client_kwargs = sanitize_for_logging(_client_kwargs_obj)
|
|
272
292
|
logging.debug(
|
|
273
293
|
f"Creating message client: {client_host} and port: {client_port} -> "
|
|
@@ -285,20 +305,24 @@ def main(
|
|
|
285
305
|
message_client_allocator=client_allocator,
|
|
286
306
|
message_client_hostname=client_host,
|
|
287
307
|
message_client_port=client_port,
|
|
288
|
-
message_client_kwargs=
|
|
308
|
+
message_client_kwargs=_client_kwargs_obj,
|
|
289
309
|
worker_pool_size=concurrency_n,
|
|
290
310
|
)
|
|
291
311
|
|
|
292
312
|
start_time_ns = time.time_ns()
|
|
293
|
-
|
|
294
|
-
files=docs,
|
|
313
|
+
handler = IngestJobHandler(
|
|
295
314
|
client=ingest_client,
|
|
315
|
+
files=docs,
|
|
296
316
|
tasks=task,
|
|
297
317
|
output_directory=output_directory,
|
|
298
318
|
batch_size=batch_size,
|
|
299
319
|
fail_on_error=fail_on_error,
|
|
300
320
|
save_images_separately=save_images_separately,
|
|
321
|
+
show_progress=True,
|
|
322
|
+
show_telemetry=True,
|
|
323
|
+
pdf_split_page_count=pdf_split_page_count,
|
|
301
324
|
)
|
|
325
|
+
(total_files, trace_times, pages_processed, trace_ids) = handler.run()
|
|
302
326
|
|
|
303
327
|
report_statistics(start_time_ns, trace_times, pages_processed, total_files)
|
|
304
328
|
|
|
@@ -110,6 +110,7 @@ class JobSpec:
|
|
|
110
110
|
"job_id": str(self._job_id),
|
|
111
111
|
"tasks": [task.to_dict() for task in self._tasks],
|
|
112
112
|
"tracing_options": self._extended_options.get("tracing_options", {}),
|
|
113
|
+
"pdf_config": self._extended_options.get("pdf_config", {}),
|
|
113
114
|
}
|
|
114
115
|
|
|
115
116
|
@property
|
|
@@ -36,6 +36,8 @@ class EmbedTask(Task):
|
|
|
36
36
|
image_elements_modality: Optional[str] = None,
|
|
37
37
|
structured_elements_modality: Optional[str] = None,
|
|
38
38
|
audio_elements_modality: Optional[str] = None,
|
|
39
|
+
custom_content_field: Optional[str] = None,
|
|
40
|
+
result_target_field: Optional[str] = None,
|
|
39
41
|
) -> None:
|
|
40
42
|
"""
|
|
41
43
|
Initialize the EmbedTask configuration.
|
|
@@ -76,6 +78,8 @@ class EmbedTask(Task):
|
|
|
76
78
|
image_elements_modality=image_elements_modality,
|
|
77
79
|
structured_elements_modality=structured_elements_modality,
|
|
78
80
|
audio_elements_modality=audio_elements_modality,
|
|
81
|
+
custom_content_field=custom_content_field,
|
|
82
|
+
result_target_field=result_target_field,
|
|
79
83
|
)
|
|
80
84
|
|
|
81
85
|
self._endpoint_url = validated_data.endpoint_url
|
|
@@ -86,6 +90,8 @@ class EmbedTask(Task):
|
|
|
86
90
|
self._image_elements_modality = validated_data.image_elements_modality
|
|
87
91
|
self._structured_elements_modality = validated_data.structured_elements_modality
|
|
88
92
|
self._audio_elements_modality = validated_data.audio_elements_modality
|
|
93
|
+
self._custom_content_field = validated_data.custom_content_field
|
|
94
|
+
self._result_target_field = validated_data.result_target_field
|
|
89
95
|
|
|
90
96
|
def __str__(self) -> str:
|
|
91
97
|
"""
|
|
@@ -114,6 +120,10 @@ class EmbedTask(Task):
|
|
|
114
120
|
info += f" structured_elements_modality: {self._structured_elements_modality}\n"
|
|
115
121
|
if self._audio_elements_modality:
|
|
116
122
|
info += f" audio_elements_modality: {self._audio_elements_modality}\n"
|
|
123
|
+
if self._custom_content_field:
|
|
124
|
+
info += f" custom_content_field: {self._custom_content_field}\n"
|
|
125
|
+
if self._result_target_field:
|
|
126
|
+
info += f" result_target_field: {self.result_target_field}\n"
|
|
117
127
|
return info
|
|
118
128
|
|
|
119
129
|
def to_dict(self) -> Dict[str, Any]:
|
|
@@ -149,4 +159,10 @@ class EmbedTask(Task):
|
|
|
149
159
|
if self._audio_elements_modality:
|
|
150
160
|
task_properties["audio_elements_modality"] = self._audio_elements_modality
|
|
151
161
|
|
|
162
|
+
if self._custom_content_field:
|
|
163
|
+
task_properties["custom_content_field"] = self._custom_content_field
|
|
164
|
+
|
|
165
|
+
if self._result_target_field:
|
|
166
|
+
task_properties["result_target_field"] = self.result_target_field
|
|
167
|
+
|
|
152
168
|
return {"type": "embed", "task_properties": task_properties}
|
|
@@ -86,7 +86,7 @@ class ExtractTask(Task):
|
|
|
86
86
|
extract_page_as_image: bool = False,
|
|
87
87
|
text_depth: str = "document",
|
|
88
88
|
paddle_output_format: str = "pseudo_markdown",
|
|
89
|
-
table_output_format: str = "
|
|
89
|
+
table_output_format: str = "markdown",
|
|
90
90
|
) -> None:
|
|
91
91
|
"""
|
|
92
92
|
Setup Extract Task Config
|
|
@@ -8,18 +8,15 @@ from typing import Dict
|
|
|
8
8
|
from typing import Type
|
|
9
9
|
from typing import Union
|
|
10
10
|
|
|
11
|
-
from .
|
|
12
|
-
from .
|
|
13
|
-
from .
|
|
14
|
-
from .
|
|
15
|
-
from .
|
|
16
|
-
from .
|
|
17
|
-
from .
|
|
18
|
-
from .store import StoreTask
|
|
19
|
-
from .
|
|
20
|
-
from .task_base import TaskType
|
|
21
|
-
from .task_base import is_valid_task_type
|
|
22
|
-
from .udf import UDFTask
|
|
11
|
+
from nv_ingest_client.primitives.tasks.task_base import Task, TaskType, is_valid_task_type
|
|
12
|
+
from nv_ingest_client.primitives.tasks.caption import CaptionTask
|
|
13
|
+
from nv_ingest_client.primitives.tasks.dedup import DedupTask
|
|
14
|
+
from nv_ingest_client.primitives.tasks.embed import EmbedTask
|
|
15
|
+
from nv_ingest_client.primitives.tasks.extract import ExtractTask
|
|
16
|
+
from nv_ingest_client.primitives.tasks.filter import FilterTask
|
|
17
|
+
from nv_ingest_client.primitives.tasks.split import SplitTask
|
|
18
|
+
from nv_ingest_client.primitives.tasks.store import StoreEmbedTask, StoreTask
|
|
19
|
+
from nv_ingest_client.primitives.tasks.udf import UDFTask
|
|
23
20
|
|
|
24
21
|
|
|
25
22
|
class TaskUnimplemented(Task):
|
|
@@ -11,6 +11,7 @@ import logging
|
|
|
11
11
|
import importlib
|
|
12
12
|
import inspect
|
|
13
13
|
import ast
|
|
14
|
+
import re
|
|
14
15
|
from typing import Dict, Optional, Union
|
|
15
16
|
|
|
16
17
|
from nv_ingest_api.internal.enums.common import PipelinePhase
|
|
@@ -122,54 +123,50 @@ def _resolve_udf_function(udf_function_spec: str) -> str:
|
|
|
122
123
|
3. File path: '/path/to/file.py:my_function'
|
|
123
124
|
4. Legacy import path: 'my_module.my_function' (function name only, no imports)
|
|
124
125
|
"""
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
126
|
+
# Default to treating as inline unless it clearly matches a
|
|
127
|
+
# module/file specification. This avoids misclassifying inline code that
|
|
128
|
+
# contains colons, imports, or annotations before the def line.
|
|
128
129
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
130
|
+
spec = udf_function_spec.strip()
|
|
131
|
+
|
|
132
|
+
# 1) File path with function: /path/to/file.py:function_name
|
|
133
|
+
if ".py:" in spec:
|
|
134
|
+
file_path, function_name = spec.split(":", 1)
|
|
132
135
|
return _extract_function_with_context(file_path, function_name)
|
|
133
136
|
|
|
134
|
-
|
|
135
|
-
|
|
137
|
+
# 2) File path without function name is an explicit error
|
|
138
|
+
if spec.endswith(".py"):
|
|
136
139
|
raise ValueError(
|
|
137
|
-
f"File path '{udf_function_spec}' is missing function name. "
|
|
138
|
-
f"Use format 'file.py:function_name' to specify which function to use."
|
|
140
|
+
f"File path '{udf_function_spec}' is missing function name. Use format 'file.py:function_name'."
|
|
139
141
|
)
|
|
140
142
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
143
|
+
# 3) Module path with colon: my.module:function
|
|
144
|
+
# Be strict: only letters, numbers, underscore, and dots on the left; valid identifier on the right;
|
|
145
|
+
# no whitespace/newlines.
|
|
146
|
+
module_colon_pattern = re.compile(r"^[A-Za-z_][\w\.]*:[A-Za-z_][\w]*$")
|
|
147
|
+
if module_colon_pattern.match(spec):
|
|
148
|
+
module_path, function_name = spec.split(":", 1)
|
|
146
149
|
try:
|
|
147
|
-
# Import the module to get its file path
|
|
148
150
|
module = importlib.import_module(module_path)
|
|
149
151
|
module_file = inspect.getfile(module)
|
|
150
|
-
|
|
151
|
-
# Extract the function with full module context
|
|
152
152
|
return _extract_function_with_context(module_file, function_name)
|
|
153
|
-
|
|
154
153
|
except ImportError as e:
|
|
155
154
|
raise ValueError(f"Failed to import module '{module_path}': {e}")
|
|
156
155
|
except Exception as e:
|
|
157
156
|
raise ValueError(f"Failed to resolve module path '{module_path}': {e}")
|
|
158
157
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
func = _load_function_from_import_path(
|
|
163
|
-
|
|
164
|
-
# Get the source code of the function only
|
|
158
|
+
# 4) Legacy import path: my.module.function (no colon)
|
|
159
|
+
legacy_import_pattern = re.compile(r"^[A-Za-z_][\w\.]*\.[A-Za-z_][\w]*$")
|
|
160
|
+
if legacy_import_pattern.match(spec):
|
|
161
|
+
func = _load_function_from_import_path(spec)
|
|
165
162
|
try:
|
|
166
163
|
source = inspect.getsource(func)
|
|
167
164
|
return source
|
|
168
165
|
except (OSError, TypeError) as e:
|
|
169
166
|
raise ValueError(f"Could not get source code for function from '{udf_function_spec}': {e}")
|
|
170
167
|
|
|
171
|
-
|
|
172
|
-
|
|
168
|
+
# 5) Default: treat as inline UDF source (entire string)
|
|
169
|
+
return udf_function_spec
|
|
173
170
|
|
|
174
171
|
|
|
175
172
|
class UDFTask(Task):
|
|
@@ -20,7 +20,7 @@ logger = logging.getLogger(__name__)
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def analyze_document_chunks(
|
|
23
|
-
results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
|
|
23
|
+
results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]],
|
|
24
24
|
) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
25
25
|
"""
|
|
26
26
|
Analyze ingestor results to count elements by type and page for each document.
|