nv-ingest-client 2025.10.14.dev20251014__tar.gz → 2025.10.15.dev20251015__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- {nv_ingest_client-2025.10.14.dev20251014/src/nv_ingest_client.egg-info → nv_ingest_client-2025.10.15.dev20251015}/PKG-INFO +1 -1
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/client/client.py +18 -4
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/client/ingest_job_handler.py +6 -1
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/client/interface.py +39 -1
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/nv_ingest_cli.py +22 -1
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/jobs/job_spec.py +1 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/document_analysis.py +1 -1
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/util.py +26 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/vdb/milvus.py +7 -4
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/LICENSE +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/MANIFEST.in +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/README.md +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/pyproject.toml +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/setup.cfg +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/cli/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/cli/util/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/cli/util/click.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/cli/util/processing.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/cli/util/system.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/client/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/client/util/processing.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/embed.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/extract.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/udf.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/dataset.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/file_processing/extract.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/milvus.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/process_json_files.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/processing.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/system.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/transport.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client/util/zipkin.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client.egg-info/SOURCES.txt +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client.egg-info/requires.txt +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/version.py +0 -0
|
@@ -36,7 +36,11 @@ from nv_ingest_client.primitives.tasks import TaskType
|
|
|
36
36
|
from nv_ingest_client.primitives.tasks import is_valid_task_type
|
|
37
37
|
from nv_ingest_client.primitives.tasks import task_factory
|
|
38
38
|
from nv_ingest_client.util.processing import handle_future_result, IngestJobFailure
|
|
39
|
-
from nv_ingest_client.util.util import
|
|
39
|
+
from nv_ingest_client.util.util import (
|
|
40
|
+
create_job_specs_for_batch,
|
|
41
|
+
check_ingest_result,
|
|
42
|
+
apply_pdf_split_config_to_job_specs,
|
|
43
|
+
)
|
|
40
44
|
|
|
41
45
|
logger = logging.getLogger(__name__)
|
|
42
46
|
|
|
@@ -688,11 +692,12 @@ class NvIngestClient:
|
|
|
688
692
|
message_client_port : int, optional
|
|
689
693
|
Port of the REST/message service. Defaults to 7670.
|
|
690
694
|
message_client_kwargs : dict, optional
|
|
691
|
-
Extra keyword arguments passed to the client allocator.
|
|
695
|
+
Extra keyword arguments passed to the client allocator. For RestClient,
|
|
696
|
+
can include 'api_version' (e.g., 'v1' or 'v2'). Defaults to 'v1'.
|
|
692
697
|
msg_counter_id : str, optional
|
|
693
698
|
Identifier for message counting. Defaults to "nv-ingest-message-id".
|
|
694
699
|
worker_pool_size : int, optional
|
|
695
|
-
Number of workers in the thread pool. Defaults to
|
|
700
|
+
Number of workers in the thread pool. Defaults to 8.
|
|
696
701
|
|
|
697
702
|
Returns
|
|
698
703
|
-------
|
|
@@ -1707,7 +1712,9 @@ class NvIngestClient:
|
|
|
1707
1712
|
|
|
1708
1713
|
return results
|
|
1709
1714
|
|
|
1710
|
-
def create_jobs_for_batch(
|
|
1715
|
+
def create_jobs_for_batch(
|
|
1716
|
+
self, files_batch: List[str], tasks: Dict[str, Any], pdf_split_page_count: int = None
|
|
1717
|
+
) -> List[str]:
|
|
1711
1718
|
"""
|
|
1712
1719
|
Create and submit job specifications (JobSpecs) for a batch of files, returning the job IDs.
|
|
1713
1720
|
This function takes a batch of files, processes each file to extract its content and type,
|
|
@@ -1723,6 +1730,9 @@ class NvIngestClient:
|
|
|
1723
1730
|
A dictionary of tasks to be added to each job. The keys represent task names, and the
|
|
1724
1731
|
values represent task specifications or configurations. Standard tasks include "split",
|
|
1725
1732
|
"extract", "store", "caption", "dedup", "filter", "embed".
|
|
1733
|
+
pdf_split_page_count : int, optional
|
|
1734
|
+
Number of pages per PDF chunk for splitting (1-128). If provided, this will be added
|
|
1735
|
+
to the job spec's extended_options for PDF files.
|
|
1726
1736
|
|
|
1727
1737
|
Returns
|
|
1728
1738
|
-------
|
|
@@ -1769,6 +1779,10 @@ class NvIngestClient:
|
|
|
1769
1779
|
|
|
1770
1780
|
job_specs = create_job_specs_for_batch(files_batch)
|
|
1771
1781
|
|
|
1782
|
+
# Apply PDF split config if provided
|
|
1783
|
+
if pdf_split_page_count is not None:
|
|
1784
|
+
apply_pdf_split_config_to_job_specs(job_specs, pdf_split_page_count)
|
|
1785
|
+
|
|
1772
1786
|
job_ids = []
|
|
1773
1787
|
for job_spec in job_specs:
|
|
1774
1788
|
logger.debug(f"Tasks: {tasks.keys()}")
|
|
@@ -45,6 +45,7 @@ class IngestJobHandler:
|
|
|
45
45
|
show_progress: bool = True,
|
|
46
46
|
show_telemetry: bool = False,
|
|
47
47
|
job_queue_id: str = "ingest_task_queue",
|
|
48
|
+
pdf_split_page_count: int = None,
|
|
48
49
|
) -> None:
|
|
49
50
|
self.client = client
|
|
50
51
|
self.files = files
|
|
@@ -56,6 +57,7 @@ class IngestJobHandler:
|
|
|
56
57
|
self.show_progress = show_progress
|
|
57
58
|
self.show_telemetry = show_telemetry
|
|
58
59
|
self.job_queue_id = job_queue_id
|
|
60
|
+
self.pdf_split_page_count = pdf_split_page_count
|
|
59
61
|
self._pbar = None
|
|
60
62
|
# Internal state used across iterations
|
|
61
63
|
self._retry_job_ids: List[str] = []
|
|
@@ -144,7 +146,9 @@ class IngestJobHandler:
|
|
|
144
146
|
new_job_count: int = min(self.batch_size - cur_job_count, len(self.files) - self._processed)
|
|
145
147
|
batch_files: List[str] = self.files[self._processed : self._processed + new_job_count]
|
|
146
148
|
|
|
147
|
-
new_job_indices: List[str] = self.client.create_jobs_for_batch(
|
|
149
|
+
new_job_indices: List[str] = self.client.create_jobs_for_batch(
|
|
150
|
+
batch_files, self.tasks, pdf_split_page_count=self.pdf_split_page_count
|
|
151
|
+
)
|
|
148
152
|
if len(new_job_indices) != new_job_count:
|
|
149
153
|
missing_jobs: int = new_job_count - len(new_job_indices)
|
|
150
154
|
error_msg: str = (
|
|
@@ -304,6 +308,7 @@ class IngestJobHandler:
|
|
|
304
308
|
trace_ids: Dict[str, str] = defaultdict(list) # type: ignore
|
|
305
309
|
failed_jobs: List[str] = []
|
|
306
310
|
retry_counts: Dict[str, int] = defaultdict(int)
|
|
311
|
+
pages_per_sec: float = None
|
|
307
312
|
|
|
308
313
|
start_time_ns: int = time.time_ns()
|
|
309
314
|
self._init_progress_bar(total_files)
|
|
@@ -54,7 +54,7 @@ from nv_ingest_client.primitives.tasks import StoreEmbedTask
|
|
|
54
54
|
from nv_ingest_client.primitives.tasks import UDFTask
|
|
55
55
|
from nv_ingest_client.util.processing import check_schema
|
|
56
56
|
from nv_ingest_client.util.system import ensure_directory_with_permissions
|
|
57
|
-
from nv_ingest_client.util.util import filter_function_kwargs
|
|
57
|
+
from nv_ingest_client.util.util import filter_function_kwargs, apply_pdf_split_config_to_job_specs
|
|
58
58
|
from nv_ingest_client.util.vdb import VDB, get_vdb_op_cls
|
|
59
59
|
from tqdm import tqdm
|
|
60
60
|
|
|
@@ -1237,6 +1237,44 @@ class Ingestor:
|
|
|
1237
1237
|
|
|
1238
1238
|
return self
|
|
1239
1239
|
|
|
1240
|
+
@ensure_job_specs
|
|
1241
|
+
def pdf_split_config(self, pages_per_chunk: int = 32) -> "Ingestor":
|
|
1242
|
+
"""
|
|
1243
|
+
Configure PDF splitting behavior for V2 API.
|
|
1244
|
+
|
|
1245
|
+
Parameters
|
|
1246
|
+
----------
|
|
1247
|
+
pages_per_chunk : int, optional
|
|
1248
|
+
Number of pages per PDF chunk (default: 32)
|
|
1249
|
+
Server enforces boundaries: min=1, max=128
|
|
1250
|
+
|
|
1251
|
+
Returns
|
|
1252
|
+
-------
|
|
1253
|
+
Ingestor
|
|
1254
|
+
Self for method chaining
|
|
1255
|
+
|
|
1256
|
+
Notes
|
|
1257
|
+
-----
|
|
1258
|
+
- Only affects V2 API endpoints with PDF splitting support
|
|
1259
|
+
- Server will clamp values outside [1, 128] range
|
|
1260
|
+
- Smaller chunks = more parallelism but more overhead
|
|
1261
|
+
- Larger chunks = less overhead but reduced concurrency
|
|
1262
|
+
"""
|
|
1263
|
+
MIN_PAGES = 1
|
|
1264
|
+
MAX_PAGES = 128
|
|
1265
|
+
|
|
1266
|
+
# Warn if value will be clamped by server
|
|
1267
|
+
if pages_per_chunk < MIN_PAGES:
|
|
1268
|
+
logger.warning(f"pages_per_chunk={pages_per_chunk} is below minimum. Server will clamp to {MIN_PAGES}.")
|
|
1269
|
+
elif pages_per_chunk > MAX_PAGES:
|
|
1270
|
+
logger.warning(f"pages_per_chunk={pages_per_chunk} exceeds maximum. Server will clamp to {MAX_PAGES}.")
|
|
1271
|
+
|
|
1272
|
+
# Flatten all job specs and apply PDF config using shared utility
|
|
1273
|
+
all_job_specs = [spec for job_specs in self._job_specs._file_type_to_job_spec.values() for spec in job_specs]
|
|
1274
|
+
apply_pdf_split_config_to_job_specs(all_job_specs, pages_per_chunk)
|
|
1275
|
+
|
|
1276
|
+
return self
|
|
1277
|
+
|
|
1240
1278
|
def _count_job_states(self, job_states: set[JobStateEnum]) -> int:
|
|
1241
1279
|
"""
|
|
1242
1280
|
Counts the jobs in specified states.
|
|
@@ -74,6 +74,12 @@ logger = logging.getLogger(__name__)
|
|
|
74
74
|
@click.option("--client_host", default="localhost", help="DNS name or URL for the endpoint.")
|
|
75
75
|
@click.option("--client_port", default=7670, type=int, help="Port for the client endpoint.")
|
|
76
76
|
@click.option("--client_kwargs", help="Additional arguments to pass to the client.", default="{}")
|
|
77
|
+
@click.option(
|
|
78
|
+
"--api_version",
|
|
79
|
+
default="v1",
|
|
80
|
+
type=click.Choice(["v1", "v2"], case_sensitive=False),
|
|
81
|
+
help="API version to use (v1 or v2). V2 required for PDF split page count feature.",
|
|
82
|
+
)
|
|
77
83
|
@click.option(
|
|
78
84
|
"--client_type",
|
|
79
85
|
default="rest",
|
|
@@ -119,6 +125,8 @@ Example:
|
|
|
119
125
|
--task 'extract:{"document_type":"docx", "extract_text":true, "extract_images":true}'
|
|
120
126
|
--task 'embed'
|
|
121
127
|
--task 'caption:{}'
|
|
128
|
+
--pdf_split_page_count 64 # Configure PDF splitting (requires --api_version v2)
|
|
129
|
+
--api_version v2 # Use V2 API for PDF splitting support
|
|
122
130
|
|
|
123
131
|
\b
|
|
124
132
|
Tasks and Options:
|
|
@@ -207,6 +215,12 @@ for locating portions of the system that might be bottlenecks for the overall ru
|
|
|
207
215
|
)
|
|
208
216
|
@click.option("--zipkin_host", default="localhost", help="DNS name or Zipkin API.")
|
|
209
217
|
@click.option("--zipkin_port", default=9411, type=int, help="Port for the Zipkin trace API")
|
|
218
|
+
@click.option(
|
|
219
|
+
"--pdf_split_page_count",
|
|
220
|
+
default=None,
|
|
221
|
+
type=int,
|
|
222
|
+
help="Number of pages per PDF chunk for splitting. Allows per-request tuning of PDF split size in v2 api.",
|
|
223
|
+
)
|
|
210
224
|
@click.option("--version", is_flag=True, help="Show version.")
|
|
211
225
|
@click.pass_context
|
|
212
226
|
def main(
|
|
@@ -215,6 +229,7 @@ def main(
|
|
|
215
229
|
client_host: str,
|
|
216
230
|
client_kwargs: str,
|
|
217
231
|
client_port: int,
|
|
232
|
+
api_version: str,
|
|
218
233
|
client_type: str,
|
|
219
234
|
concurrency_n: int,
|
|
220
235
|
dataset: str,
|
|
@@ -228,6 +243,7 @@ def main(
|
|
|
228
243
|
collect_profiling_traces: bool,
|
|
229
244
|
zipkin_host: str,
|
|
230
245
|
zipkin_port: int,
|
|
246
|
+
pdf_split_page_count: int,
|
|
231
247
|
task: [str],
|
|
232
248
|
version: [bool],
|
|
233
249
|
):
|
|
@@ -268,6 +284,10 @@ def main(
|
|
|
268
284
|
_client_kwargs_obj = json.loads(client_kwargs)
|
|
269
285
|
except Exception:
|
|
270
286
|
_client_kwargs_obj = {"raw": client_kwargs}
|
|
287
|
+
|
|
288
|
+
# Merge api_version into client_kwargs
|
|
289
|
+
_client_kwargs_obj["api_version"] = api_version
|
|
290
|
+
|
|
271
291
|
_sanitized_client_kwargs = sanitize_for_logging(_client_kwargs_obj)
|
|
272
292
|
logging.debug(
|
|
273
293
|
f"Creating message client: {client_host} and port: {client_port} -> "
|
|
@@ -285,7 +305,7 @@ def main(
|
|
|
285
305
|
message_client_allocator=client_allocator,
|
|
286
306
|
message_client_hostname=client_host,
|
|
287
307
|
message_client_port=client_port,
|
|
288
|
-
message_client_kwargs=
|
|
308
|
+
message_client_kwargs=_client_kwargs_obj,
|
|
289
309
|
worker_pool_size=concurrency_n,
|
|
290
310
|
)
|
|
291
311
|
|
|
@@ -300,6 +320,7 @@ def main(
|
|
|
300
320
|
save_images_separately=save_images_separately,
|
|
301
321
|
show_progress=True,
|
|
302
322
|
show_telemetry=True,
|
|
323
|
+
pdf_split_page_count=pdf_split_page_count,
|
|
303
324
|
)
|
|
304
325
|
(total_files, trace_times, pages_processed, trace_ids) = handler.run()
|
|
305
326
|
|
|
@@ -110,6 +110,7 @@ class JobSpec:
|
|
|
110
110
|
"job_id": str(self._job_id),
|
|
111
111
|
"tasks": [task.to_dict() for task in self._tasks],
|
|
112
112
|
"tracing_options": self._extended_options.get("tracing_options", {}),
|
|
113
|
+
"pdf_config": self._extended_options.get("pdf_config", {}),
|
|
113
114
|
}
|
|
114
115
|
|
|
115
116
|
@property
|
|
@@ -20,7 +20,7 @@ logger = logging.getLogger(__name__)
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def analyze_document_chunks(
|
|
23
|
-
results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
|
|
23
|
+
results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]],
|
|
24
24
|
) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
25
25
|
"""
|
|
26
26
|
Analyze ingestor results to count elements by type and page for each document.
|
|
@@ -350,6 +350,32 @@ def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
|
|
|
350
350
|
return job_specs
|
|
351
351
|
|
|
352
352
|
|
|
353
|
+
def apply_pdf_split_config_to_job_specs(job_specs: List[JobSpec], pages_per_chunk: int) -> None:
|
|
354
|
+
"""
|
|
355
|
+
Apply PDF split configuration to a list of JobSpec objects.
|
|
356
|
+
|
|
357
|
+
Modifies job specs in-place by adding pdf_config to extended_options for PDF files only.
|
|
358
|
+
|
|
359
|
+
Parameters
|
|
360
|
+
----------
|
|
361
|
+
job_specs : List[JobSpec]
|
|
362
|
+
List of job specifications to potentially modify
|
|
363
|
+
pages_per_chunk : int
|
|
364
|
+
Number of pages per PDF chunk (will be stored as-is; server performs clamping)
|
|
365
|
+
|
|
366
|
+
Notes
|
|
367
|
+
-----
|
|
368
|
+
- Only modifies job specs with document_type == "pdf" (case-insensitive)
|
|
369
|
+
- Modifies job specs in-place
|
|
370
|
+
- Safe to call on mixed document types (only PDFs are affected)
|
|
371
|
+
"""
|
|
372
|
+
for job_spec in job_specs:
|
|
373
|
+
if job_spec.document_type.lower() == "pdf":
|
|
374
|
+
if "pdf_config" not in job_spec._extended_options:
|
|
375
|
+
job_spec._extended_options["pdf_config"] = {}
|
|
376
|
+
job_spec._extended_options["pdf_config"]["split_page_count"] = pages_per_chunk
|
|
377
|
+
|
|
378
|
+
|
|
353
379
|
def filter_function_kwargs(func, **kwargs):
|
|
354
380
|
"""
|
|
355
381
|
Filters and returns keyword arguments that match the parameters of a given function.
|
|
@@ -917,7 +917,9 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
|
|
|
917
917
|
break
|
|
918
918
|
# check if indexed_rows is staying the same, too many times means something is wrong
|
|
919
919
|
if new_indexed_rows == indexed_rows:
|
|
920
|
-
pos_movement
|
|
920
|
+
pos_movement -= 1
|
|
921
|
+
else:
|
|
922
|
+
pos_movement = 10
|
|
921
923
|
# if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
|
|
922
924
|
if pos_movement == 0:
|
|
923
925
|
raise ValueError("Rows are not getting indexed as expected")
|
|
@@ -1046,9 +1048,10 @@ def write_to_nvingest_collection(
|
|
|
1046
1048
|
client,
|
|
1047
1049
|
collection_name,
|
|
1048
1050
|
)
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1051
|
+
if not local_index:
|
|
1052
|
+
# Make sure all rows are indexed, decided not to wrap in a timeout because we dont
|
|
1053
|
+
# know how long this should take, it is num_elements dependent.
|
|
1054
|
+
wait_for_index(collection_name, num_elements, client)
|
|
1052
1055
|
else:
|
|
1053
1056
|
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
1054
1057
|
bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
|
|
File without changes
|
{nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/MANIFEST.in
RENAMED
|
File without changes
|
{nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/README.md
RENAMED
|
File without changes
|
{nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/pyproject.toml
RENAMED
|
File without changes
|
{nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/setup.cfg
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.15.dev20251015}/src/version.py
RENAMED
|
File without changes
|