nv-ingest-client 2025.10.7.dev20251007__py3-none-any.whl → 2025.10.9.dev20251009__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/cli/util/processing.py +0 -393
- nv_ingest_client/client/client.py +455 -185
- nv_ingest_client/client/ingest_job_handler.py +384 -0
- nv_ingest_client/client/interface.py +36 -6
- nv_ingest_client/nv_ingest_cli.py +6 -3
- nv_ingest_client/primitives/tasks/extract.py +1 -1
- {nv_ingest_client-2025.10.7.dev20251007.dist-info → nv_ingest_client-2025.10.9.dev20251009.dist-info}/METADATA +1 -1
- {nv_ingest_client-2025.10.7.dev20251007.dist-info → nv_ingest_client-2025.10.9.dev20251009.dist-info}/RECORD +12 -11
- {nv_ingest_client-2025.10.7.dev20251007.dist-info → nv_ingest_client-2025.10.9.dev20251009.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.10.7.dev20251007.dist-info → nv_ingest_client-2025.10.9.dev20251009.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.10.7.dev20251007.dist-info → nv_ingest_client-2025.10.9.dev20251009.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.10.7.dev20251007.dist-info → nv_ingest_client-2025.10.9.dev20251009.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import time
|
|
10
|
+
import os
|
|
11
|
+
import io
|
|
12
|
+
import base64
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
from typing import Any, Dict, List, Tuple
|
|
15
|
+
|
|
16
|
+
from tqdm import tqdm
|
|
17
|
+
|
|
18
|
+
# Reuse existing CLI utilities to avoid duplicating behavior
|
|
19
|
+
from concurrent.futures import as_completed
|
|
20
|
+
from nv_ingest_client.util.util import check_ingest_result
|
|
21
|
+
from PIL import Image
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class IngestJobHandler:
|
|
27
|
+
"""
|
|
28
|
+
A modular job handler that mirrors the CLI's create_and_process_jobs flow,
|
|
29
|
+
so the same proven scheduling/retry behavior can be reused by other entry points.
|
|
30
|
+
|
|
31
|
+
Usage:
|
|
32
|
+
handler = IngestJobHandler(client, files, tasks, output_dir, batch_size)
|
|
33
|
+
total_files, trace_times, total_pages, trace_ids = handler.run()
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
client: Any,
|
|
39
|
+
files: List[str],
|
|
40
|
+
tasks: Dict[str, Any],
|
|
41
|
+
output_directory: str,
|
|
42
|
+
batch_size: int,
|
|
43
|
+
fail_on_error: bool = False,
|
|
44
|
+
save_images_separately: bool = False,
|
|
45
|
+
show_progress: bool = True,
|
|
46
|
+
show_telemetry: bool = False,
|
|
47
|
+
job_queue_id: str = "ingest_task_queue",
|
|
48
|
+
) -> None:
|
|
49
|
+
self.client = client
|
|
50
|
+
self.files = files
|
|
51
|
+
self.tasks = tasks
|
|
52
|
+
self.output_directory = output_directory
|
|
53
|
+
self.batch_size = batch_size
|
|
54
|
+
self.fail_on_error = fail_on_error
|
|
55
|
+
self.save_images_separately = save_images_separately
|
|
56
|
+
self.show_progress = show_progress
|
|
57
|
+
self.show_telemetry = show_telemetry
|
|
58
|
+
self.job_queue_id = job_queue_id
|
|
59
|
+
self._pbar = None
|
|
60
|
+
# Internal state used across iterations
|
|
61
|
+
self._retry_job_ids: List[str] = []
|
|
62
|
+
self._processed: int = 0
|
|
63
|
+
self._job_ids_batch: List[str] = []
|
|
64
|
+
self._job_id_map: Dict[str, str] = {}
|
|
65
|
+
self._trace_times: Dict[str, List[float]] = defaultdict(list)
|
|
66
|
+
# Constants
|
|
67
|
+
self._IMAGE_TYPES: set = {"png", "bmp", "jpeg", "jpg", "tiff"}
|
|
68
|
+
|
|
69
|
+
# ---------------------------
|
|
70
|
+
# Progress bar helpers
|
|
71
|
+
# ---------------------------
|
|
72
|
+
def _init_progress_bar(self, total: int) -> None:
|
|
73
|
+
if self.show_progress:
|
|
74
|
+
self._pbar = tqdm(total=total, desc="Processing files", unit="file")
|
|
75
|
+
else:
|
|
76
|
+
self._pbar = None
|
|
77
|
+
|
|
78
|
+
def _update_progress(self, n: int = 1, pages_per_sec: float | None = None) -> None:
|
|
79
|
+
if not self._pbar:
|
|
80
|
+
return
|
|
81
|
+
if pages_per_sec is not None:
|
|
82
|
+
self._pbar.set_postfix(pages_per_sec=f"{pages_per_sec:.2f}")
|
|
83
|
+
self._pbar.update(n)
|
|
84
|
+
|
|
85
|
+
def _close_progress_bar(self) -> None:
|
|
86
|
+
if self._pbar:
|
|
87
|
+
self._pbar.close()
|
|
88
|
+
self._pbar = None
|
|
89
|
+
|
|
90
|
+
def _generate_job_batch_for_iteration(self) -> None:
|
|
91
|
+
"""
|
|
92
|
+
Build the next batch of jobs for processing and submit newly created jobs.
|
|
93
|
+
|
|
94
|
+
This method mirrors the CLI batching semantics: it prioritizes retry jobs,
|
|
95
|
+
then creates new jobs up to the given ``batch_size``, submits those new jobs
|
|
96
|
+
asynchronously to the configured queue, and returns the combined list of
|
|
97
|
+
job indices for this iteration. It also updates the internal progress bar
|
|
98
|
+
when configured and advances the processed-file counter.
|
|
99
|
+
|
|
100
|
+
Side Effects
|
|
101
|
+
------------
|
|
102
|
+
- Populates/overwrites ``self._job_ids_batch`` with the ordered job indices to
|
|
103
|
+
process this iteration (``retry`` first, then newly created jobs).
|
|
104
|
+
- Updates ``self._job_id_map`` with any new mappings from job index to source file path
|
|
105
|
+
for jobs created in this iteration.
|
|
106
|
+
|
|
107
|
+
Raises
|
|
108
|
+
------
|
|
109
|
+
RuntimeError
|
|
110
|
+
If one or more job specs cannot be created (e.g., unreadable files)
|
|
111
|
+
and ``self.fail_on_error`` is True.
|
|
112
|
+
|
|
113
|
+
Notes
|
|
114
|
+
-----
|
|
115
|
+
- Side effects:
|
|
116
|
+
- Creates JobSpecs via ``self.client.create_jobs_for_batch(...)``.
|
|
117
|
+
- Submits newly created jobs via ``self.client.submit_job_async(..., self.job_queue_id)``.
|
|
118
|
+
- Updates the class-owned progress bar (``self._pbar``) to account for
|
|
119
|
+
missing jobs when some files fail to produce specs and
|
|
120
|
+
``self.fail_on_error`` is False.
|
|
121
|
+
- This method does not perform fetching; it only prepares and submits
|
|
122
|
+
jobs for the current iteration.
|
|
123
|
+
- The ``processed`` counter advances by the number of files attempted in
|
|
124
|
+
this iteration, even if some job specs are missing (unless
|
|
125
|
+
``self.fail_on_error`` is True).
|
|
126
|
+
|
|
127
|
+
Examples
|
|
128
|
+
--------
|
|
129
|
+
>>> handler = IngestJobHandler(client, files, tasks, "/tmp/out", batch_size=32)
|
|
130
|
+
>>> retry_ids = []
|
|
131
|
+
>>> handler._generate_job_batch_for_iteration()
|
|
132
|
+
>>> len(handler._job_ids_batch) <= 32
|
|
133
|
+
True
|
|
134
|
+
"""
|
|
135
|
+
job_indices: List[str] = []
|
|
136
|
+
job_index_map_updates: Dict[str, str] = {}
|
|
137
|
+
cur_job_count: int = 0
|
|
138
|
+
|
|
139
|
+
if self._retry_job_ids:
|
|
140
|
+
job_indices.extend(self._retry_job_ids)
|
|
141
|
+
cur_job_count = len(job_indices)
|
|
142
|
+
|
|
143
|
+
if (cur_job_count < self.batch_size) and (self._processed < len(self.files)):
|
|
144
|
+
new_job_count: int = min(self.batch_size - cur_job_count, len(self.files) - self._processed)
|
|
145
|
+
batch_files: List[str] = self.files[self._processed : self._processed + new_job_count]
|
|
146
|
+
|
|
147
|
+
new_job_indices: List[str] = self.client.create_jobs_for_batch(batch_files, self.tasks)
|
|
148
|
+
if len(new_job_indices) != new_job_count:
|
|
149
|
+
missing_jobs: int = new_job_count - len(new_job_indices)
|
|
150
|
+
error_msg: str = (
|
|
151
|
+
f"Missing {missing_jobs} job specs -- this is likely due to bad reads or file corruption"
|
|
152
|
+
)
|
|
153
|
+
logger.warning(error_msg)
|
|
154
|
+
|
|
155
|
+
if self.fail_on_error:
|
|
156
|
+
raise RuntimeError(error_msg)
|
|
157
|
+
|
|
158
|
+
if self._pbar:
|
|
159
|
+
self._pbar.update(missing_jobs)
|
|
160
|
+
|
|
161
|
+
job_index_map_updates = {job_index: file for job_index, file in zip(new_job_indices, batch_files)}
|
|
162
|
+
self._processed += new_job_count
|
|
163
|
+
# Submit newly created jobs asynchronously to the configured queue
|
|
164
|
+
_ = self.client.submit_job_async(new_job_indices, self.job_queue_id)
|
|
165
|
+
job_indices.extend(new_job_indices)
|
|
166
|
+
|
|
167
|
+
# Save into class state
|
|
168
|
+
self._job_ids_batch = job_indices
|
|
169
|
+
# Merge new mappings (do not drop existing entries for retry jobs)
|
|
170
|
+
self._job_id_map.update(job_index_map_updates)
|
|
171
|
+
|
|
172
|
+
def _handle_future_result(self, future, timeout: int = 10):
|
|
173
|
+
"""
|
|
174
|
+
Handle the result of a completed future job and process annotations.
|
|
175
|
+
|
|
176
|
+
Parameters
|
|
177
|
+
----------
|
|
178
|
+
future : concurrent.futures.Future
|
|
179
|
+
Future representing an asynchronous job.
|
|
180
|
+
timeout : int, optional
|
|
181
|
+
Maximum seconds to wait for the future result.
|
|
182
|
+
|
|
183
|
+
Returns
|
|
184
|
+
-------
|
|
185
|
+
Tuple[Dict[str, Any], str]
|
|
186
|
+
The decoded result dictionary and the trace_id for the job.
|
|
187
|
+
|
|
188
|
+
Raises
|
|
189
|
+
------
|
|
190
|
+
RuntimeError
|
|
191
|
+
If the job result indicates failure per check_ingest_result.
|
|
192
|
+
"""
|
|
193
|
+
result, _, trace_id = future.result(timeout=timeout)[0]
|
|
194
|
+
if ("annotations" in result) and result["annotations"]:
|
|
195
|
+
annotations = result["annotations"]
|
|
196
|
+
for key, value in annotations.items():
|
|
197
|
+
logger.debug(f"Annotation: {key} -> {json.dumps(value, indent=2)}")
|
|
198
|
+
|
|
199
|
+
failed, description = check_ingest_result(result)
|
|
200
|
+
if failed:
|
|
201
|
+
raise RuntimeError(f"Ingest job failed: {description}")
|
|
202
|
+
|
|
203
|
+
return result, trace_id
|
|
204
|
+
|
|
205
|
+
def _process_response(self, response: Dict[str, Any]) -> None:
|
|
206
|
+
"""
|
|
207
|
+
Extract trace timing entries from a response and accumulate per-stage elapsed times
|
|
208
|
+
into ``self._trace_times``.
|
|
209
|
+
|
|
210
|
+
Parameters
|
|
211
|
+
----------
|
|
212
|
+
response : Dict[str, Any]
|
|
213
|
+
Full response payload containing an optional ``trace`` dictionary with
|
|
214
|
+
entry/exit timestamps.
|
|
215
|
+
"""
|
|
216
|
+
trace_data: Dict[str, Any] = response.get("trace", {})
|
|
217
|
+
for key, entry_time in trace_data.items():
|
|
218
|
+
if "entry" in key:
|
|
219
|
+
exit_key: str = key.replace("entry", "exit")
|
|
220
|
+
exit_time: Any = trace_data.get(exit_key)
|
|
221
|
+
if exit_time:
|
|
222
|
+
stage_parts = key.split("::")
|
|
223
|
+
if len(stage_parts) >= 3:
|
|
224
|
+
stage_name: str = stage_parts[2]
|
|
225
|
+
elapsed_time: int = exit_time - entry_time
|
|
226
|
+
self._trace_times[stage_name].append(elapsed_time)
|
|
227
|
+
|
|
228
|
+
def _save_response_data(
|
|
229
|
+
self, response: Dict[str, Any], output_directory: str, images_to_disk: bool = False
|
|
230
|
+
) -> None:
|
|
231
|
+
"""
|
|
232
|
+
Save the response data into categorized metadata JSON files and optionally save images to disk.
|
|
233
|
+
|
|
234
|
+
Parameters
|
|
235
|
+
----------
|
|
236
|
+
response : Dict[str, Any]
|
|
237
|
+
Full response payload with a "data" list of documents.
|
|
238
|
+
output_directory : str
|
|
239
|
+
Output directory where per-type metadata JSON files (and any media) are written.
|
|
240
|
+
images_to_disk : bool, optional
|
|
241
|
+
If True, decode and write image contents to disk and replace content with a file URL.
|
|
242
|
+
"""
|
|
243
|
+
if ("data" not in response) or (not response["data"]):
|
|
244
|
+
logger.debug("Data is not in the response or response.data is empty")
|
|
245
|
+
return
|
|
246
|
+
|
|
247
|
+
response_data = response["data"]
|
|
248
|
+
if not isinstance(response_data, list) or len(response_data) == 0:
|
|
249
|
+
logger.debug("Response data is not a list or the list is empty.")
|
|
250
|
+
return
|
|
251
|
+
|
|
252
|
+
doc_meta_base = response_data[0]["metadata"]
|
|
253
|
+
source_meta = doc_meta_base["source_metadata"]
|
|
254
|
+
doc_name = source_meta["source_id"]
|
|
255
|
+
clean_doc_name = os.path.basename(doc_name)
|
|
256
|
+
output_name = f"{clean_doc_name}.metadata.json"
|
|
257
|
+
|
|
258
|
+
# Organize by document type
|
|
259
|
+
doc_map: Dict[str, List[Dict[str, Any]]] = {}
|
|
260
|
+
for document in response_data:
|
|
261
|
+
meta: Dict[str, Any] = document.get("metadata", {})
|
|
262
|
+
content_meta: Dict[str, Any] = meta.get("content_metadata", {})
|
|
263
|
+
doc_type: str = content_meta.get("type", "unknown")
|
|
264
|
+
doc_map.setdefault(doc_type, []).append(document)
|
|
265
|
+
|
|
266
|
+
for doc_type, documents in doc_map.items():
|
|
267
|
+
doc_type_path = os.path.join(output_directory, doc_type)
|
|
268
|
+
os.makedirs(doc_type_path, exist_ok=True)
|
|
269
|
+
|
|
270
|
+
if doc_type in ("image", "structured") and images_to_disk:
|
|
271
|
+
for i, doc in enumerate(documents):
|
|
272
|
+
meta: Dict[str, Any] = doc.get("metadata", {})
|
|
273
|
+
image_content = meta.get("content")
|
|
274
|
+
image_type = (
|
|
275
|
+
meta.get("image_metadata", {}).get("image_type", "png").lower()
|
|
276
|
+
if doc_type == "image"
|
|
277
|
+
else "png"
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
if image_content and image_type in self._IMAGE_TYPES:
|
|
281
|
+
try:
|
|
282
|
+
image_data = base64.b64decode(image_content)
|
|
283
|
+
image = Image.open(io.BytesIO(image_data))
|
|
284
|
+
|
|
285
|
+
image_ext = "jpg" if image_type == "jpeg" else image_type
|
|
286
|
+
image_filename = f"{clean_doc_name}_{i}.{image_ext}"
|
|
287
|
+
image_output_path = os.path.join(doc_type_path, "media", image_filename)
|
|
288
|
+
os.makedirs(os.path.dirname(image_output_path), exist_ok=True)
|
|
289
|
+
image.save(image_output_path, format=image_ext.upper())
|
|
290
|
+
|
|
291
|
+
meta["content"] = ""
|
|
292
|
+
meta["content_url"] = os.path.realpath(image_output_path)
|
|
293
|
+
logger.debug(f"Saved image to {image_output_path}")
|
|
294
|
+
except Exception as e:
|
|
295
|
+
logger.error(f"Failed to save image {i} for {clean_doc_name}: {e}")
|
|
296
|
+
|
|
297
|
+
# Write the metadata JSON file for this type
|
|
298
|
+
with open(os.path.join(doc_type_path, output_name), "w") as f:
|
|
299
|
+
f.write(json.dumps(documents, indent=2))
|
|
300
|
+
|
|
301
|
+
def run(self) -> Tuple[int, Dict[str, List[float]], int, Dict[str, str]]:
|
|
302
|
+
total_files: int = len(self.files)
|
|
303
|
+
total_pages_processed: int = 0
|
|
304
|
+
trace_ids: Dict[str, str] = defaultdict(list) # type: ignore
|
|
305
|
+
failed_jobs: List[str] = []
|
|
306
|
+
retry_counts: Dict[str, int] = defaultdict(int)
|
|
307
|
+
|
|
308
|
+
start_time_ns: int = time.time_ns()
|
|
309
|
+
self._init_progress_bar(total_files)
|
|
310
|
+
try:
|
|
311
|
+
self._processed = 0
|
|
312
|
+
while (self._processed < len(self.files)) or self._retry_job_ids:
|
|
313
|
+
# Create a batch (retries first, then new jobs up to batch_size)
|
|
314
|
+
self._generate_job_batch_for_iteration()
|
|
315
|
+
job_id_map = self._job_id_map
|
|
316
|
+
self._retry_job_ids = []
|
|
317
|
+
|
|
318
|
+
futures_dict: Dict[Any, str] = self.client.fetch_job_result_async(self._job_ids_batch, data_only=False)
|
|
319
|
+
for future in as_completed(futures_dict.keys()):
|
|
320
|
+
try:
|
|
321
|
+
# Block as each future completes; this mirrors CLI behavior
|
|
322
|
+
future_response, trace_id = self._handle_future_result(future)
|
|
323
|
+
job_id: str = futures_dict[future]
|
|
324
|
+
trace_ids[job_id_map[job_id]] = trace_id
|
|
325
|
+
|
|
326
|
+
first_page_metadata = future_response["data"][0]["metadata"]
|
|
327
|
+
file_page_counts: Dict[str, int] = {
|
|
328
|
+
first_page_metadata["source_metadata"]["source_name"]: first_page_metadata[
|
|
329
|
+
"content_metadata"
|
|
330
|
+
]["hierarchy"]["page_count"]
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
if self.output_directory:
|
|
334
|
+
self._save_response_data(
|
|
335
|
+
future_response,
|
|
336
|
+
self.output_directory,
|
|
337
|
+
images_to_disk=self.save_images_separately,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
total_pages_processed += file_page_counts[list(file_page_counts.keys())[0]]
|
|
341
|
+
elapsed_time: float = (time.time_ns() - start_time_ns) / 1e9
|
|
342
|
+
if elapsed_time > 0:
|
|
343
|
+
pages_per_sec: float = total_pages_processed / elapsed_time
|
|
344
|
+
else:
|
|
345
|
+
pages_per_sec = None
|
|
346
|
+
|
|
347
|
+
self._process_response(future_response)
|
|
348
|
+
|
|
349
|
+
except TimeoutError:
|
|
350
|
+
job_id = futures_dict[future]
|
|
351
|
+
src_name = job_id_map[job_id]
|
|
352
|
+
retry_counts[src_name] += 1
|
|
353
|
+
self._retry_job_ids.append(job_id)
|
|
354
|
+
except json.JSONDecodeError as e:
|
|
355
|
+
job_id = futures_dict[future]
|
|
356
|
+
src_name = job_id_map[job_id]
|
|
357
|
+
logger.error(f"Decoding error while processing {job_id}({src_name}): {e}")
|
|
358
|
+
failed_jobs.append(f"{job_id}::{src_name}")
|
|
359
|
+
except RuntimeError as e:
|
|
360
|
+
job_id = futures_dict[future]
|
|
361
|
+
src_name = job_id_map[job_id]
|
|
362
|
+
logger.error(f"Error while processing '{job_id}' - ({src_name}):\n{e}")
|
|
363
|
+
failed_jobs.append(f"{job_id}::{src_name}")
|
|
364
|
+
except Exception as e:
|
|
365
|
+
job_id = futures_dict[future]
|
|
366
|
+
src_name = job_id_map[job_id]
|
|
367
|
+
logger.exception(f"Unhandled error while processing {job_id}({src_name}): {e}")
|
|
368
|
+
failed_jobs.append(f"{job_id}::{src_name}")
|
|
369
|
+
finally:
|
|
370
|
+
# Do not update the pbar if this job is going to be retried
|
|
371
|
+
if futures_dict[future] not in self._retry_job_ids:
|
|
372
|
+
self._update_progress(1, pages_per_sec)
|
|
373
|
+
finally:
|
|
374
|
+
self._close_progress_bar()
|
|
375
|
+
|
|
376
|
+
# Optionally print telemetry summary
|
|
377
|
+
if self.show_telemetry and hasattr(self.client, "summarize_telemetry"):
|
|
378
|
+
try:
|
|
379
|
+
summary = self.client.summarize_telemetry()
|
|
380
|
+
logger.info("NvIngestClient Telemetry Summary: %s", json.dumps(summary, indent=2))
|
|
381
|
+
except Exception:
|
|
382
|
+
pass
|
|
383
|
+
|
|
384
|
+
return total_files, self._trace_times, total_pages_processed, trace_ids
|
|
@@ -404,10 +404,11 @@ class Ingestor:
|
|
|
404
404
|
save_to_disk: bool = False,
|
|
405
405
|
**kwargs: Any,
|
|
406
406
|
) -> Union[
|
|
407
|
-
List[List[Dict[str, Any]]], # In-memory: List of
|
|
407
|
+
List[List[Dict[str, Any]]], # In-memory: List of response['data'] for each doc
|
|
408
|
+
List[Dict[str, Any]], # In-memory: Full response envelopes when return_full_response=True
|
|
408
409
|
List[LazyLoadedList], # Disk: List of proxies, one per original doc
|
|
409
410
|
Tuple[
|
|
410
|
-
Union[List[List[Dict[str, Any]]], List[LazyLoadedList]],
|
|
411
|
+
Union[List[List[Dict[str, Any]]], List[Dict[str, Any]], List[LazyLoadedList]],
|
|
411
412
|
List[Tuple[str, str]],
|
|
412
413
|
],
|
|
413
414
|
]: # noqa: E501
|
|
@@ -423,13 +424,16 @@ class Ingestor:
|
|
|
423
424
|
**kwargs : Any
|
|
424
425
|
Additional keyword arguments for the underlying client methods. Supported keys:
|
|
425
426
|
'concurrency_limit', 'timeout', 'max_job_retries', 'retry_delay',
|
|
426
|
-
'data_only', 'verbose'. Unrecognized keys are passed
|
|
427
|
-
process_jobs_concurrently.
|
|
427
|
+
'data_only', 'return_full_response', 'verbose'. Unrecognized keys are passed
|
|
428
|
+
through to process_jobs_concurrently.
|
|
428
429
|
|
|
429
430
|
Returns
|
|
430
431
|
-------
|
|
431
|
-
results : list
|
|
432
|
-
|
|
432
|
+
results : list
|
|
433
|
+
When `return_failures` is False:
|
|
434
|
+
- Default: List of response['data'] per job (list[list[dict]]).
|
|
435
|
+
- If `return_full_response=True`: List of full response envelopes (each dict
|
|
436
|
+
contains keys like 'data', 'trace', 'annotations').
|
|
433
437
|
results, failures : tuple (list of dict, list of tuple of str)
|
|
434
438
|
Tuple containing successful results and failure information when `return_failures` is True.
|
|
435
439
|
"""
|
|
@@ -549,6 +553,22 @@ class Ingestor:
|
|
|
549
553
|
|
|
550
554
|
proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently, **kwargs)
|
|
551
555
|
|
|
556
|
+
# Telemetry controls (optional)
|
|
557
|
+
enable_telemetry: Optional[bool] = kwargs.pop("enable_telemetry", None)
|
|
558
|
+
show_telemetry: Optional[bool] = kwargs.pop("show_telemetry", None)
|
|
559
|
+
if show_telemetry is None:
|
|
560
|
+
# Fallback to env NV_INGEST_CLIENT_SHOW_TELEMETRY (0/1), default off
|
|
561
|
+
try:
|
|
562
|
+
show_telemetry = bool(int(os.getenv("NV_INGEST_CLIENT_SHOW_TELEMETRY", "0")))
|
|
563
|
+
except ValueError:
|
|
564
|
+
show_telemetry = False
|
|
565
|
+
# If user explicitly wants to show telemetry but did not specify enable_telemetry,
|
|
566
|
+
# ensure collection is enabled so summary isn't empty.
|
|
567
|
+
if enable_telemetry is None and show_telemetry:
|
|
568
|
+
enable_telemetry = True
|
|
569
|
+
if enable_telemetry is not None and hasattr(self._client, "enable_telemetry"):
|
|
570
|
+
self._client.enable_telemetry(bool(enable_telemetry))
|
|
571
|
+
|
|
552
572
|
results, failures = self._client.process_jobs_concurrently(
|
|
553
573
|
job_indices=self._job_ids,
|
|
554
574
|
job_queue_id=self._job_queue_id,
|
|
@@ -611,6 +631,16 @@ class Ingestor:
|
|
|
611
631
|
logger.info("Purging saved results from disk after successful VDB upload.")
|
|
612
632
|
self._purge_saved_results(results)
|
|
613
633
|
|
|
634
|
+
# Print telemetry summary if requested
|
|
635
|
+
if show_telemetry:
|
|
636
|
+
try:
|
|
637
|
+
summary = self._client.summarize_telemetry()
|
|
638
|
+
# Print to stdout and log for convenience
|
|
639
|
+
print("NvIngestClient Telemetry Summary:", json.dumps(summary, indent=2))
|
|
640
|
+
logger.info("NvIngestClient Telemetry Summary: %s", json.dumps(summary, indent=2))
|
|
641
|
+
except Exception:
|
|
642
|
+
pass
|
|
643
|
+
|
|
614
644
|
return (results, failures) if return_failures else results
|
|
615
645
|
|
|
616
646
|
def ingest_async(self, **kwargs: Any) -> Future:
|
|
@@ -25,10 +25,10 @@ from nv_ingest_client.cli.util.click import click_match_and_validate_files
|
|
|
25
25
|
from nv_ingest_client.cli.util.click import click_validate_batch_size
|
|
26
26
|
from nv_ingest_client.cli.util.click import click_validate_file_exists
|
|
27
27
|
from nv_ingest_client.cli.util.click import click_validate_task
|
|
28
|
-
from nv_ingest_client.cli.util.processing import create_and_process_jobs
|
|
29
28
|
from nv_ingest_client.cli.util.processing import report_statistics
|
|
30
29
|
from nv_ingest_client.cli.util.system import configure_logging
|
|
31
30
|
from nv_ingest_client.client import NvIngestClient
|
|
31
|
+
from nv_ingest_client.client.ingest_job_handler import IngestJobHandler
|
|
32
32
|
from nv_ingest_client.util.dataset import get_dataset_files
|
|
33
33
|
from nv_ingest_client.util.dataset import get_dataset_statistics
|
|
34
34
|
from nv_ingest_client.util.system import ensure_directory_with_permissions
|
|
@@ -290,15 +290,18 @@ def main(
|
|
|
290
290
|
)
|
|
291
291
|
|
|
292
292
|
start_time_ns = time.time_ns()
|
|
293
|
-
|
|
294
|
-
files=docs,
|
|
293
|
+
handler = IngestJobHandler(
|
|
295
294
|
client=ingest_client,
|
|
295
|
+
files=docs,
|
|
296
296
|
tasks=task,
|
|
297
297
|
output_directory=output_directory,
|
|
298
298
|
batch_size=batch_size,
|
|
299
299
|
fail_on_error=fail_on_error,
|
|
300
300
|
save_images_separately=save_images_separately,
|
|
301
|
+
show_progress=True,
|
|
302
|
+
show_telemetry=True,
|
|
301
303
|
)
|
|
304
|
+
(total_files, trace_times, pages_processed, trace_ids) = handler.run()
|
|
302
305
|
|
|
303
306
|
report_statistics(start_time_ns, trace_times, pages_processed, total_files)
|
|
304
307
|
|
|
@@ -86,7 +86,7 @@ class ExtractTask(Task):
|
|
|
86
86
|
extract_page_as_image: bool = False,
|
|
87
87
|
text_depth: str = "document",
|
|
88
88
|
paddle_output_format: str = "pseudo_markdown",
|
|
89
|
-
table_output_format: str = "
|
|
89
|
+
table_output_format: str = "markdown",
|
|
90
90
|
) -> None:
|
|
91
91
|
"""
|
|
92
92
|
Setup Extract Task Config
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
nv_ingest_client/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
2
|
-
nv_ingest_client/nv_ingest_cli.py,sha256=
|
|
2
|
+
nv_ingest_client/nv_ingest_cli.py,sha256=8HpbU6l0i19M9kjZdZKbf03z-CZIZoikigZuo9wD77g,13693
|
|
3
3
|
nv_ingest_client/cli/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
4
4
|
nv_ingest_client/cli/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
5
5
|
nv_ingest_client/cli/util/click.py,sha256=YjQU1uF148FU5D3ozC2m1kkfOOJxO1U8U552-T8PjU4,20029
|
|
6
|
-
nv_ingest_client/cli/util/processing.py,sha256=
|
|
6
|
+
nv_ingest_client/cli/util/processing.py,sha256=ULGCYQF1RTDQV_b35YM1WQRqIjR2wQRMJWu41DogagE,6259
|
|
7
7
|
nv_ingest_client/cli/util/system.py,sha256=AQLq0DD2Ns8jRanrKu1tmVBKPA9rl-F3-ZsGI6FXLqE,1105
|
|
8
8
|
nv_ingest_client/client/__init__.py,sha256=eEX9l1qmkLH2lAAZU3eP17SCV06ZjjrshHAB_xbboHA,375
|
|
9
|
-
nv_ingest_client/client/client.py,sha256=
|
|
10
|
-
nv_ingest_client/client/
|
|
9
|
+
nv_ingest_client/client/client.py,sha256=egPegAoe8sVYxWWbNl1V5xJdaABJxgVxX7AHRoa049w,76058
|
|
10
|
+
nv_ingest_client/client/ingest_job_handler.py,sha256=i2PC5AUaglN5aGAqE4Nrbk95TyLYN7P_Nwba0C_19xA,16810
|
|
11
|
+
nv_ingest_client/client/interface.py,sha256=c1SRgS_ryz-DAYRPr8OIW0HpWzUx-GsnlfEK2_K7fZg,48897
|
|
11
12
|
nv_ingest_client/client/util/processing.py,sha256=Ky7x7QbLn3BlgYwmrmoIc-o1VwmlmrcP9tn7GVTi0t0,2502
|
|
12
13
|
nv_ingest_client/primitives/__init__.py,sha256=3rbpLCI7Bl0pntGatAxXD_V01y6dcLhHFheI3wqet-I,269
|
|
13
14
|
nv_ingest_client/primitives/jobs/__init__.py,sha256=-yohgHv3LcCtSleHSaxjv1oO7nNcMCjN3ZYoOkIypIk,469
|
|
@@ -19,7 +20,7 @@ nv_ingest_client/primitives/tasks/caption.py,sha256=I1nOpfGb1Ts7QsElwfayhw-F_UcY
|
|
|
19
20
|
nv_ingest_client/primitives/tasks/chart_extraction.py,sha256=s5hsljgSXxQMZHGekpAg6OYJ9k3-DHk5NmFpvtKJ6Zs,1493
|
|
20
21
|
nv_ingest_client/primitives/tasks/dedup.py,sha256=qort6p3t6ZJuK_74sfOOLp3vMT3hkB5DAu3467WenyY,1719
|
|
21
22
|
nv_ingest_client/primitives/tasks/embed.py,sha256=I6Irmvm1Qj9oqzDGSgfykCtfz8pz9LNxiXO-t29nXv8,5916
|
|
22
|
-
nv_ingest_client/primitives/tasks/extract.py,sha256=
|
|
23
|
+
nv_ingest_client/primitives/tasks/extract.py,sha256=bRriVkQyXN-UwzprHIt4Lp0iwmAojLEXqBb-IUrf3vY,9328
|
|
23
24
|
nv_ingest_client/primitives/tasks/filter.py,sha256=wjcfSBGhdEyPh2tf42NMcyKZziigm24CO9B4obpQytU,2618
|
|
24
25
|
nv_ingest_client/primitives/tasks/infographic_extraction.py,sha256=SyTjZQbdVA3QwM5yVm4fUzE4Gu4zm4tAfNLDZMvySV8,1537
|
|
25
26
|
nv_ingest_client/primitives/tasks/split.py,sha256=8UkB3EialsOTEbsOZLxzmnDIfTJzC6uvjNv21IbgAVA,2332
|
|
@@ -46,9 +47,9 @@ nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIr
|
|
|
46
47
|
nv_ingest_client/util/vdb/adt_vdb.py,sha256=UubzAMSfyrqqpD-OQErpBs25hC2Mw8zGZ4waenGXPOk,515
|
|
47
48
|
nv_ingest_client/util/vdb/milvus.py,sha256=dYXszrWdwYYASBW6t8lMI6QK9-BzhV6HAUYjt3cIDsE,78602
|
|
48
49
|
nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
|
|
49
|
-
nv_ingest_client-2025.10.
|
|
50
|
-
nv_ingest_client-2025.10.
|
|
51
|
-
nv_ingest_client-2025.10.
|
|
52
|
-
nv_ingest_client-2025.10.
|
|
53
|
-
nv_ingest_client-2025.10.
|
|
54
|
-
nv_ingest_client-2025.10.
|
|
50
|
+
nv_ingest_client-2025.10.9.dev20251009.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
51
|
+
nv_ingest_client-2025.10.9.dev20251009.dist-info/METADATA,sha256=rscM-NBJXWXk1Ghg5KG0KTU56a4yWAV0c419opVrDS0,30626
|
|
52
|
+
nv_ingest_client-2025.10.9.dev20251009.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
53
|
+
nv_ingest_client-2025.10.9.dev20251009.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
|
|
54
|
+
nv_ingest_client-2025.10.9.dev20251009.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
|
|
55
|
+
nv_ingest_client-2025.10.9.dev20251009.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|