nv-ingest-client 2025.9.26.dev20250926__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

@@ -0,0 +1,412 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import logging
9
+ import time
10
+ import os
11
+ import io
12
+ import base64
13
+ from collections import defaultdict
14
+ from typing import Any, Dict, List, Tuple
15
+
16
+ from tqdm import tqdm
17
+
18
+ # Reuse existing CLI utilities to avoid duplicating behavior
19
+ from concurrent.futures import as_completed
20
+ from nv_ingest_client.util.util import check_ingest_result
21
+ from PIL import Image
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class IngestJobHandler:
27
+ """
28
+ A modular job handler that mirrors the CLI's create_and_process_jobs flow,
29
+ so the same proven scheduling/retry behavior can be reused by other entry points.
30
+
31
+ Usage:
32
+ handler = IngestJobHandler(client, files, tasks, output_dir, batch_size)
33
+ total_files, trace_times, total_pages, trace_ids = handler.run()
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ client: Any,
39
+ files: List[str],
40
+ tasks: Dict[str, Any],
41
+ output_directory: str,
42
+ batch_size: int,
43
+ fail_on_error: bool = False,
44
+ save_images_separately: bool = False,
45
+ show_progress: bool = True,
46
+ show_telemetry: bool = False,
47
+ job_queue_id: str = "ingest_task_queue",
48
+ pdf_split_page_count: int = None,
49
+ ) -> None:
50
+ self.client = client
51
+ self.files = files
52
+ self.tasks = tasks
53
+ self.output_directory = output_directory
54
+ self.batch_size = batch_size
55
+ self.fail_on_error = fail_on_error
56
+ self.save_images_separately = save_images_separately
57
+ self.show_progress = show_progress
58
+ self.show_telemetry = show_telemetry
59
+ self.job_queue_id = job_queue_id
60
+ self.pdf_split_page_count = pdf_split_page_count
61
+ self._pbar = None
62
+ # Internal state used across iterations
63
+ self._retry_job_ids: List[str] = []
64
+ self._processed: int = 0
65
+ self._job_ids_batch: List[str] = []
66
+ self._job_id_map: Dict[str, str] = {}
67
+ self._trace_times: Dict[str, List[float]] = defaultdict(list)
68
+ # Constants
69
+ self._IMAGE_TYPES: set = {"png", "bmp", "jpeg", "jpg", "tiff"}
70
+
71
+ # ---------------------------
72
+ # Progress bar helpers
73
+ # ---------------------------
74
+ def _init_progress_bar(self, total: int) -> None:
75
+ if self.show_progress:
76
+ self._pbar = tqdm(total=total, desc="Processing files", unit="file")
77
+ else:
78
+ self._pbar = None
79
+
80
+ def _update_progress(self, n: int = 1, pages_per_sec: float | None = None) -> None:
81
+ if not self._pbar:
82
+ return
83
+ if pages_per_sec is not None:
84
+ self._pbar.set_postfix(pages_per_sec=f"{pages_per_sec:.2f}")
85
+ self._pbar.update(n)
86
+
87
+ def _close_progress_bar(self) -> None:
88
+ if self._pbar:
89
+ self._pbar.close()
90
+ self._pbar = None
91
+
92
+ def _generate_job_batch_for_iteration(self) -> None:
93
+ """
94
+ Build the next batch of jobs for processing and submit newly created jobs.
95
+
96
+ This method mirrors the CLI batching semantics: it prioritizes retry jobs,
97
+ then creates new jobs up to the given ``batch_size``, submits those new jobs
98
+ asynchronously to the configured queue, and returns the combined list of
99
+ job indices for this iteration. It also updates the internal progress bar
100
+ when configured and advances the processed-file counter.
101
+
102
+ Side Effects
103
+ ------------
104
+ - Populates/overwrites ``self._job_ids_batch`` with the ordered job indices to
105
+ process this iteration (``retry`` first, then newly created jobs).
106
+ - Updates ``self._job_id_map`` with any new mappings from job index to source file path
107
+ for jobs created in this iteration.
108
+
109
+ Raises
110
+ ------
111
+ RuntimeError
112
+ If one or more job specs cannot be created (e.g., unreadable files)
113
+ and ``self.fail_on_error`` is True.
114
+
115
+ Notes
116
+ -----
117
+ - Side effects:
118
+ - Creates JobSpecs via ``self.client.create_jobs_for_batch(...)``.
119
+ - Submits newly created jobs via ``self.client.submit_job_async(..., self.job_queue_id)``.
120
+ - Updates the class-owned progress bar (``self._pbar``) to account for
121
+ missing jobs when some files fail to produce specs and
122
+ ``self.fail_on_error`` is False.
123
+ - This method does not perform fetching; it only prepares and submits
124
+ jobs for the current iteration.
125
+ - The ``processed`` counter advances by the number of files attempted in
126
+ this iteration, even if some job specs are missing (unless
127
+ ``self.fail_on_error`` is True).
128
+
129
+ Examples
130
+ --------
131
+ >>> handler = IngestJobHandler(client, files, tasks, "/tmp/out", batch_size=32)
132
+ >>> retry_ids = []
133
+ >>> handler._generate_job_batch_for_iteration()
134
+ >>> len(handler._job_ids_batch) <= 32
135
+ True
136
+ """
137
+ job_indices: List[str] = []
138
+ job_index_map_updates: Dict[str, str] = {}
139
+ cur_job_count: int = 0
140
+
141
+ if self._retry_job_ids:
142
+ job_indices.extend(self._retry_job_ids)
143
+ cur_job_count = len(job_indices)
144
+
145
+ if (cur_job_count < self.batch_size) and (self._processed < len(self.files)):
146
+ new_job_count: int = min(self.batch_size - cur_job_count, len(self.files) - self._processed)
147
+ batch_files: List[str] = self.files[self._processed : self._processed + new_job_count]
148
+
149
+ new_job_indices: List[str] = self.client.create_jobs_for_batch(
150
+ batch_files, self.tasks, pdf_split_page_count=self.pdf_split_page_count
151
+ )
152
+ if len(new_job_indices) != new_job_count:
153
+ missing_jobs: int = new_job_count - len(new_job_indices)
154
+ error_msg: str = (
155
+ f"Missing {missing_jobs} job specs -- this is likely due to bad reads or file corruption"
156
+ )
157
+ logger.warning(error_msg)
158
+
159
+ if self.fail_on_error:
160
+ raise RuntimeError(error_msg)
161
+
162
+ if self._pbar:
163
+ self._pbar.update(missing_jobs)
164
+
165
+ job_index_map_updates = {job_index: file for job_index, file in zip(new_job_indices, batch_files)}
166
+ self._processed += new_job_count
167
+ # Submit newly created jobs asynchronously to the configured queue
168
+ _ = self.client.submit_job_async(new_job_indices, self.job_queue_id)
169
+ job_indices.extend(new_job_indices)
170
+
171
+ # Save into class state
172
+ self._job_ids_batch = job_indices
173
+ # Merge new mappings (do not drop existing entries for retry jobs)
174
+ self._job_id_map.update(job_index_map_updates)
175
+
176
+ def _handle_future_result(self, future, timeout: int = 10):
177
+ """
178
+ Handle the result of a completed future job and process annotations.
179
+
180
+ Parameters
181
+ ----------
182
+ future : concurrent.futures.Future
183
+ Future representing an asynchronous job.
184
+ timeout : int, optional
185
+ Maximum seconds to wait for the future result.
186
+
187
+ Returns
188
+ -------
189
+ Tuple[Dict[str, Any], str]
190
+ The decoded result dictionary and the trace_id for the job.
191
+
192
+ Raises
193
+ ------
194
+ RuntimeError
195
+ If the job result indicates failure per check_ingest_result.
196
+ """
197
+ result, _, trace_id = future.result(timeout=timeout)[0]
198
+ if ("annotations" in result) and result["annotations"]:
199
+ annotations = result["annotations"]
200
+ for key, value in annotations.items():
201
+ logger.debug(f"Annotation: {key} -> {json.dumps(value, indent=2)}")
202
+
203
+ failed, description = check_ingest_result(result)
204
+ if failed:
205
+ raise RuntimeError(f"Ingest job failed: {description}")
206
+
207
+ return result, trace_id
208
+
209
+ def _process_response(self, response: Dict[str, Any]) -> None:
210
+ """
211
+ Extract trace timing entries from a response and accumulate per-stage elapsed times
212
+ into ``self._trace_times``.
213
+
214
+ Parameters
215
+ ----------
216
+ response : Dict[str, Any]
217
+ Full response payload containing an optional ``trace`` dictionary with
218
+ entry/exit timestamps.
219
+ """
220
+ trace_data: Dict[str, Any] = response.get("trace", {})
221
+ for key, entry_time in trace_data.items():
222
+ if "entry" in key:
223
+ exit_key: str = key.replace("entry", "exit")
224
+ exit_time: Any = trace_data.get(exit_key)
225
+ if exit_time:
226
+ stage_parts = key.split("::")
227
+ if len(stage_parts) >= 3:
228
+ stage_name: str = stage_parts[2]
229
+ elapsed_time: int = exit_time - entry_time
230
+ self._trace_times[stage_name].append(elapsed_time)
231
+
232
+ def _save_response_data(
233
+ self, response: Dict[str, Any], output_directory: str, images_to_disk: bool = False
234
+ ) -> None:
235
+ """
236
+ Save the response data into categorized metadata JSON files and optionally save images to disk.
237
+
238
+ Parameters
239
+ ----------
240
+ response : Dict[str, Any]
241
+ Full response payload with a "data" list of documents.
242
+ output_directory : str
243
+ Output directory where per-type metadata JSON files (and any media) are written.
244
+ images_to_disk : bool, optional
245
+ If True, decode and write image contents to disk and replace content with a file URL.
246
+ """
247
+ if ("data" not in response) or (not response["data"]):
248
+ logger.debug("Data is not in the response or response.data is empty")
249
+ return
250
+
251
+ response_data = response["data"]
252
+ if not isinstance(response_data, list) or len(response_data) == 0:
253
+ logger.debug("Response data is not a list or the list is empty.")
254
+ return
255
+
256
+ doc_meta_base = response_data[0]["metadata"]
257
+ source_meta = doc_meta_base["source_metadata"]
258
+ doc_name = source_meta["source_id"]
259
+ clean_doc_name = os.path.basename(doc_name)
260
+ output_name = f"{clean_doc_name}.metadata.json"
261
+
262
+ # Organize by document type
263
+ doc_map: Dict[str, List[Dict[str, Any]]] = {}
264
+ for document in response_data:
265
+ meta: Dict[str, Any] = document.get("metadata", {})
266
+ content_meta: Dict[str, Any] = meta.get("content_metadata", {})
267
+ doc_type: str = content_meta.get("type", "unknown")
268
+ doc_map.setdefault(doc_type, []).append(document)
269
+
270
+ for doc_type, documents in doc_map.items():
271
+ doc_type_path = os.path.join(output_directory, doc_type)
272
+ os.makedirs(doc_type_path, exist_ok=True)
273
+
274
+ if doc_type in ("image", "structured") and images_to_disk:
275
+ for i, doc in enumerate(documents):
276
+ meta: Dict[str, Any] = doc.get("metadata", {})
277
+ image_content = meta.get("content")
278
+ image_type = (
279
+ meta.get("image_metadata", {}).get("image_type", "png").lower()
280
+ if doc_type == "image"
281
+ else "png"
282
+ )
283
+
284
+ if image_content and image_type in self._IMAGE_TYPES:
285
+ try:
286
+ image_data = base64.b64decode(image_content)
287
+ image = Image.open(io.BytesIO(image_data))
288
+
289
+ image_ext = "jpg" if image_type == "jpeg" else image_type
290
+ image_filename = f"{clean_doc_name}_{i}.{image_ext}"
291
+ image_output_path = os.path.join(doc_type_path, "media", image_filename)
292
+ os.makedirs(os.path.dirname(image_output_path), exist_ok=True)
293
+ image.save(image_output_path, format=image_ext.upper())
294
+
295
+ meta["content"] = ""
296
+ meta["content_url"] = os.path.realpath(image_output_path)
297
+ logger.debug(f"Saved image to {image_output_path}")
298
+ except Exception as e:
299
+ logger.error(f"Failed to save image {i} for {clean_doc_name}: {e}")
300
+
301
+ # Write the metadata JSON file for this type
302
+ with open(os.path.join(doc_type_path, output_name), "w") as f:
303
+ f.write(json.dumps(documents, indent=2))
304
+
305
+ def run(self) -> Tuple[int, Dict[str, List[float]], int, Dict[str, str]]:
306
+ total_files: int = len(self.files)
307
+ total_pages_processed: int = 0
308
+ trace_ids: Dict[str, str] = defaultdict(list) # type: ignore
309
+ failed_jobs: List[str] = []
310
+ retry_counts: Dict[str, int] = defaultdict(int)
311
+ pages_per_sec: float = None
312
+
313
+ start_time_ns: int = time.time_ns()
314
+ self._init_progress_bar(total_files)
315
+ pages_per_sec: float = None
316
+ try:
317
+ self._processed = 0
318
+ while (self._processed < len(self.files)) or self._retry_job_ids:
319
+ # Create a batch (retries first, then new jobs up to batch_size)
320
+ self._generate_job_batch_for_iteration()
321
+ job_id_map = self._job_id_map
322
+ self._retry_job_ids = []
323
+
324
+ futures_dict: Dict[Any, str] = self.client.fetch_job_result_async(self._job_ids_batch, data_only=False)
325
+ for future in as_completed(futures_dict.keys()):
326
+ pages_per_sec = None
327
+ try:
328
+ # Block as each future completes; this mirrors CLI behavior
329
+ future_response, trace_id = self._handle_future_result(future)
330
+ job_id: str = futures_dict[future]
331
+ trace_ids[job_id_map[job_id]] = trace_id
332
+
333
+ # Extract page count: prefer V2 metadata location, fall back to V1
334
+ page_count = None
335
+ source_name = None
336
+
337
+ # Try V2 metadata location first (top-level metadata.total_pages)
338
+ if "metadata" in future_response and future_response["metadata"]:
339
+ response_metadata = future_response["metadata"]
340
+ page_count = response_metadata.get("total_pages")
341
+ source_name = response_metadata.get("original_source_name")
342
+
343
+ # Fall back to V1 location (first data element's hierarchy.page_count)
344
+ if page_count is None and future_response.get("data"):
345
+ try:
346
+ first_page_metadata = future_response["data"][0]["metadata"]
347
+ page_count = first_page_metadata["content_metadata"]["hierarchy"]["page_count"]
348
+ source_name = first_page_metadata["source_metadata"]["source_name"]
349
+ except (KeyError, IndexError, TypeError):
350
+ # If we can't extract from V1 location, use defaults
351
+ pass
352
+
353
+ # Use extracted values or defaults
354
+ if page_count is None:
355
+ page_count = 0 # Default if not found
356
+ if source_name is None:
357
+ source_name = "unknown_source"
358
+
359
+ file_page_counts: Dict[str, int] = {source_name: page_count}
360
+
361
+ if self.output_directory:
362
+ self._save_response_data(
363
+ future_response,
364
+ self.output_directory,
365
+ images_to_disk=self.save_images_separately,
366
+ )
367
+
368
+ total_pages_processed += file_page_counts[list(file_page_counts.keys())[0]]
369
+ elapsed_time: float = (time.time_ns() - start_time_ns) / 1e9
370
+ if elapsed_time > 0:
371
+ pages_per_sec: float = total_pages_processed / elapsed_time
372
+ else:
373
+ pages_per_sec = None
374
+
375
+ self._process_response(future_response)
376
+
377
+ except TimeoutError:
378
+ job_id = futures_dict[future]
379
+ src_name = job_id_map[job_id]
380
+ retry_counts[src_name] += 1
381
+ self._retry_job_ids.append(job_id)
382
+ except json.JSONDecodeError as e:
383
+ job_id = futures_dict[future]
384
+ src_name = job_id_map[job_id]
385
+ logger.error(f"Decoding error while processing {job_id}({src_name}): {e}")
386
+ failed_jobs.append(f"{job_id}::{src_name}")
387
+ except RuntimeError as e:
388
+ job_id = futures_dict[future]
389
+ src_name = job_id_map[job_id]
390
+ logger.error(f"Error while processing '{job_id}' - ({src_name}):\n{e}")
391
+ failed_jobs.append(f"{job_id}::{src_name}")
392
+ except Exception as e:
393
+ job_id = futures_dict[future]
394
+ src_name = job_id_map[job_id]
395
+ logger.exception(f"Unhandled error while processing {job_id}({src_name}): {e}")
396
+ failed_jobs.append(f"{job_id}::{src_name}")
397
+ finally:
398
+ # Do not update the pbar if this job is going to be retried
399
+ if futures_dict[future] not in self._retry_job_ids:
400
+ self._update_progress(1, pages_per_sec)
401
+ finally:
402
+ self._close_progress_bar()
403
+
404
+ # Optionally print telemetry summary
405
+ if self.show_telemetry and hasattr(self.client, "summarize_telemetry"):
406
+ try:
407
+ summary = self.client.summarize_telemetry()
408
+ logger.info("NvIngestClient Telemetry Summary: %s", json.dumps(summary, indent=2))
409
+ except Exception:
410
+ pass
411
+
412
+ return total_files, self._trace_times, total_pages_processed, trace_ids