nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (38) hide show
  1. nv_ingest_client/cli/util/click.py +182 -30
  2. nv_ingest_client/cli/util/processing.py +0 -393
  3. nv_ingest_client/client/client.py +561 -207
  4. nv_ingest_client/client/ingest_job_handler.py +412 -0
  5. nv_ingest_client/client/interface.py +466 -59
  6. nv_ingest_client/client/util/processing.py +11 -1
  7. nv_ingest_client/nv_ingest_cli.py +58 -6
  8. nv_ingest_client/primitives/jobs/job_spec.py +32 -10
  9. nv_ingest_client/primitives/tasks/__init__.py +6 -4
  10. nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
  11. nv_ingest_client/primitives/tasks/caption.py +10 -16
  12. nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
  13. nv_ingest_client/primitives/tasks/dedup.py +12 -21
  14. nv_ingest_client/primitives/tasks/embed.py +37 -76
  15. nv_ingest_client/primitives/tasks/extract.py +68 -169
  16. nv_ingest_client/primitives/tasks/filter.py +22 -28
  17. nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
  18. nv_ingest_client/primitives/tasks/split.py +17 -18
  19. nv_ingest_client/primitives/tasks/store.py +29 -29
  20. nv_ingest_client/primitives/tasks/task_base.py +1 -72
  21. nv_ingest_client/primitives/tasks/task_factory.py +10 -11
  22. nv_ingest_client/primitives/tasks/udf.py +349 -0
  23. nv_ingest_client/util/dataset.py +8 -2
  24. nv_ingest_client/util/document_analysis.py +314 -0
  25. nv_ingest_client/util/image_disk_utils.py +300 -0
  26. nv_ingest_client/util/transport.py +12 -6
  27. nv_ingest_client/util/util.py +66 -0
  28. nv_ingest_client/util/vdb/milvus.py +220 -75
  29. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
  30. nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
  31. nv_ingest_client/cli/util/tasks.py +0 -3
  32. nv_ingest_client/primitives/exceptions.py +0 -0
  33. nv_ingest_client/primitives/tasks/transform.py +0 -0
  34. nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
  35. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
  36. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
  37. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
  38. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
@@ -2,25 +2,13 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
- import base64
6
- import io
7
- import json
8
5
  import logging
9
- import os
10
6
  import re
11
7
  import time
12
8
  from collections import defaultdict
13
- from concurrent.futures import as_completed
14
9
  from statistics import mean
15
10
  from statistics import median
16
11
  from typing import Any
17
- from typing import Dict
18
- from typing import List
19
- from typing import Tuple
20
-
21
- from nv_ingest_client.util.processing import handle_future_result
22
- from PIL import Image
23
- from tqdm import tqdm
24
12
 
25
13
  logger = logging.getLogger(__name__)
26
14
 
@@ -135,387 +123,6 @@ def report_statistics(
135
123
  report_overall_speed(total_pages_processed, start_time_ns, total_files)
136
124
 
137
125
 
138
- def process_response(response: Dict[str, Any], stage_elapsed_times: defaultdict) -> None:
139
- """
140
- Process the response to extract trace data and calculate elapsed time for each stage.
141
-
142
- This function iterates over trace data in the response, identifies entry and exit times for each stage,
143
- calculates the elapsed time, and appends the elapsed time to the corresponding stage in the provided
144
- `stage_elapsed_times` dictionary.
145
-
146
- Parameters
147
- ----------
148
- response : Dict[str, Any]
149
- The response dictionary containing trace information for processing stages.
150
- stage_elapsed_times : defaultdict
151
- A defaultdict where keys are stage names (str) and values are lists of elapsed times (int, in nanoseconds).
152
-
153
- Notes
154
- -----
155
- The function expects trace keys to include "entry" and "exit" substrings. For each entry key, the corresponding
156
- exit key is determined by replacing "entry" with "exit". The stage name is assumed to be the third element when
157
- splitting the key by "::".
158
- """
159
- trace_data: Dict[str, Any] = response.get("trace", {})
160
- for key, entry_time in trace_data.items():
161
- if "entry" in key:
162
- exit_key: str = key.replace("entry", "exit")
163
- exit_time: Any = trace_data.get(exit_key)
164
- if exit_time:
165
- # Assumes the stage name is in the third position when splitting the key
166
- stage_parts = key.split("::")
167
- if len(stage_parts) >= 3:
168
- stage_name: str = stage_parts[2]
169
- elapsed_time: int = exit_time - entry_time
170
- stage_elapsed_times[stage_name].append(elapsed_time)
171
-
172
-
173
- def organize_documents_by_type(response_data: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
174
- """
175
- Organize documents by their content type.
176
-
177
- This function takes a list of response documents, extracts the content type from each document's metadata,
178
- and organizes the documents into a dictionary, where the keys are content types and the values are lists of
179
- documents belonging to each type.
180
-
181
- Parameters
182
- ----------
183
- response_data : List[Dict[str, Any]]
184
- A list of documents, where each document is represented as a dictionary. Each dictionary must contain
185
- a 'metadata' field that may be either a JSON string or a dictionary. The metadata is expected to have a
186
- "content_metadata" field containing the document's type.
187
-
188
- Returns
189
- -------
190
- Dict[str, List[Dict[str, Any]]]
191
- A dictionary mapping document types (as strings) to lists of documents. Each key represents a document type,
192
- and the associated value is a list of documents that belong to that type.
193
-
194
- Notes
195
- -----
196
- - If the 'metadata' field of a document is a string, it is parsed into a dictionary using `json.loads`.
197
- - The function assumes that each document's metadata has a valid "content_metadata" field with a "type" key.
198
- - Documents are grouped by the value of the "type" key in their "content_metadata".
199
-
200
- Examples
201
- --------
202
- >>> response_data = [
203
- ... {"metadata": {"content_metadata": {"type": "report"}}},
204
- ... {"metadata": '{"content_metadata": {"type": "summary"}}'},
205
- ... {"metadata": {"content_metadata": {"type": "report"}}}
206
- ... ]
207
- >>> organize_documents_by_type(response_data)
208
- {'report': [{'metadata': {'content_metadata': {'type': 'report'}}},
209
- {'metadata': {'content_metadata': {'type': 'report'}}}],
210
- 'summary': [{'metadata': {'content_metadata': {'type': 'summary'}}}]}
211
- """
212
- doc_map: Dict[str, List[Dict[str, Any]]] = {}
213
- for document in response_data:
214
- doc_meta: Any = document["metadata"]
215
- if isinstance(doc_meta, str):
216
- doc_meta = json.loads(doc_meta)
217
- doc_content_metadata: Dict[str, Any] = doc_meta["content_metadata"]
218
- doc_type: str = doc_content_metadata["type"]
219
- if doc_type not in doc_map:
220
- doc_map[doc_type] = []
221
- doc_map[doc_type].append(document)
222
- return doc_map
223
-
224
-
225
- def save_response_data(response: Dict[str, Any], output_directory: str, images_to_disk: bool = False) -> None:
226
- """
227
- Save the response data into categorized metadata JSON files and optionally save images to disk.
228
-
229
- This function processes the response data, organizes it based on document types, and saves the organized data
230
- into a specified output directory as JSON files. If 'images_to_disk' is True and the document type is 'image',
231
- it decodes and writes base64 encoded images to disk.
232
-
233
- Parameters
234
- ----------
235
- response : Dict[str, Any]
236
- A dictionary containing the API response data. It must contain a "data" field, which is expected to be a
237
- list of document entries. Each document entry should contain metadata, which includes information about
238
- the document's source.
239
- output_directory : str
240
- The path to the directory where the JSON metadata files should be saved. Subdirectories will be created based
241
- on the document types, and the metadata files will be stored within these subdirectories.
242
- images_to_disk : bool, optional
243
- If True, base64 encoded images in the 'metadata.content' field will be decoded and saved to disk.
244
- Default is False.
245
-
246
- Returns
247
- -------
248
- None
249
- This function does not return any values. It writes output to the filesystem.
250
-
251
- Notes
252
- -----
253
- - If 'images_to_disk' is True and 'doc_type' is 'image', images will be decoded and saved to disk with appropriate
254
- file types based on 'metadata.image_metadata.image_type'.
255
- """
256
- if ("data" not in response) or (not response["data"]):
257
- logger.debug("Data is not in the response or response.data is empty")
258
- return
259
-
260
- response_data = response["data"]
261
-
262
- if not isinstance(response_data, list) or len(response_data) == 0:
263
- logger.debug("Response data is not a list or the list is empty.")
264
- return
265
-
266
- doc_meta_base = response_data[0]["metadata"]
267
- source_meta = doc_meta_base["source_metadata"]
268
- doc_name = source_meta["source_id"]
269
- clean_doc_name = get_valid_filename(os.path.basename(doc_name))
270
- output_name = f"{clean_doc_name}.metadata.json"
271
-
272
- doc_map = organize_documents_by_type(response_data)
273
- for doc_type, documents in doc_map.items():
274
- doc_type_path = os.path.join(output_directory, doc_type)
275
- if not os.path.exists(doc_type_path):
276
- os.makedirs(doc_type_path)
277
-
278
- if doc_type in ("image", "structured") and images_to_disk:
279
- for i, doc in enumerate(documents):
280
- meta: Dict[str, Any] = doc.get("metadata", {})
281
- image_content = meta.get("content")
282
- if doc_type == "image":
283
- image_type = meta.get("image_metadata", {}).get("image_type", "png").lower()
284
- else:
285
- image_type = "png"
286
-
287
- if image_content and image_type in {"png", "bmp", "jpeg", "jpg", "tiff"}:
288
- try:
289
- # Decode the base64 content
290
- image_data = base64.b64decode(image_content)
291
- image = Image.open(io.BytesIO(image_data))
292
-
293
- # Define the output file path
294
- image_ext = "jpg" if image_type == "jpeg" else image_type
295
- image_filename = f"{clean_doc_name}_{i}.{image_ext}"
296
- image_output_path = os.path.join(doc_type_path, "media", image_filename)
297
-
298
- # Ensure the media directory exists
299
- os.makedirs(os.path.dirname(image_output_path), exist_ok=True)
300
-
301
- # Save the image to disk
302
- image.save(image_output_path, format=image_ext.upper())
303
-
304
- # Update the metadata content with the image path
305
- meta["content"] = ""
306
- meta["content_url"] = os.path.realpath(image_output_path)
307
- logger.debug(f"Saved image to {image_output_path}")
308
-
309
- except Exception as e:
310
- logger.error(f"Failed to save image {i} for {clean_doc_name}: {e}")
311
-
312
- # Write the metadata JSON file
313
- with open(os.path.join(doc_type_path, output_name), "w") as f:
314
- f.write(json.dumps(documents, indent=2))
315
-
316
-
317
- def generate_job_batch_for_iteration(
318
- client: Any,
319
- pbar: Any,
320
- files: List[str],
321
- tasks: Dict[str, Any],
322
- processed: int,
323
- batch_size: int,
324
- retry_job_ids: List[str],
325
- fail_on_error: bool = False,
326
- ) -> Tuple[List[str], Dict[str, str], int]:
327
- """
328
- Generates a batch of job specifications for the current iteration of file processing.
329
- This function handles retrying failed jobs and creating new jobs for unprocessed files.
330
- The job specifications are then submitted for processing.
331
-
332
- Parameters
333
- ----------
334
- client : Any
335
- The client object used to submit jobs asynchronously.
336
- pbar : Any
337
- The progress bar object used to update the progress as jobs are processed.
338
- files : List[str]
339
- The list of file paths to be processed.
340
- tasks : Dict[str, Any]
341
- A dictionary of tasks to be executed as part of the job specifications.
342
- processed : int
343
- The number of files that have been processed so far.
344
- batch_size : int
345
- The maximum number of jobs to process in one batch.
346
- retry_job_ids : List[str]
347
- A list of job IDs that need to be retried due to previous failures.
348
- fail_on_error : bool, optional
349
- Whether to raise an error and stop processing if job specifications are missing. Default is False.
350
-
351
- Returns
352
- -------
353
- Tuple[List[str], Dict[str, str], int]
354
- A tuple containing:
355
- - job_ids (List[str]): The list of job IDs created or retried in this iteration.
356
- - job_id_map_updates (Dict[str, str]): A dictionary mapping job IDs to their corresponding file names.
357
- - processed (int): The updated number of files processed.
358
-
359
- Raises
360
- ------
361
- RuntimeError
362
- If `fail_on_error` is True and there are missing job specifications, a RuntimeError is raised.
363
- """
364
- job_indices: List[str] = []
365
- job_index_map_updates: Dict[str, str] = {}
366
- cur_job_count: int = 0
367
-
368
- if retry_job_ids:
369
- job_indices.extend(retry_job_ids)
370
- cur_job_count = len(job_indices)
371
-
372
- if (cur_job_count < batch_size) and (processed < len(files)):
373
- new_job_count: int = min(batch_size - cur_job_count, len(files) - processed)
374
- batch_files: List[str] = files[processed : processed + new_job_count]
375
-
376
- new_job_indices: List[str] = client.create_jobs_for_batch(batch_files, tasks)
377
- if len(new_job_indices) != new_job_count:
378
- missing_jobs: int = new_job_count - len(new_job_indices)
379
- error_msg: str = f"Missing {missing_jobs} job specs -- this is likely due to bad reads or file corruption"
380
- logger.warning(error_msg)
381
-
382
- if fail_on_error:
383
- raise RuntimeError(error_msg)
384
-
385
- pbar.update(missing_jobs)
386
-
387
- job_index_map_updates = {job_index: file for job_index, file in zip(new_job_indices, batch_files)}
388
- processed += new_job_count
389
- _ = client.submit_job_async(new_job_indices, "ingest_task_queue")
390
- job_indices.extend(new_job_indices)
391
-
392
- return job_indices, job_index_map_updates, processed
393
-
394
-
395
- def create_and_process_jobs(
396
- files: List[str],
397
- client: Any,
398
- tasks: Dict[str, Any],
399
- output_directory: str,
400
- batch_size: int,
401
- fail_on_error: bool = False,
402
- save_images_separately: bool = False,
403
- ) -> Tuple[int, Dict[str, List[float]], int, Dict[str, str]]:
404
- """
405
- Process a list of files by creating and submitting jobs for each file, then fetching
406
- and handling the results asynchronously.
407
-
408
- This function creates job specifications (JobSpecs) for the provided list of files,
409
- submits the jobs to the client, and processes the results asynchronously. It handles
410
- job retries for timeouts, logs failures, and limits the number of JobSpecs in memory to
411
- `batch_size * 2`. Progress is reported on a per-file basis, including the pages processed
412
- per second.
413
-
414
- Parameters
415
- ----------
416
- files : List[str]
417
- A list of file paths to be processed. Each file is used to create a job which is then
418
- submitted to the client.
419
- client : Any
420
- An instance of NvIngestClient used to submit jobs and fetch results asynchronously.
421
- tasks : Dict[str, Any]
422
- A dictionary of tasks to be added to each job. The keys represent task names (e.g., "split",
423
- "extract", "store", "caption", etc.) and the values represent task configurations.
424
- output_directory : str
425
- The directory path where the processed job results will be saved. If an empty string or None
426
- is provided, results will not be saved.
427
- batch_size : int
428
- The number of jobs to process in each batch. Memory is limited to `batch_size * 2` jobs at
429
- any time.
430
- fail_on_error : bool, optional
431
- If True, the function will raise an error and stop processing when encountering an unrecoverable
432
- error. If False, the function logs the error and continues processing other jobs. Default is False.
433
- save_images_separately : bool, optional
434
- If True, images will be saved separately to disk. Default is False.
435
-
436
- Returns
437
- -------
438
- Tuple[int, Dict[str, List[float]], int, Dict[str, str]]
439
- A tuple containing:
440
- - total_files (int): The total number of files processed.
441
- - trace_times (Dict[str, List[float]]): A dictionary mapping job IDs to a list of trace times
442
- for diagnostic purposes.
443
- - total_pages_processed (int): The total number of pages processed from the files.
444
- - trace_ids (Dict[str, str]): A dictionary mapping a source file to its correlating trace_id.
445
-
446
- Raises
447
- ------
448
- RuntimeError
449
- If `fail_on_error` is True and an error occurs during job submission or processing.
450
- """
451
- total_files: int = len(files)
452
- total_pages_processed: int = 0
453
- trace_times: Dict[str, List[float]] = defaultdict(list)
454
- trace_ids: Dict[str, str] = defaultdict(list) # type: ignore
455
- failed_jobs: List[str] = []
456
- retry_job_ids: List[str] = []
457
- job_id_map: Dict[str, str] = {}
458
- retry_counts: Dict[str, int] = defaultdict(int)
459
-
460
- start_time_ns: int = time.time_ns()
461
- with tqdm(total=total_files, desc="Processing files", unit="file") as pbar:
462
- processed: int = 0
463
- while (processed < len(files)) or retry_job_ids:
464
- # Process new batch of files or retry failed job IDs
465
- job_ids, job_id_map_updates, processed = generate_job_batch_for_iteration(
466
- client, pbar, files, tasks, processed, batch_size, retry_job_ids, fail_on_error
467
- )
468
- job_id_map.update(job_id_map_updates)
469
- retry_job_ids = []
470
-
471
- futures_dict: Dict[Any, str] = client.fetch_job_result_async(job_ids, data_only=False)
472
- for future in as_completed(futures_dict.keys()):
473
- retry: bool = False
474
- job_id: str = futures_dict[future]
475
- source_name: str = job_id_map[job_id]
476
- try:
477
- future_response, trace_id = handle_future_result(future)
478
- trace_ids[source_name] = trace_id
479
-
480
- first_page_metadata = future_response["data"][0]["metadata"]
481
-
482
- file_page_counts: Dict[str, int] = {
483
- first_page_metadata["source_metadata"]["source_name"]: first_page_metadata["content_metadata"][
484
- "hierarchy"
485
- ]["page_count"]
486
- }
487
-
488
- if output_directory:
489
- save_response_data(future_response, output_directory, images_to_disk=save_images_separately)
490
-
491
- total_pages_processed += file_page_counts[source_name]
492
- elapsed_time: float = (time.time_ns() - start_time_ns) / 1e9
493
- pages_per_sec: float = total_pages_processed / elapsed_time if elapsed_time > 0 else 0
494
- pbar.set_postfix(pages_per_sec=f"{pages_per_sec:.2f}")
495
-
496
- process_response(future_response, trace_times)
497
-
498
- except TimeoutError:
499
- retry_counts[source_name] += 1
500
- retry_job_ids.append(job_id) # Add job_id back to retry list
501
- retry = True
502
- except json.JSONDecodeError as e:
503
- logger.error(f"Decoding error while processing {job_id}({source_name}): {e}")
504
- failed_jobs.append(f"{job_id}::{source_name}")
505
- except RuntimeError as e:
506
- logger.error(f"Error while processing '{job_id}' - ({source_name}):\n{e}")
507
- failed_jobs.append(f"{job_id}::{source_name}")
508
- except Exception as e:
509
- logger.exception(f"Unhandled error while processing {job_id}({source_name}): {e}")
510
- failed_jobs.append(f"{job_id}::{source_name}")
511
- finally:
512
- # Do not update progress bar if we're going to retry the job.
513
- if not retry:
514
- pbar.update(1)
515
-
516
- return total_files, trace_times, total_pages_processed, trace_ids
517
-
518
-
519
126
  def get_valid_filename(name: Any) -> str:
520
127
  """
521
128
  Return a sanitized version of the given filename.