nv-ingest-client 2025.10.3.dev20251003__tar.gz → 2025.11.14.dev20251114__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {nv_ingest_client-2025.10.3.dev20251003/src/nv_ingest_client.egg-info → nv_ingest_client-2025.11.14.dev20251114}/PKG-INFO +1 -1
  2. nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client/cli/util/processing.py +159 -0
  3. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/client.py +511 -208
  4. nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client/client/ingest_job_handler.py +412 -0
  5. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/interface.py +201 -21
  6. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/nv_ingest_cli.py +28 -4
  7. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/jobs/job_spec.py +4 -0
  8. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/embed.py +24 -0
  9. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/extract.py +1 -1
  10. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/filter.py +1 -1
  11. nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client/primitives/tasks/ocr_extraction.py +55 -0
  12. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/task_factory.py +9 -12
  13. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/udf.py +24 -27
  14. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/document_analysis.py +1 -1
  15. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/file_processing/extract.py +4 -0
  16. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/util.py +26 -0
  17. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/milvus.py +52 -25
  18. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
  19. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/SOURCES.txt +2 -0
  20. nv_ingest_client-2025.10.3.dev20251003/src/nv_ingest_client/cli/util/processing.py +0 -552
  21. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/LICENSE +0 -0
  22. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/MANIFEST.in +0 -0
  23. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/README.md +0 -0
  24. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/pyproject.toml +0 -0
  25. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/setup.cfg +0 -0
  26. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/__init__.py +0 -0
  27. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/__init__.py +0 -0
  28. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/util/__init__.py +0 -0
  29. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/util/click.py +0 -0
  30. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/util/system.py +0 -0
  31. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/__init__.py +0 -0
  32. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/util/processing.py +0 -0
  33. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/__init__.py +0 -0
  34. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
  35. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
  36. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
  37. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
  38. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
  39. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
  40. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
  41. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
  42. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
  43. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
  44. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
  45. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
  46. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
  47. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/__init__.py +0 -0
  48. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/dataset.py +0 -0
  49. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
  50. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
  51. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/milvus.py +0 -0
  52. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/process_json_files.py +0 -0
  53. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/processing.py +0 -0
  54. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/system.py +0 -0
  55. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/transport.py +0 -0
  56. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
  57. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
  58. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
  59. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/zipkin.py +0 -0
  60. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
  61. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
  62. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/requires.txt +0 -0
  63. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
  64. {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.10.3.dev20251003
3
+ Version: 2025.11.14.dev20251114
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -0,0 +1,159 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import re
7
+ import time
8
+ from collections import defaultdict
9
+ from statistics import mean
10
+ from statistics import median
11
+ from typing import Any
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def report_stage_statistics(stage_elapsed_times: defaultdict, total_trace_elapsed: float, abs_elapsed: float) -> None:
17
+ """
18
+ Reports the statistics for each processing stage, including average, median, total time spent,
19
+ and their respective percentages of the total processing time.
20
+
21
+ Parameters
22
+ ----------
23
+ stage_elapsed_times : defaultdict(list)
24
+ A defaultdict containing lists of elapsed times for each processing stage, in nanoseconds.
25
+ total_trace_elapsed : float
26
+ The total elapsed time across all processing stages, in nanoseconds.
27
+ abs_elapsed : float
28
+ The absolute elapsed time from the start to the end of processing, in nanoseconds.
29
+
30
+ Notes
31
+ -----
32
+ This function logs the average, median, and total time for each stage, along with the percentage of total
33
+ computation.
34
+ It also calculates and logs the unresolved time, if any, that is not accounted for by the recorded stages.
35
+ """
36
+
37
+ for stage, times in stage_elapsed_times.items():
38
+ if times:
39
+ avg_time = mean(times)
40
+ med_time = median(times)
41
+ total_stage_time = sum(times)
42
+ percent_of_total = (total_stage_time / total_trace_elapsed * 100) if total_trace_elapsed > 0 else 0
43
+ logger.info(
44
+ f"{stage}: Avg: {avg_time / 1e6:.2f} ms, Median: {med_time / 1e6:.2f} ms, "
45
+ f"Total Time: {total_stage_time / 1e6:.2f} ms, Total % of Trace Computation: {percent_of_total:.2f}%"
46
+ )
47
+
48
+ unresolved_time = abs_elapsed - total_trace_elapsed
49
+ if unresolved_time > 0:
50
+ percent_unresolved = unresolved_time / abs_elapsed * 100
51
+ logger.info(
52
+ f"Unresolved time: {unresolved_time / 1e6:.2f} ms, Percent of Total Elapsed: {percent_unresolved:.2f}%"
53
+ )
54
+ else:
55
+ logger.info("No unresolved time detected. Trace times account for the entire elapsed duration.")
56
+
57
+
58
+ def report_overall_speed(total_pages_processed: int, start_time_ns: int, total_files: int) -> None:
59
+ """
60
+ Report the overall processing speed based on the number of pages and files processed.
61
+
62
+ This function calculates the total elapsed time from the start of processing and reports the throughput
63
+ in terms of pages and files processed per second.
64
+
65
+ Parameters
66
+ ----------
67
+ total_pages_processed : int
68
+ The total number of pages processed.
69
+ start_time_ns : int
70
+ The nanosecond timestamp marking the start of processing.
71
+ total_files : int
72
+ The total number of files processed.
73
+
74
+ Notes
75
+ -----
76
+ The function converts the elapsed time from nanoseconds to seconds and logs the overall throughput.
77
+ """
78
+ total_elapsed_time_ns: int = time.time_ns() - start_time_ns
79
+ total_elapsed_time_s: float = total_elapsed_time_ns / 1_000_000_000 # Convert nanoseconds to seconds
80
+
81
+ throughput_pages: float = total_pages_processed / total_elapsed_time_s # pages/sec
82
+ throughput_files: float = total_files / total_elapsed_time_s # files/sec
83
+
84
+ logger.info(f"Processed {total_files} files in {total_elapsed_time_s:.2f} seconds.")
85
+ logger.info(f"Total pages processed: {total_pages_processed}")
86
+ logger.info(f"Throughput (Pages/sec): {throughput_pages:.2f}")
87
+ logger.info(f"Throughput (Files/sec): {throughput_files:.2f}")
88
+
89
+
90
+ def report_statistics(
91
+ start_time_ns: int,
92
+ stage_elapsed_times: defaultdict,
93
+ total_pages_processed: int,
94
+ total_files: int,
95
+ ) -> None:
96
+ """
97
+ Aggregate and report statistics for the entire processing session.
98
+
99
+ This function calculates the absolute elapsed time from the start of processing to the current time and
100
+ the total time taken by all stages. It then reports detailed stage statistics along with overall
101
+ processing throughput.
102
+
103
+ Parameters
104
+ ----------
105
+ start_time_ns : int
106
+ The nanosecond timestamp marking the start of the processing.
107
+ stage_elapsed_times : defaultdict
108
+ A defaultdict where each key is a processing stage (str) and each value is a list of elapsed times
109
+ (int, in nanoseconds) for that stage.
110
+ total_pages_processed : int
111
+ The total number of pages processed during the session.
112
+ total_files : int
113
+ The total number of files processed during the session.
114
+
115
+ Notes
116
+ -----
117
+ The function calls `report_stage_statistics` to log detailed timing information per stage, then calls
118
+ `report_overall_speed` to log the overall throughput.
119
+ """
120
+ abs_elapsed: int = time.time_ns() - start_time_ns
121
+ total_trace_elapsed: int = sum(sum(times) for times in stage_elapsed_times.values())
122
+ report_stage_statistics(stage_elapsed_times, total_trace_elapsed, abs_elapsed) # Assumes implementation exists
123
+ report_overall_speed(total_pages_processed, start_time_ns, total_files)
124
+
125
+
126
+ def get_valid_filename(name: Any) -> str:
127
+ """
128
+ Return a sanitized version of the given filename.
129
+
130
+ This function, adapted from Django (https://github.com/django/django/blob/main/django/utils/text.py),
131
+ converts the input string to a form that is safe to use as a filename. It trims leading and trailing spaces,
132
+ replaces remaining spaces with underscores, and removes any characters that are not alphanumeric, dashes,
133
+ underscores, or dots.
134
+
135
+ Parameters
136
+ ----------
137
+ name : Any
138
+ The input value to be converted into a valid filename. It will be converted to a string.
139
+
140
+ Returns
141
+ -------
142
+ str
143
+ A sanitized string that can be used as a filename.
144
+
145
+ Raises
146
+ ------
147
+ ValueError
148
+ If a valid filename cannot be derived from the input.
149
+
150
+ Examples
151
+ --------
152
+ >>> get_valid_filename("john's portrait in 2004.jpg")
153
+ 'johns_portrait_in_2004.jpg'
154
+ """
155
+ s: str = str(name).strip().replace(" ", "_")
156
+ s = re.sub(r"(?u)[^-\w.]", "", s)
157
+ if s in {"", ".", ".."}:
158
+ raise ValueError("Could not derive file name from '%s'" % name)
159
+ return s