nv-ingest-client 2025.10.3.dev20251003__tar.gz → 2025.11.14.dev20251114__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nv_ingest_client-2025.10.3.dev20251003/src/nv_ingest_client.egg-info → nv_ingest_client-2025.11.14.dev20251114}/PKG-INFO +1 -1
- nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client/cli/util/processing.py +159 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/client.py +511 -208
- nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client/client/ingest_job_handler.py +412 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/interface.py +201 -21
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/nv_ingest_cli.py +28 -4
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/jobs/job_spec.py +4 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/embed.py +24 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/extract.py +1 -1
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/filter.py +1 -1
- nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client/primitives/tasks/ocr_extraction.py +55 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/task_factory.py +9 -12
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/udf.py +24 -27
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/document_analysis.py +1 -1
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/file_processing/extract.py +4 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/util.py +26 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/milvus.py +52 -25
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/SOURCES.txt +2 -0
- nv_ingest_client-2025.10.3.dev20251003/src/nv_ingest_client/cli/util/processing.py +0 -552
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/LICENSE +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/MANIFEST.in +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/README.md +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/pyproject.toml +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/setup.cfg +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/__init__.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/__init__.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/util/__init__.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/util/click.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/util/system.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/__init__.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/util/processing.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/__init__.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/__init__.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/dataset.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/milvus.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/process_json_files.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/processing.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/system.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/transport.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/zipkin.py +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/requires.txt +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
- {nv_ingest_client-2025.10.3.dev20251003 → nv_ingest_client-2025.11.14.dev20251114}/src/version.py +0 -0
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
import time
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from statistics import mean
|
|
10
|
+
from statistics import median
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def report_stage_statistics(stage_elapsed_times: defaultdict, total_trace_elapsed: float, abs_elapsed: float) -> None:
|
|
17
|
+
"""
|
|
18
|
+
Reports the statistics for each processing stage, including average, median, total time spent,
|
|
19
|
+
and their respective percentages of the total processing time.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
stage_elapsed_times : defaultdict(list)
|
|
24
|
+
A defaultdict containing lists of elapsed times for each processing stage, in nanoseconds.
|
|
25
|
+
total_trace_elapsed : float
|
|
26
|
+
The total elapsed time across all processing stages, in nanoseconds.
|
|
27
|
+
abs_elapsed : float
|
|
28
|
+
The absolute elapsed time from the start to the end of processing, in nanoseconds.
|
|
29
|
+
|
|
30
|
+
Notes
|
|
31
|
+
-----
|
|
32
|
+
This function logs the average, median, and total time for each stage, along with the percentage of total
|
|
33
|
+
computation.
|
|
34
|
+
It also calculates and logs the unresolved time, if any, that is not accounted for by the recorded stages.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
for stage, times in stage_elapsed_times.items():
|
|
38
|
+
if times:
|
|
39
|
+
avg_time = mean(times)
|
|
40
|
+
med_time = median(times)
|
|
41
|
+
total_stage_time = sum(times)
|
|
42
|
+
percent_of_total = (total_stage_time / total_trace_elapsed * 100) if total_trace_elapsed > 0 else 0
|
|
43
|
+
logger.info(
|
|
44
|
+
f"{stage}: Avg: {avg_time / 1e6:.2f} ms, Median: {med_time / 1e6:.2f} ms, "
|
|
45
|
+
f"Total Time: {total_stage_time / 1e6:.2f} ms, Total % of Trace Computation: {percent_of_total:.2f}%"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
unresolved_time = abs_elapsed - total_trace_elapsed
|
|
49
|
+
if unresolved_time > 0:
|
|
50
|
+
percent_unresolved = unresolved_time / abs_elapsed * 100
|
|
51
|
+
logger.info(
|
|
52
|
+
f"Unresolved time: {unresolved_time / 1e6:.2f} ms, Percent of Total Elapsed: {percent_unresolved:.2f}%"
|
|
53
|
+
)
|
|
54
|
+
else:
|
|
55
|
+
logger.info("No unresolved time detected. Trace times account for the entire elapsed duration.")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def report_overall_speed(total_pages_processed: int, start_time_ns: int, total_files: int) -> None:
|
|
59
|
+
"""
|
|
60
|
+
Report the overall processing speed based on the number of pages and files processed.
|
|
61
|
+
|
|
62
|
+
This function calculates the total elapsed time from the start of processing and reports the throughput
|
|
63
|
+
in terms of pages and files processed per second.
|
|
64
|
+
|
|
65
|
+
Parameters
|
|
66
|
+
----------
|
|
67
|
+
total_pages_processed : int
|
|
68
|
+
The total number of pages processed.
|
|
69
|
+
start_time_ns : int
|
|
70
|
+
The nanosecond timestamp marking the start of processing.
|
|
71
|
+
total_files : int
|
|
72
|
+
The total number of files processed.
|
|
73
|
+
|
|
74
|
+
Notes
|
|
75
|
+
-----
|
|
76
|
+
The function converts the elapsed time from nanoseconds to seconds and logs the overall throughput.
|
|
77
|
+
"""
|
|
78
|
+
total_elapsed_time_ns: int = time.time_ns() - start_time_ns
|
|
79
|
+
total_elapsed_time_s: float = total_elapsed_time_ns / 1_000_000_000 # Convert nanoseconds to seconds
|
|
80
|
+
|
|
81
|
+
throughput_pages: float = total_pages_processed / total_elapsed_time_s # pages/sec
|
|
82
|
+
throughput_files: float = total_files / total_elapsed_time_s # files/sec
|
|
83
|
+
|
|
84
|
+
logger.info(f"Processed {total_files} files in {total_elapsed_time_s:.2f} seconds.")
|
|
85
|
+
logger.info(f"Total pages processed: {total_pages_processed}")
|
|
86
|
+
logger.info(f"Throughput (Pages/sec): {throughput_pages:.2f}")
|
|
87
|
+
logger.info(f"Throughput (Files/sec): {throughput_files:.2f}")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def report_statistics(
|
|
91
|
+
start_time_ns: int,
|
|
92
|
+
stage_elapsed_times: defaultdict,
|
|
93
|
+
total_pages_processed: int,
|
|
94
|
+
total_files: int,
|
|
95
|
+
) -> None:
|
|
96
|
+
"""
|
|
97
|
+
Aggregate and report statistics for the entire processing session.
|
|
98
|
+
|
|
99
|
+
This function calculates the absolute elapsed time from the start of processing to the current time and
|
|
100
|
+
the total time taken by all stages. It then reports detailed stage statistics along with overall
|
|
101
|
+
processing throughput.
|
|
102
|
+
|
|
103
|
+
Parameters
|
|
104
|
+
----------
|
|
105
|
+
start_time_ns : int
|
|
106
|
+
The nanosecond timestamp marking the start of the processing.
|
|
107
|
+
stage_elapsed_times : defaultdict
|
|
108
|
+
A defaultdict where each key is a processing stage (str) and each value is a list of elapsed times
|
|
109
|
+
(int, in nanoseconds) for that stage.
|
|
110
|
+
total_pages_processed : int
|
|
111
|
+
The total number of pages processed during the session.
|
|
112
|
+
total_files : int
|
|
113
|
+
The total number of files processed during the session.
|
|
114
|
+
|
|
115
|
+
Notes
|
|
116
|
+
-----
|
|
117
|
+
The function calls `report_stage_statistics` to log detailed timing information per stage, then calls
|
|
118
|
+
`report_overall_speed` to log the overall throughput.
|
|
119
|
+
"""
|
|
120
|
+
abs_elapsed: int = time.time_ns() - start_time_ns
|
|
121
|
+
total_trace_elapsed: int = sum(sum(times) for times in stage_elapsed_times.values())
|
|
122
|
+
report_stage_statistics(stage_elapsed_times, total_trace_elapsed, abs_elapsed) # Assumes implementation exists
|
|
123
|
+
report_overall_speed(total_pages_processed, start_time_ns, total_files)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def get_valid_filename(name: Any) -> str:
|
|
127
|
+
"""
|
|
128
|
+
Return a sanitized version of the given filename.
|
|
129
|
+
|
|
130
|
+
This function, adapted from Django (https://github.com/django/django/blob/main/django/utils/text.py),
|
|
131
|
+
converts the input string to a form that is safe to use as a filename. It trims leading and trailing spaces,
|
|
132
|
+
replaces remaining spaces with underscores, and removes any characters that are not alphanumeric, dashes,
|
|
133
|
+
underscores, or dots.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
name : Any
|
|
138
|
+
The input value to be converted into a valid filename. It will be converted to a string.
|
|
139
|
+
|
|
140
|
+
Returns
|
|
141
|
+
-------
|
|
142
|
+
str
|
|
143
|
+
A sanitized string that can be used as a filename.
|
|
144
|
+
|
|
145
|
+
Raises
|
|
146
|
+
------
|
|
147
|
+
ValueError
|
|
148
|
+
If a valid filename cannot be derived from the input.
|
|
149
|
+
|
|
150
|
+
Examples
|
|
151
|
+
--------
|
|
152
|
+
>>> get_valid_filename("john's portrait in 2004.jpg")
|
|
153
|
+
'johns_portrait_in_2004.jpg'
|
|
154
|
+
"""
|
|
155
|
+
s: str = str(name).strip().replace(" ", "_")
|
|
156
|
+
s = re.sub(r"(?u)[^-\w.]", "", s)
|
|
157
|
+
if s in {"", ".", ".."}:
|
|
158
|
+
raise ValueError("Could not derive file name from '%s'" % name)
|
|
159
|
+
return s
|