nv-ingest-api 2025.4.18.dev20250418__py3-none-any.whl → 2025.4.20.dev20250420__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +0 -3
- nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
- nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.20.dev20250420.dist-info/RECORD +9 -0
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/WHEEL +1 -1
- nv_ingest_api/interface/__init__.py +0 -215
- nv_ingest_api/interface/extract.py +0 -972
- nv_ingest_api/interface/mutate.py +0 -154
- nv_ingest_api/interface/store.py +0 -218
- nv_ingest_api/interface/transform.py +0 -382
- nv_ingest_api/interface/utility.py +0 -200
- nv_ingest_api/internal/enums/__init__.py +0 -3
- nv_ingest_api/internal/enums/common.py +0 -494
- nv_ingest_api/internal/extract/__init__.py +0 -3
- nv_ingest_api/internal/extract/audio/__init__.py +0 -3
- nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
- nv_ingest_api/internal/extract/docx/__init__.py +0 -5
- nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
- nv_ingest_api/internal/extract/image/__init__.py +0 -3
- nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
- nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
- nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
- nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
- nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
- nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
- nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
- nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
- nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
- nv_ingest_api/internal/mutate/__init__.py +0 -3
- nv_ingest_api/internal/mutate/deduplicate.py +0 -110
- nv_ingest_api/internal/mutate/filter.py +0 -133
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
- nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
- nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
- nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
- nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
- nv_ingest_api/internal/schemas/__init__.py +0 -3
- nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
- nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
- nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
- nv_ingest_api/internal/schemas/store/__init__.py +0 -3
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
- nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
- nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
- nv_ingest_api/internal/store/__init__.py +0 -3
- nv_ingest_api/internal/store/embed_text_upload.py +0 -236
- nv_ingest_api/internal/store/image_upload.py +0 -232
- nv_ingest_api/internal/transform/__init__.py +0 -3
- nv_ingest_api/internal/transform/caption_image.py +0 -205
- nv_ingest_api/internal/transform/embed_text.py +0 -496
- nv_ingest_api/internal/transform/split_text.py +0 -157
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +0 -47
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +0 -78
- nv_ingest_api/util/converters/containers.py +0 -65
- nv_ingest_api/util/converters/datetools.py +0 -90
- nv_ingest_api/util/converters/dftools.py +0 -127
- nv_ingest_api/util/converters/formats.py +0 -64
- nv_ingest_api/util/converters/type_mappings.py +0 -27
- nv_ingest_api/util/detectors/__init__.py +0 -5
- nv_ingest_api/util/detectors/language.py +0 -38
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +0 -72
- nv_ingest_api/util/exception_handlers/decorators.py +0 -223
- nv_ingest_api/util/exception_handlers/detectors.py +0 -74
- nv_ingest_api/util/exception_handlers/pdf.py +0 -116
- nv_ingest_api/util/exception_handlers/schemas.py +0 -68
- nv_ingest_api/util/image_processing/__init__.py +0 -5
- nv_ingest_api/util/image_processing/clustering.py +0 -260
- nv_ingest_api/util/image_processing/processing.py +0 -179
- nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
- nv_ingest_api/util/image_processing/transforms.py +0 -407
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +0 -31
- nv_ingest_api/util/message_brokers/__init__.py +0 -3
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -451
- nv_ingest_api/util/metadata/__init__.py +0 -5
- nv_ingest_api/util/metadata/aggregators.py +0 -469
- nv_ingest_api/util/multi_processing/__init__.py +0 -8
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
- nv_ingest_api/util/nim/__init__.py +0 -56
- nv_ingest_api/util/pdf/__init__.py +0 -3
- nv_ingest_api/util/pdf/pdfium.py +0 -427
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +0 -10
- nv_ingest_api/util/service_clients/__init__.py +0 -3
- nv_ingest_api/util/service_clients/client_base.py +0 -86
- nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +0 -823
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +0 -531
- nv_ingest_api/util/string_processing/__init__.py +0 -51
- nv_ingest_api-2025.4.18.dev20250418.dist-info/RECORD +0 -152
- /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/top_level.txt +0 -0
|
@@ -1,197 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
import functools
|
|
7
|
-
import inspect
|
|
8
|
-
import string
|
|
9
|
-
from datetime import datetime
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def traceable(trace_name=None):
|
|
13
|
-
"""
|
|
14
|
-
A decorator that adds entry and exit trace timestamps to a IngestControlMessage's metadata
|
|
15
|
-
based on the presence of a 'config::add_trace_tagging' flag.
|
|
16
|
-
|
|
17
|
-
This decorator checks if the 'config::add_trace_tagging' flag is set to True in the
|
|
18
|
-
message's metadata. If so, it records the entry and exit timestamps of the function
|
|
19
|
-
execution, using either a provided custom trace name or the function's name by default.
|
|
20
|
-
|
|
21
|
-
Parameters
|
|
22
|
-
----------
|
|
23
|
-
trace_name : str, optional
|
|
24
|
-
A custom name for the trace entries in the message metadata. If not provided, the
|
|
25
|
-
function's name is used by default.
|
|
26
|
-
|
|
27
|
-
Returns
|
|
28
|
-
-------
|
|
29
|
-
decorator_trace_tagging : Callable
|
|
30
|
-
A wrapper function that decorates the target function to implement trace tagging.
|
|
31
|
-
|
|
32
|
-
Notes
|
|
33
|
-
-----
|
|
34
|
-
The decorated function must accept a IngestControlMessage object as its first argument. The
|
|
35
|
-
IngestControlMessage object must implement `has_metadata`, `get_metadata`, and `set_metadata`
|
|
36
|
-
methods used by the decorator to check for the trace tagging flag and to add trace metadata.
|
|
37
|
-
|
|
38
|
-
The trace metadata added by the decorator includes two entries:
|
|
39
|
-
- 'trace::entry::<trace_name>': The monotonic timestamp marking the function's entry.
|
|
40
|
-
- 'trace::exit::<trace_name>': The monotonic timestamp marking the function's exit.
|
|
41
|
-
|
|
42
|
-
Example
|
|
43
|
-
-------
|
|
44
|
-
Applying the decorator without a custom trace name:
|
|
45
|
-
|
|
46
|
-
>>> @traceable()
|
|
47
|
-
... def process_message(message):
|
|
48
|
-
... pass
|
|
49
|
-
|
|
50
|
-
Applying the decorator with a custom trace name:
|
|
51
|
-
|
|
52
|
-
>>> @traceable(custom_trace_name="CustomTraceName")
|
|
53
|
-
... def process_message(message):
|
|
54
|
-
... pass
|
|
55
|
-
|
|
56
|
-
In both examples, `process_message` will have entry and exit timestamps added to the
|
|
57
|
-
IngestControlMessage's metadata if 'config::add_trace_tagging' is True.
|
|
58
|
-
|
|
59
|
-
"""
|
|
60
|
-
|
|
61
|
-
def decorator_trace_tagging(func):
|
|
62
|
-
@functools.wraps(func)
|
|
63
|
-
def wrapper_trace_tagging(*args, **kwargs):
|
|
64
|
-
# Assuming the first argument is always the message
|
|
65
|
-
ts_fetched = datetime.now()
|
|
66
|
-
message = args[0]
|
|
67
|
-
|
|
68
|
-
do_trace_tagging = (message.has_metadata("config::add_trace_tagging") is True) and (
|
|
69
|
-
message.get_metadata("config::add_trace_tagging") is True
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
trace_prefix = trace_name if trace_name else func.__name__
|
|
73
|
-
|
|
74
|
-
if do_trace_tagging:
|
|
75
|
-
ts_send = message.get_timestamp("latency::ts_send")
|
|
76
|
-
ts_entry = datetime.now()
|
|
77
|
-
message.set_timestamp(f"trace::entry::{trace_prefix}", ts_entry)
|
|
78
|
-
if ts_send:
|
|
79
|
-
message.set_timestamp(f"trace::entry::{trace_prefix}_channel_in", ts_send)
|
|
80
|
-
message.set_timestamp(f"trace::exit::{trace_prefix}_channel_in", ts_fetched)
|
|
81
|
-
|
|
82
|
-
# Call the decorated function
|
|
83
|
-
result = func(*args, **kwargs)
|
|
84
|
-
|
|
85
|
-
if do_trace_tagging:
|
|
86
|
-
ts_exit = datetime.now()
|
|
87
|
-
message.set_timestamp(f"trace::exit::{trace_prefix}", ts_exit)
|
|
88
|
-
message.set_timestamp("latency::ts_send", ts_exit)
|
|
89
|
-
|
|
90
|
-
return result
|
|
91
|
-
|
|
92
|
-
return wrapper_trace_tagging
|
|
93
|
-
|
|
94
|
-
return decorator_trace_tagging
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def traceable_func(trace_name=None, dedupe=True):
|
|
98
|
-
"""
|
|
99
|
-
A decorator that injects trace information for tracking the execution of a function.
|
|
100
|
-
It logs the entry and exit timestamps of the function in a `trace_info` dictionary,
|
|
101
|
-
which can be used for performance monitoring or debugging purposes.
|
|
102
|
-
|
|
103
|
-
Parameters
|
|
104
|
-
----------
|
|
105
|
-
trace_name : str, optional
|
|
106
|
-
An optional string used as the prefix for the trace log entries. If not provided,
|
|
107
|
-
the decorated function's name is used. The string can include placeholders (e.g.,
|
|
108
|
-
"pdf_extractor::{model_name}") that will be dynamically replaced with matching
|
|
109
|
-
function argument values.
|
|
110
|
-
dedupe : bool, optional
|
|
111
|
-
If True, ensures that the trace entry and exit keys are unique by appending an index
|
|
112
|
-
(e.g., `_0`, `_1`) to the keys if duplicate entries are detected. Default is True.
|
|
113
|
-
|
|
114
|
-
Returns
|
|
115
|
-
-------
|
|
116
|
-
function
|
|
117
|
-
A wrapped function that injects trace information before and after the function's
|
|
118
|
-
execution.
|
|
119
|
-
|
|
120
|
-
Notes
|
|
121
|
-
-----
|
|
122
|
-
- If `trace_info` is not provided in the keyword arguments, a new dictionary is created
|
|
123
|
-
and used for storing trace entries.
|
|
124
|
-
- If `trace_name` contains format placeholders, the decorator attempts to populate them
|
|
125
|
-
with matching argument values from the decorated function.
|
|
126
|
-
- The trace information is logged in the format:
|
|
127
|
-
- `trace::entry::{trace_name}` for the entry timestamp.
|
|
128
|
-
- `trace::exit::{trace_name}` for the exit timestamp.
|
|
129
|
-
- If `dedupe` is True, the trace keys will be appended with an index to avoid
|
|
130
|
-
overwriting existing entries.
|
|
131
|
-
|
|
132
|
-
Example
|
|
133
|
-
-------
|
|
134
|
-
>>> @traceable_func(trace_name="pdf_extractor::{model_name}")
|
|
135
|
-
>>> def extract_pdf(model_name):
|
|
136
|
-
... pass
|
|
137
|
-
>>> trace_info = {}
|
|
138
|
-
>>> extract_pdf("my_model", trace_info=trace_info)
|
|
139
|
-
|
|
140
|
-
In this example, `model_name` is dynamically replaced in the trace_name, and the
|
|
141
|
-
trace information is logged with unique keys if deduplication is enabled.
|
|
142
|
-
"""
|
|
143
|
-
|
|
144
|
-
def decorator_inject_trace_info(func):
|
|
145
|
-
@functools.wraps(func)
|
|
146
|
-
def wrapper_inject_trace_info(*args, **kwargs):
|
|
147
|
-
trace_info = kwargs.pop("trace_info", None)
|
|
148
|
-
if trace_info is None:
|
|
149
|
-
trace_info = {}
|
|
150
|
-
trace_prefix = trace_name if trace_name else func.__name__
|
|
151
|
-
|
|
152
|
-
arg_names = list(inspect.signature(func).parameters)
|
|
153
|
-
args_name_to_val = dict(zip(arg_names, args))
|
|
154
|
-
|
|
155
|
-
# If `trace_name` is a formattable string, e.g., "pdf_extractor::{model_name}",
|
|
156
|
-
# search `args` and `kwargs` to replace the placeholder.
|
|
157
|
-
placeholders = [x[1] for x in string.Formatter().parse(trace_name) if x[1] is not None]
|
|
158
|
-
if placeholders:
|
|
159
|
-
format_kwargs = {}
|
|
160
|
-
for name in placeholders:
|
|
161
|
-
if name in args_name_to_val:
|
|
162
|
-
arg_val = args_name_to_val[name]
|
|
163
|
-
elif name in kwargs:
|
|
164
|
-
arg_val = kwargs.get(name)
|
|
165
|
-
else:
|
|
166
|
-
arg_val = name
|
|
167
|
-
format_kwargs[name] = arg_val
|
|
168
|
-
trace_prefix = trace_prefix.format(**format_kwargs)
|
|
169
|
-
|
|
170
|
-
trace_entry_key = f"trace::entry::{trace_prefix}"
|
|
171
|
-
trace_exit_key = f"trace::exit::{trace_prefix}"
|
|
172
|
-
|
|
173
|
-
ts_entry = datetime.now()
|
|
174
|
-
|
|
175
|
-
if dedupe:
|
|
176
|
-
trace_entry_key += "_{}"
|
|
177
|
-
trace_exit_key += "_{}"
|
|
178
|
-
i = 0
|
|
179
|
-
while (trace_entry_key.format(i) in trace_info) or (trace_exit_key.format(i) in trace_info):
|
|
180
|
-
i += 1
|
|
181
|
-
trace_entry_key = trace_entry_key.format(i)
|
|
182
|
-
trace_exit_key = trace_exit_key.format(i)
|
|
183
|
-
|
|
184
|
-
trace_info[trace_entry_key] = ts_entry
|
|
185
|
-
|
|
186
|
-
# Call the decorated function
|
|
187
|
-
result = func(*args, **kwargs)
|
|
188
|
-
|
|
189
|
-
ts_exit = datetime.now()
|
|
190
|
-
|
|
191
|
-
trace_info[trace_exit_key] = ts_exit
|
|
192
|
-
|
|
193
|
-
return result
|
|
194
|
-
|
|
195
|
-
return wrapper_inject_trace_info
|
|
196
|
-
|
|
197
|
-
return decorator_inject_trace_info
|
|
@@ -1,130 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
import logging
|
|
7
|
-
from typing import Optional
|
|
8
|
-
from typing import Tuple
|
|
9
|
-
|
|
10
|
-
from pydantic import BaseModel
|
|
11
|
-
from pydantic import root_validator
|
|
12
|
-
|
|
13
|
-
logger = logging.getLogger(__name__)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class AudioConfigSchema(BaseModel):
|
|
17
|
-
"""
|
|
18
|
-
Configuration schema for audio extraction endpoints and options.
|
|
19
|
-
|
|
20
|
-
Parameters
|
|
21
|
-
----------
|
|
22
|
-
auth_token : Optional[str], default=None
|
|
23
|
-
Authentication token required for secure services.
|
|
24
|
-
|
|
25
|
-
audio_endpoints : Tuple[str, str]
|
|
26
|
-
A tuple containing the gRPC and HTTP services for the audio_retriever endpoint.
|
|
27
|
-
Either the gRPC or HTTP service can be empty, but not both.
|
|
28
|
-
|
|
29
|
-
Methods
|
|
30
|
-
-------
|
|
31
|
-
validate_endpoints(values)
|
|
32
|
-
Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
|
|
33
|
-
|
|
34
|
-
Raises
|
|
35
|
-
------
|
|
36
|
-
ValueError
|
|
37
|
-
If both gRPC and HTTP services are empty for any endpoint.
|
|
38
|
-
|
|
39
|
-
Config
|
|
40
|
-
------
|
|
41
|
-
extra : str
|
|
42
|
-
Pydantic config option to forbid extra fields.
|
|
43
|
-
"""
|
|
44
|
-
|
|
45
|
-
auth_token: Optional[str] = None
|
|
46
|
-
audio_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
47
|
-
audio_infer_protocol: Optional[str] = None
|
|
48
|
-
function_id: Optional[str] = None
|
|
49
|
-
use_ssl: Optional[bool] = None
|
|
50
|
-
ssl_cert: Optional[str] = None
|
|
51
|
-
|
|
52
|
-
@root_validator(pre=True)
|
|
53
|
-
def validate_endpoints(cls, values):
|
|
54
|
-
"""
|
|
55
|
-
Validates the gRPC and HTTP services for all endpoints.
|
|
56
|
-
|
|
57
|
-
Parameters
|
|
58
|
-
----------
|
|
59
|
-
values : dict
|
|
60
|
-
Dictionary containing the values of the attributes for the class.
|
|
61
|
-
|
|
62
|
-
Returns
|
|
63
|
-
-------
|
|
64
|
-
dict
|
|
65
|
-
The validated dictionary of values.
|
|
66
|
-
|
|
67
|
-
Raises
|
|
68
|
-
------
|
|
69
|
-
ValueError
|
|
70
|
-
If both gRPC and HTTP services are empty for any endpoint.
|
|
71
|
-
"""
|
|
72
|
-
|
|
73
|
-
def clean_service(service):
|
|
74
|
-
"""Set service to None if it's an empty string or contains only spaces or quotes."""
|
|
75
|
-
if service is None or not service.strip() or service.strip(" \"'") == "":
|
|
76
|
-
return None
|
|
77
|
-
return service
|
|
78
|
-
|
|
79
|
-
endpoint_name = "audio_endpoints"
|
|
80
|
-
grpc_service, http_service = values.get(endpoint_name)
|
|
81
|
-
grpc_service = clean_service(grpc_service)
|
|
82
|
-
http_service = clean_service(http_service)
|
|
83
|
-
|
|
84
|
-
if not grpc_service and not http_service:
|
|
85
|
-
raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
|
|
86
|
-
|
|
87
|
-
values[endpoint_name] = (grpc_service, http_service)
|
|
88
|
-
|
|
89
|
-
protocol_name = "audio_infer_protocol"
|
|
90
|
-
protocol_value = values.get(protocol_name)
|
|
91
|
-
|
|
92
|
-
if not protocol_value:
|
|
93
|
-
protocol_value = "http" if http_service else "grpc" if grpc_service else ""
|
|
94
|
-
|
|
95
|
-
protocol_value = protocol_value.lower()
|
|
96
|
-
values[protocol_name] = protocol_value
|
|
97
|
-
|
|
98
|
-
return values
|
|
99
|
-
|
|
100
|
-
class Config:
|
|
101
|
-
extra = "forbid"
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
class AudioExtractorSchema(BaseModel):
|
|
105
|
-
"""
|
|
106
|
-
Configuration schema for the PDF extractor settings.
|
|
107
|
-
|
|
108
|
-
Parameters
|
|
109
|
-
----------
|
|
110
|
-
max_queue_size : int, default=1
|
|
111
|
-
The maximum number of items allowed in the processing queue.
|
|
112
|
-
|
|
113
|
-
n_workers : int, default=16
|
|
114
|
-
The number of worker threads to use for processing.
|
|
115
|
-
|
|
116
|
-
raise_on_failure : bool, default=False
|
|
117
|
-
A flag indicating whether to raise an exception on processing failure.
|
|
118
|
-
|
|
119
|
-
audio_extraction_config: Optional[AudioConfigSchema], default=None
|
|
120
|
-
Configuration schema for the audio extraction stage.
|
|
121
|
-
"""
|
|
122
|
-
|
|
123
|
-
max_queue_size: int = 1
|
|
124
|
-
n_workers: int = 16
|
|
125
|
-
raise_on_failure: bool = False
|
|
126
|
-
|
|
127
|
-
audio_extraction_config: Optional[AudioConfigSchema] = None
|
|
128
|
-
|
|
129
|
-
class Config:
|
|
130
|
-
extra = "forbid"
|
|
@@ -1,135 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
import logging
|
|
6
|
-
from typing import Optional
|
|
7
|
-
from typing import Tuple
|
|
8
|
-
|
|
9
|
-
from pydantic import field_validator, model_validator, ConfigDict, BaseModel
|
|
10
|
-
|
|
11
|
-
logger = logging.getLogger(__name__)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class ChartExtractorConfigSchema(BaseModel):
|
|
15
|
-
"""
|
|
16
|
-
Configuration schema for chart extraction service endpoints and options.
|
|
17
|
-
|
|
18
|
-
Parameters
|
|
19
|
-
----------
|
|
20
|
-
auth_token : Optional[str], default=None
|
|
21
|
-
Authentication token required for secure services.
|
|
22
|
-
|
|
23
|
-
yolox_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
|
|
24
|
-
A tuple containing the gRPC and HTTP services for the yolox endpoint.
|
|
25
|
-
Either the gRPC or HTTP service can be empty, but not both.
|
|
26
|
-
|
|
27
|
-
paddle_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
|
|
28
|
-
A tuple containing the gRPC and HTTP services for the paddle endpoint.
|
|
29
|
-
Either the gRPC or HTTP service can be empty, but not both.
|
|
30
|
-
|
|
31
|
-
Methods
|
|
32
|
-
-------
|
|
33
|
-
validate_endpoints(values)
|
|
34
|
-
Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
|
|
35
|
-
|
|
36
|
-
Raises
|
|
37
|
-
------
|
|
38
|
-
ValueError
|
|
39
|
-
If both gRPC and HTTP services are empty for any endpoint.
|
|
40
|
-
|
|
41
|
-
Config
|
|
42
|
-
------
|
|
43
|
-
extra : str
|
|
44
|
-
Pydantic config option to forbid extra fields.
|
|
45
|
-
"""
|
|
46
|
-
|
|
47
|
-
auth_token: Optional[str] = None
|
|
48
|
-
|
|
49
|
-
yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
50
|
-
yolox_infer_protocol: str = ""
|
|
51
|
-
|
|
52
|
-
paddle_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
53
|
-
paddle_infer_protocol: str = ""
|
|
54
|
-
|
|
55
|
-
nim_batch_size: int = 2
|
|
56
|
-
workers_per_progress_engine: int = 5
|
|
57
|
-
|
|
58
|
-
@model_validator(mode="before")
|
|
59
|
-
@classmethod
|
|
60
|
-
def validate_endpoints(cls, values):
|
|
61
|
-
"""
|
|
62
|
-
Validates the gRPC and HTTP services for all endpoints.
|
|
63
|
-
|
|
64
|
-
Ensures that at least one service (either gRPC or HTTP) is provided
|
|
65
|
-
for each endpoint in the configuration.
|
|
66
|
-
|
|
67
|
-
Parameters
|
|
68
|
-
----------
|
|
69
|
-
values : dict
|
|
70
|
-
Dictionary containing the values of the attributes for the class.
|
|
71
|
-
|
|
72
|
-
Returns
|
|
73
|
-
-------
|
|
74
|
-
dict
|
|
75
|
-
The validated dictionary of values.
|
|
76
|
-
|
|
77
|
-
Raises
|
|
78
|
-
------
|
|
79
|
-
ValueError
|
|
80
|
-
If both gRPC and HTTP services are empty for any endpoint.
|
|
81
|
-
"""
|
|
82
|
-
|
|
83
|
-
def clean_service(service):
|
|
84
|
-
"""Set service to None if it's an empty string or contains only spaces or quotes."""
|
|
85
|
-
if service is None or not service.strip() or service.strip(" \"'") == "":
|
|
86
|
-
return None
|
|
87
|
-
return service
|
|
88
|
-
|
|
89
|
-
for endpoint_name in ["yolox_endpoints", "paddle_endpoints"]:
|
|
90
|
-
grpc_service, http_service = values.get(endpoint_name, (None, None))
|
|
91
|
-
grpc_service = clean_service(grpc_service)
|
|
92
|
-
http_service = clean_service(http_service)
|
|
93
|
-
|
|
94
|
-
if not grpc_service and not http_service:
|
|
95
|
-
raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
|
|
96
|
-
|
|
97
|
-
values[endpoint_name] = (grpc_service, http_service)
|
|
98
|
-
|
|
99
|
-
return values
|
|
100
|
-
|
|
101
|
-
model_config = ConfigDict(extra="forbid")
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
class ChartExtractorSchema(BaseModel):
|
|
105
|
-
"""
|
|
106
|
-
Configuration schema for chart extraction processing settings.
|
|
107
|
-
|
|
108
|
-
Parameters
|
|
109
|
-
----------
|
|
110
|
-
max_queue_size : int, default=1
|
|
111
|
-
The maximum number of items allowed in the processing queue.
|
|
112
|
-
|
|
113
|
-
n_workers : int, default=2
|
|
114
|
-
The number of worker threads to use for processing.
|
|
115
|
-
|
|
116
|
-
raise_on_failure : bool, default=False
|
|
117
|
-
A flag indicating whether to raise an exception if a failure occurs during chart extraction.
|
|
118
|
-
|
|
119
|
-
extraction_config: Optional[ChartExtractorConfigSchema], default=None
|
|
120
|
-
Configuration for the chart extraction stage, including yolox and paddle service endpoints.
|
|
121
|
-
"""
|
|
122
|
-
|
|
123
|
-
max_queue_size: int = 1
|
|
124
|
-
n_workers: int = 2
|
|
125
|
-
raise_on_failure: bool = False
|
|
126
|
-
|
|
127
|
-
endpoint_config: Optional[ChartExtractorConfigSchema] = None
|
|
128
|
-
|
|
129
|
-
@field_validator("max_queue_size", "n_workers")
|
|
130
|
-
def check_positive(cls, v, field):
|
|
131
|
-
if v <= 0:
|
|
132
|
-
raise ValueError(f"{field.field_name} must be greater than 10.")
|
|
133
|
-
return v
|
|
134
|
-
|
|
135
|
-
model_config = ConfigDict(extra="forbid")
|
|
@@ -1,124 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
import logging
|
|
7
|
-
from typing import Optional
|
|
8
|
-
from typing import Tuple
|
|
9
|
-
|
|
10
|
-
from pydantic import model_validator, ConfigDict, BaseModel
|
|
11
|
-
|
|
12
|
-
logger = logging.getLogger(__name__)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class DocxConfigSchema(BaseModel):
|
|
16
|
-
"""
|
|
17
|
-
Configuration schema for docx extraction endpoints and options.
|
|
18
|
-
|
|
19
|
-
Parameters
|
|
20
|
-
----------
|
|
21
|
-
auth_token : Optional[str], default=None
|
|
22
|
-
Authentication token required for secure services.
|
|
23
|
-
|
|
24
|
-
yolox_endpoints : Tuple[str, str]
|
|
25
|
-
A tuple containing the gRPC and HTTP services for the yolox endpoint.
|
|
26
|
-
Either the gRPC or HTTP service can be empty, but not both.
|
|
27
|
-
|
|
28
|
-
Methods
|
|
29
|
-
-------
|
|
30
|
-
validate_endpoints(values)
|
|
31
|
-
Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
|
|
32
|
-
|
|
33
|
-
Raises
|
|
34
|
-
------
|
|
35
|
-
ValueError
|
|
36
|
-
If both gRPC and HTTP services are empty for any endpoint.
|
|
37
|
-
|
|
38
|
-
Config
|
|
39
|
-
------
|
|
40
|
-
extra : str
|
|
41
|
-
Pydantic config option to forbid extra fields.
|
|
42
|
-
"""
|
|
43
|
-
|
|
44
|
-
auth_token: Optional[str] = None
|
|
45
|
-
|
|
46
|
-
yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
47
|
-
yolox_infer_protocol: str = ""
|
|
48
|
-
|
|
49
|
-
@model_validator(mode="before")
|
|
50
|
-
@classmethod
|
|
51
|
-
def validate_endpoints(cls, values):
|
|
52
|
-
"""
|
|
53
|
-
Validates the gRPC and HTTP services for all endpoints.
|
|
54
|
-
|
|
55
|
-
Parameters
|
|
56
|
-
----------
|
|
57
|
-
values : dict
|
|
58
|
-
Dictionary containing the values of the attributes for the class.
|
|
59
|
-
|
|
60
|
-
Returns
|
|
61
|
-
-------
|
|
62
|
-
dict
|
|
63
|
-
The validated dictionary of values.
|
|
64
|
-
|
|
65
|
-
Raises
|
|
66
|
-
------
|
|
67
|
-
ValueError
|
|
68
|
-
If both gRPC and HTTP services are empty for any endpoint.
|
|
69
|
-
"""
|
|
70
|
-
|
|
71
|
-
def clean_service(service):
|
|
72
|
-
"""Set service to None if it's an empty string or contains only spaces or quotes."""
|
|
73
|
-
if service is None or not service.strip() or service.strip(" \"'") == "":
|
|
74
|
-
return None
|
|
75
|
-
return service
|
|
76
|
-
|
|
77
|
-
for model_name in ["yolox"]:
|
|
78
|
-
endpoint_name = f"{model_name}_endpoints"
|
|
79
|
-
grpc_service, http_service = values.get(endpoint_name)
|
|
80
|
-
grpc_service = clean_service(grpc_service)
|
|
81
|
-
http_service = clean_service(http_service)
|
|
82
|
-
|
|
83
|
-
if not grpc_service and not http_service:
|
|
84
|
-
raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
|
|
85
|
-
|
|
86
|
-
values[endpoint_name] = (grpc_service, http_service)
|
|
87
|
-
|
|
88
|
-
protocol_name = f"{model_name}_infer_protocol"
|
|
89
|
-
protocol_value = values.get(protocol_name)
|
|
90
|
-
if not protocol_value:
|
|
91
|
-
protocol_value = "http" if http_service else "grpc" if grpc_service else ""
|
|
92
|
-
protocol_value = protocol_value.lower()
|
|
93
|
-
values[protocol_name] = protocol_value
|
|
94
|
-
|
|
95
|
-
return values
|
|
96
|
-
|
|
97
|
-
model_config = ConfigDict(extra="forbid")
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
class DocxExtractorSchema(BaseModel):
|
|
101
|
-
"""
|
|
102
|
-
Configuration schema for the PDF extractor settings.
|
|
103
|
-
|
|
104
|
-
Parameters
|
|
105
|
-
----------
|
|
106
|
-
max_queue_size : int, default=1
|
|
107
|
-
The maximum number of items allowed in the processing queue.
|
|
108
|
-
|
|
109
|
-
n_workers : int, default=16
|
|
110
|
-
The number of worker threads to use for processing.
|
|
111
|
-
|
|
112
|
-
raise_on_failure : bool, default=False
|
|
113
|
-
A flag indicating whether to raise an exception on processing failure.
|
|
114
|
-
|
|
115
|
-
image_extraction_config: Optional[ImageConfigSchema], default=None
|
|
116
|
-
Configuration schema for the image extraction stage.
|
|
117
|
-
"""
|
|
118
|
-
|
|
119
|
-
max_queue_size: int = 1
|
|
120
|
-
n_workers: int = 16
|
|
121
|
-
raise_on_failure: bool = False
|
|
122
|
-
|
|
123
|
-
docx_extraction_config: Optional[DocxConfigSchema] = None
|
|
124
|
-
model_config = ConfigDict(extra="forbid")
|