nv-ingest-api 2025.5.12.dev20250512__py3-none-any.whl → 2025.5.14.dev20250514__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/interface/transform.py +1 -1
- nv_ingest_api/internal/extract/docx/docx_extractor.py +3 -3
- nv_ingest_api/internal/extract/image/chart_extractor.py +3 -3
- nv_ingest_api/internal/extract/image/image_extractor.py +5 -5
- nv_ingest_api/internal/extract/image/image_helpers/common.py +1 -1
- nv_ingest_api/internal/extract/image/infographic_extractor.py +1 -1
- nv_ingest_api/internal/extract/image/table_extractor.py +2 -2
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +2 -2
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +1 -1
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +44 -17
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +1 -1
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -1
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +7 -1
- nv_ingest_api/internal/primitives/nim/nim_client.py +1 -1
- nv_ingest_api/internal/primitives/tracing/tagging.py +20 -16
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +1 -1
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +2 -2
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +1 -1
- nv_ingest_api/internal/transform/caption_image.py +1 -1
- nv_ingest_api/internal/transform/embed_text.py +75 -56
- nv_ingest_api/util/exception_handlers/converters.py +1 -1
- nv_ingest_api/util/exception_handlers/decorators.py +309 -51
- nv_ingest_api/util/image_processing/processing.py +1 -1
- nv_ingest_api/util/logging/configuration.py +15 -8
- nv_ingest_api/util/pdf/pdfium.py +2 -2
- nv_ingest_api/util/service_clients/redis/redis_client.py +1 -1
- nv_ingest_api/util/service_clients/rest/rest_client.py +1 -1
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +426 -0
- {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.14.dev20250514.dist-info}/METADATA +1 -1
- {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.14.dev20250514.dist-info}/RECORD +34 -32
- {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.14.dev20250514.dist-info}/WHEEL +1 -1
- {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.14.dev20250514.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.14.dev20250514.dist-info}/top_level.txt +0 -0
|
@@ -2,77 +2,321 @@
|
|
|
2
2
|
# All rights reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
+
import asyncio
|
|
5
6
|
import logging
|
|
6
7
|
import functools
|
|
7
8
|
import inspect
|
|
8
9
|
import re
|
|
9
|
-
import
|
|
10
|
+
from typing import Any, Optional, Callable, Tuple
|
|
10
11
|
from functools import wraps
|
|
11
12
|
|
|
12
13
|
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
|
|
13
14
|
from nv_ingest_api.internal.primitives.tracing.logging import TaskResultStatus, annotate_task_result
|
|
14
15
|
from nv_ingest_api.util.control_message.validators import cm_ensure_payload_not_null, cm_set_failure
|
|
15
16
|
|
|
16
|
-
|
|
17
17
|
logger = logging.getLogger(__name__)
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
def nv_ingest_node_failure_try_except( # New name to distinguish
|
|
21
|
+
annotation_id: str,
|
|
22
|
+
payload_can_be_empty: bool = False,
|
|
23
|
+
raise_on_failure: bool = False,
|
|
24
|
+
skip_processing_if_failed: bool = True,
|
|
25
|
+
forward_func: Optional[Callable[[Any], Any]] = None,
|
|
26
|
+
) -> Callable:
|
|
27
|
+
"""
|
|
28
|
+
Decorator that wraps function execution in a try/except block to handle
|
|
29
|
+
failures by annotating an IngestControlMessage. Replaces the context
|
|
30
|
+
manager approach for potentially simpler interaction with frameworks like Ray.
|
|
31
|
+
|
|
32
|
+
Parameters are the same as nv_ingest_node_failure_context_manager.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def extract_message_and_prefix(args: Tuple) -> Tuple[Any, Tuple]:
|
|
36
|
+
"""Extracts control_message and potential 'self' prefix."""
|
|
37
|
+
# (Keep the implementation from the original decorator)
|
|
38
|
+
if args and hasattr(args[0], "get_metadata"):
|
|
39
|
+
return args[0], ()
|
|
40
|
+
elif len(args) >= 2 and hasattr(args[1], "get_metadata"):
|
|
41
|
+
return args[1], (args[0],)
|
|
42
|
+
else:
|
|
43
|
+
# Be more specific in error if possible
|
|
44
|
+
arg_types = [type(arg).__name__ for arg in args]
|
|
45
|
+
raise ValueError(f"No IngestControlMessage found in first or second argument. Got types: {arg_types}")
|
|
46
|
+
|
|
47
|
+
def decorator(func: Callable) -> Callable:
|
|
48
|
+
func_name = func.__name__ # Get function name for logging/errors
|
|
49
|
+
|
|
50
|
+
# --- ASYNC WRAPPER ---
|
|
51
|
+
if asyncio.iscoroutinefunction(func):
|
|
52
|
+
|
|
53
|
+
@functools.wraps(func)
|
|
54
|
+
async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
55
|
+
logger.debug(f"async_wrapper for {func_name}: Entering.")
|
|
56
|
+
try:
|
|
57
|
+
control_message, prefix = extract_message_and_prefix(args)
|
|
58
|
+
except ValueError as e:
|
|
59
|
+
logger.error(f"async_wrapper for {func_name}: Failed to extract control message. Error: {e}")
|
|
60
|
+
raise # Cannot proceed without the message
|
|
61
|
+
|
|
62
|
+
# --- Skip logic ---
|
|
63
|
+
is_failed = control_message.get_metadata("cm_failed", False)
|
|
64
|
+
if is_failed and skip_processing_if_failed:
|
|
65
|
+
logger.debug(f"async_wrapper for {func_name}: Skipping processing, message already marked failed.")
|
|
66
|
+
if forward_func:
|
|
67
|
+
logger.debug("async_wrapper: Forwarding skipped message.")
|
|
68
|
+
# Await forward_func if it's async
|
|
69
|
+
if asyncio.iscoroutinefunction(forward_func):
|
|
70
|
+
return await forward_func(control_message)
|
|
71
|
+
else:
|
|
72
|
+
return forward_func(control_message)
|
|
73
|
+
else:
|
|
74
|
+
logger.debug("async_wrapper: Returning skipped message as is.")
|
|
75
|
+
return control_message
|
|
76
|
+
|
|
77
|
+
# --- Main execution block ---
|
|
78
|
+
result = None
|
|
79
|
+
try:
|
|
80
|
+
# Payload check
|
|
81
|
+
if not payload_can_be_empty:
|
|
82
|
+
cm_ensure_payload_not_null(control_message)
|
|
83
|
+
|
|
84
|
+
# Rebuild args and call original async function
|
|
85
|
+
new_args = prefix + (control_message,) + args[len(prefix) + 1 :]
|
|
86
|
+
logger.debug(f"async_wrapper for {func_name}: Calling await func...")
|
|
87
|
+
result = await func(*new_args, **kwargs)
|
|
88
|
+
logger.debug(f"async_wrapper for {func_name}: func call completed.")
|
|
89
|
+
|
|
90
|
+
# Success annotation
|
|
91
|
+
logger.debug(f"async_wrapper for {func_name}: Annotating success.")
|
|
92
|
+
annotate_task_result(
|
|
93
|
+
control_message=result if result is not None else control_message,
|
|
94
|
+
# Annotate result if func returns it, else original message
|
|
95
|
+
result=TaskResultStatus.SUCCESS,
|
|
96
|
+
task_id=annotation_id,
|
|
97
|
+
)
|
|
98
|
+
logger.debug(f"async_wrapper for {func_name}: Success annotation done. Returning result.")
|
|
99
|
+
return result
|
|
100
|
+
|
|
101
|
+
except Exception as e:
|
|
102
|
+
# --- Failure Handling ---
|
|
103
|
+
error_message = f"Error in {func_name}: {e}"
|
|
104
|
+
logger.error(f"async_wrapper for {func_name}: Caught exception: {error_message}", exc_info=True)
|
|
105
|
+
|
|
106
|
+
# Annotate failure on the original message object
|
|
107
|
+
try:
|
|
108
|
+
cm_set_failure(control_message, error_message)
|
|
109
|
+
annotate_task_result(
|
|
110
|
+
control_message=control_message,
|
|
111
|
+
result=TaskResultStatus.FAILURE,
|
|
112
|
+
task_id=annotation_id,
|
|
113
|
+
message=error_message,
|
|
114
|
+
)
|
|
115
|
+
logger.debug(f"async_wrapper for {func_name}: Failure annotation complete.")
|
|
116
|
+
except Exception as anno_err:
|
|
117
|
+
# Log error during annotation but proceed based on raise_on_failure
|
|
118
|
+
logger.exception(
|
|
119
|
+
f"async_wrapper for {func_name}: CRITICAL - Error during failure annotation: {anno_err}"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Decide whether to raise or return annotated message
|
|
123
|
+
if raise_on_failure:
|
|
124
|
+
logger.debug(f"async_wrapper for {func_name}: Re-raising exception as configured.")
|
|
125
|
+
raise e # Re-raise the original exception
|
|
126
|
+
else:
|
|
127
|
+
logger.debug(
|
|
128
|
+
f"async_wrapper for {func_name}: Suppressing exception and returning annotated message."
|
|
129
|
+
)
|
|
130
|
+
# Return the original control_message, now annotated with failure
|
|
131
|
+
return control_message
|
|
132
|
+
|
|
133
|
+
return async_wrapper
|
|
134
|
+
|
|
135
|
+
# --- SYNC WRAPPER ---
|
|
136
|
+
else:
|
|
137
|
+
|
|
138
|
+
@functools.wraps(func)
|
|
139
|
+
def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
140
|
+
logger.debug(f"sync_wrapper for {func_name}: Entering.")
|
|
141
|
+
try:
|
|
142
|
+
control_message, prefix = extract_message_and_prefix(args)
|
|
143
|
+
except ValueError as e:
|
|
144
|
+
logger.error(f"sync_wrapper for {func_name}: Failed to extract control message. Error: {e}")
|
|
145
|
+
raise
|
|
146
|
+
|
|
147
|
+
# --- Skip logic ---
|
|
148
|
+
is_failed = control_message.get_metadata("cm_failed", False)
|
|
149
|
+
if is_failed and skip_processing_if_failed:
|
|
150
|
+
logger.warning(f"sync_wrapper for {func_name}: Skipping processing, message already marked failed.")
|
|
151
|
+
if forward_func:
|
|
152
|
+
logger.debug("sync_wrapper: Forwarding skipped message.")
|
|
153
|
+
return forward_func(control_message) # Assume forward_func is sync here
|
|
154
|
+
else:
|
|
155
|
+
logger.debug("sync_wrapper: Returning skipped message as is.")
|
|
156
|
+
return control_message
|
|
157
|
+
|
|
158
|
+
# --- Main execution block ---
|
|
159
|
+
result = None
|
|
160
|
+
try:
|
|
161
|
+
# Payload check
|
|
162
|
+
if not payload_can_be_empty:
|
|
163
|
+
cm_ensure_payload_not_null(control_message)
|
|
164
|
+
|
|
165
|
+
# Rebuild args and call original sync function
|
|
166
|
+
new_args = prefix + (control_message,) + args[len(prefix) + 1 :]
|
|
167
|
+
logger.debug(f"sync_wrapper for {func_name}: Calling func...")
|
|
168
|
+
result = func(*new_args, **kwargs)
|
|
169
|
+
logger.debug(f"sync_wrapper for {func_name}: func call completed.")
|
|
170
|
+
|
|
171
|
+
# Success annotation
|
|
172
|
+
logger.debug(f"sync_wrapper for {func_name}: Annotating success.")
|
|
173
|
+
annotate_task_result(
|
|
174
|
+
control_message=result if result is not None else control_message,
|
|
175
|
+
# Annotate result or original message
|
|
176
|
+
result=TaskResultStatus.SUCCESS,
|
|
177
|
+
task_id=annotation_id,
|
|
178
|
+
)
|
|
179
|
+
logger.debug(f"sync_wrapper for {func_name}: Success annotation done. Returning result.")
|
|
180
|
+
return result
|
|
181
|
+
|
|
182
|
+
except Exception as e:
|
|
183
|
+
# --- Failure Handling ---
|
|
184
|
+
error_message = f"Error in {func_name}: {e}"
|
|
185
|
+
logger.error(f"sync_wrapper for {func_name}: Caught exception: {error_message}", exc_info=True)
|
|
186
|
+
|
|
187
|
+
# Annotate failure on the original message object
|
|
188
|
+
try:
|
|
189
|
+
cm_set_failure(control_message, error_message)
|
|
190
|
+
annotate_task_result(
|
|
191
|
+
control_message=control_message,
|
|
192
|
+
result=TaskResultStatus.FAILURE,
|
|
193
|
+
task_id=annotation_id,
|
|
194
|
+
message=error_message,
|
|
195
|
+
)
|
|
196
|
+
logger.debug(f"sync_wrapper for {func_name}: Failure annotation complete.")
|
|
197
|
+
except Exception as anno_err:
|
|
198
|
+
logger.exception(
|
|
199
|
+
f"sync_wrapper for {func_name}: CRITICAL - Error during failure annotation: {anno_err}"
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Decide whether to raise or return annotated message
|
|
203
|
+
if raise_on_failure:
|
|
204
|
+
logger.debug(f"sync_wrapper for {func_name}: Re-raising exception as configured.")
|
|
205
|
+
raise e # Re-raise the original exception
|
|
206
|
+
else:
|
|
207
|
+
logger.debug(
|
|
208
|
+
f"sync_wrapper for {func_name}: Suppressing exception and returning annotated message."
|
|
209
|
+
)
|
|
210
|
+
# Return the original control_message, now annotated with failure
|
|
211
|
+
return control_message
|
|
212
|
+
|
|
213
|
+
return sync_wrapper
|
|
214
|
+
|
|
215
|
+
return decorator
|
|
216
|
+
|
|
217
|
+
|
|
21
218
|
def nv_ingest_node_failure_context_manager(
|
|
22
219
|
annotation_id: str,
|
|
23
220
|
payload_can_be_empty: bool = False,
|
|
24
221
|
raise_on_failure: bool = False,
|
|
25
222
|
skip_processing_if_failed: bool = True,
|
|
26
|
-
forward_func=None,
|
|
27
|
-
) ->
|
|
223
|
+
forward_func: Optional[Callable[[Any], Any]] = None,
|
|
224
|
+
) -> Callable:
|
|
28
225
|
"""
|
|
29
|
-
|
|
30
|
-
|
|
226
|
+
Decorator that applies a failure context manager around a function processing an IngestControlMessage.
|
|
227
|
+
Works with both synchronous and asynchronous functions, and supports class methods (with 'self').
|
|
31
228
|
|
|
32
229
|
Parameters
|
|
33
230
|
----------
|
|
34
231
|
annotation_id : str
|
|
35
|
-
A unique identifier
|
|
232
|
+
A unique identifier for annotation.
|
|
36
233
|
payload_can_be_empty : bool, optional
|
|
37
|
-
If False, the payload
|
|
38
|
-
raising an exception if it is null. Defaults to False, enforcing payload presence.
|
|
234
|
+
If False, the message payload must not be null.
|
|
39
235
|
raise_on_failure : bool, optional
|
|
40
|
-
If True,
|
|
41
|
-
Otherwise, the error is handled silently by annotating the IngestControlMessage. Defaults to False.
|
|
236
|
+
If True, exceptions are raised; otherwise, they are annotated.
|
|
42
237
|
skip_processing_if_failed : bool, optional
|
|
43
|
-
If True,
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
forward_func : callable, optional
|
|
47
|
-
A function to forward the IngestControlMessage if it has already been marked as failed.
|
|
238
|
+
If True, skip processing if the message is already marked as failed.
|
|
239
|
+
forward_func : Optional[Callable[[Any], Any]]
|
|
240
|
+
If provided, a function to forward the message when processing is skipped.
|
|
48
241
|
|
|
49
242
|
Returns
|
|
50
243
|
-------
|
|
51
244
|
Callable
|
|
52
|
-
|
|
245
|
+
The decorated function.
|
|
53
246
|
"""
|
|
54
247
|
|
|
55
|
-
def
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
cm_ensure_payload_not_null(control_message=control_message)
|
|
69
|
-
control_message = func(ctx_mgr.control_message, *args, **kwargs)
|
|
70
|
-
else:
|
|
71
|
-
if forward_func:
|
|
72
|
-
control_message = forward_func(control_message)
|
|
73
|
-
return control_message
|
|
248
|
+
def extract_message_and_prefix(args: Tuple) -> Tuple[Any, Tuple]:
|
|
249
|
+
"""
|
|
250
|
+
Determines if the function is a method (first argument is self) or a standalone function.
|
|
251
|
+
Returns a tuple (control_message, prefix) where prefix is a tuple of preceding arguments to be preserved.
|
|
252
|
+
"""
|
|
253
|
+
if args and hasattr(args[0], "get_metadata"):
|
|
254
|
+
# Standalone function: first argument is the message.
|
|
255
|
+
return args[0], ()
|
|
256
|
+
elif len(args) >= 2 and hasattr(args[1], "get_metadata"):
|
|
257
|
+
# Method: first argument is self, second is the message.
|
|
258
|
+
return args[1], (args[0],)
|
|
259
|
+
else:
|
|
260
|
+
raise ValueError("No IngestControlMessage found in the first or second argument.")
|
|
74
261
|
|
|
75
|
-
|
|
262
|
+
def decorator(func: Callable) -> Callable:
|
|
263
|
+
if asyncio.iscoroutinefunction(func):
|
|
264
|
+
|
|
265
|
+
@functools.wraps(func)
|
|
266
|
+
async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
267
|
+
control_message, prefix = extract_message_and_prefix(args)
|
|
268
|
+
is_failed = control_message.get_metadata("cm_failed", False)
|
|
269
|
+
if not is_failed or not skip_processing_if_failed:
|
|
270
|
+
ctx_mgr = CMNVIngestFailureContextManager(
|
|
271
|
+
control_message=control_message,
|
|
272
|
+
annotation_id=annotation_id,
|
|
273
|
+
raise_on_failure=raise_on_failure,
|
|
274
|
+
func_name=func.__name__,
|
|
275
|
+
)
|
|
276
|
+
try:
|
|
277
|
+
ctx_mgr.__enter__()
|
|
278
|
+
if not payload_can_be_empty:
|
|
279
|
+
cm_ensure_payload_not_null(control_message)
|
|
280
|
+
# Rebuild argument list preserving any prefix (e.g. self).
|
|
281
|
+
new_args = prefix + (ctx_mgr.control_message,) + args[len(prefix) + 1 :]
|
|
282
|
+
result = await func(*new_args, **kwargs)
|
|
283
|
+
except Exception as e:
|
|
284
|
+
ctx_mgr.__exit__(type(e), e, e.__traceback__)
|
|
285
|
+
raise
|
|
286
|
+
else:
|
|
287
|
+
ctx_mgr.__exit__(None, None, None)
|
|
288
|
+
return result
|
|
289
|
+
else:
|
|
290
|
+
if forward_func:
|
|
291
|
+
return await forward_func(control_message)
|
|
292
|
+
else:
|
|
293
|
+
return control_message
|
|
294
|
+
|
|
295
|
+
return async_wrapper
|
|
296
|
+
else:
|
|
297
|
+
|
|
298
|
+
@functools.wraps(func)
|
|
299
|
+
def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
300
|
+
control_message, prefix = extract_message_and_prefix(args)
|
|
301
|
+
is_failed = control_message.get_metadata("cm_failed", False)
|
|
302
|
+
if not is_failed or not skip_processing_if_failed:
|
|
303
|
+
with CMNVIngestFailureContextManager(
|
|
304
|
+
control_message=control_message,
|
|
305
|
+
annotation_id=annotation_id,
|
|
306
|
+
raise_on_failure=raise_on_failure,
|
|
307
|
+
func_name=func.__name__,
|
|
308
|
+
) as ctx_mgr:
|
|
309
|
+
if not payload_can_be_empty:
|
|
310
|
+
cm_ensure_payload_not_null(control_message)
|
|
311
|
+
new_args = prefix + (ctx_mgr.control_message,) + args[len(prefix) + 1 :]
|
|
312
|
+
return func(*new_args, **kwargs)
|
|
313
|
+
else:
|
|
314
|
+
if forward_func:
|
|
315
|
+
return forward_func(control_message)
|
|
316
|
+
else:
|
|
317
|
+
return control_message
|
|
318
|
+
|
|
319
|
+
return sync_wrapper
|
|
76
320
|
|
|
77
321
|
return decorator
|
|
78
322
|
|
|
@@ -81,7 +325,7 @@ def nv_ingest_source_failure_context_manager(
|
|
|
81
325
|
annotation_id: str,
|
|
82
326
|
payload_can_be_empty: bool = False,
|
|
83
327
|
raise_on_failure: bool = False,
|
|
84
|
-
) ->
|
|
328
|
+
) -> Callable:
|
|
85
329
|
"""
|
|
86
330
|
A decorator that ensures any function's output is treated as a IngestControlMessage for annotation.
|
|
87
331
|
It applies a context manager to handle success and failure annotations based on the function's execution.
|
|
@@ -209,15 +453,29 @@ class CMNVIngestFailureContextManager:
|
|
|
209
453
|
|
|
210
454
|
|
|
211
455
|
def unified_exception_handler(func):
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
456
|
+
if asyncio.iscoroutinefunction(func):
|
|
457
|
+
|
|
458
|
+
@functools.wraps(func)
|
|
459
|
+
async def async_wrapper(*args, **kwargs):
|
|
460
|
+
try:
|
|
461
|
+
return await func(*args, **kwargs)
|
|
462
|
+
except Exception as e:
|
|
463
|
+
func_name = func.__name__
|
|
464
|
+
err_msg = f"{func_name}: error: {e}"
|
|
465
|
+
logger.exception(err_msg, exc_info=True)
|
|
466
|
+
raise type(e)(err_msg) from e
|
|
467
|
+
|
|
468
|
+
return async_wrapper
|
|
469
|
+
else:
|
|
470
|
+
|
|
471
|
+
@functools.wraps(func)
|
|
472
|
+
def sync_wrapper(*args, **kwargs):
|
|
473
|
+
try:
|
|
474
|
+
return func(*args, **kwargs)
|
|
475
|
+
except Exception as e:
|
|
476
|
+
func_name = func.__name__
|
|
477
|
+
err_msg = f"{func_name}: error: {e}"
|
|
478
|
+
logger.exception(err_msg, exc_info=True)
|
|
479
|
+
raise type(e)(err_msg) from e
|
|
480
|
+
|
|
481
|
+
return sync_wrapper
|
|
@@ -150,7 +150,7 @@ def extract_tables_and_charts_yolox(
|
|
|
150
150
|
min_score=YOLOX_MIN_SCORE,
|
|
151
151
|
final_thresh=YOLOX_FINAL_SCORE,
|
|
152
152
|
trace_info=trace_info,
|
|
153
|
-
stage_name="
|
|
153
|
+
stage_name="pdf_extraction",
|
|
154
154
|
)
|
|
155
155
|
|
|
156
156
|
# Process results: iterate over each image's inference output.
|
|
@@ -9,6 +9,7 @@ from enum import Enum
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class LogLevel(str, Enum):
|
|
12
|
+
DEFAULT = "DEFAULT"
|
|
12
13
|
DEBUG = "DEBUG"
|
|
13
14
|
INFO = "INFO"
|
|
14
15
|
WARNING = "WARNING"
|
|
@@ -16,16 +17,22 @@ class LogLevel(str, Enum):
|
|
|
16
17
|
CRITICAL = "CRITICAL"
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
def configure_logging(
|
|
20
|
-
"""
|
|
21
|
-
Parameters:
|
|
22
|
-
- level_name (str): The name of the logging level (e.g., "DEBUG", "INFO").
|
|
20
|
+
def configure_logging(level_name: str) -> None:
|
|
23
21
|
"""
|
|
22
|
+
Configures global logging.
|
|
24
23
|
|
|
25
|
-
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
level_name : str
|
|
27
|
+
The name of the logging level (e.g., "DEBUG", "INFO").
|
|
28
|
+
"""
|
|
29
|
+
numeric_level = getattr(logging, level_name.upper(), None)
|
|
26
30
|
if not isinstance(numeric_level, int):
|
|
27
31
|
raise ValueError(f"Invalid log level: {level_name}")
|
|
28
32
|
|
|
29
|
-
logging.
|
|
30
|
-
|
|
31
|
-
|
|
33
|
+
logging.basicConfig(
|
|
34
|
+
level=numeric_level,
|
|
35
|
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
|
36
|
+
stream=sys.stdout,
|
|
37
|
+
force=True, # <- reconfigures even if basicConfig was called earlier (Python 3.8+)
|
|
38
|
+
)
|
nv_ingest_api/util/pdf/pdfium.py
CHANGED
|
@@ -119,7 +119,7 @@ def pdfium_try_get_bitmap_as_numpy(image_obj) -> np.ndarray:
|
|
|
119
119
|
return img_array
|
|
120
120
|
|
|
121
121
|
|
|
122
|
-
@traceable_func(trace_name="
|
|
122
|
+
@traceable_func(trace_name="pdf_extraction::pdfium_pages_to_numpy")
|
|
123
123
|
def pdfium_pages_to_numpy(
|
|
124
124
|
pages: List[pdfium.PdfPage],
|
|
125
125
|
render_dpi: int = 300,
|
|
@@ -394,7 +394,7 @@ def extract_image_like_objects_from_pdfium_page(page, merge=True, **kwargs):
|
|
|
394
394
|
try:
|
|
395
395
|
original_images, _ = pdfium_pages_to_numpy(
|
|
396
396
|
[page], # A batch with a single image.
|
|
397
|
-
render_dpi=
|
|
397
|
+
render_dpi=300, # dpi = 72 is equivalent to scale = 1.
|
|
398
398
|
rotation=rotation, # Without rotation, coordinates from page.get_pos() will not match.
|
|
399
399
|
)
|
|
400
400
|
image_bboxes = extract_merged_images_from_pdfium_page(page, merge=merge, **kwargs)
|
|
@@ -446,7 +446,7 @@ class RedisClient(MessageBrokerClientBase):
|
|
|
446
446
|
current_time: float = time.monotonic()
|
|
447
447
|
elapsed_time: float = current_time - start_time
|
|
448
448
|
if elapsed_time > timeout:
|
|
449
|
-
logger.
|
|
449
|
+
logger.debug(f"Overall timeout ({timeout}s) exceeded for non-destructive fetch of '{channel_name}'.")
|
|
450
450
|
if expected_count:
|
|
451
451
|
raise TimeoutError(
|
|
452
452
|
f"Timeout collecting fragments for {channel_name}. "
|
|
@@ -470,7 +470,7 @@ class RestClient(MessageBrokerClientBase):
|
|
|
470
470
|
f"Requires a requests.Session compatible API."
|
|
471
471
|
)
|
|
472
472
|
except requests.exceptions.RequestException as err:
|
|
473
|
-
logger.
|
|
473
|
+
logger.debug(
|
|
474
474
|
f"RequestException submitting job: {err}. Attempting retry ({retries + 1}/{self._max_retries})..."
|
|
475
475
|
)
|
|
476
476
|
try:
|
|
File without changes
|