nv-ingest-api 2025.5.12.dev20250512__py3-none-any.whl → 2025.5.13.dev20250513__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (28) hide show
  1. nv_ingest_api/interface/transform.py +1 -1
  2. nv_ingest_api/internal/extract/docx/docx_extractor.py +3 -3
  3. nv_ingest_api/internal/extract/image/image_extractor.py +5 -5
  4. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +1 -1
  5. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +44 -17
  6. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +1 -1
  7. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -1
  8. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +7 -1
  9. nv_ingest_api/internal/primitives/nim/nim_client.py +1 -1
  10. nv_ingest_api/internal/primitives/tracing/tagging.py +20 -16
  11. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +1 -1
  12. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +2 -2
  13. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +1 -1
  14. nv_ingest_api/internal/transform/caption_image.py +1 -1
  15. nv_ingest_api/internal/transform/embed_text.py +75 -56
  16. nv_ingest_api/util/exception_handlers/converters.py +1 -1
  17. nv_ingest_api/util/exception_handlers/decorators.py +309 -51
  18. nv_ingest_api/util/logging/configuration.py +15 -8
  19. nv_ingest_api/util/pdf/pdfium.py +1 -1
  20. nv_ingest_api/util/service_clients/redis/redis_client.py +1 -1
  21. nv_ingest_api/util/service_clients/rest/rest_client.py +1 -1
  22. nv_ingest_api/util/system/__init__.py +0 -0
  23. nv_ingest_api/util/system/hardware_info.py +426 -0
  24. {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.13.dev20250513.dist-info}/METADATA +1 -1
  25. {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.13.dev20250513.dist-info}/RECORD +28 -26
  26. {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.13.dev20250513.dist-info}/WHEEL +0 -0
  27. {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.13.dev20250513.dist-info}/licenses/LICENSE +0 -0
  28. {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.13.dev20250513.dist-info}/top_level.txt +0 -0
@@ -2,77 +2,321 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ import asyncio
5
6
  import logging
6
7
  import functools
7
8
  import inspect
8
9
  import re
9
- import typing
10
+ from typing import Any, Optional, Callable, Tuple
10
11
  from functools import wraps
11
12
 
12
13
  from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
13
14
  from nv_ingest_api.internal.primitives.tracing.logging import TaskResultStatus, annotate_task_result
14
15
  from nv_ingest_api.util.control_message.validators import cm_ensure_payload_not_null, cm_set_failure
15
16
 
16
-
17
17
  logger = logging.getLogger(__name__)
18
18
 
19
19
 
20
- # TODO(Devin): move back to framework
20
+ def nv_ingest_node_failure_try_except( # New name to distinguish
21
+ annotation_id: str,
22
+ payload_can_be_empty: bool = False,
23
+ raise_on_failure: bool = False,
24
+ skip_processing_if_failed: bool = True,
25
+ forward_func: Optional[Callable[[Any], Any]] = None,
26
+ ) -> Callable:
27
+ """
28
+ Decorator that wraps function execution in a try/except block to handle
29
+ failures by annotating an IngestControlMessage. Replaces the context
30
+ manager approach for potentially simpler interaction with frameworks like Ray.
31
+
32
+ Parameters are the same as nv_ingest_node_failure_context_manager.
33
+ """
34
+
35
+ def extract_message_and_prefix(args: Tuple) -> Tuple[Any, Tuple]:
36
+ """Extracts control_message and potential 'self' prefix."""
37
+ # (Keep the implementation from the original decorator)
38
+ if args and hasattr(args[0], "get_metadata"):
39
+ return args[0], ()
40
+ elif len(args) >= 2 and hasattr(args[1], "get_metadata"):
41
+ return args[1], (args[0],)
42
+ else:
43
+ # Be more specific in error if possible
44
+ arg_types = [type(arg).__name__ for arg in args]
45
+ raise ValueError(f"No IngestControlMessage found in first or second argument. Got types: {arg_types}")
46
+
47
+ def decorator(func: Callable) -> Callable:
48
+ func_name = func.__name__ # Get function name for logging/errors
49
+
50
+ # --- ASYNC WRAPPER ---
51
+ if asyncio.iscoroutinefunction(func):
52
+
53
+ @functools.wraps(func)
54
+ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
55
+ logger.debug(f"async_wrapper for {func_name}: Entering.")
56
+ try:
57
+ control_message, prefix = extract_message_and_prefix(args)
58
+ except ValueError as e:
59
+ logger.error(f"async_wrapper for {func_name}: Failed to extract control message. Error: {e}")
60
+ raise # Cannot proceed without the message
61
+
62
+ # --- Skip logic ---
63
+ is_failed = control_message.get_metadata("cm_failed", False)
64
+ if is_failed and skip_processing_if_failed:
65
+ logger.debug(f"async_wrapper for {func_name}: Skipping processing, message already marked failed.")
66
+ if forward_func:
67
+ logger.debug("async_wrapper: Forwarding skipped message.")
68
+ # Await forward_func if it's async
69
+ if asyncio.iscoroutinefunction(forward_func):
70
+ return await forward_func(control_message)
71
+ else:
72
+ return forward_func(control_message)
73
+ else:
74
+ logger.debug("async_wrapper: Returning skipped message as is.")
75
+ return control_message
76
+
77
+ # --- Main execution block ---
78
+ result = None
79
+ try:
80
+ # Payload check
81
+ if not payload_can_be_empty:
82
+ cm_ensure_payload_not_null(control_message)
83
+
84
+ # Rebuild args and call original async function
85
+ new_args = prefix + (control_message,) + args[len(prefix) + 1 :]
86
+ logger.debug(f"async_wrapper for {func_name}: Calling await func...")
87
+ result = await func(*new_args, **kwargs)
88
+ logger.debug(f"async_wrapper for {func_name}: func call completed.")
89
+
90
+ # Success annotation
91
+ logger.debug(f"async_wrapper for {func_name}: Annotating success.")
92
+ annotate_task_result(
93
+ control_message=result if result is not None else control_message,
94
+ # Annotate result if func returns it, else original message
95
+ result=TaskResultStatus.SUCCESS,
96
+ task_id=annotation_id,
97
+ )
98
+ logger.debug(f"async_wrapper for {func_name}: Success annotation done. Returning result.")
99
+ return result
100
+
101
+ except Exception as e:
102
+ # --- Failure Handling ---
103
+ error_message = f"Error in {func_name}: {e}"
104
+ logger.error(f"async_wrapper for {func_name}: Caught exception: {error_message}", exc_info=True)
105
+
106
+ # Annotate failure on the original message object
107
+ try:
108
+ cm_set_failure(control_message, error_message)
109
+ annotate_task_result(
110
+ control_message=control_message,
111
+ result=TaskResultStatus.FAILURE,
112
+ task_id=annotation_id,
113
+ message=error_message,
114
+ )
115
+ logger.debug(f"async_wrapper for {func_name}: Failure annotation complete.")
116
+ except Exception as anno_err:
117
+ # Log error during annotation but proceed based on raise_on_failure
118
+ logger.exception(
119
+ f"async_wrapper for {func_name}: CRITICAL - Error during failure annotation: {anno_err}"
120
+ )
121
+
122
+ # Decide whether to raise or return annotated message
123
+ if raise_on_failure:
124
+ logger.debug(f"async_wrapper for {func_name}: Re-raising exception as configured.")
125
+ raise e # Re-raise the original exception
126
+ else:
127
+ logger.debug(
128
+ f"async_wrapper for {func_name}: Suppressing exception and returning annotated message."
129
+ )
130
+ # Return the original control_message, now annotated with failure
131
+ return control_message
132
+
133
+ return async_wrapper
134
+
135
+ # --- SYNC WRAPPER ---
136
+ else:
137
+
138
+ @functools.wraps(func)
139
+ def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
140
+ logger.debug(f"sync_wrapper for {func_name}: Entering.")
141
+ try:
142
+ control_message, prefix = extract_message_and_prefix(args)
143
+ except ValueError as e:
144
+ logger.error(f"sync_wrapper for {func_name}: Failed to extract control message. Error: {e}")
145
+ raise
146
+
147
+ # --- Skip logic ---
148
+ is_failed = control_message.get_metadata("cm_failed", False)
149
+ if is_failed and skip_processing_if_failed:
150
+ logger.warning(f"sync_wrapper for {func_name}: Skipping processing, message already marked failed.")
151
+ if forward_func:
152
+ logger.debug("sync_wrapper: Forwarding skipped message.")
153
+ return forward_func(control_message) # Assume forward_func is sync here
154
+ else:
155
+ logger.debug("sync_wrapper: Returning skipped message as is.")
156
+ return control_message
157
+
158
+ # --- Main execution block ---
159
+ result = None
160
+ try:
161
+ # Payload check
162
+ if not payload_can_be_empty:
163
+ cm_ensure_payload_not_null(control_message)
164
+
165
+ # Rebuild args and call original sync function
166
+ new_args = prefix + (control_message,) + args[len(prefix) + 1 :]
167
+ logger.debug(f"sync_wrapper for {func_name}: Calling func...")
168
+ result = func(*new_args, **kwargs)
169
+ logger.debug(f"sync_wrapper for {func_name}: func call completed.")
170
+
171
+ # Success annotation
172
+ logger.debug(f"sync_wrapper for {func_name}: Annotating success.")
173
+ annotate_task_result(
174
+ control_message=result if result is not None else control_message,
175
+ # Annotate result or original message
176
+ result=TaskResultStatus.SUCCESS,
177
+ task_id=annotation_id,
178
+ )
179
+ logger.debug(f"sync_wrapper for {func_name}: Success annotation done. Returning result.")
180
+ return result
181
+
182
+ except Exception as e:
183
+ # --- Failure Handling ---
184
+ error_message = f"Error in {func_name}: {e}"
185
+ logger.error(f"sync_wrapper for {func_name}: Caught exception: {error_message}", exc_info=True)
186
+
187
+ # Annotate failure on the original message object
188
+ try:
189
+ cm_set_failure(control_message, error_message)
190
+ annotate_task_result(
191
+ control_message=control_message,
192
+ result=TaskResultStatus.FAILURE,
193
+ task_id=annotation_id,
194
+ message=error_message,
195
+ )
196
+ logger.debug(f"sync_wrapper for {func_name}: Failure annotation complete.")
197
+ except Exception as anno_err:
198
+ logger.exception(
199
+ f"sync_wrapper for {func_name}: CRITICAL - Error during failure annotation: {anno_err}"
200
+ )
201
+
202
+ # Decide whether to raise or return annotated message
203
+ if raise_on_failure:
204
+ logger.debug(f"sync_wrapper for {func_name}: Re-raising exception as configured.")
205
+ raise e # Re-raise the original exception
206
+ else:
207
+ logger.debug(
208
+ f"sync_wrapper for {func_name}: Suppressing exception and returning annotated message."
209
+ )
210
+ # Return the original control_message, now annotated with failure
211
+ return control_message
212
+
213
+ return sync_wrapper
214
+
215
+ return decorator
216
+
217
+
21
218
  def nv_ingest_node_failure_context_manager(
22
219
  annotation_id: str,
23
220
  payload_can_be_empty: bool = False,
24
221
  raise_on_failure: bool = False,
25
222
  skip_processing_if_failed: bool = True,
26
- forward_func=None,
27
- ) -> typing.Callable:
223
+ forward_func: Optional[Callable[[Any], Any]] = None,
224
+ ) -> Callable:
28
225
  """
29
- A decorator that applies a default failure context manager around a function to manage
30
- the execution and potential failure of operations involving IngestControlMessages.
226
+ Decorator that applies a failure context manager around a function processing an IngestControlMessage.
227
+ Works with both synchronous and asynchronous functions, and supports class methods (with 'self').
31
228
 
32
229
  Parameters
33
230
  ----------
34
231
  annotation_id : str
35
- A unique identifier used for annotating the task's result.
232
+ A unique identifier for annotation.
36
233
  payload_can_be_empty : bool, optional
37
- If False, the payload of the IngestControlMessage will be checked to ensure it's not null,
38
- raising an exception if it is null. Defaults to False, enforcing payload presence.
234
+ If False, the message payload must not be null.
39
235
  raise_on_failure : bool, optional
40
- If True, an exception is raised if the decorated function encounters an error.
41
- Otherwise, the error is handled silently by annotating the IngestControlMessage. Defaults to False.
236
+ If True, exceptions are raised; otherwise, they are annotated.
42
237
  skip_processing_if_failed : bool, optional
43
- If True, skips the processing of the decorated function if the control message has already
44
- been marked as failed. If False, the function will be processed regardless of the failure
45
- status of the IngestControlMessage. Defaults to True.
46
- forward_func : callable, optional
47
- A function to forward the IngestControlMessage if it has already been marked as failed.
238
+ If True, skip processing if the message is already marked as failed.
239
+ forward_func : Optional[Callable[[Any], Any]]
240
+ If provided, a function to forward the message when processing is skipped.
48
241
 
49
242
  Returns
50
243
  -------
51
244
  Callable
52
- A decorator that wraps the given function with failure handling logic.
245
+ The decorated function.
53
246
  """
54
247
 
55
- def decorator(func):
56
- @wraps(func)
57
- def wrapper(control_message: IngestControlMessage, *args, **kwargs):
58
- # Quick return if the IngestControlMessage has already failed
59
- is_failed = control_message.get_metadata("cm_failed", False)
60
- if not is_failed or not skip_processing_if_failed:
61
- with CMNVIngestFailureContextManager(
62
- control_message=control_message,
63
- annotation_id=annotation_id,
64
- raise_on_failure=raise_on_failure,
65
- func_name=func.__name__,
66
- ) as ctx_mgr:
67
- if not payload_can_be_empty:
68
- cm_ensure_payload_not_null(control_message=control_message)
69
- control_message = func(ctx_mgr.control_message, *args, **kwargs)
70
- else:
71
- if forward_func:
72
- control_message = forward_func(control_message)
73
- return control_message
248
+ def extract_message_and_prefix(args: Tuple) -> Tuple[Any, Tuple]:
249
+ """
250
+ Determines if the function is a method (first argument is self) or a standalone function.
251
+ Returns a tuple (control_message, prefix) where prefix is a tuple of preceding arguments to be preserved.
252
+ """
253
+ if args and hasattr(args[0], "get_metadata"):
254
+ # Standalone function: first argument is the message.
255
+ return args[0], ()
256
+ elif len(args) >= 2 and hasattr(args[1], "get_metadata"):
257
+ # Method: first argument is self, second is the message.
258
+ return args[1], (args[0],)
259
+ else:
260
+ raise ValueError("No IngestControlMessage found in the first or second argument.")
74
261
 
75
- return wrapper
262
+ def decorator(func: Callable) -> Callable:
263
+ if asyncio.iscoroutinefunction(func):
264
+
265
+ @functools.wraps(func)
266
+ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
267
+ control_message, prefix = extract_message_and_prefix(args)
268
+ is_failed = control_message.get_metadata("cm_failed", False)
269
+ if not is_failed or not skip_processing_if_failed:
270
+ ctx_mgr = CMNVIngestFailureContextManager(
271
+ control_message=control_message,
272
+ annotation_id=annotation_id,
273
+ raise_on_failure=raise_on_failure,
274
+ func_name=func.__name__,
275
+ )
276
+ try:
277
+ ctx_mgr.__enter__()
278
+ if not payload_can_be_empty:
279
+ cm_ensure_payload_not_null(control_message)
280
+ # Rebuild argument list preserving any prefix (e.g. self).
281
+ new_args = prefix + (ctx_mgr.control_message,) + args[len(prefix) + 1 :]
282
+ result = await func(*new_args, **kwargs)
283
+ except Exception as e:
284
+ ctx_mgr.__exit__(type(e), e, e.__traceback__)
285
+ raise
286
+ else:
287
+ ctx_mgr.__exit__(None, None, None)
288
+ return result
289
+ else:
290
+ if forward_func:
291
+ return await forward_func(control_message)
292
+ else:
293
+ return control_message
294
+
295
+ return async_wrapper
296
+ else:
297
+
298
+ @functools.wraps(func)
299
+ def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
300
+ control_message, prefix = extract_message_and_prefix(args)
301
+ is_failed = control_message.get_metadata("cm_failed", False)
302
+ if not is_failed or not skip_processing_if_failed:
303
+ with CMNVIngestFailureContextManager(
304
+ control_message=control_message,
305
+ annotation_id=annotation_id,
306
+ raise_on_failure=raise_on_failure,
307
+ func_name=func.__name__,
308
+ ) as ctx_mgr:
309
+ if not payload_can_be_empty:
310
+ cm_ensure_payload_not_null(control_message)
311
+ new_args = prefix + (ctx_mgr.control_message,) + args[len(prefix) + 1 :]
312
+ return func(*new_args, **kwargs)
313
+ else:
314
+ if forward_func:
315
+ return forward_func(control_message)
316
+ else:
317
+ return control_message
318
+
319
+ return sync_wrapper
76
320
 
77
321
  return decorator
78
322
 
@@ -81,7 +325,7 @@ def nv_ingest_source_failure_context_manager(
81
325
  annotation_id: str,
82
326
  payload_can_be_empty: bool = False,
83
327
  raise_on_failure: bool = False,
84
- ) -> typing.Callable:
328
+ ) -> Callable:
85
329
  """
86
330
  A decorator that ensures any function's output is treated as a IngestControlMessage for annotation.
87
331
  It applies a context manager to handle success and failure annotations based on the function's execution.
@@ -209,15 +453,29 @@ class CMNVIngestFailureContextManager:
209
453
 
210
454
 
211
455
  def unified_exception_handler(func):
212
- @functools.wraps(func)
213
- def wrapper(*args, **kwargs):
214
- try:
215
- return func(*args, **kwargs)
216
- except Exception as e:
217
- # Use the function's name in the error message
218
- func_name = func.__name__
219
- err_msg = f"{func_name}: error: {e}"
220
- logger.exception(err_msg, exc_info=True)
221
- raise type(e)(err_msg) from e
222
-
223
- return wrapper
456
+ if asyncio.iscoroutinefunction(func):
457
+
458
+ @functools.wraps(func)
459
+ async def async_wrapper(*args, **kwargs):
460
+ try:
461
+ return await func(*args, **kwargs)
462
+ except Exception as e:
463
+ func_name = func.__name__
464
+ err_msg = f"{func_name}: error: {e}"
465
+ logger.exception(err_msg, exc_info=True)
466
+ raise type(e)(err_msg) from e
467
+
468
+ return async_wrapper
469
+ else:
470
+
471
+ @functools.wraps(func)
472
+ def sync_wrapper(*args, **kwargs):
473
+ try:
474
+ return func(*args, **kwargs)
475
+ except Exception as e:
476
+ func_name = func.__name__
477
+ err_msg = f"{func_name}: error: {e}"
478
+ logger.exception(err_msg, exc_info=True)
479
+ raise type(e)(err_msg) from e
480
+
481
+ return sync_wrapper
@@ -9,6 +9,7 @@ from enum import Enum
9
9
 
10
10
 
11
11
  class LogLevel(str, Enum):
12
+ DEFAULT = "DEFAULT"
12
13
  DEBUG = "DEBUG"
13
14
  INFO = "INFO"
14
15
  WARNING = "WARNING"
@@ -16,16 +17,22 @@ class LogLevel(str, Enum):
16
17
  CRITICAL = "CRITICAL"
17
18
 
18
19
 
19
- def configure_logging(logger, level_name):
20
- """
21
- Parameters:
22
- - level_name (str): The name of the logging level (e.g., "DEBUG", "INFO").
20
+ def configure_logging(level_name: str) -> None:
23
21
  """
22
+ Configures global logging.
24
23
 
25
- numeric_level = getattr(logging, level_name, None)
24
+ Parameters
25
+ ----------
26
+ level_name : str
27
+ The name of the logging level (e.g., "DEBUG", "INFO").
28
+ """
29
+ numeric_level = getattr(logging, level_name.upper(), None)
26
30
  if not isinstance(numeric_level, int):
27
31
  raise ValueError(f"Invalid log level: {level_name}")
28
32
 
29
- logging.StreamHandler(sys.stdout)
30
- logging.basicConfig(level=numeric_level, format="%(asctime)s - %(levelname)s - %(message)s")
31
- logger.setLevel(numeric_level)
33
+ logging.basicConfig(
34
+ level=numeric_level,
35
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
36
+ stream=sys.stdout,
37
+ force=True, # <- reconfigures even if basicConfig was called earlier (Python 3.8+)
38
+ )
@@ -394,7 +394,7 @@ def extract_image_like_objects_from_pdfium_page(page, merge=True, **kwargs):
394
394
  try:
395
395
  original_images, _ = pdfium_pages_to_numpy(
396
396
  [page], # A batch with a single image.
397
- render_dpi=72, # dpi = 72 is equivalent to scale = 1.
397
+ render_dpi=300, # dpi = 72 is equivalent to scale = 1.
398
398
  rotation=rotation, # Without rotation, coordinates from page.get_pos() will not match.
399
399
  )
400
400
  image_bboxes = extract_merged_images_from_pdfium_page(page, merge=merge, **kwargs)
@@ -446,7 +446,7 @@ class RedisClient(MessageBrokerClientBase):
446
446
  current_time: float = time.monotonic()
447
447
  elapsed_time: float = current_time - start_time
448
448
  if elapsed_time > timeout:
449
- logger.warning(f"Overall timeout ({timeout}s) exceeded for non-destructive fetch of '{channel_name}'.")
449
+ logger.debug(f"Overall timeout ({timeout}s) exceeded for non-destructive fetch of '{channel_name}'.")
450
450
  if expected_count:
451
451
  raise TimeoutError(
452
452
  f"Timeout collecting fragments for {channel_name}. "
@@ -470,7 +470,7 @@ class RestClient(MessageBrokerClientBase):
470
470
  f"Requires a requests.Session compatible API."
471
471
  )
472
472
  except requests.exceptions.RequestException as err:
473
- logger.warning(
473
+ logger.debug(
474
474
  f"RequestException submitting job: {err}. Attempting retry ({retries + 1}/{self._max_retries})..."
475
475
  )
476
476
  try:
File without changes