nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,429 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import asyncio
6
+ import logging
7
+ import functools
8
+ import inspect
9
+ import re
10
+ from typing import Any, Optional, Callable, Tuple
11
+ from functools import wraps
12
+
13
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
14
+ from nv_ingest_api.internal.primitives.tracing.logging import TaskResultStatus, annotate_task_result
15
+ from nv_ingest_api.util.control_message.validators import cm_ensure_payload_not_null, cm_set_failure
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def nv_ingest_node_failure_try_except( # New name to distinguish
21
+ annotation_id: Optional[str] = None,
22
+ payload_can_be_empty: bool = False,
23
+ raise_on_failure: bool = False,
24
+ skip_processing_if_failed: bool = True,
25
+ forward_func: Optional[Callable[[Any], Any]] = None,
26
+ ) -> Callable:
27
+ """
28
+ Decorator that wraps function execution in a try/except block to handle
29
+ failures by annotating an IngestControlMessage. Replaces the context
30
+ manager approach for potentially simpler interaction with frameworks like Ray.
31
+
32
+ Parameters
33
+ ----------
34
+ annotation_id : Optional[str]
35
+ A unique identifier for annotation. If None, attempts to auto-detect
36
+ from the stage instance's stage_name property.
37
+ payload_can_be_empty : bool, optional
38
+ If False, the message payload must not be null.
39
+ raise_on_failure : bool, optional
40
+ If True, exceptions are raised; otherwise, they are annotated.
41
+ skip_processing_if_failed : bool, optional
42
+ If True, skip processing if the message is already marked as failed.
43
+ forward_func : Optional[Callable[[Any], Any]]
44
+ If provided, a function to forward the message when processing is skipped.
45
+ """
46
+
47
+ def extract_message_and_prefix(args: Tuple) -> Tuple[Any, Tuple]:
48
+ """Extracts control_message and potential 'self' prefix."""
49
+ # (Keep the implementation from the original decorator)
50
+ if args and hasattr(args[0], "get_metadata"):
51
+ return args[0], ()
52
+ elif len(args) >= 2 and hasattr(args[1], "get_metadata"):
53
+ return args[1], (args[0],)
54
+ else:
55
+ # Be more specific in error if possible
56
+ arg_types = [type(arg).__name__ for arg in args]
57
+ raise ValueError(f"No IngestControlMessage found in first or second argument. Got types: {arg_types}")
58
+
59
+ def decorator(func: Callable) -> Callable:
60
+ func_name = func.__name__ # Get function name for logging/errors
61
+
62
+ @functools.wraps(func)
63
+ def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
64
+ logger.debug(f"sync_wrapper for {func_name}: Entering.")
65
+
66
+ # Determine the annotation_id to use
67
+ resolved_annotation_id = annotation_id
68
+
69
+ # If no explicit annotation_id provided, try to get it from self.stage_name
70
+ if resolved_annotation_id is None and len(args) >= 1:
71
+ stage_instance = args[0] # 'self' in method calls
72
+ if hasattr(stage_instance, "stage_name") and stage_instance.stage_name:
73
+ resolved_annotation_id = stage_instance.stage_name
74
+ logger.debug("Using auto-detected annotation_id from stage_name: " f"'{resolved_annotation_id}'")
75
+ else:
76
+ # Fallback to function name if no stage_name available
77
+ resolved_annotation_id = func_name
78
+ logger.debug(
79
+ "No stage_name available, using function name as annotation_id: " f"'{resolved_annotation_id}'"
80
+ )
81
+ elif resolved_annotation_id is None:
82
+ # Fallback to function name if no annotation_id and no instance
83
+ resolved_annotation_id = func_name
84
+ logger.debug(
85
+ "No annotation_id provided and no instance available, using function name: "
86
+ f"'{resolved_annotation_id}'"
87
+ )
88
+
89
+ try:
90
+ control_message, prefix = extract_message_and_prefix(args)
91
+ except ValueError as e:
92
+ logger.error(f"sync_wrapper for {func_name}: Failed to extract control message. Error: {e}")
93
+ raise
94
+
95
+ # --- Skip logic ---
96
+ is_failed = control_message.get_metadata("cm_failed", False)
97
+ if is_failed and skip_processing_if_failed:
98
+ logger.warning(f"sync_wrapper for {func_name}: Skipping processing, message already marked failed.")
99
+ if forward_func:
100
+ logger.debug("sync_wrapper: Forwarding skipped message.")
101
+ return forward_func(control_message) # Assume forward_func is sync here
102
+ else:
103
+ logger.debug("sync_wrapper: Returning skipped message as is.")
104
+ return control_message
105
+
106
+ # --- Main execution block ---
107
+ result = None
108
+ try:
109
+ # Payload check
110
+ if not payload_can_be_empty:
111
+ cm_ensure_payload_not_null(control_message)
112
+
113
+ # Rebuild args and call original sync function
114
+ new_args = prefix + (control_message,) + args[len(prefix) + 1 :]
115
+ logger.debug(f"sync_wrapper for {func_name}: Calling func...")
116
+ result = func(*new_args, **kwargs)
117
+ logger.debug(f"sync_wrapper for {func_name}: func call completed.")
118
+
119
+ # Success annotation
120
+ logger.debug(f"sync_wrapper for {func_name}: Annotating success.")
121
+ annotate_task_result(
122
+ control_message=result if result is not None else control_message,
123
+ # Annotate result or original message
124
+ result=TaskResultStatus.SUCCESS,
125
+ task_id=resolved_annotation_id,
126
+ )
127
+ logger.debug(f"sync_wrapper for {func_name}: Success annotation done. Returning result.")
128
+ return result
129
+
130
+ except Exception as e:
131
+ # --- Failure Handling ---
132
+ error_message = f"Error in {func_name}: {e}"
133
+ logger.error(f"sync_wrapper for {func_name}: Caught exception: {error_message}", exc_info=True)
134
+
135
+ # Annotate failure on the original message object
136
+ try:
137
+ cm_set_failure(control_message, error_message)
138
+ annotate_task_result(
139
+ control_message=control_message,
140
+ result=TaskResultStatus.FAILURE,
141
+ task_id=resolved_annotation_id,
142
+ message=error_message,
143
+ )
144
+ logger.debug(f"sync_wrapper for {func_name}: Failure annotation complete.")
145
+ except Exception as anno_err:
146
+ logger.exception(
147
+ f"sync_wrapper for {func_name}: CRITICAL - Error during failure annotation: {anno_err}"
148
+ )
149
+
150
+ # Decide whether to raise or return annotated message
151
+ if raise_on_failure:
152
+ logger.debug(f"sync_wrapper for {func_name}: Re-raising exception as configured.")
153
+ raise e # Re-raise the original exception
154
+ else:
155
+ logger.debug(
156
+ f"sync_wrapper for {func_name}: Suppressing exception and returning annotated message."
157
+ )
158
+ # Return the original control_message, now annotated with failure
159
+ return control_message
160
+
161
+ return sync_wrapper
162
+
163
+ return decorator
164
+
165
+
166
+ def nv_ingest_node_failure_context_manager(
167
+ annotation_id: str,
168
+ payload_can_be_empty: bool = False,
169
+ raise_on_failure: bool = False,
170
+ skip_processing_if_failed: bool = True,
171
+ forward_func: Optional[Callable[[Any], Any]] = None,
172
+ ) -> Callable:
173
+ """
174
+ Decorator that applies a failure context manager around a function processing an IngestControlMessage.
175
+ Works with both synchronous and asynchronous functions, and supports class methods (with 'self').
176
+
177
+ Parameters
178
+ ----------
179
+ annotation_id : str
180
+ A unique identifier for annotation.
181
+ payload_can_be_empty : bool, optional
182
+ If False, the message payload must not be null.
183
+ raise_on_failure : bool, optional
184
+ If True, exceptions are raised; otherwise, they are annotated.
185
+ skip_processing_if_failed : bool, optional
186
+ If True, skip processing if the message is already marked as failed.
187
+ forward_func : Optional[Callable[[Any], Any]]
188
+ If provided, a function to forward the message when processing is skipped.
189
+
190
+ Returns
191
+ -------
192
+ Callable
193
+ The decorated function.
194
+ """
195
+
196
+ def extract_message_and_prefix(args: Tuple) -> Tuple[Any, Tuple]:
197
+ """
198
+ Determines if the function is a method (first argument is self) or a standalone function.
199
+ Returns a tuple (control_message, prefix) where prefix is a tuple of preceding arguments to be preserved.
200
+ """
201
+ if args and hasattr(args[0], "get_metadata"):
202
+ # Standalone function: first argument is the message.
203
+ return args[0], ()
204
+ elif len(args) >= 2 and hasattr(args[1], "get_metadata"):
205
+ # Method: first argument is self, second is the message.
206
+ return args[1], (args[0],)
207
+ else:
208
+ raise ValueError("No IngestControlMessage found in the first or second argument.")
209
+
210
+ def decorator(func: Callable) -> Callable:
211
+ if asyncio.iscoroutinefunction(func):
212
+
213
+ @functools.wraps(func)
214
+ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
215
+ control_message, prefix = extract_message_and_prefix(args)
216
+ is_failed = control_message.get_metadata("cm_failed", False)
217
+ if not is_failed or not skip_processing_if_failed:
218
+ ctx_mgr = CMNVIngestFailureContextManager(
219
+ control_message=control_message,
220
+ annotation_id=annotation_id,
221
+ raise_on_failure=raise_on_failure,
222
+ func_name=func.__name__,
223
+ )
224
+ try:
225
+ ctx_mgr.__enter__()
226
+ if not payload_can_be_empty:
227
+ cm_ensure_payload_not_null(control_message)
228
+ # Rebuild argument list preserving any prefix (e.g. self).
229
+ new_args = prefix + (ctx_mgr.control_message,) + args[len(prefix) + 1 :]
230
+ result = await func(*new_args, **kwargs)
231
+ except Exception as e:
232
+ ctx_mgr.__exit__(type(e), e, e.__traceback__)
233
+ raise
234
+ else:
235
+ ctx_mgr.__exit__(None, None, None)
236
+ return result
237
+ else:
238
+ if forward_func:
239
+ return await forward_func(control_message)
240
+ else:
241
+ return control_message
242
+
243
+ return async_wrapper
244
+ else:
245
+
246
+ @functools.wraps(func)
247
+ def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
248
+ control_message, prefix = extract_message_and_prefix(args)
249
+ is_failed = control_message.get_metadata("cm_failed", False)
250
+ if not is_failed or not skip_processing_if_failed:
251
+ with CMNVIngestFailureContextManager(
252
+ control_message=control_message,
253
+ annotation_id=annotation_id,
254
+ raise_on_failure=raise_on_failure,
255
+ func_name=func.__name__,
256
+ ) as ctx_mgr:
257
+ if not payload_can_be_empty:
258
+ cm_ensure_payload_not_null(control_message)
259
+ new_args = prefix + (ctx_mgr.control_message,) + args[len(prefix) + 1 :]
260
+ return func(*new_args, **kwargs)
261
+ else:
262
+ if forward_func:
263
+ return forward_func(control_message)
264
+ else:
265
+ return control_message
266
+
267
+ return sync_wrapper
268
+
269
+ return decorator
270
+
271
+
272
+ def nv_ingest_source_failure_context_manager(
273
+ annotation_id: str,
274
+ payload_can_be_empty: bool = False,
275
+ raise_on_failure: bool = False,
276
+ ) -> Callable:
277
+ """
278
+ A decorator that ensures any function's output is treated as a IngestControlMessage for annotation.
279
+ It applies a context manager to handle success and failure annotations based on the function's execution.
280
+
281
+ Parameters
282
+ ----------
283
+ annotation_id : str
284
+ Unique identifier used for annotating the function's output.
285
+ payload_can_be_empty : bool, optional
286
+ Specifies if the function's output IngestControlMessage payload can be empty, default is False.
287
+ raise_on_failure : bool, optional
288
+ Determines if an exception should be raised upon function failure, default is False.
289
+
290
+ Returns
291
+ -------
292
+ Callable
293
+ A decorator that ensures function output is processed for success or failure annotation.
294
+ """
295
+
296
+ def decorator(func):
297
+ @wraps(func)
298
+ def wrapper(*args, **kwargs) -> IngestControlMessage:
299
+ try:
300
+ result = func(*args, **kwargs)
301
+ if not isinstance(result, IngestControlMessage):
302
+ raise TypeError(f"{func.__name__} output is not a IngestControlMessage as expected.")
303
+ if not payload_can_be_empty and result.get_metadata("payload") is None:
304
+ raise ValueError(f"{func.__name__} IngestControlMessage payload cannot be null.")
305
+
306
+ # Success annotation.
307
+ annotate_task_result(result, result=TaskResultStatus.SUCCESS, task_id=annotation_id)
308
+ except Exception as e:
309
+ error_message = f"Error in {func.__name__}: {e}"
310
+ # Prepare a new IngestControlMessage for failure annotation if needed.
311
+ if "result" not in locals() or not isinstance(result, IngestControlMessage):
312
+ result = IngestControlMessage()
313
+ cm_set_failure(result, error_message)
314
+ annotate_task_result(
315
+ result,
316
+ result=TaskResultStatus.FAILURE,
317
+ task_id=annotation_id,
318
+ message=error_message,
319
+ )
320
+ if raise_on_failure:
321
+ raise
322
+ return result
323
+
324
+ return wrapper
325
+
326
+ return decorator
327
+
328
+
329
+ class CMNVIngestFailureContextManager:
330
+ """
331
+ Context manager for handling IngestControlMessage failures during processing, providing
332
+ a structured way to annotate and manage failures and successes.
333
+
334
+ Parameters
335
+ ----------
336
+ control_message : IngestControlMessage
337
+ The IngestControlMessage instance to be managed.
338
+ annotation_id : str
339
+ The task's unique identifier for annotation purposes.
340
+ raise_on_failure : bool, optional
341
+ Determines whether to raise an exception upon failure. Defaults to False, which
342
+ means failures are annotated rather than raising exceptions.
343
+ func_name : str, optional
344
+ The name of the function being wrapped, used to annotate error messages uniformly.
345
+ If None, stack introspection is used to deduce a likely function name. Defaults to None.
346
+
347
+ Returns
348
+ -------
349
+ None
350
+ """
351
+
352
+ def __init__(
353
+ self,
354
+ control_message: IngestControlMessage,
355
+ annotation_id: str,
356
+ raise_on_failure: bool = False,
357
+ func_name: str = None,
358
+ ):
359
+ self.control_message = control_message
360
+ self.annotation_id = annotation_id
361
+ self.raise_on_failure = raise_on_failure
362
+ if func_name is not None:
363
+ self._func_name = func_name
364
+ else:
365
+ try:
366
+ # Use stack introspection to get a candidate function name.
367
+ stack = inspect.stack()
368
+ # Use the third frame as a heuristic; adjust if needed.
369
+ candidate = stack[2].function if len(stack) > 2 else "UnknownFunction"
370
+ # Remove any whitespace and limit the length to 50 characters.
371
+ candidate = re.sub(r"\s+", "", candidate)[:50]
372
+ self._func_name = candidate if candidate else "UnknownFunction"
373
+ except Exception:
374
+ self._func_name = "UnknownFunction"
375
+
376
+ def __enter__(self):
377
+ return self
378
+
379
+ def __exit__(self, exc_type, exc_value, traceback):
380
+ if exc_type is not None: # An exception occurred
381
+ error_message = f"Error in {self._func_name}: {exc_value}"
382
+ if self.control_message is not None:
383
+ cm_set_failure(self.control_message, error_message)
384
+ annotate_task_result(
385
+ self.control_message,
386
+ result=TaskResultStatus.FAILURE,
387
+ task_id=self.annotation_id,
388
+ message=error_message,
389
+ )
390
+ # Propagate the exception if raise_on_failure is True; otherwise, suppress it.
391
+ if self.raise_on_failure:
392
+ return False
393
+ return True
394
+
395
+ annotate_task_result(
396
+ self.control_message,
397
+ result=TaskResultStatus.SUCCESS,
398
+ task_id=self.annotation_id,
399
+ )
400
+ return False
401
+
402
+
403
+ def unified_exception_handler(func):
404
+ if asyncio.iscoroutinefunction(func):
405
+
406
+ @functools.wraps(func)
407
+ async def async_wrapper(*args, **kwargs):
408
+ try:
409
+ return await func(*args, **kwargs)
410
+ except Exception as e:
411
+ func_name = func.__name__
412
+ err_msg = f"{func_name}: error: {e}"
413
+ logger.exception(err_msg, exc_info=True)
414
+ raise type(e)(err_msg) from e
415
+
416
+ return async_wrapper
417
+ else:
418
+
419
+ @functools.wraps(func)
420
+ def sync_wrapper(*args, **kwargs):
421
+ try:
422
+ return func(*args, **kwargs)
423
+ except Exception as e:
424
+ func_name = func.__name__
425
+ err_msg = f"{func_name}: error: {e}"
426
+ logger.exception(err_msg, exc_info=True)
427
+ raise type(e)(err_msg) from e
428
+
429
+ return sync_wrapper
@@ -0,0 +1,74 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from typing import Any
8
+ from typing import Callable
9
+ from typing import Dict
10
+
11
+ from langdetect.lang_detect_exception import LangDetectException
12
+
13
+ from nv_ingest_api.internal.enums.common import LanguageEnum
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def langdetect_exception_handler(func: Callable, **kwargs: Dict[str, Any]) -> Callable:
19
+ """
20
+ A decorator that handles `LangDetectException` for language detection functions.
21
+
22
+ This decorator wraps a function that performs language detection and catches any `LangDetectException` that occurs
23
+ during its execution.
24
+ If such an exception is raised, it logs a warning and returns a default value of `LanguageEnum.UNKNOWN`.
25
+
26
+ Parameters
27
+ ----------
28
+ func : callable
29
+ The function to be decorated. This function is expected to handle language detection.
30
+
31
+ kwargs : dict
32
+ Additional keyword arguments to be passed to the function.
33
+
34
+ Returns
35
+ -------
36
+ callable
37
+ The wrapped function that executes `func` with exception handling.
38
+
39
+ Notes
40
+ -----
41
+ If a `LangDetectException` is raised while executing the wrapped function, the exception is logged,
42
+ and `LanguageEnum.UNKNOWN` is returned as a fallback value.
43
+
44
+ Examples
45
+ --------
46
+ >>> @langdetect_exception_handler
47
+ ... def detect_language(text):
48
+ ... # Function implementation here
49
+ ... pass
50
+ ...
51
+ >>> detect_language('This is a test sentence.')
52
+ <LanguageEnum.EN: 'en'>
53
+
54
+ If a `LangDetectException` is encountered, the function will return `LanguageEnum.UNKNOWN`:
55
+
56
+ >>> detect_language('')
57
+ <LanguageEnum.UNKNOWN: 'unknown'>
58
+
59
+ Raises
60
+ ------
61
+ LangDetectException
62
+ The exception raised by the wrapped function is caught and handled by logging a warning
63
+ and returning `LanguageEnum.UNKNOWN`.
64
+ """
65
+
66
+ def inner_function(*args, **kwargs):
67
+ try:
68
+ return func(*args, **kwargs)
69
+ except LangDetectException as e:
70
+ log_error_message = f"LangDetectException: {e}"
71
+ logger.warning(log_error_message)
72
+ return LanguageEnum.UNKNOWN
73
+
74
+ return inner_function
@@ -0,0 +1,116 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from nv_ingest_api.internal.enums.common import StatusEnum, TaskTypeEnum
9
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def pdfium_exception_handler(descriptor):
15
+ """
16
+ A decorator that handles exceptions for functions interacting with PDFium.
17
+
18
+ This decorator wraps a function and catches any exceptions that occur during its execution.
19
+ If an exception is raised, it logs a warning with a descriptor and the function name,
20
+ then returns an empty list as a fallback value.
21
+
22
+ Parameters
23
+ ----------
24
+ descriptor : str
25
+ A string descriptor to identify the context or source of the function being wrapped.
26
+ This descriptor is included in the log message if an exception occurs.
27
+
28
+ Returns
29
+ -------
30
+ callable
31
+ A decorator function that wraps the target function with exception handling.
32
+
33
+ Notes
34
+ -----
35
+ This decorator is useful for ensuring that functions interacting with PDFium can gracefully handle errors
36
+ without interrupting the entire processing pipeline.
37
+
38
+ Examples
39
+ --------
40
+ >>> @pdfium_exception_handler("PDF Processing")
41
+ ... def process_pdf(file_path):
42
+ ... # Function implementation here
43
+ ... pass
44
+ ...
45
+ >>> process_pdf("example.pdf")
46
+ []
47
+
48
+ Raises
49
+ ------
50
+ Exception
51
+ Any exception raised by the wrapped function is caught, logged, and handled by returning an empty list.
52
+ """
53
+
54
+ def outer_function(func):
55
+ def inner_function(*args, **kwargs):
56
+ try:
57
+ return func(*args, **kwargs)
58
+ except Exception as e:
59
+ log_error_message = f"{descriptor}:{func.__name__} error:{e}"
60
+ logger.warning(log_error_message)
61
+ return []
62
+
63
+ return inner_function
64
+
65
+ return outer_function
66
+
67
+
68
+ def create_exception_tag(error_message, source_id=None):
69
+ """
70
+ Creates a metadata tag for logging or reporting an exception.
71
+
72
+ This function generates a metadata dictionary containing information about the exception,
73
+ including the task type, status, source identifier, and error message.
74
+ The metadata is validated and returned as a list containing a single entry.
75
+
76
+ Parameters
77
+ ----------
78
+ error_message : str
79
+ The error message describing the exception.
80
+ source_id : Optional[str], default=None
81
+ The identifier for the source related to the error, if available.
82
+
83
+ Returns
84
+ -------
85
+ list
86
+ A list containing a single entry, which is a tuple. The first element of the tuple is `None`,
87
+ and the second element is the validated metadata dictionary as a `dict`.
88
+
89
+ Notes
90
+ -----
91
+ This function is typically used to generate error metadata for tracking and logging purposes.
92
+
93
+ Examples
94
+ --------
95
+ >>> create_exception_tag("File not found", source_id="12345")
96
+ [[None, {'task': 'EXTRACT', 'status': 'ERROR', 'source_id': '12345', 'error_msg': 'File not found'}]]
97
+
98
+ Raises
99
+ ------
100
+ ValidationError
101
+ If the metadata does not pass validation.
102
+ """
103
+ unified_metadata = {}
104
+
105
+ error_metadata = {
106
+ "task": TaskTypeEnum.EXTRACT,
107
+ "status": StatusEnum.ERROR,
108
+ "source_id": source_id,
109
+ "error_msg": error_message,
110
+ }
111
+
112
+ unified_metadata["error_metadata"] = error_metadata
113
+
114
+ validated_unified_metadata = validate_metadata(unified_metadata)
115
+
116
+ return [[None, validated_unified_metadata.model_dump()]]
@@ -0,0 +1,68 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from pydantic import ValidationError
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def schema_exception_handler(func, **kwargs):
14
+ """
15
+ A decorator that handles `ValidationError` exceptions for schema validation functions.
16
+
17
+ This decorator wraps a function that performs schema validation using Pydantic.
18
+ If a `ValidationError` is raised, it logs detailed error messages and raises a `ValueError` with the combined error
19
+ messages.
20
+
21
+ Parameters
22
+ ----------
23
+ func : callable
24
+ The function to be decorated. This function is expected to perform schema validation.
25
+
26
+ kwargs : dict
27
+ Additional keyword arguments to be passed to the function.
28
+
29
+ Returns
30
+ -------
31
+ callable
32
+ The wrapped function that executes `func` with exception handling.
33
+
34
+ Raises
35
+ ------
36
+ ValueError
37
+ If a `ValidationError` is caught, this decorator logs the error details and raises a `ValueError` with the
38
+ combined error messages.
39
+
40
+ Notes
41
+ -----
42
+ This decorator is particularly useful for functions that validate configurations or data models,
43
+ ensuring that any validation errors are logged and communicated clearly.
44
+
45
+ Examples
46
+ --------
47
+ >>> @schema_exception_handler
48
+ ... def validate_config(config_data):
49
+ ... schema = MySchema(**config_data)
50
+ ... return schema
51
+ ...
52
+ >>> try:
53
+ ... validate_config(invalid_config)
54
+ ... except ValueError as e:
55
+ ... print(f"Caught error: {e}")
56
+ Caught error: Invalid configuration: field1: value is not a valid integer; field2: field required
57
+ """
58
+
59
+ def inner_function(*args, **kwargs):
60
+ try:
61
+ return func(*args, **kwargs)
62
+ except ValidationError as e:
63
+ error_messages = "; ".join([f"{error['loc'][0]}: {error['msg']}" for error in e.errors()])
64
+ log_error_message = f"Invalid configuration: {error_messages}"
65
+ logger.error(log_error_message)
66
+ raise ValueError(log_error_message)
67
+
68
+ return inner_function
@@ -0,0 +1,5 @@
1
+ from .transforms import scale_image_to_encoding_size
2
+
3
+ __all__ = [
4
+ "scale_image_to_encoding_size",
5
+ ]