nv-ingest-api 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.21.dev20250421.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,38 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import langdetect
7
+
8
+ from nv_ingest_api.internal.enums.common import LanguageEnum
9
+ from nv_ingest_api.util.exception_handlers.detectors import langdetect_exception_handler
10
+
11
+
12
+ @langdetect_exception_handler
13
+ def detect_language(text):
14
+ """
15
+ Detect spoken language from a string of text.
16
+
17
+ Parameters
18
+ ----------
19
+ text : str
20
+ A string of text.
21
+
22
+ Returns
23
+ -------
24
+ LanguageEnum
25
+ A value from `LanguageEnum` detected language code.
26
+ """
27
+
28
+ try:
29
+ language = langdetect.detect(text)
30
+
31
+ if LanguageEnum.has_value(language):
32
+ language = LanguageEnum[language.upper().replace("-", "_")]
33
+ else:
34
+ language = LanguageEnum.UNKNOWN
35
+ except langdetect.lang_detect_exception.LangDetectException:
36
+ language = LanguageEnum.UNKNOWN
37
+
38
+ return language
File without changes
@@ -0,0 +1,72 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from datetime import datetime
8
+ from datetime import timezone
9
+ from typing import Any
10
+ from typing import Callable
11
+ from typing import Dict
12
+
13
+ from nv_ingest_api.util.converters import datetools
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def datetools_exception_handler(func: Callable, **kwargs: Dict[str, Any]) -> Callable:
19
+ """
20
+ A decorator that handles exceptions for date-related functions.
21
+
22
+ This decorator wraps a function that processes dates and catches any exceptions that occur during its execution.
23
+ If an exception is raised, it logs a warning and returns the current UTC time as an ISO 8601 formatted string.
24
+
25
+ Parameters
26
+ ----------
27
+ func : Callable
28
+ The function to be decorated. This function is expected to handle date operations.
29
+
30
+ kwargs : dict
31
+ Additional keyword arguments to be passed to the function.
32
+
33
+ Returns
34
+ -------
35
+ Callable
36
+ The wrapped function that executes `func` with exception handling.
37
+
38
+ Notes
39
+ -----
40
+ If an exception is raised while executing the wrapped function, the current UTC time (with timezone information
41
+ removed)
42
+ will be returned as an ISO 8601 formatted string.
43
+
44
+ Examples
45
+ --------
46
+ >>> @datetools_exception_handler
47
+ ... def parse_date(date_str):
48
+ ... return datetime.strptime(date_str, '%Y-%m-%d')
49
+ ...
50
+ >>> parse_date('2024-08-22')
51
+ datetime.datetime(2024, 8, 22, 0, 0)
52
+
53
+ If the input is invalid, the current UTC time without timezone information is returned:
54
+
55
+ >>> parse_date('invalid-date')
56
+ '2024-08-22T12:34:56'
57
+
58
+ Raises
59
+ ------
60
+ Exception
61
+ Any exception raised by the wrapped function is caught, logged, and handled by returning the current UTC time.
62
+ """
63
+
64
+ def inner_function(*args, **kwargs):
65
+ try:
66
+ return func(*args, **kwargs)
67
+ except Exception as e:
68
+ log_error_message = f"Invalid date format: {e}"
69
+ logger.warning(log_error_message)
70
+ return datetools.remove_tz(datetime.now(timezone.utc)).isoformat()
71
+
72
+ return inner_function
@@ -0,0 +1,223 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import functools
7
+ import inspect
8
+ import re
9
+ import typing
10
+ from functools import wraps
11
+
12
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
13
+ from nv_ingest_api.internal.primitives.tracing.logging import TaskResultStatus, annotate_task_result
14
+ from nv_ingest_api.util.control_message.validators import cm_ensure_payload_not_null, cm_set_failure
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ # TODO(Devin): move back to framework
21
+ def nv_ingest_node_failure_context_manager(
22
+ annotation_id: str,
23
+ payload_can_be_empty: bool = False,
24
+ raise_on_failure: bool = False,
25
+ skip_processing_if_failed: bool = True,
26
+ forward_func=None,
27
+ ) -> typing.Callable:
28
+ """
29
+ A decorator that applies a default failure context manager around a function to manage
30
+ the execution and potential failure of operations involving IngestControlMessages.
31
+
32
+ Parameters
33
+ ----------
34
+ annotation_id : str
35
+ A unique identifier used for annotating the task's result.
36
+ payload_can_be_empty : bool, optional
37
+ If False, the payload of the IngestControlMessage will be checked to ensure it's not null,
38
+ raising an exception if it is null. Defaults to False, enforcing payload presence.
39
+ raise_on_failure : bool, optional
40
+ If True, an exception is raised if the decorated function encounters an error.
41
+ Otherwise, the error is handled silently by annotating the IngestControlMessage. Defaults to False.
42
+ skip_processing_if_failed : bool, optional
43
+ If True, skips the processing of the decorated function if the control message has already
44
+ been marked as failed. If False, the function will be processed regardless of the failure
45
+ status of the IngestControlMessage. Defaults to True.
46
+ forward_func : callable, optional
47
+ A function to forward the IngestControlMessage if it has already been marked as failed.
48
+
49
+ Returns
50
+ -------
51
+ Callable
52
+ A decorator that wraps the given function with failure handling logic.
53
+ """
54
+
55
+ def decorator(func):
56
+ @wraps(func)
57
+ def wrapper(control_message: IngestControlMessage, *args, **kwargs):
58
+ # Quick return if the IngestControlMessage has already failed
59
+ is_failed = control_message.get_metadata("cm_failed", False)
60
+ if not is_failed or not skip_processing_if_failed:
61
+ with CMNVIngestFailureContextManager(
62
+ control_message=control_message,
63
+ annotation_id=annotation_id,
64
+ raise_on_failure=raise_on_failure,
65
+ func_name=func.__name__,
66
+ ) as ctx_mgr:
67
+ if not payload_can_be_empty:
68
+ cm_ensure_payload_not_null(control_message=control_message)
69
+ control_message = func(ctx_mgr.control_message, *args, **kwargs)
70
+ else:
71
+ if forward_func:
72
+ control_message = forward_func(control_message)
73
+ return control_message
74
+
75
+ return wrapper
76
+
77
+ return decorator
78
+
79
+
80
+ def nv_ingest_source_failure_context_manager(
81
+ annotation_id: str,
82
+ payload_can_be_empty: bool = False,
83
+ raise_on_failure: bool = False,
84
+ ) -> typing.Callable:
85
+ """
86
+ A decorator that ensures any function's output is treated as a IngestControlMessage for annotation.
87
+ It applies a context manager to handle success and failure annotations based on the function's execution.
88
+
89
+ Parameters
90
+ ----------
91
+ annotation_id : str
92
+ Unique identifier used for annotating the function's output.
93
+ payload_can_be_empty : bool, optional
94
+ Specifies if the function's output IngestControlMessage payload can be empty, default is False.
95
+ raise_on_failure : bool, optional
96
+ Determines if an exception should be raised upon function failure, default is False.
97
+
98
+ Returns
99
+ -------
100
+ Callable
101
+ A decorator that ensures function output is processed for success or failure annotation.
102
+ """
103
+
104
+ def decorator(func):
105
+ @wraps(func)
106
+ def wrapper(*args, **kwargs) -> IngestControlMessage:
107
+ try:
108
+ result = func(*args, **kwargs)
109
+ if not isinstance(result, IngestControlMessage):
110
+ raise TypeError(f"{func.__name__} output is not a IngestControlMessage as expected.")
111
+ if not payload_can_be_empty and result.get_metadata("payload") is None:
112
+ raise ValueError(f"{func.__name__} IngestControlMessage payload cannot be null.")
113
+
114
+ # Success annotation.
115
+ annotate_task_result(result, result=TaskResultStatus.SUCCESS, task_id=annotation_id)
116
+ except Exception as e:
117
+ error_message = f"Error in {func.__name__}: {e}"
118
+ # Prepare a new IngestControlMessage for failure annotation if needed.
119
+ if "result" not in locals() or not isinstance(result, IngestControlMessage):
120
+ result = IngestControlMessage()
121
+ cm_set_failure(result, error_message)
122
+ annotate_task_result(
123
+ result,
124
+ result=TaskResultStatus.FAILURE,
125
+ task_id=annotation_id,
126
+ message=error_message,
127
+ )
128
+ if raise_on_failure:
129
+ raise
130
+ return result
131
+
132
+ return wrapper
133
+
134
+ return decorator
135
+
136
+
137
+ class CMNVIngestFailureContextManager:
138
+ """
139
+ Context manager for handling IngestControlMessage failures during processing, providing
140
+ a structured way to annotate and manage failures and successes.
141
+
142
+ Parameters
143
+ ----------
144
+ control_message : IngestControlMessage
145
+ The IngestControlMessage instance to be managed.
146
+ annotation_id : str
147
+ The task's unique identifier for annotation purposes.
148
+ raise_on_failure : bool, optional
149
+ Determines whether to raise an exception upon failure. Defaults to False, which
150
+ means failures are annotated rather than raising exceptions.
151
+ func_name : str, optional
152
+ The name of the function being wrapped, used to annotate error messages uniformly.
153
+ If None, stack introspection is used to deduce a likely function name. Defaults to None.
154
+
155
+ Returns
156
+ -------
157
+ None
158
+ """
159
+
160
+ def __init__(
161
+ self,
162
+ control_message: IngestControlMessage,
163
+ annotation_id: str,
164
+ raise_on_failure: bool = False,
165
+ func_name: str = None,
166
+ ):
167
+ self.control_message = control_message
168
+ self.annotation_id = annotation_id
169
+ self.raise_on_failure = raise_on_failure
170
+ if func_name is not None:
171
+ self._func_name = func_name
172
+ else:
173
+ try:
174
+ # Use stack introspection to get a candidate function name.
175
+ stack = inspect.stack()
176
+ # Use the third frame as a heuristic; adjust if needed.
177
+ candidate = stack[2].function if len(stack) > 2 else "UnknownFunction"
178
+ # Remove any whitespace and limit the length to 50 characters.
179
+ candidate = re.sub(r"\s+", "", candidate)[:50]
180
+ self._func_name = candidate if candidate else "UnknownFunction"
181
+ except Exception:
182
+ self._func_name = "UnknownFunction"
183
+
184
+ def __enter__(self):
185
+ return self
186
+
187
+ def __exit__(self, exc_type, exc_value, traceback):
188
+ if exc_type is not None: # An exception occurred
189
+ error_message = f"Error in {self._func_name}: {exc_value}"
190
+ if self.control_message is not None:
191
+ cm_set_failure(self.control_message, error_message)
192
+ annotate_task_result(
193
+ self.control_message,
194
+ result=TaskResultStatus.FAILURE,
195
+ task_id=self.annotation_id,
196
+ message=error_message,
197
+ )
198
+ # Propagate the exception if raise_on_failure is True; otherwise, suppress it.
199
+ if self.raise_on_failure:
200
+ return False
201
+ return True
202
+
203
+ annotate_task_result(
204
+ self.control_message,
205
+ result=TaskResultStatus.SUCCESS,
206
+ task_id=self.annotation_id,
207
+ )
208
+ return False
209
+
210
+
211
+ def unified_exception_handler(func):
212
+ @functools.wraps(func)
213
+ def wrapper(*args, **kwargs):
214
+ try:
215
+ return func(*args, **kwargs)
216
+ except Exception as e:
217
+ # Use the function's name in the error message
218
+ func_name = func.__name__
219
+ err_msg = f"{func_name}: error: {e}"
220
+ logger.exception(err_msg, exc_info=True)
221
+ raise type(e)(err_msg) from e
222
+
223
+ return wrapper
@@ -0,0 +1,74 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from typing import Any
8
+ from typing import Callable
9
+ from typing import Dict
10
+
11
+ from langdetect.lang_detect_exception import LangDetectException
12
+
13
+ from nv_ingest_api.internal.enums.common import LanguageEnum
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def langdetect_exception_handler(func: Callable, **kwargs: Dict[str, Any]) -> Callable:
19
+ """
20
+ A decorator that handles `LangDetectException` for language detection functions.
21
+
22
+ This decorator wraps a function that performs language detection and catches any `LangDetectException` that occurs
23
+ during its execution.
24
+ If such an exception is raised, it logs a warning and returns a default value of `LanguageEnum.UNKNOWN`.
25
+
26
+ Parameters
27
+ ----------
28
+ func : callable
29
+ The function to be decorated. This function is expected to handle language detection.
30
+
31
+ kwargs : dict
32
+ Additional keyword arguments to be passed to the function.
33
+
34
+ Returns
35
+ -------
36
+ callable
37
+ The wrapped function that executes `func` with exception handling.
38
+
39
+ Notes
40
+ -----
41
+ If a `LangDetectException` is raised while executing the wrapped function, the exception is logged,
42
+ and `LanguageEnum.UNKNOWN` is returned as a fallback value.
43
+
44
+ Examples
45
+ --------
46
+ >>> @langdetect_exception_handler
47
+ ... def detect_language(text):
48
+ ... # Function implementation here
49
+ ... pass
50
+ ...
51
+ >>> detect_language('This is a test sentence.')
52
+ <LanguageEnum.EN: 'en'>
53
+
54
+ If a `LangDetectException` is encountered, the function will return `LanguageEnum.UNKNOWN`:
55
+
56
+ >>> detect_language('')
57
+ <LanguageEnum.UNKNOWN: 'unknown'>
58
+
59
+ Raises
60
+ ------
61
+ LangDetectException
62
+ The exception raised by the wrapped function is caught and handled by logging a warning
63
+ and returning `LanguageEnum.UNKNOWN`.
64
+ """
65
+
66
+ def inner_function(*args, **kwargs):
67
+ try:
68
+ return func(*args, **kwargs)
69
+ except LangDetectException as e:
70
+ log_error_message = f"LangDetectException: {e}"
71
+ logger.warning(log_error_message)
72
+ return LanguageEnum.UNKNOWN
73
+
74
+ return inner_function
@@ -0,0 +1,116 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from nv_ingest_api.internal.enums.common import StatusEnum, TaskTypeEnum
9
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def pdfium_exception_handler(descriptor):
15
+ """
16
+ A decorator that handles exceptions for functions interacting with PDFium.
17
+
18
+ This decorator wraps a function and catches any exceptions that occur during its execution.
19
+ If an exception is raised, it logs a warning with a descriptor and the function name,
20
+ then returns an empty list as a fallback value.
21
+
22
+ Parameters
23
+ ----------
24
+ descriptor : str
25
+ A string descriptor to identify the context or source of the function being wrapped.
26
+ This descriptor is included in the log message if an exception occurs.
27
+
28
+ Returns
29
+ -------
30
+ callable
31
+ A decorator function that wraps the target function with exception handling.
32
+
33
+ Notes
34
+ -----
35
+ This decorator is useful for ensuring that functions interacting with PDFium can gracefully handle errors
36
+ without interrupting the entire processing pipeline.
37
+
38
+ Examples
39
+ --------
40
+ >>> @pdfium_exception_handler("PDF Processing")
41
+ ... def process_pdf(file_path):
42
+ ... # Function implementation here
43
+ ... pass
44
+ ...
45
+ >>> process_pdf("example.pdf")
46
+ []
47
+
48
+ Raises
49
+ ------
50
+ Exception
51
+ Any exception raised by the wrapped function is caught, logged, and handled by returning an empty list.
52
+ """
53
+
54
+ def outer_function(func):
55
+ def inner_function(*args, **kwargs):
56
+ try:
57
+ return func(*args, **kwargs)
58
+ except Exception as e:
59
+ log_error_message = f"{descriptor}:{func.__name__} error:{e}"
60
+ logger.warning(log_error_message)
61
+ return []
62
+
63
+ return inner_function
64
+
65
+ return outer_function
66
+
67
+
68
+ def create_exception_tag(error_message, source_id=None):
69
+ """
70
+ Creates a metadata tag for logging or reporting an exception.
71
+
72
+ This function generates a metadata dictionary containing information about the exception,
73
+ including the task type, status, source identifier, and error message.
74
+ The metadata is validated and returned as a list containing a single entry.
75
+
76
+ Parameters
77
+ ----------
78
+ error_message : str
79
+ The error message describing the exception.
80
+ source_id : Optional[str], default=None
81
+ The identifier for the source related to the error, if available.
82
+
83
+ Returns
84
+ -------
85
+ list
86
+ A list containing a single entry, which is a tuple. The first element of the tuple is `None`,
87
+ and the second element is the validated metadata dictionary as a `dict`.
88
+
89
+ Notes
90
+ -----
91
+ This function is typically used to generate error metadata for tracking and logging purposes.
92
+
93
+ Examples
94
+ --------
95
+ >>> create_exception_tag("File not found", source_id="12345")
96
+ [[None, {'task': 'EXTRACT', 'status': 'ERROR', 'source_id': '12345', 'error_msg': 'File not found'}]]
97
+
98
+ Raises
99
+ ------
100
+ ValidationError
101
+ If the metadata does not pass validation.
102
+ """
103
+ unified_metadata = {}
104
+
105
+ error_metadata = {
106
+ "task": TaskTypeEnum.EXTRACT,
107
+ "status": StatusEnum.ERROR,
108
+ "source_id": source_id,
109
+ "error_msg": error_message,
110
+ }
111
+
112
+ unified_metadata["error_metadata"] = error_metadata
113
+
114
+ validated_unified_metadata = validate_metadata(unified_metadata)
115
+
116
+ return [[None, validated_unified_metadata.model_dump()]]
@@ -0,0 +1,68 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from pydantic import ValidationError
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def schema_exception_handler(func, **kwargs):
14
+ """
15
+ A decorator that handles `ValidationError` exceptions for schema validation functions.
16
+
17
+ This decorator wraps a function that performs schema validation using Pydantic.
18
+ If a `ValidationError` is raised, it logs detailed error messages and raises a `ValueError` with the combined error
19
+ messages.
20
+
21
+ Parameters
22
+ ----------
23
+ func : callable
24
+ The function to be decorated. This function is expected to perform schema validation.
25
+
26
+ kwargs : dict
27
+ Additional keyword arguments to be passed to the function.
28
+
29
+ Returns
30
+ -------
31
+ callable
32
+ The wrapped function that executes `func` with exception handling.
33
+
34
+ Raises
35
+ ------
36
+ ValueError
37
+ If a `ValidationError` is caught, this decorator logs the error details and raises a `ValueError` with the
38
+ combined error messages.
39
+
40
+ Notes
41
+ -----
42
+ This decorator is particularly useful for functions that validate configurations or data models,
43
+ ensuring that any validation errors are logged and communicated clearly.
44
+
45
+ Examples
46
+ --------
47
+ >>> @schema_exception_handler
48
+ ... def validate_config(config_data):
49
+ ... schema = MySchema(**config_data)
50
+ ... return schema
51
+ ...
52
+ >>> try:
53
+ ... validate_config(invalid_config)
54
+ ... except ValueError as e:
55
+ ... print(f"Caught error: {e}")
56
+ Caught error: Invalid configuration: field1: value is not a valid integer; field2: field required
57
+ """
58
+
59
+ def inner_function(*args, **kwargs):
60
+ try:
61
+ return func(*args, **kwargs)
62
+ except ValidationError as e:
63
+ error_messages = "; ".join([f"{error['loc'][0]}: {error['msg']}" for error in e.errors()])
64
+ log_error_message = f"Invalid configuration: {error_messages}"
65
+ logger.error(log_error_message)
66
+ raise ValueError(log_error_message)
67
+
68
+ return inner_function
@@ -0,0 +1,5 @@
1
+ from .transforms import scale_image_to_encoding_size
2
+
3
+ __all__ = [
4
+ "scale_image_to_encoding_size",
5
+ ]