nv-ingest-api 2025.4.20.dev20250420__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +215 -0
- nv_ingest_api/interface/extract.py +972 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +218 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +200 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +494 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
- nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
- nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +232 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +205 -0
- nv_ingest_api/internal/transform/embed_text.py +496 -0
- nv_ingest_api/internal/transform/split_text.py +157 -0
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +223 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +179 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
- nv_ingest_api/util/image_processing/transforms.py +407 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +31 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +469 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
- nv_ingest_api/util/nim/__init__.py +56 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +427 -0
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
- nv_ingest_api-2025.4.20.dev20250420.dist-info/RECORD +0 -9
- /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
- {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import langdetect
|
|
7
|
+
|
|
8
|
+
from nv_ingest_api.internal.enums.common import LanguageEnum
|
|
9
|
+
from nv_ingest_api.util.exception_handlers.detectors import langdetect_exception_handler
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@langdetect_exception_handler
|
|
13
|
+
def detect_language(text):
|
|
14
|
+
"""
|
|
15
|
+
Detect spoken language from a string of text.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
text : str
|
|
20
|
+
A string of text.
|
|
21
|
+
|
|
22
|
+
Returns
|
|
23
|
+
-------
|
|
24
|
+
LanguageEnum
|
|
25
|
+
A value from `LanguageEnum` detected language code.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
language = langdetect.detect(text)
|
|
30
|
+
|
|
31
|
+
if LanguageEnum.has_value(language):
|
|
32
|
+
language = LanguageEnum[language.upper().replace("-", "_")]
|
|
33
|
+
else:
|
|
34
|
+
language = LanguageEnum.UNKNOWN
|
|
35
|
+
except langdetect.lang_detect_exception.LangDetectException:
|
|
36
|
+
language = LanguageEnum.UNKNOWN
|
|
37
|
+
|
|
38
|
+
return language
|
|
File without changes
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from datetime import timezone
|
|
9
|
+
from typing import Any
|
|
10
|
+
from typing import Callable
|
|
11
|
+
from typing import Dict
|
|
12
|
+
|
|
13
|
+
from nv_ingest_api.util.converters import datetools
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def datetools_exception_handler(func: Callable, **kwargs: Dict[str, Any]) -> Callable:
|
|
19
|
+
"""
|
|
20
|
+
A decorator that handles exceptions for date-related functions.
|
|
21
|
+
|
|
22
|
+
This decorator wraps a function that processes dates and catches any exceptions that occur during its execution.
|
|
23
|
+
If an exception is raised, it logs a warning and returns the current UTC time as an ISO 8601 formatted string.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
func : Callable
|
|
28
|
+
The function to be decorated. This function is expected to handle date operations.
|
|
29
|
+
|
|
30
|
+
kwargs : dict
|
|
31
|
+
Additional keyword arguments to be passed to the function.
|
|
32
|
+
|
|
33
|
+
Returns
|
|
34
|
+
-------
|
|
35
|
+
Callable
|
|
36
|
+
The wrapped function that executes `func` with exception handling.
|
|
37
|
+
|
|
38
|
+
Notes
|
|
39
|
+
-----
|
|
40
|
+
If an exception is raised while executing the wrapped function, the current UTC time (with timezone information
|
|
41
|
+
removed)
|
|
42
|
+
will be returned as an ISO 8601 formatted string.
|
|
43
|
+
|
|
44
|
+
Examples
|
|
45
|
+
--------
|
|
46
|
+
>>> @datetools_exception_handler
|
|
47
|
+
... def parse_date(date_str):
|
|
48
|
+
... return datetime.strptime(date_str, '%Y-%m-%d')
|
|
49
|
+
...
|
|
50
|
+
>>> parse_date('2024-08-22')
|
|
51
|
+
datetime.datetime(2024, 8, 22, 0, 0)
|
|
52
|
+
|
|
53
|
+
If the input is invalid, the current UTC time without timezone information is returned:
|
|
54
|
+
|
|
55
|
+
>>> parse_date('invalid-date')
|
|
56
|
+
'2024-08-22T12:34:56'
|
|
57
|
+
|
|
58
|
+
Raises
|
|
59
|
+
------
|
|
60
|
+
Exception
|
|
61
|
+
Any exception raised by the wrapped function is caught, logged, and handled by returning the current UTC time.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def inner_function(*args, **kwargs):
|
|
65
|
+
try:
|
|
66
|
+
return func(*args, **kwargs)
|
|
67
|
+
except Exception as e:
|
|
68
|
+
log_error_message = f"Invalid date format: {e}"
|
|
69
|
+
logger.warning(log_error_message)
|
|
70
|
+
return datetools.remove_tz(datetime.now(timezone.utc)).isoformat()
|
|
71
|
+
|
|
72
|
+
return inner_function
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import functools
|
|
7
|
+
import inspect
|
|
8
|
+
import re
|
|
9
|
+
import typing
|
|
10
|
+
from functools import wraps
|
|
11
|
+
|
|
12
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
|
|
13
|
+
from nv_ingest_api.internal.primitives.tracing.logging import TaskResultStatus, annotate_task_result
|
|
14
|
+
from nv_ingest_api.util.control_message.validators import cm_ensure_payload_not_null, cm_set_failure
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# TODO(Devin): move back to framework
|
|
21
|
+
def nv_ingest_node_failure_context_manager(
|
|
22
|
+
annotation_id: str,
|
|
23
|
+
payload_can_be_empty: bool = False,
|
|
24
|
+
raise_on_failure: bool = False,
|
|
25
|
+
skip_processing_if_failed: bool = True,
|
|
26
|
+
forward_func=None,
|
|
27
|
+
) -> typing.Callable:
|
|
28
|
+
"""
|
|
29
|
+
A decorator that applies a default failure context manager around a function to manage
|
|
30
|
+
the execution and potential failure of operations involving IngestControlMessages.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
annotation_id : str
|
|
35
|
+
A unique identifier used for annotating the task's result.
|
|
36
|
+
payload_can_be_empty : bool, optional
|
|
37
|
+
If False, the payload of the IngestControlMessage will be checked to ensure it's not null,
|
|
38
|
+
raising an exception if it is null. Defaults to False, enforcing payload presence.
|
|
39
|
+
raise_on_failure : bool, optional
|
|
40
|
+
If True, an exception is raised if the decorated function encounters an error.
|
|
41
|
+
Otherwise, the error is handled silently by annotating the IngestControlMessage. Defaults to False.
|
|
42
|
+
skip_processing_if_failed : bool, optional
|
|
43
|
+
If True, skips the processing of the decorated function if the control message has already
|
|
44
|
+
been marked as failed. If False, the function will be processed regardless of the failure
|
|
45
|
+
status of the IngestControlMessage. Defaults to True.
|
|
46
|
+
forward_func : callable, optional
|
|
47
|
+
A function to forward the IngestControlMessage if it has already been marked as failed.
|
|
48
|
+
|
|
49
|
+
Returns
|
|
50
|
+
-------
|
|
51
|
+
Callable
|
|
52
|
+
A decorator that wraps the given function with failure handling logic.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def decorator(func):
|
|
56
|
+
@wraps(func)
|
|
57
|
+
def wrapper(control_message: IngestControlMessage, *args, **kwargs):
|
|
58
|
+
# Quick return if the IngestControlMessage has already failed
|
|
59
|
+
is_failed = control_message.get_metadata("cm_failed", False)
|
|
60
|
+
if not is_failed or not skip_processing_if_failed:
|
|
61
|
+
with CMNVIngestFailureContextManager(
|
|
62
|
+
control_message=control_message,
|
|
63
|
+
annotation_id=annotation_id,
|
|
64
|
+
raise_on_failure=raise_on_failure,
|
|
65
|
+
func_name=func.__name__,
|
|
66
|
+
) as ctx_mgr:
|
|
67
|
+
if not payload_can_be_empty:
|
|
68
|
+
cm_ensure_payload_not_null(control_message=control_message)
|
|
69
|
+
control_message = func(ctx_mgr.control_message, *args, **kwargs)
|
|
70
|
+
else:
|
|
71
|
+
if forward_func:
|
|
72
|
+
control_message = forward_func(control_message)
|
|
73
|
+
return control_message
|
|
74
|
+
|
|
75
|
+
return wrapper
|
|
76
|
+
|
|
77
|
+
return decorator
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def nv_ingest_source_failure_context_manager(
|
|
81
|
+
annotation_id: str,
|
|
82
|
+
payload_can_be_empty: bool = False,
|
|
83
|
+
raise_on_failure: bool = False,
|
|
84
|
+
) -> typing.Callable:
|
|
85
|
+
"""
|
|
86
|
+
A decorator that ensures any function's output is treated as a IngestControlMessage for annotation.
|
|
87
|
+
It applies a context manager to handle success and failure annotations based on the function's execution.
|
|
88
|
+
|
|
89
|
+
Parameters
|
|
90
|
+
----------
|
|
91
|
+
annotation_id : str
|
|
92
|
+
Unique identifier used for annotating the function's output.
|
|
93
|
+
payload_can_be_empty : bool, optional
|
|
94
|
+
Specifies if the function's output IngestControlMessage payload can be empty, default is False.
|
|
95
|
+
raise_on_failure : bool, optional
|
|
96
|
+
Determines if an exception should be raised upon function failure, default is False.
|
|
97
|
+
|
|
98
|
+
Returns
|
|
99
|
+
-------
|
|
100
|
+
Callable
|
|
101
|
+
A decorator that ensures function output is processed for success or failure annotation.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def decorator(func):
|
|
105
|
+
@wraps(func)
|
|
106
|
+
def wrapper(*args, **kwargs) -> IngestControlMessage:
|
|
107
|
+
try:
|
|
108
|
+
result = func(*args, **kwargs)
|
|
109
|
+
if not isinstance(result, IngestControlMessage):
|
|
110
|
+
raise TypeError(f"{func.__name__} output is not a IngestControlMessage as expected.")
|
|
111
|
+
if not payload_can_be_empty and result.get_metadata("payload") is None:
|
|
112
|
+
raise ValueError(f"{func.__name__} IngestControlMessage payload cannot be null.")
|
|
113
|
+
|
|
114
|
+
# Success annotation.
|
|
115
|
+
annotate_task_result(result, result=TaskResultStatus.SUCCESS, task_id=annotation_id)
|
|
116
|
+
except Exception as e:
|
|
117
|
+
error_message = f"Error in {func.__name__}: {e}"
|
|
118
|
+
# Prepare a new IngestControlMessage for failure annotation if needed.
|
|
119
|
+
if "result" not in locals() or not isinstance(result, IngestControlMessage):
|
|
120
|
+
result = IngestControlMessage()
|
|
121
|
+
cm_set_failure(result, error_message)
|
|
122
|
+
annotate_task_result(
|
|
123
|
+
result,
|
|
124
|
+
result=TaskResultStatus.FAILURE,
|
|
125
|
+
task_id=annotation_id,
|
|
126
|
+
message=error_message,
|
|
127
|
+
)
|
|
128
|
+
if raise_on_failure:
|
|
129
|
+
raise
|
|
130
|
+
return result
|
|
131
|
+
|
|
132
|
+
return wrapper
|
|
133
|
+
|
|
134
|
+
return decorator
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class CMNVIngestFailureContextManager:
|
|
138
|
+
"""
|
|
139
|
+
Context manager for handling IngestControlMessage failures during processing, providing
|
|
140
|
+
a structured way to annotate and manage failures and successes.
|
|
141
|
+
|
|
142
|
+
Parameters
|
|
143
|
+
----------
|
|
144
|
+
control_message : IngestControlMessage
|
|
145
|
+
The IngestControlMessage instance to be managed.
|
|
146
|
+
annotation_id : str
|
|
147
|
+
The task's unique identifier for annotation purposes.
|
|
148
|
+
raise_on_failure : bool, optional
|
|
149
|
+
Determines whether to raise an exception upon failure. Defaults to False, which
|
|
150
|
+
means failures are annotated rather than raising exceptions.
|
|
151
|
+
func_name : str, optional
|
|
152
|
+
The name of the function being wrapped, used to annotate error messages uniformly.
|
|
153
|
+
If None, stack introspection is used to deduce a likely function name. Defaults to None.
|
|
154
|
+
|
|
155
|
+
Returns
|
|
156
|
+
-------
|
|
157
|
+
None
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
def __init__(
|
|
161
|
+
self,
|
|
162
|
+
control_message: IngestControlMessage,
|
|
163
|
+
annotation_id: str,
|
|
164
|
+
raise_on_failure: bool = False,
|
|
165
|
+
func_name: str = None,
|
|
166
|
+
):
|
|
167
|
+
self.control_message = control_message
|
|
168
|
+
self.annotation_id = annotation_id
|
|
169
|
+
self.raise_on_failure = raise_on_failure
|
|
170
|
+
if func_name is not None:
|
|
171
|
+
self._func_name = func_name
|
|
172
|
+
else:
|
|
173
|
+
try:
|
|
174
|
+
# Use stack introspection to get a candidate function name.
|
|
175
|
+
stack = inspect.stack()
|
|
176
|
+
# Use the third frame as a heuristic; adjust if needed.
|
|
177
|
+
candidate = stack[2].function if len(stack) > 2 else "UnknownFunction"
|
|
178
|
+
# Remove any whitespace and limit the length to 50 characters.
|
|
179
|
+
candidate = re.sub(r"\s+", "", candidate)[:50]
|
|
180
|
+
self._func_name = candidate if candidate else "UnknownFunction"
|
|
181
|
+
except Exception:
|
|
182
|
+
self._func_name = "UnknownFunction"
|
|
183
|
+
|
|
184
|
+
def __enter__(self):
|
|
185
|
+
return self
|
|
186
|
+
|
|
187
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
188
|
+
if exc_type is not None: # An exception occurred
|
|
189
|
+
error_message = f"Error in {self._func_name}: {exc_value}"
|
|
190
|
+
if self.control_message is not None:
|
|
191
|
+
cm_set_failure(self.control_message, error_message)
|
|
192
|
+
annotate_task_result(
|
|
193
|
+
self.control_message,
|
|
194
|
+
result=TaskResultStatus.FAILURE,
|
|
195
|
+
task_id=self.annotation_id,
|
|
196
|
+
message=error_message,
|
|
197
|
+
)
|
|
198
|
+
# Propagate the exception if raise_on_failure is True; otherwise, suppress it.
|
|
199
|
+
if self.raise_on_failure:
|
|
200
|
+
return False
|
|
201
|
+
return True
|
|
202
|
+
|
|
203
|
+
annotate_task_result(
|
|
204
|
+
self.control_message,
|
|
205
|
+
result=TaskResultStatus.SUCCESS,
|
|
206
|
+
task_id=self.annotation_id,
|
|
207
|
+
)
|
|
208
|
+
return False
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def unified_exception_handler(func):
|
|
212
|
+
@functools.wraps(func)
|
|
213
|
+
def wrapper(*args, **kwargs):
|
|
214
|
+
try:
|
|
215
|
+
return func(*args, **kwargs)
|
|
216
|
+
except Exception as e:
|
|
217
|
+
# Use the function's name in the error message
|
|
218
|
+
func_name = func.__name__
|
|
219
|
+
err_msg = f"{func_name}: error: {e}"
|
|
220
|
+
logger.exception(err_msg, exc_info=True)
|
|
221
|
+
raise type(e)(err_msg) from e
|
|
222
|
+
|
|
223
|
+
return wrapper
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any
|
|
8
|
+
from typing import Callable
|
|
9
|
+
from typing import Dict
|
|
10
|
+
|
|
11
|
+
from langdetect.lang_detect_exception import LangDetectException
|
|
12
|
+
|
|
13
|
+
from nv_ingest_api.internal.enums.common import LanguageEnum
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def langdetect_exception_handler(func: Callable, **kwargs: Dict[str, Any]) -> Callable:
|
|
19
|
+
"""
|
|
20
|
+
A decorator that handles `LangDetectException` for language detection functions.
|
|
21
|
+
|
|
22
|
+
This decorator wraps a function that performs language detection and catches any `LangDetectException` that occurs
|
|
23
|
+
during its execution.
|
|
24
|
+
If such an exception is raised, it logs a warning and returns a default value of `LanguageEnum.UNKNOWN`.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
func : callable
|
|
29
|
+
The function to be decorated. This function is expected to handle language detection.
|
|
30
|
+
|
|
31
|
+
kwargs : dict
|
|
32
|
+
Additional keyword arguments to be passed to the function.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
callable
|
|
37
|
+
The wrapped function that executes `func` with exception handling.
|
|
38
|
+
|
|
39
|
+
Notes
|
|
40
|
+
-----
|
|
41
|
+
If a `LangDetectException` is raised while executing the wrapped function, the exception is logged,
|
|
42
|
+
and `LanguageEnum.UNKNOWN` is returned as a fallback value.
|
|
43
|
+
|
|
44
|
+
Examples
|
|
45
|
+
--------
|
|
46
|
+
>>> @langdetect_exception_handler
|
|
47
|
+
... def detect_language(text):
|
|
48
|
+
... # Function implementation here
|
|
49
|
+
... pass
|
|
50
|
+
...
|
|
51
|
+
>>> detect_language('This is a test sentence.')
|
|
52
|
+
<LanguageEnum.EN: 'en'>
|
|
53
|
+
|
|
54
|
+
If a `LangDetectException` is encountered, the function will return `LanguageEnum.UNKNOWN`:
|
|
55
|
+
|
|
56
|
+
>>> detect_language('')
|
|
57
|
+
<LanguageEnum.UNKNOWN: 'unknown'>
|
|
58
|
+
|
|
59
|
+
Raises
|
|
60
|
+
------
|
|
61
|
+
LangDetectException
|
|
62
|
+
The exception raised by the wrapped function is caught and handled by logging a warning
|
|
63
|
+
and returning `LanguageEnum.UNKNOWN`.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def inner_function(*args, **kwargs):
|
|
67
|
+
try:
|
|
68
|
+
return func(*args, **kwargs)
|
|
69
|
+
except LangDetectException as e:
|
|
70
|
+
log_error_message = f"LangDetectException: {e}"
|
|
71
|
+
logger.warning(log_error_message)
|
|
72
|
+
return LanguageEnum.UNKNOWN
|
|
73
|
+
|
|
74
|
+
return inner_function
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
from nv_ingest_api.internal.enums.common import StatusEnum, TaskTypeEnum
|
|
9
|
+
from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def pdfium_exception_handler(descriptor):
|
|
15
|
+
"""
|
|
16
|
+
A decorator that handles exceptions for functions interacting with PDFium.
|
|
17
|
+
|
|
18
|
+
This decorator wraps a function and catches any exceptions that occur during its execution.
|
|
19
|
+
If an exception is raised, it logs a warning with a descriptor and the function name,
|
|
20
|
+
then returns an empty list as a fallback value.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
descriptor : str
|
|
25
|
+
A string descriptor to identify the context or source of the function being wrapped.
|
|
26
|
+
This descriptor is included in the log message if an exception occurs.
|
|
27
|
+
|
|
28
|
+
Returns
|
|
29
|
+
-------
|
|
30
|
+
callable
|
|
31
|
+
A decorator function that wraps the target function with exception handling.
|
|
32
|
+
|
|
33
|
+
Notes
|
|
34
|
+
-----
|
|
35
|
+
This decorator is useful for ensuring that functions interacting with PDFium can gracefully handle errors
|
|
36
|
+
without interrupting the entire processing pipeline.
|
|
37
|
+
|
|
38
|
+
Examples
|
|
39
|
+
--------
|
|
40
|
+
>>> @pdfium_exception_handler("PDF Processing")
|
|
41
|
+
... def process_pdf(file_path):
|
|
42
|
+
... # Function implementation here
|
|
43
|
+
... pass
|
|
44
|
+
...
|
|
45
|
+
>>> process_pdf("example.pdf")
|
|
46
|
+
[]
|
|
47
|
+
|
|
48
|
+
Raises
|
|
49
|
+
------
|
|
50
|
+
Exception
|
|
51
|
+
Any exception raised by the wrapped function is caught, logged, and handled by returning an empty list.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def outer_function(func):
|
|
55
|
+
def inner_function(*args, **kwargs):
|
|
56
|
+
try:
|
|
57
|
+
return func(*args, **kwargs)
|
|
58
|
+
except Exception as e:
|
|
59
|
+
log_error_message = f"{descriptor}:{func.__name__} error:{e}"
|
|
60
|
+
logger.warning(log_error_message)
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
return inner_function
|
|
64
|
+
|
|
65
|
+
return outer_function
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def create_exception_tag(error_message, source_id=None):
|
|
69
|
+
"""
|
|
70
|
+
Creates a metadata tag for logging or reporting an exception.
|
|
71
|
+
|
|
72
|
+
This function generates a metadata dictionary containing information about the exception,
|
|
73
|
+
including the task type, status, source identifier, and error message.
|
|
74
|
+
The metadata is validated and returned as a list containing a single entry.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
error_message : str
|
|
79
|
+
The error message describing the exception.
|
|
80
|
+
source_id : Optional[str], default=None
|
|
81
|
+
The identifier for the source related to the error, if available.
|
|
82
|
+
|
|
83
|
+
Returns
|
|
84
|
+
-------
|
|
85
|
+
list
|
|
86
|
+
A list containing a single entry, which is a tuple. The first element of the tuple is `None`,
|
|
87
|
+
and the second element is the validated metadata dictionary as a `dict`.
|
|
88
|
+
|
|
89
|
+
Notes
|
|
90
|
+
-----
|
|
91
|
+
This function is typically used to generate error metadata for tracking and logging purposes.
|
|
92
|
+
|
|
93
|
+
Examples
|
|
94
|
+
--------
|
|
95
|
+
>>> create_exception_tag("File not found", source_id="12345")
|
|
96
|
+
[[None, {'task': 'EXTRACT', 'status': 'ERROR', 'source_id': '12345', 'error_msg': 'File not found'}]]
|
|
97
|
+
|
|
98
|
+
Raises
|
|
99
|
+
------
|
|
100
|
+
ValidationError
|
|
101
|
+
If the metadata does not pass validation.
|
|
102
|
+
"""
|
|
103
|
+
unified_metadata = {}
|
|
104
|
+
|
|
105
|
+
error_metadata = {
|
|
106
|
+
"task": TaskTypeEnum.EXTRACT,
|
|
107
|
+
"status": StatusEnum.ERROR,
|
|
108
|
+
"source_id": source_id,
|
|
109
|
+
"error_msg": error_message,
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
unified_metadata["error_metadata"] = error_metadata
|
|
113
|
+
|
|
114
|
+
validated_unified_metadata = validate_metadata(unified_metadata)
|
|
115
|
+
|
|
116
|
+
return [[None, validated_unified_metadata.model_dump()]]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
from pydantic import ValidationError
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def schema_exception_handler(func, **kwargs):
|
|
14
|
+
"""
|
|
15
|
+
A decorator that handles `ValidationError` exceptions for schema validation functions.
|
|
16
|
+
|
|
17
|
+
This decorator wraps a function that performs schema validation using Pydantic.
|
|
18
|
+
If a `ValidationError` is raised, it logs detailed error messages and raises a `ValueError` with the combined error
|
|
19
|
+
messages.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
func : callable
|
|
24
|
+
The function to be decorated. This function is expected to perform schema validation.
|
|
25
|
+
|
|
26
|
+
kwargs : dict
|
|
27
|
+
Additional keyword arguments to be passed to the function.
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
callable
|
|
32
|
+
The wrapped function that executes `func` with exception handling.
|
|
33
|
+
|
|
34
|
+
Raises
|
|
35
|
+
------
|
|
36
|
+
ValueError
|
|
37
|
+
If a `ValidationError` is caught, this decorator logs the error details and raises a `ValueError` with the
|
|
38
|
+
combined error messages.
|
|
39
|
+
|
|
40
|
+
Notes
|
|
41
|
+
-----
|
|
42
|
+
This decorator is particularly useful for functions that validate configurations or data models,
|
|
43
|
+
ensuring that any validation errors are logged and communicated clearly.
|
|
44
|
+
|
|
45
|
+
Examples
|
|
46
|
+
--------
|
|
47
|
+
>>> @schema_exception_handler
|
|
48
|
+
... def validate_config(config_data):
|
|
49
|
+
... schema = MySchema(**config_data)
|
|
50
|
+
... return schema
|
|
51
|
+
...
|
|
52
|
+
>>> try:
|
|
53
|
+
... validate_config(invalid_config)
|
|
54
|
+
... except ValueError as e:
|
|
55
|
+
... print(f"Caught error: {e}")
|
|
56
|
+
Caught error: Invalid configuration: field1: value is not a valid integer; field2: field required
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def inner_function(*args, **kwargs):
|
|
60
|
+
try:
|
|
61
|
+
return func(*args, **kwargs)
|
|
62
|
+
except ValidationError as e:
|
|
63
|
+
error_messages = "; ".join([f"{error['loc'][0]}: {error['msg']}" for error in e.errors()])
|
|
64
|
+
log_error_message = f"Invalid configuration: {error_messages}"
|
|
65
|
+
logger.error(log_error_message)
|
|
66
|
+
raise ValueError(log_error_message)
|
|
67
|
+
|
|
68
|
+
return inner_function
|