nv-ingest-api 26.1.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +218 -0
- nv_ingest_api/interface/extract.py +977 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +200 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +186 -0
- nv_ingest_api/internal/__init__.py +0 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +550 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
- nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
- nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
- nv_ingest_api/internal/meta/__init__.py +3 -0
- nv_ingest_api/internal/meta/udf.py +232 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/control_message_task.py +16 -0
- nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
- nv_ingest_api/internal/schemas/meta/udf.py +23 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +251 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +219 -0
- nv_ingest_api/internal/transform/embed_text.py +702 -0
- nv_ingest_api/internal/transform/split_text.py +182 -0
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/dataloader/__init__.py +9 -0
- nv_ingest_api/util/dataloader/dataloader.py +409 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +429 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +177 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
- nv_ingest_api/util/image_processing/transforms.py +850 -0
- nv_ingest_api/util/imports/__init__.py +3 -0
- nv_ingest_api/util/imports/callable_signatures.py +108 -0
- nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
- nv_ingest_api/util/introspection/__init__.py +3 -0
- nv_ingest_api/util/introspection/class_inspect.py +145 -0
- nv_ingest_api/util/introspection/function_inspect.py +65 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +102 -0
- nv_ingest_api/util/logging/sanitize.py +84 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +516 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
- nv_ingest_api/util/nim/__init__.py +161 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +428 -0
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- nv_ingest_api/util/string_processing/configuration.py +682 -0
- nv_ingest_api/util/string_processing/yaml.py +109 -0
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +594 -0
- nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
- nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
- nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
- nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
- nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
- udfs/__init__.py +5 -0
- udfs/llm_summarizer_udf.py +259 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import inspect
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def ingest_stage_callable_signature(sig: inspect.Signature):
|
|
13
|
+
"""
|
|
14
|
+
Validates that a callable has the signature:
|
|
15
|
+
(IngestControlMessage, BaseModel) -> IngestControlMessage
|
|
16
|
+
|
|
17
|
+
Also allows for generic (*args, **kwargs) signatures for flexibility with class constructors.
|
|
18
|
+
|
|
19
|
+
Raises
|
|
20
|
+
------
|
|
21
|
+
TypeError
|
|
22
|
+
If the signature does not match the expected pattern.
|
|
23
|
+
"""
|
|
24
|
+
params = list(sig.parameters.values())
|
|
25
|
+
|
|
26
|
+
# If the signature accepts arbitrary keyword arguments, it's flexible enough.
|
|
27
|
+
if any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params):
|
|
28
|
+
return
|
|
29
|
+
|
|
30
|
+
if len(params) != 2:
|
|
31
|
+
raise TypeError(f"Expected exactly 2 parameters, got {len(params)}")
|
|
32
|
+
|
|
33
|
+
if params[0].name != "control_message" or params[1].name != "stage_config":
|
|
34
|
+
raise TypeError("Expected parameter names: 'control_message', 'stage_config'")
|
|
35
|
+
|
|
36
|
+
first_param = params[0].annotation
|
|
37
|
+
second_param = params[1].annotation
|
|
38
|
+
return_type = sig.return_annotation
|
|
39
|
+
|
|
40
|
+
if first_param is inspect.Parameter.empty:
|
|
41
|
+
raise TypeError("First parameter must be annotated with IngestControlMessage")
|
|
42
|
+
|
|
43
|
+
if second_param is inspect.Parameter.empty:
|
|
44
|
+
raise TypeError("Second parameter must be annotated with a subclass of BaseModel")
|
|
45
|
+
|
|
46
|
+
if return_type is inspect.Signature.empty:
|
|
47
|
+
raise TypeError("Return type must be annotated with IngestControlMessage")
|
|
48
|
+
|
|
49
|
+
if not issubclass(first_param, IngestControlMessage):
|
|
50
|
+
raise TypeError(f"First parameter must be IngestControlMessage, got {first_param}")
|
|
51
|
+
|
|
52
|
+
if not (issubclass(second_param, BaseModel)):
|
|
53
|
+
raise TypeError(f"Second parameter must be a subclass of BaseModel, got {second_param}")
|
|
54
|
+
|
|
55
|
+
if not issubclass(return_type, IngestControlMessage):
|
|
56
|
+
raise TypeError(f"Return type must be IngestControlMessage, got {return_type}")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def ingest_callable_signature(sig: inspect.Signature):
|
|
60
|
+
"""
|
|
61
|
+
Validates that a callable has the signature:
|
|
62
|
+
(IngestControlMessage) -> IngestControlMessage
|
|
63
|
+
|
|
64
|
+
Also allows for generic (*args, **kwargs) signatures for flexibility with class constructors.
|
|
65
|
+
|
|
66
|
+
Raises
|
|
67
|
+
------
|
|
68
|
+
TypeError
|
|
69
|
+
If the signature does not match the expected pattern.
|
|
70
|
+
"""
|
|
71
|
+
params = list(sig.parameters.values())
|
|
72
|
+
|
|
73
|
+
# If the signature accepts arbitrary keyword arguments, it's flexible enough.
|
|
74
|
+
if any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params):
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
if len(params) != 1:
|
|
78
|
+
raise TypeError(f"Expected exactly 1 parameter, got {len(params)}")
|
|
79
|
+
|
|
80
|
+
if params[0].name != "control_message":
|
|
81
|
+
raise TypeError("Expected parameter name: 'control_message'")
|
|
82
|
+
|
|
83
|
+
first_param = params[0].annotation
|
|
84
|
+
return_type = sig.return_annotation
|
|
85
|
+
|
|
86
|
+
if first_param is inspect.Parameter.empty:
|
|
87
|
+
raise TypeError("Parameter must be annotated with IngestControlMessage")
|
|
88
|
+
|
|
89
|
+
if return_type is inspect.Signature.empty:
|
|
90
|
+
raise TypeError("Return type must be annotated with IngestControlMessage")
|
|
91
|
+
|
|
92
|
+
# Handle string annotations (forward references)
|
|
93
|
+
if isinstance(first_param, str):
|
|
94
|
+
if first_param != "IngestControlMessage":
|
|
95
|
+
raise TypeError(f"Parameter must be IngestControlMessage, got {first_param}")
|
|
96
|
+
else:
|
|
97
|
+
# Handle actual class annotations
|
|
98
|
+
if not issubclass(first_param, IngestControlMessage):
|
|
99
|
+
raise TypeError(f"Parameter must be IngestControlMessage, got {first_param}")
|
|
100
|
+
|
|
101
|
+
# Handle string annotations for return type
|
|
102
|
+
if isinstance(return_type, str):
|
|
103
|
+
if return_type != "IngestControlMessage":
|
|
104
|
+
raise TypeError(f"Return type must be IngestControlMessage, got {return_type}")
|
|
105
|
+
else:
|
|
106
|
+
# Handle actual class annotations
|
|
107
|
+
if not issubclass(return_type, IngestControlMessage):
|
|
108
|
+
raise TypeError(f"Return type must be IngestControlMessage, got {return_type}")
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
import inspect
|
|
7
|
+
from typing import Callable, Union, List, Optional
|
|
8
|
+
|
|
9
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def resolve_obj_from_path(path: str, allowed_base_paths: Optional[List[str]] = None) -> object:
|
|
13
|
+
"""
|
|
14
|
+
Import and return an object from a string path of the form 'module.sub:attr'.
|
|
15
|
+
|
|
16
|
+
To enhance security, this function can restrict imports to a list of allowed base module paths.
|
|
17
|
+
"""
|
|
18
|
+
if ":" not in path:
|
|
19
|
+
raise ValueError(f"Invalid path '{path}': expected format 'module.sub:attr'")
|
|
20
|
+
module_path, attr_name = path.split(":", 1)
|
|
21
|
+
|
|
22
|
+
# Security check: only allow imports from specified base paths if provided.
|
|
23
|
+
if allowed_base_paths:
|
|
24
|
+
is_allowed = any(module_path == base or module_path.startswith(base + ".") for base in allowed_base_paths)
|
|
25
|
+
if not is_allowed:
|
|
26
|
+
raise ImportError(
|
|
27
|
+
f"Module '{module_path}' is not in the list of allowed base paths. "
|
|
28
|
+
f"Allowed paths: {allowed_base_paths}"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
mod = importlib.import_module(module_path)
|
|
33
|
+
except ModuleNotFoundError as e:
|
|
34
|
+
raise ImportError(f"Could not import module '{module_path}'") from e
|
|
35
|
+
try:
|
|
36
|
+
obj = getattr(mod, attr_name)
|
|
37
|
+
except AttributeError as e:
|
|
38
|
+
raise AttributeError(f"Module '{module_path}' has no attribute '{attr_name}'") from e
|
|
39
|
+
return obj
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def resolve_callable_from_path(
|
|
43
|
+
callable_path: str,
|
|
44
|
+
signature_schema: Union[List[str], Callable[[inspect.Signature], None], str],
|
|
45
|
+
allowed_base_paths: Optional[List[str]] = None,
|
|
46
|
+
) -> Callable:
|
|
47
|
+
"""
|
|
48
|
+
Import and return a callable from a module path string like 'module.submodule:callable_name',
|
|
49
|
+
and validate its signature using the required signature_schema (callable or path to callable).
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
callable_path : str
|
|
54
|
+
The module path and callable in the format 'module.sub:callable'.
|
|
55
|
+
signature_schema : Union[List[str], Callable, str]
|
|
56
|
+
Either:
|
|
57
|
+
- A list of parameter names to require.
|
|
58
|
+
- A callable that takes an inspect.Signature and raises on failure.
|
|
59
|
+
- A string path to such a callable ('module.sub:schema_checker').
|
|
60
|
+
allowed_base_paths : Optional[List[str]]
|
|
61
|
+
An optional list of base module paths from which imports are allowed.
|
|
62
|
+
If provided, both the callable and any signature schema specified by path
|
|
63
|
+
must reside within one of these paths.
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
-------
|
|
67
|
+
Callable
|
|
68
|
+
The resolved and validated callable.
|
|
69
|
+
|
|
70
|
+
Raises
|
|
71
|
+
------
|
|
72
|
+
ValueError
|
|
73
|
+
If the path is not correctly formatted.
|
|
74
|
+
ImportError
|
|
75
|
+
If the module cannot be imported or is not in the allowed paths.
|
|
76
|
+
AttributeError
|
|
77
|
+
If the attribute does not exist in the module.
|
|
78
|
+
TypeError
|
|
79
|
+
If the resolved attribute is not callable or the signature does not match.
|
|
80
|
+
"""
|
|
81
|
+
obj = resolve_obj_from_path(callable_path, allowed_base_paths=allowed_base_paths)
|
|
82
|
+
if not callable(obj):
|
|
83
|
+
raise TypeError(f"Object '{callable_path}' is not callable")
|
|
84
|
+
|
|
85
|
+
# Load/check signature_schema
|
|
86
|
+
schema_checker = signature_schema
|
|
87
|
+
if isinstance(signature_schema, str):
|
|
88
|
+
# When loading the schema checker, apply the same security restrictions.
|
|
89
|
+
schema_checker = resolve_obj_from_path(signature_schema, allowed_base_paths=allowed_base_paths)
|
|
90
|
+
|
|
91
|
+
sig = inspect.signature(obj)
|
|
92
|
+
if isinstance(schema_checker, list):
|
|
93
|
+
actual_params = list(sig.parameters.keys())
|
|
94
|
+
missing = [p for p in schema_checker if p not in actual_params]
|
|
95
|
+
if missing:
|
|
96
|
+
raise TypeError(
|
|
97
|
+
f"Callable at '{callable_path}' is missing required parameters: {missing}\n"
|
|
98
|
+
f"Actual parameters: {actual_params}"
|
|
99
|
+
)
|
|
100
|
+
elif callable(schema_checker):
|
|
101
|
+
try:
|
|
102
|
+
schema_checker(sig)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
raise TypeError(f"Signature validation for '{callable_path}' failed: {e}") from e
|
|
105
|
+
else:
|
|
106
|
+
raise TypeError(f"Invalid signature_schema: expected list, callable, or str, got {type(signature_schema)}")
|
|
107
|
+
|
|
108
|
+
return obj
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def resolve_actor_class_from_path(
|
|
112
|
+
path: str, expected_base_class: type, allowed_base_paths: Optional[List[str]] = None
|
|
113
|
+
) -> type:
|
|
114
|
+
"""
|
|
115
|
+
Resolves an actor class from a path and validates that it is a class
|
|
116
|
+
that inherits from the expected base class. This function correctly handles
|
|
117
|
+
decorated Ray actors by inspecting their original class.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
path : str
|
|
122
|
+
The full import path to the actor class.
|
|
123
|
+
expected_base_class : type
|
|
124
|
+
The base class that the resolved class must inherit from.
|
|
125
|
+
allowed_base_paths : Optional[List[str]]
|
|
126
|
+
An optional list of base module paths from which imports are allowed.
|
|
127
|
+
|
|
128
|
+
Returns
|
|
129
|
+
-------
|
|
130
|
+
type
|
|
131
|
+
The resolved actor class (or Ray actor factory).
|
|
132
|
+
"""
|
|
133
|
+
obj = resolve_obj_from_path(path, allowed_base_paths=allowed_base_paths)
|
|
134
|
+
|
|
135
|
+
# Determine the class to validate. If it's a Ray actor factory, we need to
|
|
136
|
+
# inspect its MRO to find the original user-defined class.
|
|
137
|
+
cls_to_validate = None
|
|
138
|
+
if inspect.isclass(obj):
|
|
139
|
+
cls_to_validate = obj
|
|
140
|
+
else:
|
|
141
|
+
# For actor factories, find the base class in the MRO that inherits from RayActorStage
|
|
142
|
+
for base in obj.__class__.__mro__:
|
|
143
|
+
if inspect.isclass(base) and issubclass(base, RayActorStage) and base is not RayActorStage:
|
|
144
|
+
cls_to_validate = base
|
|
145
|
+
break
|
|
146
|
+
|
|
147
|
+
if cls_to_validate is None:
|
|
148
|
+
raise TypeError(
|
|
149
|
+
f"Could not resolve a valid actor class from path '{path}'. "
|
|
150
|
+
f"The object is not a class and not a recognized actor factory."
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if not issubclass(cls_to_validate, expected_base_class):
|
|
154
|
+
raise TypeError(
|
|
155
|
+
f"Actor class '{cls_to_validate.__name__}' at '{path}' must inherit from '{expected_base_class.__name__}'."
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
return obj
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import inspect
|
|
6
|
+
from typing import Optional, Type, Union, Callable
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def find_pydantic_config_schema(
|
|
12
|
+
actor_class: Type,
|
|
13
|
+
base_class_to_find: Type,
|
|
14
|
+
param_name: str = "config",
|
|
15
|
+
) -> Optional[Type[BaseModel]]:
|
|
16
|
+
"""
|
|
17
|
+
Introspects a class's MRO to find a Pydantic model in its __init__ signature.
|
|
18
|
+
|
|
19
|
+
This function is designed to find the specific Pydantic configuration model
|
|
20
|
+
for a pipeline actor, which might be a direct class or a proxy object.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
actor_class : Type
|
|
25
|
+
The actor class or proxy object to inspect.
|
|
26
|
+
base_class_to_find : Type
|
|
27
|
+
The specific base class (e.g., RaySource, RayStage) to look for when
|
|
28
|
+
resolving the true actor class from a proxy.
|
|
29
|
+
param_name : str, optional
|
|
30
|
+
The name of the __init__ parameter to inspect for the Pydantic schema,
|
|
31
|
+
by default "config".
|
|
32
|
+
|
|
33
|
+
Returns
|
|
34
|
+
-------
|
|
35
|
+
Optional[Type[BaseModel]]
|
|
36
|
+
The Pydantic BaseModel class if found, otherwise None.
|
|
37
|
+
"""
|
|
38
|
+
# 1. Find the actual class to inspect, handling proxy objects.
|
|
39
|
+
cls_to_inspect = None
|
|
40
|
+
if inspect.isclass(actor_class):
|
|
41
|
+
cls_to_inspect = actor_class
|
|
42
|
+
else:
|
|
43
|
+
for base in actor_class.__class__.__mro__:
|
|
44
|
+
if inspect.isclass(base) and issubclass(base, base_class_to_find) and base is not base_class_to_find:
|
|
45
|
+
cls_to_inspect = base
|
|
46
|
+
break
|
|
47
|
+
|
|
48
|
+
if not cls_to_inspect:
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
# 2. Walk the MRO of the real class to find the __init__ with the typed parameter.
|
|
52
|
+
for cls in cls_to_inspect.__mro__:
|
|
53
|
+
if param_name in getattr(cls.__init__, "__annotations__", {}):
|
|
54
|
+
try:
|
|
55
|
+
init_sig = inspect.signature(cls.__init__)
|
|
56
|
+
config_param = init_sig.parameters.get(param_name)
|
|
57
|
+
if (
|
|
58
|
+
config_param
|
|
59
|
+
and config_param.annotation is not BaseModel
|
|
60
|
+
and issubclass(config_param.annotation, BaseModel)
|
|
61
|
+
):
|
|
62
|
+
return config_param.annotation # Found the schema
|
|
63
|
+
except (ValueError, TypeError):
|
|
64
|
+
# This class's __init__ is not inspectable (e.g., a C-extension), continue up the MRO.
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def find_pydantic_config_schema_for_callable(
|
|
71
|
+
callable_fn: Callable,
|
|
72
|
+
param_name: str = "stage_config",
|
|
73
|
+
) -> Optional[Type[BaseModel]]:
|
|
74
|
+
"""
|
|
75
|
+
Introspects a callable's signature to find a Pydantic model parameter.
|
|
76
|
+
|
|
77
|
+
This function is designed to find the specific Pydantic configuration model
|
|
78
|
+
for a pipeline callable function.
|
|
79
|
+
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
callable_fn : Callable
|
|
83
|
+
The callable function to inspect.
|
|
84
|
+
param_name : str, optional
|
|
85
|
+
The name of the parameter to inspect for the Pydantic schema,
|
|
86
|
+
by default "stage_config".
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
Optional[Type[BaseModel]]
|
|
91
|
+
The Pydantic BaseModel class if found, otherwise None.
|
|
92
|
+
"""
|
|
93
|
+
try:
|
|
94
|
+
sig = inspect.signature(callable_fn)
|
|
95
|
+
config_param = sig.parameters.get(param_name)
|
|
96
|
+
if (
|
|
97
|
+
config_param
|
|
98
|
+
and config_param.annotation is not BaseModel
|
|
99
|
+
and hasattr(config_param.annotation, "__mro__")
|
|
100
|
+
and issubclass(config_param.annotation, BaseModel)
|
|
101
|
+
):
|
|
102
|
+
return config_param.annotation
|
|
103
|
+
except (ValueError, TypeError):
|
|
104
|
+
# Function signature is not inspectable
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def find_pydantic_config_schema_unified(
|
|
111
|
+
target: Union[Type, Callable],
|
|
112
|
+
base_class_to_find: Optional[Type] = None,
|
|
113
|
+
param_name: str = "config",
|
|
114
|
+
) -> Optional[Type[BaseModel]]:
|
|
115
|
+
"""
|
|
116
|
+
Unified function to find Pydantic schema for either classes or callables.
|
|
117
|
+
|
|
118
|
+
Parameters
|
|
119
|
+
----------
|
|
120
|
+
target : Union[Type, Callable]
|
|
121
|
+
The class or callable to inspect.
|
|
122
|
+
base_class_to_find : Optional[Type], optional
|
|
123
|
+
The specific base class to look for when resolving actor classes from proxies.
|
|
124
|
+
Only used for class inspection.
|
|
125
|
+
param_name : str, optional
|
|
126
|
+
The name of the parameter to inspect for the Pydantic schema.
|
|
127
|
+
For classes: defaults to "config"
|
|
128
|
+
For callables: should be "stage_config"
|
|
129
|
+
|
|
130
|
+
Returns
|
|
131
|
+
-------
|
|
132
|
+
Optional[Type[BaseModel]]
|
|
133
|
+
The Pydantic BaseModel class if found, otherwise None.
|
|
134
|
+
"""
|
|
135
|
+
if callable(target) and not inspect.isclass(target):
|
|
136
|
+
# Handle callable function
|
|
137
|
+
return find_pydantic_config_schema_for_callable(target, param_name)
|
|
138
|
+
elif inspect.isclass(target) or hasattr(target, "__class__"):
|
|
139
|
+
# Handle class or proxy object
|
|
140
|
+
if base_class_to_find is None:
|
|
141
|
+
# If no base class specified, we can't use the original function
|
|
142
|
+
return None
|
|
143
|
+
return find_pydantic_config_schema(target, base_class_to_find, param_name)
|
|
144
|
+
else:
|
|
145
|
+
return None
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Utilities for introspecting and analyzing UDF function specifications.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def infer_udf_function_name(udf_function: str) -> Optional[str]:
|
|
14
|
+
"""
|
|
15
|
+
Attempts to infer the UDF function name from the provided function string.
|
|
16
|
+
|
|
17
|
+
Supports three formats:
|
|
18
|
+
1. Inline function: 'def my_func(control_message): ...' -> 'my_func'
|
|
19
|
+
2. Import path: 'my_module.my_function' -> 'my_function'
|
|
20
|
+
3. File path: '/path/to/file.py:function_name' -> 'function_name'
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
udf_function : str
|
|
25
|
+
The UDF function string.
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
Optional[str]
|
|
30
|
+
The inferred UDF function name, or None if inference is not possible.
|
|
31
|
+
|
|
32
|
+
Examples
|
|
33
|
+
--------
|
|
34
|
+
>>> infer_udf_function_name("def my_custom_func(control_message): pass")
|
|
35
|
+
'my_custom_func'
|
|
36
|
+
|
|
37
|
+
>>> infer_udf_function_name("my_module.submodule.process_data")
|
|
38
|
+
'process_data'
|
|
39
|
+
|
|
40
|
+
>>> infer_udf_function_name("/path/to/script.py:custom_function")
|
|
41
|
+
'custom_function'
|
|
42
|
+
|
|
43
|
+
>>> infer_udf_function_name("/path/to/script.py")
|
|
44
|
+
None
|
|
45
|
+
"""
|
|
46
|
+
udf_function = udf_function.strip()
|
|
47
|
+
|
|
48
|
+
# Format 3: File path with explicit function name
|
|
49
|
+
if ":" in udf_function and ("/" in udf_function or "\\" in udf_function):
|
|
50
|
+
# File path with explicit function name: '/path/to/file.py:function_name'
|
|
51
|
+
return udf_function.split(":")[-1].strip()
|
|
52
|
+
|
|
53
|
+
# Format 2: Import path like 'module.submodule.function'
|
|
54
|
+
elif "." in udf_function and not udf_function.startswith("def "):
|
|
55
|
+
# Import path: extract the last part as function name
|
|
56
|
+
return udf_function.split(".")[-1].strip()
|
|
57
|
+
|
|
58
|
+
# Format 1: Inline function definition
|
|
59
|
+
elif udf_function.startswith("def "):
|
|
60
|
+
# Parse inline function definition to extract function name
|
|
61
|
+
match = re.match(r"def\s+(\w+)\s*\(", udf_function)
|
|
62
|
+
if match:
|
|
63
|
+
return match.group(1)
|
|
64
|
+
|
|
65
|
+
return None
|
|
File without changes
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import logging.config
|
|
8
|
+
from enum import Enum
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LogLevel(str, Enum):
|
|
12
|
+
DEFAULT = "DEFAULT"
|
|
13
|
+
DEBUG = "DEBUG"
|
|
14
|
+
INFO = "INFO"
|
|
15
|
+
WARNING = "WARNING"
|
|
16
|
+
ERROR = "ERROR"
|
|
17
|
+
CRITICAL = "CRITICAL"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def configure_logging(level_name: str) -> None:
|
|
21
|
+
"""
|
|
22
|
+
Configures global logging.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
level_name : str
|
|
27
|
+
The name of the logging level (e.g., "DEBUG", "INFO").
|
|
28
|
+
"""
|
|
29
|
+
numeric_level = getattr(logging, level_name.upper(), None)
|
|
30
|
+
if not isinstance(numeric_level, int):
|
|
31
|
+
raise ValueError(f"Invalid log level: {level_name}")
|
|
32
|
+
|
|
33
|
+
# Scorched-earth reset: remove ALL existing handlers from root and named loggers
|
|
34
|
+
# to ensure there is exactly one handler after configuration.
|
|
35
|
+
root_logger = logging.getLogger()
|
|
36
|
+
for h in list(root_logger.handlers):
|
|
37
|
+
root_logger.removeHandler(h)
|
|
38
|
+
try:
|
|
39
|
+
h.close()
|
|
40
|
+
except Exception:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
# Clear handlers from all known loggers and make them propagate to root
|
|
44
|
+
for name, logger_obj in list(logging.Logger.manager.loggerDict.items()):
|
|
45
|
+
if isinstance(logger_obj, logging.Logger):
|
|
46
|
+
for h in list(logger_obj.handlers):
|
|
47
|
+
logger_obj.removeHandler(h)
|
|
48
|
+
try:
|
|
49
|
+
h.close()
|
|
50
|
+
except Exception:
|
|
51
|
+
pass
|
|
52
|
+
# Ensure messages bubble to root; levels will be controlled centrally
|
|
53
|
+
logger_obj.propagate = True
|
|
54
|
+
logger_obj.setLevel(logging.NOTSET)
|
|
55
|
+
|
|
56
|
+
# Use dictConfig to establish a single console handler on the root logger.
|
|
57
|
+
config_dict = {
|
|
58
|
+
"version": 1,
|
|
59
|
+
# We already cleared handlers above; keep loggers enabled so they propagate to root
|
|
60
|
+
"disable_existing_loggers": False,
|
|
61
|
+
"formatters": {
|
|
62
|
+
"standard": {
|
|
63
|
+
"format": "%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
"handlers": {
|
|
67
|
+
"console": {
|
|
68
|
+
"class": "logging.StreamHandler",
|
|
69
|
+
"level": numeric_level,
|
|
70
|
+
"formatter": "standard",
|
|
71
|
+
"stream": "ext://sys.stdout",
|
|
72
|
+
}
|
|
73
|
+
},
|
|
74
|
+
"root": {
|
|
75
|
+
"level": numeric_level,
|
|
76
|
+
"handlers": ["console"],
|
|
77
|
+
},
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
logging.config.dictConfig(config_dict)
|
|
81
|
+
|
|
82
|
+
# Enforce exactly one handler remains attached to root (keep first StreamHandler)
|
|
83
|
+
root_logger = logging.getLogger()
|
|
84
|
+
if len(root_logger.handlers) > 1:
|
|
85
|
+
keep = None
|
|
86
|
+
for h in list(root_logger.handlers):
|
|
87
|
+
if keep is None and isinstance(h, logging.StreamHandler):
|
|
88
|
+
keep = h
|
|
89
|
+
continue
|
|
90
|
+
root_logger.removeHandler(h)
|
|
91
|
+
try:
|
|
92
|
+
h.close()
|
|
93
|
+
except Exception:
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
# Route warnings module through logging
|
|
97
|
+
try:
|
|
98
|
+
import logging as _logging
|
|
99
|
+
|
|
100
|
+
_logging.captureWarnings(True)
|
|
101
|
+
except Exception:
|
|
102
|
+
pass
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, Mapping, MutableMapping, Sequence, Set
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
# Pydantic is optional at runtime for this helper; import if available
|
|
11
|
+
from pydantic import BaseModel # type: ignore
|
|
12
|
+
except Exception: # pragma: no cover - pydantic always present in this repo
|
|
13
|
+
BaseModel = None # type: ignore
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
_DEFAULT_SENSITIVE_KEYS: Set[str] = {
|
|
17
|
+
"access_token",
|
|
18
|
+
"api_key",
|
|
19
|
+
"authorization",
|
|
20
|
+
"auth_token",
|
|
21
|
+
"client_secret",
|
|
22
|
+
"hf_access_token",
|
|
23
|
+
"hugging_face_access_token",
|
|
24
|
+
"password",
|
|
25
|
+
"refresh_token",
|
|
26
|
+
"secret",
|
|
27
|
+
"ssl_cert",
|
|
28
|
+
"x-api-key",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
_REDACTION = "***REDACTED***"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _is_mapping(obj: Any) -> bool:
|
|
35
|
+
try:
|
|
36
|
+
return isinstance(obj, Mapping)
|
|
37
|
+
except Exception:
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _is_sequence(obj: Any) -> bool:
|
|
42
|
+
# Exclude strings/bytes from sequences we want to traverse
|
|
43
|
+
return isinstance(obj, Sequence) and not isinstance(obj, (str, bytes, bytearray))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def sanitize_for_logging(
|
|
47
|
+
data: Any,
|
|
48
|
+
sensitive_keys: Set[str] | None = None,
|
|
49
|
+
redaction: str = _REDACTION,
|
|
50
|
+
) -> Any:
|
|
51
|
+
"""
|
|
52
|
+
Recursively sanitize common secret fields from dicts, lists, tuples, and Pydantic models.
|
|
53
|
+
|
|
54
|
+
- Key comparison is case-insensitive and matches exact keys only.
|
|
55
|
+
- Does not mutate input; returns a sanitized deep copy.
|
|
56
|
+
- For Pydantic BaseModel instances, uses model_dump() before redaction.
|
|
57
|
+
"""
|
|
58
|
+
keys = {k.lower() for k in (sensitive_keys or _DEFAULT_SENSITIVE_KEYS)}
|
|
59
|
+
|
|
60
|
+
# Handle Pydantic models without importing pydantic at module import time
|
|
61
|
+
if BaseModel is not None and isinstance(data, BaseModel): # type: ignore[arg-type]
|
|
62
|
+
try:
|
|
63
|
+
return sanitize_for_logging(data.model_dump(), keys, redaction)
|
|
64
|
+
except Exception:
|
|
65
|
+
# Fall through and try generic handling below
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
# Dict-like
|
|
69
|
+
if _is_mapping(data):
|
|
70
|
+
out: MutableMapping[str, Any] = type(data)() # preserve mapping type where possible
|
|
71
|
+
for k, v in data.items(): # type: ignore[assignment]
|
|
72
|
+
key_lower = str(k).lower()
|
|
73
|
+
if key_lower in keys:
|
|
74
|
+
out[k] = redaction
|
|
75
|
+
else:
|
|
76
|
+
out[k] = sanitize_for_logging(v, keys, redaction)
|
|
77
|
+
return out
|
|
78
|
+
|
|
79
|
+
# List/Tuple/Sequence
|
|
80
|
+
if _is_sequence(data):
|
|
81
|
+
return type(data)(sanitize_for_logging(v, keys, redaction) for v in data)
|
|
82
|
+
|
|
83
|
+
# Fallback: return as-is
|
|
84
|
+
return data
|