nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,108 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import inspect
7
+ from pydantic import BaseModel
8
+
9
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
10
+
11
+
12
+ def ingest_stage_callable_signature(sig: inspect.Signature):
13
+ """
14
+ Validates that a callable has the signature:
15
+ (IngestControlMessage, BaseModel) -> IngestControlMessage
16
+
17
+ Also allows for generic (*args, **kwargs) signatures for flexibility with class constructors.
18
+
19
+ Raises
20
+ ------
21
+ TypeError
22
+ If the signature does not match the expected pattern.
23
+ """
24
+ params = list(sig.parameters.values())
25
+
26
+ # If the signature accepts arbitrary keyword arguments, it's flexible enough.
27
+ if any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params):
28
+ return
29
+
30
+ if len(params) != 2:
31
+ raise TypeError(f"Expected exactly 2 parameters, got {len(params)}")
32
+
33
+ if params[0].name != "control_message" or params[1].name != "stage_config":
34
+ raise TypeError("Expected parameter names: 'control_message', 'stage_config'")
35
+
36
+ first_param = params[0].annotation
37
+ second_param = params[1].annotation
38
+ return_type = sig.return_annotation
39
+
40
+ if first_param is inspect.Parameter.empty:
41
+ raise TypeError("First parameter must be annotated with IngestControlMessage")
42
+
43
+ if second_param is inspect.Parameter.empty:
44
+ raise TypeError("Second parameter must be annotated with a subclass of BaseModel")
45
+
46
+ if return_type is inspect.Signature.empty:
47
+ raise TypeError("Return type must be annotated with IngestControlMessage")
48
+
49
+ if not issubclass(first_param, IngestControlMessage):
50
+ raise TypeError(f"First parameter must be IngestControlMessage, got {first_param}")
51
+
52
+ if not (issubclass(second_param, BaseModel)):
53
+ raise TypeError(f"Second parameter must be a subclass of BaseModel, got {second_param}")
54
+
55
+ if not issubclass(return_type, IngestControlMessage):
56
+ raise TypeError(f"Return type must be IngestControlMessage, got {return_type}")
57
+
58
+
59
+ def ingest_callable_signature(sig: inspect.Signature):
60
+ """
61
+ Validates that a callable has the signature:
62
+ (IngestControlMessage) -> IngestControlMessage
63
+
64
+ Also allows for generic (*args, **kwargs) signatures for flexibility with class constructors.
65
+
66
+ Raises
67
+ ------
68
+ TypeError
69
+ If the signature does not match the expected pattern.
70
+ """
71
+ params = list(sig.parameters.values())
72
+
73
+ # If the signature accepts arbitrary keyword arguments, it's flexible enough.
74
+ if any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params):
75
+ return
76
+
77
+ if len(params) != 1:
78
+ raise TypeError(f"Expected exactly 1 parameter, got {len(params)}")
79
+
80
+ if params[0].name != "control_message":
81
+ raise TypeError("Expected parameter name: 'control_message'")
82
+
83
+ first_param = params[0].annotation
84
+ return_type = sig.return_annotation
85
+
86
+ if first_param is inspect.Parameter.empty:
87
+ raise TypeError("Parameter must be annotated with IngestControlMessage")
88
+
89
+ if return_type is inspect.Signature.empty:
90
+ raise TypeError("Return type must be annotated with IngestControlMessage")
91
+
92
+ # Handle string annotations (forward references)
93
+ if isinstance(first_param, str):
94
+ if first_param != "IngestControlMessage":
95
+ raise TypeError(f"Parameter must be IngestControlMessage, got {first_param}")
96
+ else:
97
+ # Handle actual class annotations
98
+ if not issubclass(first_param, IngestControlMessage):
99
+ raise TypeError(f"Parameter must be IngestControlMessage, got {first_param}")
100
+
101
+ # Handle string annotations for return type
102
+ if isinstance(return_type, str):
103
+ if return_type != "IngestControlMessage":
104
+ raise TypeError(f"Return type must be IngestControlMessage, got {return_type}")
105
+ else:
106
+ # Handle actual class annotations
107
+ if not issubclass(return_type, IngestControlMessage):
108
+ raise TypeError(f"Return type must be IngestControlMessage, got {return_type}")
@@ -0,0 +1,158 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import importlib
6
+ import inspect
7
+ from typing import Callable, Union, List, Optional
8
+
9
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
10
+
11
+
12
+ def resolve_obj_from_path(path: str, allowed_base_paths: Optional[List[str]] = None) -> object:
13
+ """
14
+ Import and return an object from a string path of the form 'module.sub:attr'.
15
+
16
+ To enhance security, this function can restrict imports to a list of allowed base module paths.
17
+ """
18
+ if ":" not in path:
19
+ raise ValueError(f"Invalid path '{path}': expected format 'module.sub:attr'")
20
+ module_path, attr_name = path.split(":", 1)
21
+
22
+ # Security check: only allow imports from specified base paths if provided.
23
+ if allowed_base_paths:
24
+ is_allowed = any(module_path == base or module_path.startswith(base + ".") for base in allowed_base_paths)
25
+ if not is_allowed:
26
+ raise ImportError(
27
+ f"Module '{module_path}' is not in the list of allowed base paths. "
28
+ f"Allowed paths: {allowed_base_paths}"
29
+ )
30
+
31
+ try:
32
+ mod = importlib.import_module(module_path)
33
+ except ModuleNotFoundError as e:
34
+ raise ImportError(f"Could not import module '{module_path}'") from e
35
+ try:
36
+ obj = getattr(mod, attr_name)
37
+ except AttributeError as e:
38
+ raise AttributeError(f"Module '{module_path}' has no attribute '{attr_name}'") from e
39
+ return obj
40
+
41
+
42
+ def resolve_callable_from_path(
43
+ callable_path: str,
44
+ signature_schema: Union[List[str], Callable[[inspect.Signature], None], str],
45
+ allowed_base_paths: Optional[List[str]] = None,
46
+ ) -> Callable:
47
+ """
48
+ Import and return a callable from a module path string like 'module.submodule:callable_name',
49
+ and validate its signature using the required signature_schema (callable or path to callable).
50
+
51
+ Parameters
52
+ ----------
53
+ callable_path : str
54
+ The module path and callable in the format 'module.sub:callable'.
55
+ signature_schema : Union[List[str], Callable, str]
56
+ Either:
57
+ - A list of parameter names to require.
58
+ - A callable that takes an inspect.Signature and raises on failure.
59
+ - A string path to such a callable ('module.sub:schema_checker').
60
+ allowed_base_paths : Optional[List[str]]
61
+ An optional list of base module paths from which imports are allowed.
62
+ If provided, both the callable and any signature schema specified by path
63
+ must reside within one of these paths.
64
+
65
+ Returns
66
+ -------
67
+ Callable
68
+ The resolved and validated callable.
69
+
70
+ Raises
71
+ ------
72
+ ValueError
73
+ If the path is not correctly formatted.
74
+ ImportError
75
+ If the module cannot be imported or is not in the allowed paths.
76
+ AttributeError
77
+ If the attribute does not exist in the module.
78
+ TypeError
79
+ If the resolved attribute is not callable or the signature does not match.
80
+ """
81
+ obj = resolve_obj_from_path(callable_path, allowed_base_paths=allowed_base_paths)
82
+ if not callable(obj):
83
+ raise TypeError(f"Object '{callable_path}' is not callable")
84
+
85
+ # Load/check signature_schema
86
+ schema_checker = signature_schema
87
+ if isinstance(signature_schema, str):
88
+ # When loading the schema checker, apply the same security restrictions.
89
+ schema_checker = resolve_obj_from_path(signature_schema, allowed_base_paths=allowed_base_paths)
90
+
91
+ sig = inspect.signature(obj)
92
+ if isinstance(schema_checker, list):
93
+ actual_params = list(sig.parameters.keys())
94
+ missing = [p for p in schema_checker if p not in actual_params]
95
+ if missing:
96
+ raise TypeError(
97
+ f"Callable at '{callable_path}' is missing required parameters: {missing}\n"
98
+ f"Actual parameters: {actual_params}"
99
+ )
100
+ elif callable(schema_checker):
101
+ try:
102
+ schema_checker(sig)
103
+ except Exception as e:
104
+ raise TypeError(f"Signature validation for '{callable_path}' failed: {e}") from e
105
+ else:
106
+ raise TypeError(f"Invalid signature_schema: expected list, callable, or str, got {type(signature_schema)}")
107
+
108
+ return obj
109
+
110
+
111
+ def resolve_actor_class_from_path(
112
+ path: str, expected_base_class: type, allowed_base_paths: Optional[List[str]] = None
113
+ ) -> type:
114
+ """
115
+ Resolves an actor class from a path and validates that it is a class
116
+ that inherits from the expected base class. This function correctly handles
117
+ decorated Ray actors by inspecting their original class.
118
+
119
+ Parameters
120
+ ----------
121
+ path : str
122
+ The full import path to the actor class.
123
+ expected_base_class : type
124
+ The base class that the resolved class must inherit from.
125
+ allowed_base_paths : Optional[List[str]]
126
+ An optional list of base module paths from which imports are allowed.
127
+
128
+ Returns
129
+ -------
130
+ type
131
+ The resolved actor class (or Ray actor factory).
132
+ """
133
+ obj = resolve_obj_from_path(path, allowed_base_paths=allowed_base_paths)
134
+
135
+ # Determine the class to validate. If it's a Ray actor factory, we need to
136
+ # inspect its MRO to find the original user-defined class.
137
+ cls_to_validate = None
138
+ if inspect.isclass(obj):
139
+ cls_to_validate = obj
140
+ else:
141
+ # For actor factories, find the base class in the MRO that inherits from RayActorStage
142
+ for base in obj.__class__.__mro__:
143
+ if inspect.isclass(base) and issubclass(base, RayActorStage) and base is not RayActorStage:
144
+ cls_to_validate = base
145
+ break
146
+
147
+ if cls_to_validate is None:
148
+ raise TypeError(
149
+ f"Could not resolve a valid actor class from path '{path}'. "
150
+ f"The object is not a class and not a recognized actor factory."
151
+ )
152
+
153
+ if not issubclass(cls_to_validate, expected_base_class):
154
+ raise TypeError(
155
+ f"Actor class '{cls_to_validate.__name__}' at '{path}' must inherit from '{expected_base_class.__name__}'."
156
+ )
157
+
158
+ return obj
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,145 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import inspect
6
+ from typing import Optional, Type, Union, Callable
7
+
8
+ from pydantic import BaseModel
9
+
10
+
11
+ def find_pydantic_config_schema(
12
+ actor_class: Type,
13
+ base_class_to_find: Type,
14
+ param_name: str = "config",
15
+ ) -> Optional[Type[BaseModel]]:
16
+ """
17
+ Introspects a class's MRO to find a Pydantic model in its __init__ signature.
18
+
19
+ This function is designed to find the specific Pydantic configuration model
20
+ for a pipeline actor, which might be a direct class or a proxy object.
21
+
22
+ Parameters
23
+ ----------
24
+ actor_class : Type
25
+ The actor class or proxy object to inspect.
26
+ base_class_to_find : Type
27
+ The specific base class (e.g., RaySource, RayStage) to look for when
28
+ resolving the true actor class from a proxy.
29
+ param_name : str, optional
30
+ The name of the __init__ parameter to inspect for the Pydantic schema,
31
+ by default "config".
32
+
33
+ Returns
34
+ -------
35
+ Optional[Type[BaseModel]]
36
+ The Pydantic BaseModel class if found, otherwise None.
37
+ """
38
+ # 1. Find the actual class to inspect, handling proxy objects.
39
+ cls_to_inspect = None
40
+ if inspect.isclass(actor_class):
41
+ cls_to_inspect = actor_class
42
+ else:
43
+ for base in actor_class.__class__.__mro__:
44
+ if inspect.isclass(base) and issubclass(base, base_class_to_find) and base is not base_class_to_find:
45
+ cls_to_inspect = base
46
+ break
47
+
48
+ if not cls_to_inspect:
49
+ return None
50
+
51
+ # 2. Walk the MRO of the real class to find the __init__ with the typed parameter.
52
+ for cls in cls_to_inspect.__mro__:
53
+ if param_name in getattr(cls.__init__, "__annotations__", {}):
54
+ try:
55
+ init_sig = inspect.signature(cls.__init__)
56
+ config_param = init_sig.parameters.get(param_name)
57
+ if (
58
+ config_param
59
+ and config_param.annotation is not BaseModel
60
+ and issubclass(config_param.annotation, BaseModel)
61
+ ):
62
+ return config_param.annotation # Found the schema
63
+ except (ValueError, TypeError):
64
+ # This class's __init__ is not inspectable (e.g., a C-extension), continue up the MRO.
65
+ continue
66
+
67
+ return None
68
+
69
+
70
+ def find_pydantic_config_schema_for_callable(
71
+ callable_fn: Callable,
72
+ param_name: str = "stage_config",
73
+ ) -> Optional[Type[BaseModel]]:
74
+ """
75
+ Introspects a callable's signature to find a Pydantic model parameter.
76
+
77
+ This function is designed to find the specific Pydantic configuration model
78
+ for a pipeline callable function.
79
+
80
+ Parameters
81
+ ----------
82
+ callable_fn : Callable
83
+ The callable function to inspect.
84
+ param_name : str, optional
85
+ The name of the parameter to inspect for the Pydantic schema,
86
+ by default "stage_config".
87
+
88
+ Returns
89
+ -------
90
+ Optional[Type[BaseModel]]
91
+ The Pydantic BaseModel class if found, otherwise None.
92
+ """
93
+ try:
94
+ sig = inspect.signature(callable_fn)
95
+ config_param = sig.parameters.get(param_name)
96
+ if (
97
+ config_param
98
+ and config_param.annotation is not BaseModel
99
+ and hasattr(config_param.annotation, "__mro__")
100
+ and issubclass(config_param.annotation, BaseModel)
101
+ ):
102
+ return config_param.annotation
103
+ except (ValueError, TypeError):
104
+ # Function signature is not inspectable
105
+ pass
106
+
107
+ return None
108
+
109
+
110
+ def find_pydantic_config_schema_unified(
111
+ target: Union[Type, Callable],
112
+ base_class_to_find: Optional[Type] = None,
113
+ param_name: str = "config",
114
+ ) -> Optional[Type[BaseModel]]:
115
+ """
116
+ Unified function to find Pydantic schema for either classes or callables.
117
+
118
+ Parameters
119
+ ----------
120
+ target : Union[Type, Callable]
121
+ The class or callable to inspect.
122
+ base_class_to_find : Optional[Type], optional
123
+ The specific base class to look for when resolving actor classes from proxies.
124
+ Only used for class inspection.
125
+ param_name : str, optional
126
+ The name of the parameter to inspect for the Pydantic schema.
127
+ For classes: defaults to "config"
128
+ For callables: should be "stage_config"
129
+
130
+ Returns
131
+ -------
132
+ Optional[Type[BaseModel]]
133
+ The Pydantic BaseModel class if found, otherwise None.
134
+ """
135
+ if callable(target) and not inspect.isclass(target):
136
+ # Handle callable function
137
+ return find_pydantic_config_schema_for_callable(target, param_name)
138
+ elif inspect.isclass(target) or hasattr(target, "__class__"):
139
+ # Handle class or proxy object
140
+ if base_class_to_find is None:
141
+ # If no base class specified, we can't use the original function
142
+ return None
143
+ return find_pydantic_config_schema(target, base_class_to_find, param_name)
144
+ else:
145
+ return None
@@ -0,0 +1,65 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Utilities for introspecting and analyzing UDF function specifications.
7
+ """
8
+
9
+ import re
10
+ from typing import Optional
11
+
12
+
13
+ def infer_udf_function_name(udf_function: str) -> Optional[str]:
14
+ """
15
+ Attempts to infer the UDF function name from the provided function string.
16
+
17
+ Supports three formats:
18
+ 1. Inline function: 'def my_func(control_message): ...' -> 'my_func'
19
+ 2. Import path: 'my_module.my_function' -> 'my_function'
20
+ 3. File path: '/path/to/file.py:function_name' -> 'function_name'
21
+
22
+ Parameters
23
+ ----------
24
+ udf_function : str
25
+ The UDF function string.
26
+
27
+ Returns
28
+ -------
29
+ Optional[str]
30
+ The inferred UDF function name, or None if inference is not possible.
31
+
32
+ Examples
33
+ --------
34
+ >>> infer_udf_function_name("def my_custom_func(control_message): pass")
35
+ 'my_custom_func'
36
+
37
+ >>> infer_udf_function_name("my_module.submodule.process_data")
38
+ 'process_data'
39
+
40
+ >>> infer_udf_function_name("/path/to/script.py:custom_function")
41
+ 'custom_function'
42
+
43
+ >>> infer_udf_function_name("/path/to/script.py")
44
+ None
45
+ """
46
+ udf_function = udf_function.strip()
47
+
48
+ # Format 3: File path with explicit function name
49
+ if ":" in udf_function and ("/" in udf_function or "\\" in udf_function):
50
+ # File path with explicit function name: '/path/to/file.py:function_name'
51
+ return udf_function.split(":")[-1].strip()
52
+
53
+ # Format 2: Import path like 'module.submodule.function'
54
+ elif "." in udf_function and not udf_function.startswith("def "):
55
+ # Import path: extract the last part as function name
56
+ return udf_function.split(".")[-1].strip()
57
+
58
+ # Format 1: Inline function definition
59
+ elif udf_function.startswith("def "):
60
+ # Parse inline function definition to extract function name
61
+ match = re.match(r"def\s+(\w+)\s*\(", udf_function)
62
+ if match:
63
+ return match.group(1)
64
+
65
+ return None
File without changes
@@ -0,0 +1,102 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ import logging.config
8
+ from enum import Enum
9
+
10
+
11
+ class LogLevel(str, Enum):
12
+ DEFAULT = "DEFAULT"
13
+ DEBUG = "DEBUG"
14
+ INFO = "INFO"
15
+ WARNING = "WARNING"
16
+ ERROR = "ERROR"
17
+ CRITICAL = "CRITICAL"
18
+
19
+
20
+ def configure_logging(level_name: str) -> None:
21
+ """
22
+ Configures global logging.
23
+
24
+ Parameters
25
+ ----------
26
+ level_name : str
27
+ The name of the logging level (e.g., "DEBUG", "INFO").
28
+ """
29
+ numeric_level = getattr(logging, level_name.upper(), None)
30
+ if not isinstance(numeric_level, int):
31
+ raise ValueError(f"Invalid log level: {level_name}")
32
+
33
+ # Scorched-earth reset: remove ALL existing handlers from root and named loggers
34
+ # to ensure there is exactly one handler after configuration.
35
+ root_logger = logging.getLogger()
36
+ for h in list(root_logger.handlers):
37
+ root_logger.removeHandler(h)
38
+ try:
39
+ h.close()
40
+ except Exception:
41
+ pass
42
+
43
+ # Clear handlers from all known loggers and make them propagate to root
44
+ for name, logger_obj in list(logging.Logger.manager.loggerDict.items()):
45
+ if isinstance(logger_obj, logging.Logger):
46
+ for h in list(logger_obj.handlers):
47
+ logger_obj.removeHandler(h)
48
+ try:
49
+ h.close()
50
+ except Exception:
51
+ pass
52
+ # Ensure messages bubble to root; levels will be controlled centrally
53
+ logger_obj.propagate = True
54
+ logger_obj.setLevel(logging.NOTSET)
55
+
56
+ # Use dictConfig to establish a single console handler on the root logger.
57
+ config_dict = {
58
+ "version": 1,
59
+ # We already cleared handlers above; keep loggers enabled so they propagate to root
60
+ "disable_existing_loggers": False,
61
+ "formatters": {
62
+ "standard": {
63
+ "format": "%(asctime)s - %(levelname)s - %(name)s - %(message)s",
64
+ }
65
+ },
66
+ "handlers": {
67
+ "console": {
68
+ "class": "logging.StreamHandler",
69
+ "level": numeric_level,
70
+ "formatter": "standard",
71
+ "stream": "ext://sys.stdout",
72
+ }
73
+ },
74
+ "root": {
75
+ "level": numeric_level,
76
+ "handlers": ["console"],
77
+ },
78
+ }
79
+
80
+ logging.config.dictConfig(config_dict)
81
+
82
+ # Enforce exactly one handler remains attached to root (keep first StreamHandler)
83
+ root_logger = logging.getLogger()
84
+ if len(root_logger.handlers) > 1:
85
+ keep = None
86
+ for h in list(root_logger.handlers):
87
+ if keep is None and isinstance(h, logging.StreamHandler):
88
+ keep = h
89
+ continue
90
+ root_logger.removeHandler(h)
91
+ try:
92
+ h.close()
93
+ except Exception:
94
+ pass
95
+
96
+ # Route warnings module through logging
97
+ try:
98
+ import logging as _logging
99
+
100
+ _logging.captureWarnings(True)
101
+ except Exception:
102
+ pass
@@ -0,0 +1,84 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Mapping, MutableMapping, Sequence, Set
8
+
9
+ try:
10
+ # Pydantic is optional at runtime for this helper; import if available
11
+ from pydantic import BaseModel # type: ignore
12
+ except Exception: # pragma: no cover - pydantic always present in this repo
13
+ BaseModel = None # type: ignore
14
+
15
+
16
+ _DEFAULT_SENSITIVE_KEYS: Set[str] = {
17
+ "access_token",
18
+ "api_key",
19
+ "authorization",
20
+ "auth_token",
21
+ "client_secret",
22
+ "hf_access_token",
23
+ "hugging_face_access_token",
24
+ "password",
25
+ "refresh_token",
26
+ "secret",
27
+ "ssl_cert",
28
+ "x-api-key",
29
+ }
30
+
31
+ _REDACTION = "***REDACTED***"
32
+
33
+
34
+ def _is_mapping(obj: Any) -> bool:
35
+ try:
36
+ return isinstance(obj, Mapping)
37
+ except Exception:
38
+ return False
39
+
40
+
41
+ def _is_sequence(obj: Any) -> bool:
42
+ # Exclude strings/bytes from sequences we want to traverse
43
+ return isinstance(obj, Sequence) and not isinstance(obj, (str, bytes, bytearray))
44
+
45
+
46
+ def sanitize_for_logging(
47
+ data: Any,
48
+ sensitive_keys: Set[str] | None = None,
49
+ redaction: str = _REDACTION,
50
+ ) -> Any:
51
+ """
52
+ Recursively sanitize common secret fields from dicts, lists, tuples, and Pydantic models.
53
+
54
+ - Key comparison is case-insensitive and matches exact keys only.
55
+ - Does not mutate input; returns a sanitized deep copy.
56
+ - For Pydantic BaseModel instances, uses model_dump() before redaction.
57
+ """
58
+ keys = {k.lower() for k in (sensitive_keys or _DEFAULT_SENSITIVE_KEYS)}
59
+
60
+ # Handle Pydantic models without importing pydantic at module import time
61
+ if BaseModel is not None and isinstance(data, BaseModel): # type: ignore[arg-type]
62
+ try:
63
+ return sanitize_for_logging(data.model_dump(), keys, redaction)
64
+ except Exception:
65
+ # Fall through and try generic handling below
66
+ pass
67
+
68
+ # Dict-like
69
+ if _is_mapping(data):
70
+ out: MutableMapping[str, Any] = type(data)() # preserve mapping type where possible
71
+ for k, v in data.items(): # type: ignore[assignment]
72
+ key_lower = str(k).lower()
73
+ if key_lower in keys:
74
+ out[k] = redaction
75
+ else:
76
+ out[k] = sanitize_for_logging(v, keys, redaction)
77
+ return out
78
+
79
+ # List/Tuple/Sequence
80
+ if _is_sequence(data):
81
+ return type(data)(sanitize_for_logging(v, keys, redaction) for v in data)
82
+
83
+ # Fallback: return as-is
84
+ return data
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0