nv-ingest 2025.8.13.dev20250813__py3-none-any.whl → 2025.8.15.dev20250815__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (56) hide show
  1. nv_ingest/framework/orchestration/execution/__init__.py +3 -0
  2. nv_ingest/framework/orchestration/execution/helpers.py +85 -0
  3. nv_ingest/framework/orchestration/execution/options.py +112 -0
  4. nv_ingest/framework/orchestration/process/__init__.py +3 -0
  5. nv_ingest/framework/orchestration/process/dependent_services.py +55 -0
  6. nv_ingest/framework/orchestration/process/execution.py +497 -0
  7. nv_ingest/framework/orchestration/process/lifecycle.py +122 -0
  8. nv_ingest/framework/orchestration/process/strategies.py +182 -0
  9. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +1 -1
  10. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
  11. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +23 -23
  12. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
  13. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +8 -4
  14. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +16 -16
  15. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +9 -5
  16. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +8 -4
  17. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +10 -6
  18. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
  19. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +18 -17
  20. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
  21. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +14 -13
  22. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +15 -13
  23. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
  24. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
  25. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +22 -13
  26. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +10 -7
  27. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +10 -8
  28. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
  29. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
  30. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +71 -61
  31. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +7 -5
  32. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +8 -4
  33. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
  34. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
  35. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +7 -5
  36. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +13 -14
  37. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +18 -12
  38. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
  39. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
  40. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
  41. nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
  42. nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
  43. nv_ingest/pipeline/__init__.py +3 -0
  44. nv_ingest/pipeline/config/__init__.py +3 -0
  45. nv_ingest/pipeline/config/loaders.py +198 -0
  46. nv_ingest/pipeline/config/replica_resolver.py +227 -0
  47. nv_ingest/pipeline/default_pipeline_impl.py +517 -0
  48. nv_ingest/pipeline/ingest_pipeline.py +389 -0
  49. nv_ingest/pipeline/pipeline_schema.py +398 -0
  50. {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/METADATA +1 -1
  51. {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/RECORD +54 -40
  52. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
  53. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
  54. {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/WHEEL +0 -0
  55. {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/licenses/LICENSE +0 -0
  56. {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,352 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import functools
6
+ import hashlib
7
+ import inspect
8
+ import logging
9
+ import os
10
+ import time
11
+ from typing import Dict, List, Any, Optional, Callable
12
+ from dataclasses import dataclass
13
+
14
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_all_tasks_by_type
15
+ from nv_ingest_api.util.imports.callable_signatures import ingest_callable_signature
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @dataclass
21
+ class CachedUDF:
22
+ """Cached UDF function with metadata"""
23
+
24
+ function: callable
25
+ function_name: str
26
+ signature_validated: bool
27
+ created_at: float
28
+ last_used: float
29
+ use_count: int
30
+
31
+
32
+ class UDFCache:
33
+ """LRU cache for compiled and validated UDF functions"""
34
+
35
+ def __init__(self, max_size: int = 128, ttl_seconds: Optional[int] = 3600):
36
+ self.max_size = max_size
37
+ self.ttl_seconds = ttl_seconds
38
+ self.cache: Dict[str, CachedUDF] = {}
39
+ self.access_order: List[str] = [] # For LRU tracking
40
+
41
+ def _generate_cache_key(self, udf_function_str: str, udf_function_name: str) -> str:
42
+ """Generate cache key from UDF string and function name"""
43
+ content = f"{udf_function_str.strip()}:{udf_function_name}"
44
+ return hashlib.sha256(content.encode()).hexdigest()
45
+
46
+ def _evict_lru(self):
47
+ """Remove least recently used item"""
48
+ if self.access_order:
49
+ lru_key = self.access_order.pop(0)
50
+ self.cache.pop(lru_key, None)
51
+
52
+ def _cleanup_expired(self):
53
+ """Remove expired entries if TTL is configured"""
54
+ if not self.ttl_seconds:
55
+ return
56
+
57
+ current_time = time.time()
58
+ expired_keys = [
59
+ key for key, cached_udf in self.cache.items() if current_time - cached_udf.created_at > self.ttl_seconds
60
+ ]
61
+
62
+ for key in expired_keys:
63
+ self.cache.pop(key, None)
64
+ if key in self.access_order:
65
+ self.access_order.remove(key)
66
+
67
+ def get(self, udf_function_str: str, udf_function_name: str) -> Optional[CachedUDF]:
68
+ """Get cached UDF function if available"""
69
+ self._cleanup_expired()
70
+
71
+ cache_key = self._generate_cache_key(udf_function_str, udf_function_name)
72
+
73
+ if cache_key in self.cache:
74
+ # Update access tracking
75
+ if cache_key in self.access_order:
76
+ self.access_order.remove(cache_key)
77
+ self.access_order.append(cache_key)
78
+
79
+ # Update usage stats
80
+ cached_udf = self.cache[cache_key]
81
+ cached_udf.last_used = time.time()
82
+ cached_udf.use_count += 1
83
+
84
+ return cached_udf
85
+
86
+ return None
87
+
88
+ def put(
89
+ self, udf_function_str: str, udf_function_name: str, function: callable, signature_validated: bool = True
90
+ ) -> str:
91
+ """Cache a compiled and validated UDF function"""
92
+ cache_key = self._generate_cache_key(udf_function_str, udf_function_name)
93
+
94
+ # Evict LRU if at capacity
95
+ while len(self.cache) >= self.max_size:
96
+ self._evict_lru()
97
+
98
+ current_time = time.time()
99
+ cached_udf = CachedUDF(
100
+ function=function,
101
+ function_name=udf_function_name,
102
+ signature_validated=signature_validated,
103
+ created_at=current_time,
104
+ last_used=current_time,
105
+ use_count=1,
106
+ )
107
+
108
+ self.cache[cache_key] = cached_udf
109
+ self.access_order.append(cache_key)
110
+
111
+ return cache_key
112
+
113
+ def get_stats(self) -> Dict[str, Any]:
114
+ """Get cache statistics"""
115
+ total_uses = sum(udf.use_count for udf in self.cache.values())
116
+ most_used = max(self.cache.values(), key=lambda x: x.use_count, default=None)
117
+ return {
118
+ "size": len(self.cache),
119
+ "max_size": self.max_size,
120
+ "total_uses": total_uses,
121
+ "most_used_function": most_used.function_name if most_used else None,
122
+ "most_used_count": most_used.use_count if most_used else 0,
123
+ }
124
+
125
+
126
+ # Global cache instance
127
+ _udf_cache = UDFCache(max_size=128, ttl_seconds=3600)
128
+
129
+
130
+ def compile_and_validate_udf(udf_function_str: str, udf_function_name: str, task_num: int) -> callable:
131
+ """Compile and validate UDF function (extracted for caching)"""
132
+ # Execute the UDF function string in a controlled namespace
133
+ namespace: Dict[str, Any] = {}
134
+ try:
135
+ exec(udf_function_str, namespace)
136
+ except Exception as e:
137
+ raise ValueError(f"UDF task {task_num} failed to execute: {str(e)}")
138
+
139
+ # Extract the specified function from the namespace
140
+ if udf_function_name in namespace and callable(namespace[udf_function_name]):
141
+ udf_function = namespace[udf_function_name]
142
+ else:
143
+ raise ValueError(f"UDF task {task_num}: Specified UDF function '{udf_function_name}' not found or not callable")
144
+
145
+ # Validate the UDF function signature
146
+ try:
147
+ ingest_callable_signature(inspect.signature(udf_function))
148
+ except Exception as e:
149
+ raise ValueError(f"UDF task {task_num} has invalid function signature: {str(e)}")
150
+
151
+ return udf_function
152
+
153
+
154
+ def execute_targeted_udfs(
155
+ control_message: IngestControlMessage, stage_name: str, directive: str
156
+ ) -> IngestControlMessage:
157
+ """Execute UDFs that target this stage with the given directive."""
158
+ # Early exit if no UDF tasks exist - check by task type, not task ID
159
+ udf_tasks_exist = any(task.type == "udf" for task in control_message.get_tasks())
160
+ if not udf_tasks_exist:
161
+ return control_message
162
+
163
+ # Remove all UDF tasks and get them - handle case where no tasks found
164
+ try:
165
+ all_udf_tasks = remove_all_tasks_by_type(control_message, "udf")
166
+ except ValueError:
167
+ # No UDF tasks found - this can happen due to race conditions
168
+ logger.debug(f"No UDF tasks found for stage '{stage_name}' directive '{directive}'")
169
+ return control_message
170
+
171
+ # Execute applicable UDFs and collect remaining ones
172
+ remaining_tasks = []
173
+
174
+ for task_properties in all_udf_tasks:
175
+ # Check if this UDF targets this stage with the specified directive
176
+ target_stage = task_properties.get("target_stage", "")
177
+ run_before = task_properties.get("run_before", False)
178
+ run_after = task_properties.get("run_after", False)
179
+
180
+ # Determine if this UDF should execute
181
+ should_execute = False
182
+ if directive == "run_before" and run_before and target_stage == stage_name:
183
+ should_execute = True
184
+ elif directive == "run_after" and run_after and target_stage == stage_name:
185
+ should_execute = True
186
+
187
+ if should_execute:
188
+ try:
189
+ # Get UDF function details
190
+ udf_function_str = task_properties.get("udf_function", "").strip()
191
+ udf_function_name = task_properties.get("udf_function_name", "").strip()
192
+ task_id = task_properties.get("task_id", "unknown")
193
+
194
+ # Skip empty UDF functions
195
+ if not udf_function_str:
196
+ logger.debug(f"UDF task {task_id} has empty function, skipping")
197
+ remaining_tasks.append(task_properties)
198
+ continue
199
+
200
+ # Validate function name
201
+ if not udf_function_name:
202
+ raise ValueError(f"UDF task {task_id} missing required 'udf_function_name' property")
203
+
204
+ # Get or compile UDF function
205
+ cached_udf = _udf_cache.get(udf_function_str, udf_function_name)
206
+ if cached_udf:
207
+ udf_function = cached_udf.function
208
+ logger.debug(f"UDF task {task_id}: Using cached function '{udf_function_name}'")
209
+ else:
210
+ udf_function = compile_and_validate_udf(udf_function_str, udf_function_name, task_id)
211
+ _udf_cache.put(udf_function_str, udf_function_name, udf_function)
212
+ logger.debug(f"UDF task {task_id}: Cached function '{udf_function_name}'")
213
+
214
+ # Execute the UDF
215
+ control_message = udf_function(control_message)
216
+
217
+ # Validate return type
218
+ if not isinstance(control_message, IngestControlMessage):
219
+ raise ValueError(f"UDF task {task_id} must return IngestControlMessage")
220
+
221
+ logger.info(f"Executed UDF {task_id} '{udf_function_name}' {directive} stage '{stage_name}'")
222
+
223
+ except Exception as e:
224
+ logger.error(f"UDF {task_id} failed {directive} stage '{stage_name}': {e}")
225
+ # Keep failed task for next stage
226
+ remaining_tasks.append(task_properties)
227
+ else:
228
+ # Keep non-applicable task for next stage
229
+ remaining_tasks.append(task_properties)
230
+
231
+ # Re-add all remaining UDF tasks
232
+ for task_properties in remaining_tasks:
233
+ from nv_ingest_api.internal.primitives.control_message_task import ControlMessageTask
234
+
235
+ task = ControlMessageTask(type="udf", id=task_properties.get("task_id", "unknown"), properties=task_properties)
236
+ control_message.add_task(task)
237
+
238
+ return control_message
239
+
240
+
241
+ def remove_task_by_id(control_message: IngestControlMessage, task_id: str) -> IngestControlMessage:
242
+ """Remove a specific task by ID from the control message"""
243
+ try:
244
+ control_message.remove_task(task_id)
245
+ except RuntimeError as e:
246
+ logger.warning(f"Could not remove task {task_id}: {e}")
247
+
248
+ return control_message
249
+
250
+
251
+ def udf_intercept_hook(stage_name: Optional[str] = None, enable_run_before: bool = True, enable_run_after: bool = True):
252
+ """
253
+ Decorator that executes UDFs targeted at this stage.
254
+
255
+ This decorator integrates with the existing UDF system, providing full
256
+ UDF compilation, caching, and execution capabilities. UDFs can target
257
+ specific stages using run_before or run_after directives.
258
+
259
+ Args:
260
+ stage_name: Name of the stage (e.g., "image_dedup", "text_extract").
261
+ If None, will attempt to use self.stage_name from the decorated method's instance.
262
+ enable_run_before: Whether to execute UDFs with run_before=True (default: True)
263
+ enable_run_after: Whether to execute UDFs with run_after=True (default: True)
264
+
265
+ Examples:
266
+ # Automatic stage name detection (recommended)
267
+ @traceable("image_deduplication")
268
+ @udf_intercept_hook() # Uses self.stage_name automatically
269
+ @filter_by_task(required_tasks=["dedup"])
270
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
271
+ return control_message
272
+
273
+ # Explicit stage name (fallback/override)
274
+ @traceable("data_sink")
275
+ @udf_intercept_hook("data_sink", enable_run_after=False)
276
+ @filter_by_task(required_tasks=["store"])
277
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
278
+ return control_message
279
+
280
+ # Only run_after UDFs (e.g., for source stages)
281
+ @traceable("data_source")
282
+ @udf_intercept_hook(enable_run_before=False) # Uses self.stage_name automatically
283
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
284
+ return control_message
285
+ """
286
+
287
+ def decorator(func: Callable) -> Callable:
288
+ @functools.wraps(func)
289
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
290
+ # Check if UDF processing is globally disabled
291
+ if os.getenv("INGEST_DISABLE_UDF_PROCESSING"):
292
+ logger.debug("UDF processing is disabled via INGEST_DISABLE_UDF_PROCESSING environment variable")
293
+ return func(*args, **kwargs)
294
+
295
+ # Determine the stage name to use
296
+ resolved_stage_name = stage_name
297
+
298
+ # If no explicit stage_name provided, try to get it from self.stage_name
299
+ if resolved_stage_name is None and len(args) >= 1:
300
+ stage_instance = args[0] # 'self' in method calls
301
+ if hasattr(stage_instance, "stage_name") and stage_instance.stage_name:
302
+ resolved_stage_name = stage_instance.stage_name
303
+ logger.debug(f"Using auto-detected stage name: '{resolved_stage_name}'")
304
+ else:
305
+ logger.warning(
306
+ "No stage_name provided and could not auto-detect from instance. Skipping UDF intercept."
307
+ )
308
+ return func(*args, **kwargs)
309
+ elif resolved_stage_name is None:
310
+ logger.warning(
311
+ "No stage_name provided and no instance available for auto-detection. Skipping UDF intercept."
312
+ )
313
+ return func(*args, **kwargs)
314
+
315
+ # Extract control_message from args (handle both self.method and function cases)
316
+ control_message = None
317
+ if len(args) >= 2 and hasattr(args[1], "get_tasks"):
318
+ control_message = args[1] # self.method case
319
+ args_list = list(args)
320
+ elif len(args) >= 1 and hasattr(args[0], "get_tasks"):
321
+ control_message = args[0] # function case
322
+ args_list = list(args)
323
+
324
+ if control_message:
325
+ # Execute UDFs that should run before this stage (if enabled)
326
+ if enable_run_before:
327
+ control_message = execute_targeted_udfs(control_message, resolved_stage_name, "run_before")
328
+ # Update args with modified control_message
329
+ if len(args) >= 2 and hasattr(args[1], "get_tasks"):
330
+ args_list[1] = control_message
331
+ else:
332
+ args_list[0] = control_message
333
+
334
+ # Execute the original stage logic
335
+ result = func(*tuple(args_list), **kwargs)
336
+
337
+ # Execute UDFs that should run after this stage (if enabled)
338
+ if enable_run_after and hasattr(result, "get_tasks"): # Result is control_message
339
+ result = execute_targeted_udfs(result, resolved_stage_name, "run_after")
340
+
341
+ return result
342
+ else:
343
+ return func(*args, **kwargs)
344
+
345
+ return wrapper
346
+
347
+ return decorator
348
+
349
+
350
+ def get_udf_cache_stats() -> Dict[str, Any]:
351
+ """Get UDF cache performance statistics"""
352
+ return _udf_cache.get_stats()
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,198 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Configuration loading and management functions for pipeline execution.
7
+
8
+ This module provides declarative functions for loading, validating, and applying
9
+ runtime overrides to pipeline configurations, replacing imperative inline logic.
10
+ """
11
+
12
+ import logging
13
+ import yaml
14
+ from typing import Optional
15
+
16
+ from nv_ingest.pipeline.pipeline_schema import PipelineConfigSchema
17
+ from nv_ingest.pipeline.default_pipeline_impl import DEFAULT_LIBMODE_PIPELINE_YAML
18
+ from nv_ingest.framework.orchestration.execution.options import PipelineRuntimeOverrides
19
+ from nv_ingest_api.util.string_processing.yaml import substitute_env_vars_in_yaml_content
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def load_pipeline_config(config_path: str) -> PipelineConfigSchema:
25
+ """
26
+ Load a pipeline configuration file, substituting environment variables.
27
+
28
+ Parameters
29
+ ----------
30
+ config_path : str
31
+ The path to the YAML configuration file.
32
+
33
+ Returns
34
+ -------
35
+ PipelineConfigSchema
36
+ A validated PipelineConfigSchema object.
37
+
38
+ Raises
39
+ ------
40
+ ValueError
41
+ If the YAML file cannot be parsed after environment variable substitution.
42
+ """
43
+ logger.info(f"Loading pipeline configuration from: {config_path}")
44
+
45
+ # Read the raw YAML file content
46
+ with open(config_path, "r") as f:
47
+ raw_content = f.read()
48
+
49
+ # Substitute all environment variable placeholders using the utility function
50
+ substituted_content = substitute_env_vars_in_yaml_content(raw_content)
51
+
52
+ # Parse the substituted content with PyYAML, with error handling
53
+ try:
54
+ processed_config = yaml.safe_load(substituted_content)
55
+ except yaml.YAMLError as e:
56
+ error_message = (
57
+ f"Failed to parse YAML after environment variable substitution. "
58
+ f"Error: {e}\n\n"
59
+ f"--- Substituted Content ---\n{substituted_content}\n---------------------------"
60
+ )
61
+ raise ValueError(error_message) from e
62
+
63
+ # Pydantic validates the clean, substituted data against the schema
64
+ return PipelineConfigSchema(**processed_config)
65
+
66
+
67
+ def load_default_libmode_config() -> PipelineConfigSchema:
68
+ """
69
+ Load and validate the default libmode pipeline configuration.
70
+
71
+ This function loads the embedded default libmode pipeline YAML,
72
+ performs environment variable substitution, and returns a validated
73
+ configuration object.
74
+
75
+ Returns
76
+ -------
77
+ PipelineConfigSchema
78
+ Validated default libmode pipeline configuration.
79
+
80
+ Raises
81
+ ------
82
+ ValueError
83
+ If the default YAML cannot be parsed or validated.
84
+ """
85
+ logger.info("Loading default libmode pipeline configuration")
86
+
87
+ # Substitute environment variables in the YAML content
88
+ substituted_content = substitute_env_vars_in_yaml_content(DEFAULT_LIBMODE_PIPELINE_YAML)
89
+
90
+ # Parse the substituted content with PyYAML
91
+ try:
92
+ processed_config = yaml.safe_load(substituted_content)
93
+ except yaml.YAMLError as e:
94
+ error_message = (
95
+ f"Failed to parse default libmode pipeline YAML after environment variable substitution. " f"Error: {e}"
96
+ )
97
+ raise ValueError(error_message) from e
98
+
99
+ # Create and return validated PipelineConfigSchema
100
+ return PipelineConfigSchema(**processed_config)
101
+
102
+
103
+ def apply_runtime_overrides(config: PipelineConfigSchema, overrides: PipelineRuntimeOverrides) -> PipelineConfigSchema:
104
+ """
105
+ Apply runtime parameter overrides to a pipeline configuration.
106
+
107
+ This function creates a copy of the provided configuration and applies
108
+ any non-None override values to the pipeline runtime settings.
109
+
110
+ Parameters
111
+ ----------
112
+ config : PipelineConfigSchema
113
+ Base pipeline configuration to modify.
114
+ overrides : PipelineRuntimeOverrides
115
+ Runtime overrides to apply. Only non-None values are applied.
116
+
117
+ Returns
118
+ -------
119
+ PipelineConfigSchema
120
+ Modified configuration with overrides applied.
121
+ """
122
+ # Create a copy to avoid modifying the original
123
+ modified_config = config.model_copy(deep=True)
124
+
125
+ # Apply overrides if provided
126
+ if overrides.disable_dynamic_scaling is not None:
127
+ modified_config.pipeline.disable_dynamic_scaling = overrides.disable_dynamic_scaling
128
+ logger.debug(f"Applied dynamic scaling override: {overrides.disable_dynamic_scaling}")
129
+
130
+ if overrides.dynamic_memory_threshold is not None:
131
+ modified_config.pipeline.dynamic_memory_threshold = overrides.dynamic_memory_threshold
132
+ logger.debug(f"Applied memory threshold override: {overrides.dynamic_memory_threshold}")
133
+
134
+ return modified_config
135
+
136
+
137
+ def validate_pipeline_config(config: Optional[PipelineConfigSchema]) -> PipelineConfigSchema:
138
+ """
139
+ Validate and ensure a pipeline configuration is available.
140
+
141
+ This function ensures that a valid pipeline configuration is available,
142
+ either from the provided config or by loading the default libmode config.
143
+
144
+ Parameters
145
+ ----------
146
+ config : Optional[PipelineConfigSchema]
147
+ Pipeline configuration to validate, or None to load default.
148
+
149
+ Returns
150
+ -------
151
+ PipelineConfigSchema
152
+ Validated pipeline configuration.
153
+
154
+ Raises
155
+ ------
156
+ ValueError
157
+ If config is None and default config cannot be loaded.
158
+ """
159
+ if config is None:
160
+ return load_default_libmode_config()
161
+
162
+ # Config is already validated by Pydantic, just return it
163
+ return config
164
+
165
+
166
+ def resolve_pipeline_config(provided_config: Optional[PipelineConfigSchema], libmode: bool) -> PipelineConfigSchema:
167
+ """
168
+ Resolve the final pipeline configuration from inputs.
169
+
170
+ This function implements the configuration resolution logic:
171
+ - If config provided: use it
172
+ - If libmode=True and no config: load default libmode config
173
+ - If libmode=False and no config: raise error
174
+
175
+ Parameters
176
+ ----------
177
+ provided_config : Optional[PipelineConfigSchema]
178
+ User-provided pipeline configuration, or None.
179
+ libmode : bool
180
+ Whether to allow loading default libmode configuration.
181
+
182
+ Returns
183
+ -------
184
+ PipelineConfigSchema
185
+ Resolved and validated pipeline configuration.
186
+
187
+ Raises
188
+ ------
189
+ ValueError
190
+ If no config provided and libmode=False.
191
+ """
192
+ if provided_config is not None:
193
+ return provided_config
194
+
195
+ if libmode:
196
+ return load_default_libmode_config()
197
+ else:
198
+ raise ValueError("pipeline_config must be provided when libmode is False")