nv-ingest 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.16.dev20250816__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (56) hide show
  1. nv_ingest/framework/orchestration/execution/__init__.py +3 -0
  2. nv_ingest/framework/orchestration/execution/helpers.py +85 -0
  3. nv_ingest/framework/orchestration/execution/options.py +112 -0
  4. nv_ingest/framework/orchestration/process/__init__.py +3 -0
  5. nv_ingest/framework/orchestration/process/dependent_services.py +55 -0
  6. nv_ingest/framework/orchestration/process/execution.py +497 -0
  7. nv_ingest/framework/orchestration/process/lifecycle.py +122 -0
  8. nv_ingest/framework/orchestration/process/strategies.py +182 -0
  9. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +1 -1
  10. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
  11. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +23 -23
  12. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
  13. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +8 -4
  14. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +16 -16
  15. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +9 -5
  16. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +8 -4
  17. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +10 -6
  18. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
  19. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +18 -17
  20. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
  21. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +14 -13
  22. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +15 -13
  23. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
  24. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
  25. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +22 -13
  26. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +10 -7
  27. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +10 -8
  28. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
  29. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
  30. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +71 -61
  31. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +7 -5
  32. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +8 -4
  33. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
  34. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
  35. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +7 -5
  36. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +13 -14
  37. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +18 -12
  38. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
  39. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
  40. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
  41. nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
  42. nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
  43. nv_ingest/pipeline/__init__.py +3 -0
  44. nv_ingest/pipeline/config/__init__.py +3 -0
  45. nv_ingest/pipeline/config/loaders.py +198 -0
  46. nv_ingest/pipeline/config/replica_resolver.py +227 -0
  47. nv_ingest/pipeline/default_pipeline_impl.py +517 -0
  48. nv_ingest/pipeline/ingest_pipeline.py +389 -0
  49. nv_ingest/pipeline/pipeline_schema.py +398 -0
  50. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/METADATA +1 -1
  51. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/RECORD +54 -40
  52. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
  53. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
  54. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/WHEEL +0 -0
  55. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/licenses/LICENSE +0 -0
  56. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,389 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import math
7
+ from typing import Dict, Optional, Type, List, Set
8
+ import os
9
+
10
+ from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import RayPipeline, ScalingConfig
11
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_sink_stage_base import RayActorSinkStage
12
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_source_stage_base import RayActorSourceStage
13
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
14
+ from nv_ingest.framework.orchestration.ray.util.pipeline.tools import wrap_callable_as_stage
15
+ from nv_ingest.pipeline.pipeline_schema import (
16
+ PipelineConfigSchema,
17
+ StageConfig,
18
+ StageType,
19
+ ReplicaStrategyConfig,
20
+ )
21
+ from nv_ingest_api.util.imports.callable_signatures import ingest_stage_callable_signature
22
+ from nv_ingest_api.util.imports.dynamic_resolvers import resolve_actor_class_from_path, resolve_callable_from_path
23
+ from nv_ingest_api.util.introspection.class_inspect import (
24
+ find_pydantic_config_schema,
25
+ find_pydantic_config_schema_unified,
26
+ )
27
+ from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class IngestPipelineBuilder:
33
+ """
34
+ A high-level builder for creating and configuring an ingestion pipeline.
35
+
36
+ This class translates a `PipelineConfig` object into a runnable `RayPipeline`,
37
+ handling class resolution, configuration validation, replica calculation,
38
+ and stage/edge construction.
39
+
40
+ Attributes
41
+ ----------
42
+ _config : PipelineConfigSchema
43
+ The declarative configuration for the pipeline.
44
+ _pipeline : RayPipeline
45
+ The underlying RayPipeline instance being constructed.
46
+ _system_resource_probe : SystemResourceProbe
47
+ A utility to probe for available system resources like CPU cores.
48
+
49
+ """
50
+
51
+ def __init__(self, config: PipelineConfigSchema, system_resource_probe: Optional[SystemResourceProbe] = None):
52
+ """
53
+ Initializes the IngestPipeline.
54
+
55
+ Parameters
56
+ ----------
57
+ config : PipelineConfigSchema
58
+ The pipeline configuration object.
59
+ system_resource_probe : Optional[SystemResourceProbe], optional
60
+ A probe for system resources. If not provided, a default instance
61
+ will be created. Defaults to None.
62
+ """
63
+ logger.debug(f"Initializing IngestPipeline for '{config.name}'.")
64
+ self._config: PipelineConfigSchema = config
65
+ scaling_config = ScalingConfig(
66
+ dynamic_memory_scaling=not config.pipeline.disable_dynamic_scaling,
67
+ dynamic_memory_threshold=config.pipeline.dynamic_memory_threshold,
68
+ pid_kp=config.pipeline.pid_controller.kp,
69
+ pid_ki=config.pipeline.pid_controller.ki,
70
+ pid_ema_alpha=config.pipeline.pid_controller.ema_alpha,
71
+ pid_target_queue_depth=config.pipeline.pid_controller.target_queue_depth,
72
+ pid_penalty_factor=config.pipeline.pid_controller.penalty_factor,
73
+ pid_error_boost_factor=config.pipeline.pid_controller.error_boost_factor,
74
+ rcm_memory_safety_buffer_fraction=config.pipeline.pid_controller.rcm_memory_safety_buffer_fraction,
75
+ )
76
+ self._pipeline: RayPipeline = RayPipeline(scaling_config=scaling_config)
77
+ self._system_resource_probe: SystemResourceProbe = system_resource_probe or SystemResourceProbe()
78
+ self._is_built: bool = False
79
+ self._built_stages: Set[str] = set()
80
+
81
+ def build(self) -> None:
82
+ """
83
+ Builds the ingestion pipeline from the configuration.
84
+
85
+ This method constructs the RayPipeline by adding stages and edges as
86
+ defined in the pipeline configuration. It also validates dependencies
87
+ and ensures the pipeline is ready to be started.
88
+
89
+ Raises
90
+ ------
91
+ ValueError
92
+ If the pipeline configuration is invalid, such as containing
93
+ circular dependencies or references to non-existent stages.
94
+ """
95
+ if self._is_built:
96
+ logger.warning("Pipeline is already built. Skipping build.")
97
+ return
98
+
99
+ logger.info(f"Building pipeline '{self._config.name}'...")
100
+
101
+ # First, validate the overall structure and dependencies
102
+ self._validate_dependencies()
103
+
104
+ # Then, build the stages
105
+ total_cpus = os.cpu_count() or 1
106
+ for stage_config in self._config.stages:
107
+ if not stage_config.enabled:
108
+ logger.info(f"Stage '{stage_config.name}' is disabled. Skipping.")
109
+ continue
110
+ self._build_stage(stage_config, total_cpus)
111
+
112
+ # Finally, add the edges
113
+ for edge_config in self._config.edges:
114
+ if not (edge_config.from_stage in self._built_stages and edge_config.to_stage in self._built_stages):
115
+ logger.warning(
116
+ f"Skipping edge from '{edge_config.from_stage}' to '{edge_config.to_stage}' "
117
+ f"because one or both stages are disabled or failed to build."
118
+ )
119
+ continue
120
+
121
+ self._pipeline.make_edge(
122
+ from_stage=edge_config.from_stage,
123
+ to_stage=edge_config.to_stage,
124
+ queue_size=edge_config.queue_size,
125
+ )
126
+
127
+ self._pipeline.build()
128
+ self._is_built = True
129
+ logger.info(f"Pipeline '{self._config.name}' built successfully.")
130
+
131
+ def _build_stage(self, stage_config: StageConfig, total_cpus: int) -> None:
132
+ """Builds and adds a single stage to the pipeline."""
133
+ logger.debug(f"Building stage '{stage_config.name}'...")
134
+ stage_type_enum = StageType(stage_config.type)
135
+ expected_base_class: Optional[Type] = {
136
+ StageType.SOURCE: RayActorSourceStage,
137
+ StageType.SINK: RayActorSinkStage,
138
+ StageType.STAGE: RayActorStage,
139
+ }.get(stage_type_enum)
140
+
141
+ if not expected_base_class:
142
+ raise ValueError(f"Invalid stage type '{stage_config.type}' for stage '{stage_config.name}'")
143
+
144
+ # Handle callable vs actor stage configurations
145
+ if stage_config.callable:
146
+ # Handle callable stage
147
+ callable_fn = resolve_callable_from_path(
148
+ stage_config.callable, signature_schema=ingest_stage_callable_signature
149
+ )
150
+ config_schema = find_pydantic_config_schema_unified(callable_fn, param_name="stage_config")
151
+
152
+ # For callable stages, we need a schema to wrap the callable
153
+ if not config_schema:
154
+ raise ValueError(
155
+ f"Callable stage '{stage_config.name}' must have a Pydantic schema in its stage_config parameter"
156
+ )
157
+
158
+ # Wrap callable as a stage using wrap_callable_as_stage
159
+ actor_class = wrap_callable_as_stage(callable_fn, config_schema, required_tasks=stage_config.task_filters)
160
+
161
+ # For callable stages, the config instance is handled by wrap_callable_as_stage
162
+ config_instance = config_schema(**stage_config.config) if config_schema else None
163
+ else:
164
+ # Handle actor stage (existing logic)
165
+ actor_class = resolve_actor_class_from_path(stage_config.actor, expected_base_class)
166
+ config_schema = find_pydantic_config_schema(actor_class, expected_base_class)
167
+ config_instance = config_schema(**stage_config.config) if config_schema else None
168
+
169
+ add_method = getattr(self._pipeline, f"add_{stage_config.type.value}", None)
170
+ if not add_method:
171
+ raise AttributeError(f"Pipeline has no method 'add_{stage_config.type.value}'")
172
+
173
+ replicas = stage_config.replicas
174
+ min_replicas, max_replicas = 1, 1
175
+
176
+ # Check if dynamic scaling is disabled by checking pipeline config
177
+ dynamic_scaling_disabled = getattr(self._config.pipeline, "disable_dynamic_scaling", False)
178
+
179
+ if replicas and total_cpus:
180
+ # Handle new replica configuration format
181
+ if hasattr(replicas, "min_replicas") and replicas.min_replicas is not None:
182
+ min_replicas = replicas.min_replicas
183
+ elif replicas.cpu_count_min is not None: # Legacy support
184
+ min_replicas = replicas.cpu_count_min
185
+ elif replicas.cpu_percent_min is not None: # Legacy support
186
+ min_replicas = math.floor(replicas.cpu_percent_min * total_cpus)
187
+
188
+ # For max_replicas, prioritize based on scaling mode
189
+ if dynamic_scaling_disabled:
190
+ # Static scaling mode - use static_replicas if available
191
+ if hasattr(replicas, "static_replicas") and replicas.static_replicas is not None:
192
+ if isinstance(replicas.static_replicas, int):
193
+ # Use resolved static replica count
194
+ max_replicas = replicas.static_replicas
195
+ min_replicas = replicas.static_replicas # In static mode, min == max
196
+ logger.debug(f"Stage '{stage_config.name}': Using resolved static replicas = {max_replicas}")
197
+ else:
198
+ # Should not happen after resolve_static_replicas, but fallback to legacy
199
+ logger.warning(
200
+ f"Stage '{stage_config.name}': "
201
+ "static_replicas not resolved to int, using legacy calculation"
202
+ )
203
+ max_replicas = self._calculate_legacy_max_replicas(replicas, total_cpus)
204
+ else:
205
+ # No static_replicas defined, use legacy calculation
206
+ max_replicas = self._calculate_legacy_max_replicas(replicas, total_cpus)
207
+ else:
208
+ # Dynamic scaling mode - use max_replicas
209
+ if hasattr(replicas, "max_replicas") and replicas.max_replicas is not None:
210
+ if isinstance(replicas.max_replicas, int):
211
+ max_replicas = replicas.max_replicas
212
+ else:
213
+ # ReplicaStrategyConfig - calculate based on strategy and system resources
214
+ max_replicas = self._calculate_strategy_based_replicas(
215
+ stage_config.name, replicas.max_replicas, total_cpus
216
+ )
217
+ logger.debug(
218
+ f"Stage '{stage_config.name}': max_replicas calculated from strategy = {max_replicas}"
219
+ )
220
+ else:
221
+ # Legacy calculation
222
+ max_replicas = self._calculate_legacy_max_replicas(replicas, total_cpus)
223
+
224
+ # Ensure max_replicas is not less than min_replicas
225
+ max_replicas = max(min_replicas, max_replicas)
226
+
227
+ actor_kwarg = f"{stage_config.type.value}_actor"
228
+ add_method(
229
+ name=stage_config.name,
230
+ **{actor_kwarg: actor_class},
231
+ config=config_instance,
232
+ min_replicas=min_replicas,
233
+ max_replicas=max_replicas,
234
+ )
235
+ logger.debug(f"Added stage '{stage_config.name}' ({min_replicas}-{max_replicas} replicas) to the pipeline.")
236
+ self._built_stages.add(stage_config.name)
237
+
238
+ def _calculate_legacy_max_replicas(self, replicas, total_cpus):
239
+ if replicas.cpu_count_max is not None:
240
+ return replicas.cpu_count_max
241
+ elif replicas.cpu_percent_max is not None:
242
+ return math.ceil(replicas.cpu_percent_max * total_cpus)
243
+ else:
244
+ return 1
245
+
246
+ def _calculate_strategy_based_replicas(
247
+ self, stage_name: str, strategy_config: ReplicaStrategyConfig, total_cpus: int
248
+ ) -> int:
249
+ """
250
+ Calculate replica count based on ReplicaStrategyConfig for dynamic scaling.
251
+
252
+ Parameters
253
+ ----------
254
+ stage_name : str
255
+ Name of the stage for logging purposes.
256
+ strategy_config : ReplicaStrategyConfig
257
+ The replica strategy configuration.
258
+ total_cpus : int
259
+ Total available CPU cores.
260
+
261
+ Returns
262
+ -------
263
+ int
264
+ Calculated replica count.
265
+ """
266
+ from nv_ingest.pipeline.pipeline_schema import ReplicaCalculationStrategy
267
+
268
+ strategy = strategy_config.strategy
269
+
270
+ if strategy == ReplicaCalculationStrategy.STATIC:
271
+ return strategy_config.value or 1
272
+
273
+ elif strategy == ReplicaCalculationStrategy.CPU_PERCENTAGE:
274
+ cpu_percent = strategy_config.cpu_percent or 0.5
275
+ limit = strategy_config.limit or total_cpus
276
+ calculated = max(1, int(total_cpus * cpu_percent))
277
+ result = min(calculated, limit)
278
+ logger.debug(
279
+ f"Stage '{stage_name}': CPU_PERCENTAGE strategy: {cpu_percent:.1%} of {total_cpus} "
280
+ f"CPUs = {calculated}, limited to {result}"
281
+ )
282
+ return result
283
+
284
+ elif strategy == ReplicaCalculationStrategy.MEMORY_THRESHOLDING:
285
+ # For dynamic scaling, use a more aggressive memory allocation (80% vs 70% for static)
286
+ memory_per_replica_mb = strategy_config.memory_per_replica_mb or 1000
287
+ available_memory_mb = int(self._system_resource_probe.total_memory_mb * 0.8)
288
+ calculated = max(1, available_memory_mb // memory_per_replica_mb)
289
+ limit = strategy_config.limit or calculated
290
+ result = min(calculated, limit)
291
+ logger.debug(
292
+ f"Stage '{stage_name}': MEMORY_THRESHOLDING strategy: {available_memory_mb}"
293
+ f"MB / {memory_per_replica_mb}MB = {calculated}, limited to {result}"
294
+ )
295
+ return result
296
+
297
+ elif strategy == ReplicaCalculationStrategy.MEMORY_STATIC_GLOBAL_PERCENT:
298
+ # For dynamic scaling, this strategy behaves like memory_thresholding but with global threshold
299
+ memory_per_replica_mb = strategy_config.memory_per_replica_mb or 1000
300
+ # Use dynamic memory threshold from pipeline config, fallback to 0.8
301
+ dynamic_threshold = getattr(self._config.pipeline, "dynamic_memory_threshold", 0.8)
302
+ available_memory_mb = int(self._system_resource_probe.total_memory_mb * dynamic_threshold)
303
+ calculated = max(1, available_memory_mb // memory_per_replica_mb)
304
+ limit = strategy_config.limit or calculated
305
+ result = min(calculated, limit)
306
+ logger.debug(
307
+ f"Stage '{stage_name}': MEMORY_STATIC_GLOBAL_PERCENT strategy (dynamic): "
308
+ f"{available_memory_mb}MB / {memory_per_replica_mb}MB = {calculated}, limited to {result}"
309
+ )
310
+ return result
311
+
312
+ else:
313
+ logger.warning(f"Unknown replica strategy '{strategy}' for stage '{stage_name}', defaulting to 1 replica")
314
+ return 1
315
+
316
+ def _validate_dependencies(self) -> None:
317
+ """
318
+ Validates stage dependencies, checking for undefined stages and circular dependencies.
319
+
320
+ Raises
321
+ ------
322
+ ValueError
323
+ If a stage has an invalid dependency (points to a non-existent stage)
324
+ or if a circular dependency is detected among the stages.
325
+ """
326
+ all_stage_names = {s.name for s in self._config.stages}
327
+ dependency_graph = {s.name: s.runs_after for s in self._config.stages}
328
+
329
+ # First, check for dependencies on non-existent stages
330
+ for stage_name, deps in dependency_graph.items():
331
+ for dep_name in deps:
332
+ if dep_name not in all_stage_names:
333
+ raise ValueError(
334
+ f"Stage '{stage_name}' has an invalid dependency: '{dep_name}' is not a defined stage."
335
+ )
336
+
337
+ # Second, check for circular dependencies using DFS
338
+ visiting = set() # For nodes currently in the recursion stack for DFS
339
+ visited = set() # For nodes that have been completely visited
340
+
341
+ for stage_name in all_stage_names:
342
+ if stage_name not in visited:
343
+ self._detect_cycle_util(stage_name, dependency_graph, visiting, visited)
344
+
345
+ def _detect_cycle_util(self, stage_name: str, graph: Dict[str, List[str]], visiting: set, visited: set) -> None:
346
+ """Utility function to detect cycles using DFS."""
347
+ visiting.add(stage_name)
348
+
349
+ for dependency in graph.get(stage_name, []):
350
+ if dependency in visiting:
351
+ raise ValueError(f"Circular dependency detected involving stage '{stage_name}' and '{dependency}'.")
352
+ if dependency not in visited:
353
+ self._detect_cycle_util(dependency, graph, visiting, visited)
354
+
355
+ visiting.remove(stage_name)
356
+ visited.add(stage_name)
357
+
358
+ def start(self) -> None:
359
+ """
360
+ Starts the underlying RayPipeline, making it ready to process data.
361
+
362
+ Raises
363
+ ------
364
+ RuntimeError
365
+ If the pipeline has not been built by calling `build()` first.
366
+ """
367
+ if not self._is_built:
368
+ raise RuntimeError("Pipeline has not been built yet. Call build() before start().")
369
+ logger.info("Starting the ingestion pipeline...")
370
+ self._pipeline.start()
371
+
372
+ def stop(self) -> None:
373
+ """
374
+ Stops the underlying RayPipeline gracefully.
375
+ """
376
+ if self._pipeline:
377
+ logger.info("Stopping the ingestion pipeline...")
378
+ self._pipeline.stop()
379
+
380
+ def get_pipeline(self) -> RayPipeline:
381
+ """
382
+ Returns the underlying RayPipeline instance.
383
+
384
+ Returns
385
+ -------
386
+ RayPipeline
387
+ The raw RayPipeline object.
388
+ """
389
+ return self._pipeline