nv-ingest 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.16.dev20250816__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (56) hide show
  1. nv_ingest/framework/orchestration/execution/__init__.py +3 -0
  2. nv_ingest/framework/orchestration/execution/helpers.py +85 -0
  3. nv_ingest/framework/orchestration/execution/options.py +112 -0
  4. nv_ingest/framework/orchestration/process/__init__.py +3 -0
  5. nv_ingest/framework/orchestration/process/dependent_services.py +55 -0
  6. nv_ingest/framework/orchestration/process/execution.py +497 -0
  7. nv_ingest/framework/orchestration/process/lifecycle.py +122 -0
  8. nv_ingest/framework/orchestration/process/strategies.py +182 -0
  9. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +1 -1
  10. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
  11. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +23 -23
  12. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
  13. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +8 -4
  14. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +16 -16
  15. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +9 -5
  16. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +8 -4
  17. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +10 -6
  18. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
  19. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +18 -17
  20. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
  21. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +14 -13
  22. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +15 -13
  23. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
  24. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
  25. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +22 -13
  26. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +10 -7
  27. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +10 -8
  28. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
  29. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
  30. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +71 -61
  31. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +7 -5
  32. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +8 -4
  33. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
  34. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
  35. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +7 -5
  36. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +13 -14
  37. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +18 -12
  38. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
  39. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
  40. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
  41. nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
  42. nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
  43. nv_ingest/pipeline/__init__.py +3 -0
  44. nv_ingest/pipeline/config/__init__.py +3 -0
  45. nv_ingest/pipeline/config/loaders.py +198 -0
  46. nv_ingest/pipeline/config/replica_resolver.py +227 -0
  47. nv_ingest/pipeline/default_pipeline_impl.py +517 -0
  48. nv_ingest/pipeline/ingest_pipeline.py +389 -0
  49. nv_ingest/pipeline/pipeline_schema.py +398 -0
  50. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/METADATA +1 -1
  51. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/RECORD +54 -40
  52. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
  53. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
  54. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/WHEEL +0 -0
  55. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/licenses/LICENSE +0 -0
  56. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,227 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Runtime replica resolution for static scaling mode.
7
+
8
+ This module provides functionality to resolve replica counts for stages using
9
+ non-static strategies when dynamic scaling is disabled, ensuring total memory
10
+ consumption stays within the static_memory_threshold.
11
+ """
12
+
13
+ import logging
14
+ from typing import List
15
+ from copy import deepcopy
16
+
17
+ from nv_ingest.pipeline.pipeline_schema import (
18
+ PipelineConfigSchema,
19
+ StageConfig,
20
+ ReplicaCalculationStrategy,
21
+ ReplicaStrategyConfig,
22
+ )
23
+ from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ def resolve_static_replicas(pipeline_config: PipelineConfigSchema) -> PipelineConfigSchema:
29
+ """
30
+ Resolve static replica counts for all stages when dynamic scaling is disabled.
31
+
32
+ This function calculates the static replica counts for stages using non-static
33
+ strategies, ensuring the total memory consumption stays within the configured
34
+ static_memory_threshold. If the total exceeds the threshold, all non-static
35
+ stages are scaled down proportionally (minimum 1 replica each).
36
+
37
+ Parameters
38
+ ----------
39
+ pipeline_config : PipelineConfigSchema
40
+ The pipeline configuration with potentially unresolved replica strategies.
41
+
42
+ Returns
43
+ -------
44
+ PipelineConfigSchema
45
+ A new pipeline configuration with all static replica counts resolved.
46
+ """
47
+ # Only resolve if dynamic scaling is disabled
48
+ if not pipeline_config.pipeline.disable_dynamic_scaling:
49
+ logger.debug("Dynamic scaling enabled, skipping static replica resolution")
50
+ return pipeline_config
51
+
52
+ logger.info("Resolving static replica counts for disabled dynamic scaling mode")
53
+
54
+ # Create a deep copy to avoid modifying the original config
55
+ resolved_config = deepcopy(pipeline_config)
56
+
57
+ # Get system resource information
58
+ system_probe = SystemResourceProbe()
59
+ total_memory_mb = system_probe.total_memory_mb
60
+ available_memory_mb = int(total_memory_mb * resolved_config.pipeline.static_memory_threshold)
61
+
62
+ logger.info(
63
+ f"System memory: {total_memory_mb}MB, available for static replicas: {available_memory_mb}MB "
64
+ f"(threshold: {resolved_config.pipeline.static_memory_threshold:.1%})"
65
+ )
66
+
67
+ # Find stages with non-static strategies and calculate their baseline replica counts
68
+ non_static_stages = []
69
+ total_memory_demand_mb = 0
70
+
71
+ for stage in resolved_config.stages:
72
+ if stage.replicas and stage.replicas.static_replicas:
73
+ if isinstance(stage.replicas.static_replicas, ReplicaStrategyConfig):
74
+ strategy_config = stage.replicas.static_replicas
75
+ baseline_replicas = _calculate_baseline_static_replicas(
76
+ stage, strategy_config, system_probe, resolved_config.pipeline.static_memory_threshold
77
+ )
78
+
79
+ memory_per_replica_mb = strategy_config.memory_per_replica_mb or 0
80
+ stage_memory_demand = baseline_replicas * memory_per_replica_mb
81
+
82
+ non_static_stages.append(
83
+ {
84
+ "stage": stage,
85
+ "strategy_config": strategy_config,
86
+ "baseline_replicas": baseline_replicas,
87
+ "memory_per_replica_mb": memory_per_replica_mb,
88
+ "baseline_memory_demand_mb": stage_memory_demand,
89
+ }
90
+ )
91
+
92
+ total_memory_demand_mb += stage_memory_demand
93
+
94
+ logger.debug(
95
+ f"Stage '{stage.name}': {baseline_replicas} replicas × "
96
+ f"{memory_per_replica_mb}MB = {stage_memory_demand}MB"
97
+ )
98
+
99
+ if not non_static_stages:
100
+ logger.info("No stages with non-static strategies found")
101
+ return resolved_config
102
+
103
+ logger.info(f"Total baseline memory demand: {total_memory_demand_mb}MB from {len(non_static_stages)} stages")
104
+
105
+ # Check if we need to scale down
106
+ if total_memory_demand_mb <= available_memory_mb:
107
+ logger.info("Memory demand within threshold, applying baseline replica counts")
108
+ scaling_factor = 1.0
109
+ else:
110
+ # Calculate scaling factor to fit within memory threshold
111
+ scaling_factor = available_memory_mb / total_memory_demand_mb
112
+ logger.warning(
113
+ f"Memory demand exceeds threshold by {((total_memory_demand_mb / available_memory_mb) - 1) * 100:.1f}%, "
114
+ f"scaling down by factor of {scaling_factor:.3f}"
115
+ )
116
+
117
+ # Apply the resolved replica counts
118
+ total_actual_memory_mb = 0
119
+ for stage_info in non_static_stages:
120
+ stage = stage_info["stage"]
121
+ baseline_replicas = stage_info["baseline_replicas"]
122
+ memory_per_replica_mb = stage_info["memory_per_replica_mb"]
123
+
124
+ # Calculate scaled replica count (minimum 1)
125
+ scaled_replicas = max(1, int(baseline_replicas * scaling_factor))
126
+ actual_memory_mb = scaled_replicas * memory_per_replica_mb
127
+ total_actual_memory_mb += actual_memory_mb
128
+
129
+ # Replace the strategy config with a static replica count
130
+ stage.replicas.static_replicas = scaled_replicas
131
+
132
+ logger.info(
133
+ f"Stage '{stage.name}': {baseline_replicas} → {scaled_replicas} replicas " f"({actual_memory_mb}MB)"
134
+ )
135
+
136
+ logger.info(
137
+ f"Total actual memory allocation: {total_actual_memory_mb}MB "
138
+ f"({(total_actual_memory_mb / total_memory_mb) * 100:.1f}% of system memory)"
139
+ )
140
+
141
+ return resolved_config
142
+
143
+
144
+ def _calculate_baseline_static_replicas(
145
+ stage: StageConfig,
146
+ strategy_config: ReplicaStrategyConfig,
147
+ system_probe: SystemResourceProbe,
148
+ static_memory_threshold: float = 0.75,
149
+ ) -> int:
150
+ """
151
+ Calculate the baseline static replica count for a stage based on its strategy.
152
+
153
+ Parameters
154
+ ----------
155
+ stage : StageConfig
156
+ The stage configuration.
157
+ strategy_config : ReplicaStrategyConfig
158
+ The replica strategy configuration.
159
+ system_probe : SystemResourceProbe
160
+ System resource information.
161
+ static_memory_threshold : float, optional
162
+ The global static memory threshold (default: 0.75).
163
+
164
+ Returns
165
+ -------
166
+ int
167
+ The calculated baseline replica count.
168
+ """
169
+ strategy = strategy_config.strategy
170
+
171
+ if strategy == ReplicaCalculationStrategy.STATIC:
172
+ return strategy_config.value or 1
173
+
174
+ elif strategy == ReplicaCalculationStrategy.CPU_PERCENTAGE:
175
+ cpu_percent = strategy_config.cpu_percent or 0.5
176
+ limit = strategy_config.limit or system_probe.cpu_count
177
+ calculated = max(1, int(system_probe.cpu_count * cpu_percent))
178
+ return min(calculated, limit)
179
+
180
+ elif strategy == ReplicaCalculationStrategy.MEMORY_THRESHOLDING:
181
+ # For memory thresholding, use a conservative approach for static mode
182
+ memory_per_replica_mb = strategy_config.memory_per_replica_mb or 1000
183
+ available_memory_mb = int(system_probe.total_memory_mb * 0.7) # Conservative 70%
184
+ calculated = max(1, available_memory_mb // memory_per_replica_mb)
185
+ limit = strategy_config.limit or calculated
186
+ return min(calculated, limit)
187
+
188
+ elif strategy == ReplicaCalculationStrategy.MEMORY_STATIC_GLOBAL_PERCENT:
189
+ # Use the global static memory threshold for calculation
190
+ memory_per_replica_mb = strategy_config.memory_per_replica_mb or 1000
191
+ available_memory_mb = int(system_probe.total_memory_mb * static_memory_threshold)
192
+ calculated = max(1, available_memory_mb // memory_per_replica_mb)
193
+ limit = strategy_config.limit or calculated
194
+ return min(calculated, limit)
195
+
196
+ else:
197
+ logger.warning(f"Unknown replica strategy '{strategy}' for stage '{stage.name}', defaulting to 1 replica")
198
+ return 1
199
+
200
+
201
+ def get_memory_intensive_stages(pipeline_config: PipelineConfigSchema) -> List[str]:
202
+ """
203
+ Identify stages that are memory-intensive and may need special handling.
204
+
205
+ Parameters
206
+ ----------
207
+ pipeline_config : PipelineConfigSchema
208
+ The pipeline configuration.
209
+
210
+ Returns
211
+ -------
212
+ List[str]
213
+ List of stage names that are memory-intensive.
214
+ """
215
+ memory_intensive_stages = []
216
+
217
+ for stage in pipeline_config.stages:
218
+ if stage.replicas and stage.replicas.static_replicas:
219
+ if isinstance(stage.replicas.static_replicas, ReplicaStrategyConfig):
220
+ strategy_config = stage.replicas.static_replicas
221
+ memory_per_replica_mb = strategy_config.memory_per_replica_mb or 0
222
+
223
+ # Consider stages using >5GB per replica as memory-intensive
224
+ if memory_per_replica_mb > 5000:
225
+ memory_intensive_stages.append(stage.name)
226
+
227
+ return memory_intensive_stages