nv-ingest 2025.8.4.dev20250804__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/api/__init__.py +6 -0
- nv_ingest/api/main.py +2 -0
- nv_ingest/api/tracing.py +82 -0
- nv_ingest/api/v2/README.md +203 -0
- nv_ingest/api/v2/__init__.py +3 -0
- nv_ingest/api/v2/ingest.py +1300 -0
- nv_ingest/framework/orchestration/execution/__init__.py +3 -0
- nv_ingest/framework/orchestration/execution/helpers.py +85 -0
- nv_ingest/framework/orchestration/execution/options.py +112 -0
- nv_ingest/framework/orchestration/process/__init__.py +3 -0
- nv_ingest/framework/orchestration/process/dependent_services.py +84 -0
- nv_ingest/framework/orchestration/process/execution.py +495 -0
- nv_ingest/framework/orchestration/process/lifecycle.py +214 -0
- nv_ingest/framework/orchestration/process/strategies.py +218 -0
- nv_ingest/framework/orchestration/process/termination.py +147 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +3 -3
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +32 -38
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +10 -7
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +17 -14
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +11 -6
- nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +12 -7
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
- nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +19 -15
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +16 -14
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +16 -13
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +92 -4
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +12 -8
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +12 -9
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +116 -69
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +79 -11
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +12 -6
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +17 -18
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +21 -14
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
- nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
- nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
- nv_ingest/pipeline/__init__.py +3 -0
- nv_ingest/pipeline/config/__init__.py +3 -0
- nv_ingest/pipeline/config/loaders.py +229 -0
- nv_ingest/pipeline/config/replica_resolver.py +237 -0
- nv_ingest/pipeline/default_libmode_pipeline_impl.py +528 -0
- nv_ingest/pipeline/default_pipeline_impl.py +557 -0
- nv_ingest/pipeline/ingest_pipeline.py +389 -0
- nv_ingest/pipeline/pipeline_schema.py +398 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/METADATA +6 -3
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/RECORD +64 -43
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Configuration loading and management functions for pipeline execution.
|
|
7
|
+
|
|
8
|
+
This module provides declarative functions for loading, validating, and applying
|
|
9
|
+
runtime overrides to pipeline configurations, replacing imperative inline logic.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import yaml
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
from nv_ingest.pipeline.pipeline_schema import PipelineConfigSchema
|
|
17
|
+
from nv_ingest.pipeline.default_libmode_pipeline_impl import DEFAULT_LIBMODE_PIPELINE_YAML
|
|
18
|
+
from nv_ingest.pipeline.default_pipeline_impl import DEFAULT_PIPELINE_YAML
|
|
19
|
+
from nv_ingest.framework.orchestration.execution.options import PipelineRuntimeOverrides
|
|
20
|
+
from nv_ingest_api.util.string_processing.yaml import substitute_env_vars_in_yaml_content
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def load_pipeline_config(config_path: str) -> PipelineConfigSchema:
|
|
26
|
+
"""
|
|
27
|
+
Load a pipeline configuration file, substituting environment variables.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
config_path : str
|
|
32
|
+
The path to the YAML configuration file.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
PipelineConfigSchema
|
|
37
|
+
A validated PipelineConfigSchema object.
|
|
38
|
+
|
|
39
|
+
Raises
|
|
40
|
+
------
|
|
41
|
+
ValueError
|
|
42
|
+
If the YAML file cannot be parsed after environment variable substitution.
|
|
43
|
+
"""
|
|
44
|
+
logger.info(f"Loading pipeline configuration from: {config_path}")
|
|
45
|
+
|
|
46
|
+
# Read the raw YAML file content
|
|
47
|
+
with open(config_path, "r") as f:
|
|
48
|
+
raw_content = f.read()
|
|
49
|
+
|
|
50
|
+
# Substitute all environment variable placeholders using the utility function
|
|
51
|
+
substituted_content = substitute_env_vars_in_yaml_content(raw_content)
|
|
52
|
+
|
|
53
|
+
# Parse the substituted content with PyYAML, with error handling
|
|
54
|
+
try:
|
|
55
|
+
processed_config = yaml.safe_load(substituted_content)
|
|
56
|
+
except yaml.YAMLError as e:
|
|
57
|
+
error_message = (
|
|
58
|
+
f"Failed to parse YAML after environment variable substitution. "
|
|
59
|
+
f"Error: {e}\n\n"
|
|
60
|
+
f"--- Substituted Content ---\n{substituted_content}\n---------------------------"
|
|
61
|
+
)
|
|
62
|
+
raise ValueError(error_message) from e
|
|
63
|
+
|
|
64
|
+
# Pydantic validates the clean, substituted data against the schema
|
|
65
|
+
return PipelineConfigSchema(**processed_config)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def load_default_pipeline_config() -> PipelineConfigSchema:
|
|
69
|
+
"""
|
|
70
|
+
Load and validate the embedded default (non-libmode) pipeline configuration.
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
PipelineConfigSchema
|
|
75
|
+
Validated default pipeline configuration.
|
|
76
|
+
|
|
77
|
+
Raises
|
|
78
|
+
------
|
|
79
|
+
ValueError
|
|
80
|
+
If the default YAML cannot be parsed or validated.
|
|
81
|
+
"""
|
|
82
|
+
logger.info("Loading embedded default pipeline configuration")
|
|
83
|
+
|
|
84
|
+
substituted_content = substitute_env_vars_in_yaml_content(DEFAULT_PIPELINE_YAML)
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
processed_config = yaml.safe_load(substituted_content)
|
|
88
|
+
except yaml.YAMLError as e:
|
|
89
|
+
error_message = (
|
|
90
|
+
f"Failed to parse embedded default pipeline YAML after environment variable substitution. Error: {e}"
|
|
91
|
+
)
|
|
92
|
+
raise ValueError(error_message) from e
|
|
93
|
+
|
|
94
|
+
return PipelineConfigSchema(**processed_config)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def load_default_libmode_config() -> PipelineConfigSchema:
|
|
98
|
+
"""
|
|
99
|
+
Load and validate the default libmode pipeline configuration.
|
|
100
|
+
|
|
101
|
+
This function loads the embedded default libmode pipeline YAML,
|
|
102
|
+
performs environment variable substitution, and returns a validated
|
|
103
|
+
configuration object.
|
|
104
|
+
|
|
105
|
+
Returns
|
|
106
|
+
-------
|
|
107
|
+
PipelineConfigSchema
|
|
108
|
+
Validated default libmode pipeline configuration.
|
|
109
|
+
|
|
110
|
+
Raises
|
|
111
|
+
------
|
|
112
|
+
ValueError
|
|
113
|
+
If the default YAML cannot be parsed or validated.
|
|
114
|
+
"""
|
|
115
|
+
logger.info("Loading default libmode pipeline configuration")
|
|
116
|
+
|
|
117
|
+
# Substitute environment variables in the YAML content
|
|
118
|
+
substituted_content = substitute_env_vars_in_yaml_content(DEFAULT_LIBMODE_PIPELINE_YAML)
|
|
119
|
+
|
|
120
|
+
# Parse the substituted content with PyYAML
|
|
121
|
+
try:
|
|
122
|
+
processed_config = yaml.safe_load(substituted_content)
|
|
123
|
+
except yaml.YAMLError as e:
|
|
124
|
+
error_message = (
|
|
125
|
+
f"Failed to parse default libmode pipeline YAML after environment variable substitution. " f"Error: {e}"
|
|
126
|
+
)
|
|
127
|
+
raise ValueError(error_message) from e
|
|
128
|
+
|
|
129
|
+
# Create and return validated PipelineConfigSchema
|
|
130
|
+
return PipelineConfigSchema(**processed_config)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def apply_runtime_overrides(config: PipelineConfigSchema, overrides: PipelineRuntimeOverrides) -> PipelineConfigSchema:
|
|
134
|
+
"""
|
|
135
|
+
Apply runtime parameter overrides to a pipeline configuration.
|
|
136
|
+
|
|
137
|
+
This function creates a copy of the provided configuration and applies
|
|
138
|
+
any non-None override values to the pipeline runtime settings.
|
|
139
|
+
|
|
140
|
+
Parameters
|
|
141
|
+
----------
|
|
142
|
+
config : PipelineConfigSchema
|
|
143
|
+
Base pipeline configuration to modify.
|
|
144
|
+
overrides : PipelineRuntimeOverrides
|
|
145
|
+
Runtime overrides to apply. Only non-None values are applied.
|
|
146
|
+
|
|
147
|
+
Returns
|
|
148
|
+
-------
|
|
149
|
+
PipelineConfigSchema
|
|
150
|
+
Modified configuration with overrides applied.
|
|
151
|
+
"""
|
|
152
|
+
# Create a copy to avoid modifying the original
|
|
153
|
+
modified_config = config.model_copy(deep=True)
|
|
154
|
+
|
|
155
|
+
# Apply overrides if provided
|
|
156
|
+
if overrides.disable_dynamic_scaling is not None:
|
|
157
|
+
modified_config.pipeline.disable_dynamic_scaling = overrides.disable_dynamic_scaling
|
|
158
|
+
logger.debug(f"Applied dynamic scaling override: {overrides.disable_dynamic_scaling}")
|
|
159
|
+
|
|
160
|
+
if overrides.dynamic_memory_threshold is not None:
|
|
161
|
+
modified_config.pipeline.dynamic_memory_threshold = overrides.dynamic_memory_threshold
|
|
162
|
+
logger.debug(f"Applied memory threshold override: {overrides.dynamic_memory_threshold}")
|
|
163
|
+
|
|
164
|
+
return modified_config
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def validate_pipeline_config(config: Optional[PipelineConfigSchema]) -> PipelineConfigSchema:
|
|
168
|
+
"""
|
|
169
|
+
Validate and ensure a pipeline configuration is available.
|
|
170
|
+
|
|
171
|
+
This function ensures that a valid pipeline configuration is available,
|
|
172
|
+
either from the provided config or by loading the default libmode config.
|
|
173
|
+
|
|
174
|
+
Parameters
|
|
175
|
+
----------
|
|
176
|
+
config : Optional[PipelineConfigSchema]
|
|
177
|
+
Pipeline configuration to validate, or None to load default.
|
|
178
|
+
|
|
179
|
+
Returns
|
|
180
|
+
-------
|
|
181
|
+
PipelineConfigSchema
|
|
182
|
+
Validated pipeline configuration.
|
|
183
|
+
|
|
184
|
+
Raises
|
|
185
|
+
------
|
|
186
|
+
ValueError
|
|
187
|
+
If config is None and default config cannot be loaded.
|
|
188
|
+
"""
|
|
189
|
+
if config is None:
|
|
190
|
+
return load_default_libmode_config()
|
|
191
|
+
|
|
192
|
+
# Config is already validated by Pydantic, just return it
|
|
193
|
+
return config
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def resolve_pipeline_config(provided_config: Optional[PipelineConfigSchema], libmode: bool) -> PipelineConfigSchema:
|
|
197
|
+
"""
|
|
198
|
+
Resolve the final pipeline configuration from inputs.
|
|
199
|
+
|
|
200
|
+
This function implements the configuration resolution logic:
|
|
201
|
+
- If config provided: use it
|
|
202
|
+
- If libmode=True and no config: load default libmode config
|
|
203
|
+
- If libmode=False and no config: raise error
|
|
204
|
+
|
|
205
|
+
Parameters
|
|
206
|
+
----------
|
|
207
|
+
provided_config : Optional[PipelineConfigSchema]
|
|
208
|
+
User-provided pipeline configuration, or None.
|
|
209
|
+
libmode : bool
|
|
210
|
+
Whether to allow loading default libmode configuration.
|
|
211
|
+
|
|
212
|
+
Returns
|
|
213
|
+
-------
|
|
214
|
+
PipelineConfigSchema
|
|
215
|
+
Resolved and validated pipeline configuration.
|
|
216
|
+
|
|
217
|
+
Raises
|
|
218
|
+
------
|
|
219
|
+
ValueError
|
|
220
|
+
If no config provided and libmode=False.
|
|
221
|
+
"""
|
|
222
|
+
if provided_config is not None:
|
|
223
|
+
return provided_config
|
|
224
|
+
|
|
225
|
+
if libmode:
|
|
226
|
+
return load_default_libmode_config()
|
|
227
|
+
else:
|
|
228
|
+
# For non-libmode, fall back to embedded default pipeline implementation
|
|
229
|
+
return load_default_pipeline_config()
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Runtime replica resolution for static scaling mode.
|
|
7
|
+
|
|
8
|
+
This module provides functionality to resolve replica counts for stages using
|
|
9
|
+
non-static strategies when dynamic scaling is disabled, ensuring total memory
|
|
10
|
+
consumption stays within the static_memory_threshold.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
from typing import List
|
|
16
|
+
from copy import deepcopy
|
|
17
|
+
|
|
18
|
+
from nv_ingest.pipeline.pipeline_schema import (
|
|
19
|
+
PipelineConfigSchema,
|
|
20
|
+
StageConfig,
|
|
21
|
+
ReplicaCalculationStrategy,
|
|
22
|
+
ReplicaStrategyConfig,
|
|
23
|
+
)
|
|
24
|
+
from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def resolve_static_replicas(pipeline_config: PipelineConfigSchema) -> PipelineConfigSchema:
|
|
30
|
+
"""
|
|
31
|
+
Resolve static replica counts for all stages when dynamic scaling is disabled.
|
|
32
|
+
|
|
33
|
+
This function calculates the static replica counts for stages using non-static
|
|
34
|
+
strategies, ensuring the total memory consumption stays within the configured
|
|
35
|
+
static_memory_threshold. If the total exceeds the threshold, all non-static
|
|
36
|
+
stages are scaled down proportionally (minimum 1 replica each).
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
pipeline_config : PipelineConfigSchema
|
|
41
|
+
The pipeline configuration with potentially unresolved replica strategies.
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
-------
|
|
45
|
+
PipelineConfigSchema
|
|
46
|
+
A new pipeline configuration with all static replica counts resolved.
|
|
47
|
+
"""
|
|
48
|
+
# Only resolve if dynamic scaling is disabled
|
|
49
|
+
if not pipeline_config.pipeline.disable_dynamic_scaling:
|
|
50
|
+
logger.debug("Dynamic scaling enabled, skipping static replica resolution")
|
|
51
|
+
return pipeline_config
|
|
52
|
+
|
|
53
|
+
logger.info("Resolving static replica counts for disabled dynamic scaling mode")
|
|
54
|
+
|
|
55
|
+
# Create a deep copy to avoid modifying the original config
|
|
56
|
+
resolved_config = deepcopy(pipeline_config)
|
|
57
|
+
|
|
58
|
+
# Get system resource information
|
|
59
|
+
system_probe = SystemResourceProbe()
|
|
60
|
+
total_memory_mb = system_probe.total_memory_mb
|
|
61
|
+
available_memory_mb = int(total_memory_mb * resolved_config.pipeline.static_memory_threshold)
|
|
62
|
+
|
|
63
|
+
logger.info(
|
|
64
|
+
f"System memory: {total_memory_mb}MB, available for static replicas: {available_memory_mb}MB "
|
|
65
|
+
f"(threshold: {resolved_config.pipeline.static_memory_threshold:.1%})"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Find stages with non-static strategies and calculate their baseline replica counts
|
|
69
|
+
non_static_stages = []
|
|
70
|
+
total_memory_demand_mb = 0
|
|
71
|
+
|
|
72
|
+
for stage in resolved_config.stages:
|
|
73
|
+
if stage.replicas and stage.replicas.static_replicas:
|
|
74
|
+
if isinstance(stage.replicas.static_replicas, ReplicaStrategyConfig):
|
|
75
|
+
strategy_config = stage.replicas.static_replicas
|
|
76
|
+
baseline_replicas = _calculate_baseline_static_replicas(
|
|
77
|
+
stage, strategy_config, system_probe, resolved_config.pipeline.static_memory_threshold
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
memory_per_replica_mb = strategy_config.memory_per_replica_mb or 0
|
|
81
|
+
stage_memory_demand = baseline_replicas * memory_per_replica_mb
|
|
82
|
+
|
|
83
|
+
non_static_stages.append(
|
|
84
|
+
{
|
|
85
|
+
"stage": stage,
|
|
86
|
+
"strategy_config": strategy_config,
|
|
87
|
+
"baseline_replicas": baseline_replicas,
|
|
88
|
+
"memory_per_replica_mb": memory_per_replica_mb,
|
|
89
|
+
"baseline_memory_demand_mb": stage_memory_demand,
|
|
90
|
+
}
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
total_memory_demand_mb += stage_memory_demand
|
|
94
|
+
|
|
95
|
+
logger.debug(
|
|
96
|
+
f"Stage '{stage.name}': {baseline_replicas} replicas × "
|
|
97
|
+
f"{memory_per_replica_mb}MB = {stage_memory_demand}MB"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if not non_static_stages:
|
|
101
|
+
logger.info("No stages with non-static strategies found")
|
|
102
|
+
return resolved_config
|
|
103
|
+
|
|
104
|
+
logger.info(f"Total baseline memory demand: {total_memory_demand_mb}MB from {len(non_static_stages)} stages")
|
|
105
|
+
|
|
106
|
+
# Optional bypass of global memory-based scale down via environment variable
|
|
107
|
+
bypass_env = os.getenv("NV_INGEST_BYPASS_STATIC_MEMORY_SCALE_DOWN", "").strip().lower()
|
|
108
|
+
bypass_scale_down = bypass_env in ("1", "true", "yes", "on")
|
|
109
|
+
|
|
110
|
+
# Check if we need to scale down (unless bypassed)
|
|
111
|
+
if bypass_scale_down:
|
|
112
|
+
logger.warning(
|
|
113
|
+
"Bypassing static memory-based replica scale-down due to NV_INGEST_BYPASS_STATIC_MEMORY_SCALE_DOWN"
|
|
114
|
+
)
|
|
115
|
+
scaling_factor = 1.0
|
|
116
|
+
elif total_memory_demand_mb <= available_memory_mb:
|
|
117
|
+
logger.info("Memory demand within threshold, applying baseline replica counts")
|
|
118
|
+
scaling_factor = 1.0
|
|
119
|
+
else:
|
|
120
|
+
# Calculate scaling factor to fit within memory threshold
|
|
121
|
+
scaling_factor = available_memory_mb / total_memory_demand_mb
|
|
122
|
+
logger.warning(
|
|
123
|
+
f"Memory demand exceeds threshold by {((total_memory_demand_mb / available_memory_mb) - 1) * 100:.1f}%, "
|
|
124
|
+
f"scaling down by factor of {scaling_factor:.3f}"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Apply the resolved replica counts
|
|
128
|
+
total_actual_memory_mb = 0
|
|
129
|
+
for stage_info in non_static_stages:
|
|
130
|
+
stage = stage_info["stage"]
|
|
131
|
+
baseline_replicas = stage_info["baseline_replicas"]
|
|
132
|
+
memory_per_replica_mb = stage_info["memory_per_replica_mb"]
|
|
133
|
+
|
|
134
|
+
# Calculate scaled replica count (minimum 1)
|
|
135
|
+
scaled_replicas = max(1, int(baseline_replicas * scaling_factor))
|
|
136
|
+
actual_memory_mb = scaled_replicas * memory_per_replica_mb
|
|
137
|
+
total_actual_memory_mb += actual_memory_mb
|
|
138
|
+
|
|
139
|
+
# Replace the strategy config with a static replica count
|
|
140
|
+
stage.replicas.static_replicas = scaled_replicas
|
|
141
|
+
|
|
142
|
+
logger.info(
|
|
143
|
+
f"Stage '{stage.name}': {baseline_replicas} → {scaled_replicas} replicas " f"({actual_memory_mb}MB)"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
logger.info(
|
|
147
|
+
f"Total actual memory allocation: {total_actual_memory_mb}MB "
|
|
148
|
+
f"({(total_actual_memory_mb / total_memory_mb) * 100:.1f}% of system memory)"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
return resolved_config
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _calculate_baseline_static_replicas(
|
|
155
|
+
stage: StageConfig,
|
|
156
|
+
strategy_config: ReplicaStrategyConfig,
|
|
157
|
+
system_probe: SystemResourceProbe,
|
|
158
|
+
static_memory_threshold: float = 0.75,
|
|
159
|
+
) -> int:
|
|
160
|
+
"""
|
|
161
|
+
Calculate the baseline static replica count for a stage based on its strategy.
|
|
162
|
+
|
|
163
|
+
Parameters
|
|
164
|
+
----------
|
|
165
|
+
stage : StageConfig
|
|
166
|
+
The stage configuration.
|
|
167
|
+
strategy_config : ReplicaStrategyConfig
|
|
168
|
+
The replica strategy configuration.
|
|
169
|
+
system_probe : SystemResourceProbe
|
|
170
|
+
System resource information.
|
|
171
|
+
static_memory_threshold : float, optional
|
|
172
|
+
The global static memory threshold (default: 0.75).
|
|
173
|
+
|
|
174
|
+
Returns
|
|
175
|
+
-------
|
|
176
|
+
int
|
|
177
|
+
The calculated baseline replica count.
|
|
178
|
+
"""
|
|
179
|
+
strategy = strategy_config.strategy
|
|
180
|
+
|
|
181
|
+
if strategy == ReplicaCalculationStrategy.STATIC:
|
|
182
|
+
return strategy_config.value or 1
|
|
183
|
+
|
|
184
|
+
elif strategy == ReplicaCalculationStrategy.CPU_PERCENTAGE:
|
|
185
|
+
cpu_percent = strategy_config.cpu_percent or 0.5
|
|
186
|
+
limit = strategy_config.limit or system_probe.cpu_count
|
|
187
|
+
calculated = max(1, int(system_probe.cpu_count * cpu_percent))
|
|
188
|
+
return min(calculated, limit)
|
|
189
|
+
|
|
190
|
+
elif strategy == ReplicaCalculationStrategy.MEMORY_THRESHOLDING:
|
|
191
|
+
# For memory thresholding, use a conservative approach for static mode
|
|
192
|
+
memory_per_replica_mb = strategy_config.memory_per_replica_mb or 1000
|
|
193
|
+
available_memory_mb = int(system_probe.total_memory_mb * 0.7) # Conservative 70%
|
|
194
|
+
calculated = max(1, available_memory_mb // memory_per_replica_mb)
|
|
195
|
+
limit = strategy_config.limit or calculated
|
|
196
|
+
return min(calculated, limit)
|
|
197
|
+
|
|
198
|
+
elif strategy == ReplicaCalculationStrategy.MEMORY_STATIC_GLOBAL_PERCENT:
|
|
199
|
+
# Use the global static memory threshold for calculation
|
|
200
|
+
memory_per_replica_mb = strategy_config.memory_per_replica_mb or 1000
|
|
201
|
+
available_memory_mb = int(system_probe.total_memory_mb * static_memory_threshold)
|
|
202
|
+
calculated = max(1, available_memory_mb // memory_per_replica_mb)
|
|
203
|
+
limit = strategy_config.limit or calculated
|
|
204
|
+
return min(calculated, limit)
|
|
205
|
+
|
|
206
|
+
else:
|
|
207
|
+
logger.warning(f"Unknown replica strategy '{strategy}' for stage '{stage.name}', defaulting to 1 replica")
|
|
208
|
+
return 1
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def get_memory_intensive_stages(pipeline_config: PipelineConfigSchema) -> List[str]:
|
|
212
|
+
"""
|
|
213
|
+
Identify stages that are memory-intensive and may need special handling.
|
|
214
|
+
|
|
215
|
+
Parameters
|
|
216
|
+
----------
|
|
217
|
+
pipeline_config : PipelineConfigSchema
|
|
218
|
+
The pipeline configuration.
|
|
219
|
+
|
|
220
|
+
Returns
|
|
221
|
+
-------
|
|
222
|
+
List[str]
|
|
223
|
+
List of stage names that are memory-intensive.
|
|
224
|
+
"""
|
|
225
|
+
memory_intensive_stages = []
|
|
226
|
+
|
|
227
|
+
for stage in pipeline_config.stages:
|
|
228
|
+
if stage.replicas and stage.replicas.static_replicas:
|
|
229
|
+
if isinstance(stage.replicas.static_replicas, ReplicaStrategyConfig):
|
|
230
|
+
strategy_config = stage.replicas.static_replicas
|
|
231
|
+
memory_per_replica_mb = strategy_config.memory_per_replica_mb or 0
|
|
232
|
+
|
|
233
|
+
# Consider stages using >5GB per replica as memory-intensive
|
|
234
|
+
if memory_per_replica_mb > 5000:
|
|
235
|
+
memory_intensive_stages.append(stage.name)
|
|
236
|
+
|
|
237
|
+
return memory_intensive_stages
|