nv-ingest 25.7.7.dev20250707__py3-none-any.whl → 25.8.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/api/v1/health.py +1 -1
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +8 -7
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +65 -303
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +438 -163
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +30 -3
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +159 -230
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +32 -11
- nv_ingest/framework/orchestration/ray/util/env_config.py +75 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +7 -72
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +161 -2
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +25 -12
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +55 -28
- {nv_ingest-25.7.7.dev20250707.dist-info → nv_ingest-25.8.0rc2.dist-info}/METADATA +2 -5
- {nv_ingest-25.7.7.dev20250707.dist-info → nv_ingest-25.8.0rc2.dist-info}/RECORD +17 -16
- {nv_ingest-25.7.7.dev20250707.dist-info → nv_ingest-25.8.0rc2.dist-info}/WHEEL +0 -0
- {nv_ingest-25.7.7.dev20250707.dist-info → nv_ingest-25.8.0rc2.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-25.7.7.dev20250707.dist-info → nv_ingest-25.8.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
logger = logging.getLogger(__name__)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def str_to_bool(value: str) -> bool:
|
|
8
|
+
"""
|
|
9
|
+
Convert string to boolean value.
|
|
10
|
+
|
|
11
|
+
Parameters
|
|
12
|
+
----------
|
|
13
|
+
value : str
|
|
14
|
+
String value to convert
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
bool
|
|
19
|
+
Boolean representation of the string
|
|
20
|
+
"""
|
|
21
|
+
return value.strip().lower() in {"1", "true", "yes", "on"}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_env_var(name: str, default, var_type=None):
|
|
25
|
+
"""
|
|
26
|
+
Get environment variable with type conversion and default value.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
name : str
|
|
31
|
+
Environment variable name
|
|
32
|
+
default : Any
|
|
33
|
+
Default value if environment variable is not set
|
|
34
|
+
var_type : type, optional
|
|
35
|
+
Type to convert to. If None, infers from default value type
|
|
36
|
+
|
|
37
|
+
Returns
|
|
38
|
+
-------
|
|
39
|
+
Any
|
|
40
|
+
Environment variable value converted to the appropriate type
|
|
41
|
+
"""
|
|
42
|
+
value = os.environ.get(name)
|
|
43
|
+
if value is None:
|
|
44
|
+
return default
|
|
45
|
+
|
|
46
|
+
# Determine type from default if not explicitly provided
|
|
47
|
+
target_type = var_type or type(default)
|
|
48
|
+
|
|
49
|
+
# Handle boolean conversion specially
|
|
50
|
+
if target_type is bool:
|
|
51
|
+
return str_to_bool(value)
|
|
52
|
+
|
|
53
|
+
# For other types, use direct conversion
|
|
54
|
+
try:
|
|
55
|
+
return target_type(value)
|
|
56
|
+
except (ValueError, TypeError) as e:
|
|
57
|
+
logger.warning(
|
|
58
|
+
f"Failed to convert environment variable {name}='{value}' to \
|
|
59
|
+
{target_type.__name__}. Using default: {default}, error: {e}"
|
|
60
|
+
)
|
|
61
|
+
return default
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# Dynamic Memory Scaling Configuration
|
|
65
|
+
DISABLE_DYNAMIC_SCALING = get_env_var("INGEST_DISABLE_DYNAMIC_SCALING", False, bool)
|
|
66
|
+
DYNAMIC_MEMORY_THRESHOLD = get_env_var("INGEST_DYNAMIC_MEMORY_THRESHOLD", 0.75, float)
|
|
67
|
+
DYNAMIC_MEMORY_KP = get_env_var("INGEST_DYNAMIC_MEMORY_KP", 0.2, float)
|
|
68
|
+
DYNAMIC_MEMORY_KI = get_env_var("INGEST_DYNAMIC_MEMORY_KI", 0.01, float)
|
|
69
|
+
DYNAMIC_MEMORY_EMA_ALPHA = get_env_var("INGEST_DYNAMIC_MEMORY_EMA_ALPHA", 0.1, float)
|
|
70
|
+
DYNAMIC_MEMORY_TARGET_QUEUE_DEPTH = get_env_var("INGEST_DYNAMIC_MEMORY_TARGET_QUEUE_DEPTH", 0, int)
|
|
71
|
+
DYNAMIC_MEMORY_PENALTY_FACTOR = get_env_var("INGEST_DYNAMIC_MEMORY_PENALTY_FACTOR", 0.1, float)
|
|
72
|
+
DYNAMIC_MEMORY_ERROR_BOOST_FACTOR = get_env_var("INGEST_DYNAMIC_MEMORY_ERROR_BOOST_FACTOR", 1.5, float)
|
|
73
|
+
DYNAMIC_MEMORY_RCM_MEMORY_SAFETY_BUFFER_FRACTION = get_env_var(
|
|
74
|
+
"INGEST_DYNAMIC_MEMORY_RCM_MEMORY_SAFETY_BUFFER_FRACTION", 0.15, float
|
|
75
|
+
)
|
|
@@ -6,9 +6,7 @@ import logging
|
|
|
6
6
|
import math
|
|
7
7
|
from dataclasses import dataclass
|
|
8
8
|
|
|
9
|
-
import
|
|
10
|
-
from collections import deque
|
|
11
|
-
from typing import Dict, Any, Deque, List, Tuple, Optional
|
|
9
|
+
from typing import Dict, Any, List, Tuple, Optional
|
|
12
10
|
|
|
13
11
|
from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
|
|
14
12
|
|
|
@@ -16,7 +14,7 @@ logging.basicConfig(level=logging.INFO)
|
|
|
16
14
|
logger = logging.getLogger(__name__)
|
|
17
15
|
|
|
18
16
|
# --- Constants ---
|
|
19
|
-
DEFAULT_STAGE_COST_MB =
|
|
17
|
+
DEFAULT_STAGE_COST_MB = 5_000.0 # Fallback memory cost
|
|
20
18
|
|
|
21
19
|
|
|
22
20
|
@dataclass
|
|
@@ -46,9 +44,7 @@ class PIDController:
|
|
|
46
44
|
kp: float,
|
|
47
45
|
ki: float,
|
|
48
46
|
kd: float, # Currently unused in delta calculation
|
|
49
|
-
stage_cost_estimates: Dict[str, int], # Static estimates (MB)
|
|
50
47
|
target_queue_depth: int = 0,
|
|
51
|
-
window_size: int = 10,
|
|
52
48
|
penalty_factor: float = 0.0005,
|
|
53
49
|
error_boost_factor: float = 1.5,
|
|
54
50
|
):
|
|
@@ -64,16 +60,10 @@ class PIDController:
|
|
|
64
60
|
kd : float
|
|
65
61
|
Derivative gain. Reacts to the rate of change of the error.
|
|
66
62
|
(Currently set to 0 in internal calculations).
|
|
67
|
-
stage_cost_estimates : Dict[str, int]
|
|
68
|
-
Static estimated memory cost (in MB) per replica for each stage.
|
|
69
|
-
Used as a fallback and minimum for dynamic estimates.
|
|
70
63
|
target_queue_depth : int, optional
|
|
71
64
|
Default target queue depth for stages if not specified in metrics,
|
|
72
65
|
by default 0. The PID loop tries to drive the queue depth towards
|
|
73
66
|
this value.
|
|
74
|
-
window_size : int, optional
|
|
75
|
-
Number of recent samples used for dynamic memory cost estimation
|
|
76
|
-
per replica, by default 10.
|
|
77
67
|
penalty_factor : float, optional
|
|
78
68
|
Multiplier applied to the number of consecutive idle cycles for a
|
|
79
69
|
stage. The resulting penalty effectively lowers the target queue
|
|
@@ -90,16 +80,11 @@ class PIDController:
|
|
|
90
80
|
self.error_boost_factor = error_boost_factor
|
|
91
81
|
|
|
92
82
|
# Per-Stage State
|
|
93
|
-
self.stage_cost_estimates = {
|
|
94
|
-
name: float(max(cost, 1.0)) for name, cost in stage_cost_estimates.items() # Ensure float and min 1MB
|
|
95
|
-
}
|
|
96
83
|
self.integral_error: Dict[str, float] = {}
|
|
97
84
|
self.prev_error: Dict[str, float] = {}
|
|
98
|
-
self.memory_history: Dict[str, Deque[float]] = {} # Per-replica memory history (MB)
|
|
99
85
|
self.idle_cycles: Dict[str, int] = {}
|
|
100
86
|
|
|
101
87
|
# Per-Stage Config
|
|
102
|
-
self.window_size = window_size
|
|
103
88
|
self.penalty_factor = penalty_factor
|
|
104
89
|
|
|
105
90
|
# --- Private Methods ---
|
|
@@ -110,48 +95,7 @@ class PIDController:
|
|
|
110
95
|
logger.debug(f"[PID-{stage}] Initializing state.")
|
|
111
96
|
self.integral_error[stage] = 0.0
|
|
112
97
|
self.prev_error[stage] = 0.0
|
|
113
|
-
self.memory_history[stage] = deque(maxlen=self.window_size)
|
|
114
98
|
self.idle_cycles[stage] = 0
|
|
115
|
-
# Ensure static cost estimate exists, provide default if missing
|
|
116
|
-
if stage not in self.stage_cost_estimates:
|
|
117
|
-
logger.warning(f"[PID-{stage}] Missing static cost estimate. Using default {DEFAULT_STAGE_COST_MB}MB.")
|
|
118
|
-
self.stage_cost_estimates[stage] = DEFAULT_STAGE_COST_MB
|
|
119
|
-
|
|
120
|
-
def _get_conservative_cost_estimate(self, stage: str) -> float:
|
|
121
|
-
"""
|
|
122
|
-
Estimates dynamic memory cost, using static estimate as a floor/max.
|
|
123
|
-
|
|
124
|
-
Returns the maximum of the recent average dynamic cost per replica
|
|
125
|
-
and the static estimate provided during initialization. This provides
|
|
126
|
-
a conservative value for resource projection.
|
|
127
|
-
|
|
128
|
-
Parameters
|
|
129
|
-
----------
|
|
130
|
-
stage : str
|
|
131
|
-
The name of the stage.
|
|
132
|
-
|
|
133
|
-
Returns
|
|
134
|
-
-------
|
|
135
|
-
float
|
|
136
|
-
The conservative memory cost estimate in MB per replica.
|
|
137
|
-
"""
|
|
138
|
-
static_cost = self.stage_cost_estimates.get(stage, DEFAULT_STAGE_COST_MB)
|
|
139
|
-
memory_samples = self.memory_history.get(stage)
|
|
140
|
-
|
|
141
|
-
# Use numpy.mean if samples exist, otherwise fallback to static
|
|
142
|
-
if memory_samples and len(memory_samples) > 0:
|
|
143
|
-
try:
|
|
144
|
-
dynamic_avg = float(np.mean(memory_samples))
|
|
145
|
-
# Use max(dynamic, static) for projection, enforce min 1MB
|
|
146
|
-
cost = max(dynamic_avg, static_cost, 1.0)
|
|
147
|
-
return cost
|
|
148
|
-
except Exception as e:
|
|
149
|
-
logger.error(
|
|
150
|
-
f"[PID-{stage}] Error calculating mean of memory samples: {e}. Falling back to static cost.",
|
|
151
|
-
exc_info=False,
|
|
152
|
-
)
|
|
153
|
-
return max(static_cost, 1.0) # Fallback safely
|
|
154
|
-
return max(static_cost, 1.0) # Fallback to static estimate if no history
|
|
155
99
|
|
|
156
100
|
# --- Public Method ---
|
|
157
101
|
|
|
@@ -167,8 +111,8 @@ class PIDController:
|
|
|
167
111
|
----------
|
|
168
112
|
stage_metrics : Dict[str, Dict[str, Any]]
|
|
169
113
|
Dictionary mapping stage names to their current metrics. Expected keys
|
|
170
|
-
per stage: 'replicas', 'queue_depth'
|
|
171
|
-
'target_queue_depth', 'processing', 'min_replicas', 'max_replicas'.
|
|
114
|
+
per stage: 'replicas', 'queue_depth', 'ema_memory_per_replica'.
|
|
115
|
+
Optional: 'target_queue_depth', 'processing', 'min_replicas', 'max_replicas'.
|
|
172
116
|
|
|
173
117
|
Returns
|
|
174
118
|
-------
|
|
@@ -185,16 +129,9 @@ class PIDController:
|
|
|
185
129
|
|
|
186
130
|
# --- Extract data and calculate current memory state ---
|
|
187
131
|
replicas = metrics.get("replicas", 0)
|
|
188
|
-
#
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
# Calculate memory per replica safely (avoid division by zero)
|
|
192
|
-
current_memory_per_replica = memory_usage / max(replicas, 1.0)
|
|
193
|
-
|
|
194
|
-
# Update memory history *before* calculating the conservative cost for *this* cycle's proposal
|
|
195
|
-
self.memory_history[stage].append(current_memory_per_replica)
|
|
196
|
-
# Recalculate conservative cost *after* updating history for the proposal
|
|
197
|
-
conservative_cost = self._get_conservative_cost_estimate(stage)
|
|
132
|
+
# The conservative cost is now the EMA memory passed in from the stats collector.
|
|
133
|
+
# Fallback to a default if not present.
|
|
134
|
+
conservative_cost = metrics.get("ema_memory_per_replica", DEFAULT_STAGE_COST_MB)
|
|
198
135
|
|
|
199
136
|
# --- PID Calculation ---
|
|
200
137
|
queue_depth = metrics.get("queue_depth", 0)
|
|
@@ -296,7 +233,6 @@ class ResourceConstraintManager:
|
|
|
296
233
|
self,
|
|
297
234
|
max_replicas: int,
|
|
298
235
|
memory_threshold: int,
|
|
299
|
-
estimated_edge_cost_mb: int,
|
|
300
236
|
memory_safety_buffer_fraction: float,
|
|
301
237
|
):
|
|
302
238
|
"""
|
|
@@ -309,7 +245,6 @@ class ResourceConstraintManager:
|
|
|
309
245
|
|
|
310
246
|
self.max_replicas = max_replicas
|
|
311
247
|
self.memory_threshold_mb = memory_threshold
|
|
312
|
-
self.estimated_edge_cost_mb = estimated_edge_cost_mb # Keep track, though unused
|
|
313
248
|
self.memory_safety_buffer_fraction = memory_safety_buffer_fraction # Unused
|
|
314
249
|
self.effective_memory_limit_mb = self.memory_threshold_mb
|
|
315
250
|
|
|
@@ -9,6 +9,7 @@ import os
|
|
|
9
9
|
from typing import Dict, Any
|
|
10
10
|
|
|
11
11
|
import ray
|
|
12
|
+
from ray import LoggingConfig
|
|
12
13
|
from pydantic import BaseModel
|
|
13
14
|
|
|
14
15
|
from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import RayPipeline
|
|
@@ -47,16 +48,174 @@ def export_config_to_env(ingest_config: Any) -> None:
|
|
|
47
48
|
os.environ.update({key.upper(): val for key, val in ingest_config.items()})
|
|
48
49
|
|
|
49
50
|
|
|
51
|
+
def build_logging_config_from_env() -> LoggingConfig:
|
|
52
|
+
"""
|
|
53
|
+
Build Ray LoggingConfig from environment variables.
|
|
54
|
+
|
|
55
|
+
Package-level preset (sets all defaults):
|
|
56
|
+
- INGEST_RAY_LOG_LEVEL: PRODUCTION, DEVELOPMENT, DEBUG. Default: DEVELOPMENT
|
|
57
|
+
|
|
58
|
+
Individual environment variables (override preset defaults):
|
|
59
|
+
- RAY_LOGGING_LEVEL: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL). Default: INFO
|
|
60
|
+
- RAY_LOGGING_ENCODING: Log encoding format (TEXT, JSON). Default: TEXT
|
|
61
|
+
- RAY_LOGGING_ADDITIONAL_ATTRS: Comma-separated list of additional standard logger attributes
|
|
62
|
+
- RAY_DEDUP_LOGS: Enable/disable log deduplication (0/1). Default: 1 (enabled)
|
|
63
|
+
- RAY_LOG_TO_DRIVER: Enable/disable logging to driver (true/false). Default: true
|
|
64
|
+
- RAY_LOGGING_ROTATE_BYTES: Maximum log file size before rotation (bytes). Default: 1GB
|
|
65
|
+
- RAY_LOGGING_ROTATE_BACKUP_COUNT: Number of backup log files to keep. Default: 19
|
|
66
|
+
- RAY_DISABLE_IMPORT_WARNING: Disable Ray import warnings (0/1). Default: 0
|
|
67
|
+
- RAY_USAGE_STATS_ENABLED: Enable/disable usage stats collection (0/1). Default: 1
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
# Apply package-level preset defaults first
|
|
71
|
+
preset_level = os.environ.get("INGEST_RAY_LOG_LEVEL", "DEVELOPMENT").upper()
|
|
72
|
+
|
|
73
|
+
# Define preset configurations
|
|
74
|
+
presets = {
|
|
75
|
+
"PRODUCTION": {
|
|
76
|
+
"RAY_LOGGING_LEVEL": "ERROR",
|
|
77
|
+
"RAY_LOGGING_ENCODING": "TEXT",
|
|
78
|
+
"RAY_LOGGING_ADDITIONAL_ATTRS": "",
|
|
79
|
+
"RAY_DEDUP_LOGS": "1",
|
|
80
|
+
"RAY_LOG_TO_DRIVER": "0", # false
|
|
81
|
+
"RAY_LOGGING_ROTATE_BYTES": "1073741824", # 1GB
|
|
82
|
+
"RAY_LOGGING_ROTATE_BACKUP_COUNT": "9", # 10GB total
|
|
83
|
+
"RAY_DISABLE_IMPORT_WARNING": "1",
|
|
84
|
+
"RAY_USAGE_STATS_ENABLED": "0",
|
|
85
|
+
},
|
|
86
|
+
"DEVELOPMENT": {
|
|
87
|
+
"RAY_LOGGING_LEVEL": "INFO",
|
|
88
|
+
"RAY_LOGGING_ENCODING": "TEXT",
|
|
89
|
+
"RAY_LOGGING_ADDITIONAL_ATTRS": "",
|
|
90
|
+
"RAY_DEDUP_LOGS": "1",
|
|
91
|
+
"RAY_LOG_TO_DRIVER": "1", # true
|
|
92
|
+
"RAY_LOGGING_ROTATE_BYTES": "1073741824", # 1GB
|
|
93
|
+
"RAY_LOGGING_ROTATE_BACKUP_COUNT": "19", # 20GB total
|
|
94
|
+
"RAY_DISABLE_IMPORT_WARNING": "0",
|
|
95
|
+
"RAY_USAGE_STATS_ENABLED": "1",
|
|
96
|
+
},
|
|
97
|
+
"DEBUG": {
|
|
98
|
+
"RAY_LOGGING_LEVEL": "DEBUG",
|
|
99
|
+
"RAY_LOGGING_ENCODING": "JSON",
|
|
100
|
+
"RAY_LOGGING_ADDITIONAL_ATTRS": "name,funcName,lineno",
|
|
101
|
+
"RAY_DEDUP_LOGS": "0",
|
|
102
|
+
"RAY_LOG_TO_DRIVER": "1", # true
|
|
103
|
+
"RAY_LOGGING_ROTATE_BYTES": "536870912", # 512MB
|
|
104
|
+
"RAY_LOGGING_ROTATE_BACKUP_COUNT": "39", # 20GB total
|
|
105
|
+
"RAY_DISABLE_IMPORT_WARNING": "0",
|
|
106
|
+
"RAY_USAGE_STATS_ENABLED": "1",
|
|
107
|
+
},
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
# Validate preset level
|
|
111
|
+
if preset_level not in presets:
|
|
112
|
+
logger.warning(
|
|
113
|
+
f"Invalid INGEST_RAY_LOG_LEVEL '{preset_level}', using DEVELOPMENT. "
|
|
114
|
+
f"Valid presets: {list(presets.keys())}"
|
|
115
|
+
)
|
|
116
|
+
preset_level = "DEVELOPMENT"
|
|
117
|
+
|
|
118
|
+
# Apply preset defaults (only if env var not already set)
|
|
119
|
+
preset_config = presets[preset_level]
|
|
120
|
+
for key, default_value in preset_config.items():
|
|
121
|
+
if key not in os.environ:
|
|
122
|
+
os.environ[key] = default_value
|
|
123
|
+
|
|
124
|
+
logger.info(f"Applied Ray logging preset: {preset_level}")
|
|
125
|
+
|
|
126
|
+
# Get log level from environment, default to INFO
|
|
127
|
+
log_level = os.environ.get("RAY_LOGGING_LEVEL", "INFO").upper()
|
|
128
|
+
|
|
129
|
+
# Validate log level
|
|
130
|
+
valid_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
|
|
131
|
+
if log_level not in valid_levels:
|
|
132
|
+
logger.warning(f"Invalid RAY_LOGGING_LEVEL '{log_level}', using INFO. Valid levels: {valid_levels}")
|
|
133
|
+
log_level = "INFO"
|
|
134
|
+
|
|
135
|
+
# Get encoding format from environment, default to TEXT
|
|
136
|
+
encoding = os.environ.get("RAY_LOGGING_ENCODING", "TEXT").upper()
|
|
137
|
+
|
|
138
|
+
# Validate encoding
|
|
139
|
+
valid_encodings = ["TEXT", "JSON"]
|
|
140
|
+
if encoding not in valid_encodings:
|
|
141
|
+
logger.warning(f"Invalid RAY_LOGGING_ENCODING '{encoding}', using TEXT. Valid encodings: {valid_encodings}")
|
|
142
|
+
encoding = "TEXT"
|
|
143
|
+
|
|
144
|
+
# Get additional standard logger attributes
|
|
145
|
+
additional_attrs_str = os.environ.get("RAY_LOGGING_ADDITIONAL_ATTRS", "")
|
|
146
|
+
additional_log_standard_attrs = []
|
|
147
|
+
if additional_attrs_str:
|
|
148
|
+
additional_log_standard_attrs = [attr.strip() for attr in additional_attrs_str.split(",") if attr.strip()]
|
|
149
|
+
|
|
150
|
+
# Set log deduplication environment variable if specified
|
|
151
|
+
dedup_logs = os.environ.get("RAY_DEDUP_LOGS", "1")
|
|
152
|
+
if dedup_logs is not None:
|
|
153
|
+
os.environ["RAY_DEDUP_LOGS"] = str(dedup_logs)
|
|
154
|
+
|
|
155
|
+
# Set log to driver environment variable if specified
|
|
156
|
+
log_to_driver = os.environ.get("RAY_LOG_TO_DRIVER", "1")
|
|
157
|
+
if log_to_driver is not None:
|
|
158
|
+
os.environ["RAY_LOG_TO_DRIVER"] = str(log_to_driver).lower()
|
|
159
|
+
|
|
160
|
+
# Configure log rotation settings
|
|
161
|
+
rotate_bytes = os.environ.get("RAY_LOGGING_ROTATE_BYTES", "1073741824") # Default: 1GB per file
|
|
162
|
+
if rotate_bytes is not None:
|
|
163
|
+
try:
|
|
164
|
+
rotate_bytes_int = int(rotate_bytes)
|
|
165
|
+
os.environ["RAY_LOGGING_ROTATE_BYTES"] = str(rotate_bytes_int)
|
|
166
|
+
except ValueError:
|
|
167
|
+
logger.warning(f"Invalid RAY_LOGGING_ROTATE_BYTES '{rotate_bytes}', using default (1GB)")
|
|
168
|
+
os.environ["RAY_LOGGING_ROTATE_BYTES"] = "1073741824"
|
|
169
|
+
|
|
170
|
+
rotate_backup_count = os.environ.get("RAY_LOGGING_ROTATE_BACKUP_COUNT", "19") # Default: 19 backups (20GB Max)
|
|
171
|
+
if rotate_backup_count is not None:
|
|
172
|
+
try:
|
|
173
|
+
backup_count_int = int(rotate_backup_count)
|
|
174
|
+
os.environ["RAY_LOGGING_ROTATE_BACKUP_COUNT"] = str(backup_count_int)
|
|
175
|
+
except ValueError:
|
|
176
|
+
logger.warning(f"Invalid RAY_LOGGING_ROTATE_BACKUP_COUNT '{rotate_backup_count}', using default (19)")
|
|
177
|
+
os.environ["RAY_LOGGING_ROTATE_BACKUP_COUNT"] = "19"
|
|
178
|
+
|
|
179
|
+
# Configure Ray internal logging verbosity
|
|
180
|
+
disable_import_warning = os.environ.get("RAY_DISABLE_IMPORT_WARNING", "0")
|
|
181
|
+
if disable_import_warning is not None:
|
|
182
|
+
os.environ["RAY_DISABLE_IMPORT_WARNING"] = str(disable_import_warning)
|
|
183
|
+
|
|
184
|
+
# Configure usage stats collection
|
|
185
|
+
usage_stats_enabled = os.environ.get("RAY_USAGE_STATS_ENABLED", "1")
|
|
186
|
+
if usage_stats_enabled is not None:
|
|
187
|
+
os.environ["RAY_USAGE_STATS_ENABLED"] = str(usage_stats_enabled)
|
|
188
|
+
|
|
189
|
+
# Create LoggingConfig with validated parameters
|
|
190
|
+
logging_config = LoggingConfig(
|
|
191
|
+
encoding=encoding,
|
|
192
|
+
log_level=log_level,
|
|
193
|
+
additional_log_standard_attrs=additional_log_standard_attrs,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
logger.info(
|
|
197
|
+
f"Ray logging configured: preset={preset_level}, level={log_level}, encoding={encoding}, "
|
|
198
|
+
f"additional_attrs={additional_log_standard_attrs}, "
|
|
199
|
+
f"dedup_logs={os.environ.get('RAY_DEDUP_LOGS', '1')}, "
|
|
200
|
+
f"log_to_driver={os.environ.get('RAY_LOG_TO_DRIVER', 'true')}, "
|
|
201
|
+
f"rotate_bytes={os.environ.get('RAY_LOGGING_ROTATE_BYTES', '1073741824')}, "
|
|
202
|
+
f"rotate_backup_count={os.environ.get('RAY_LOGGING_ROTATE_BACKUP_COUNT', '19')}"
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
return logging_config
|
|
206
|
+
|
|
207
|
+
|
|
50
208
|
def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any] = None):
|
|
51
209
|
# Initialize the pipeline with the configuration
|
|
52
210
|
if ingest_config:
|
|
53
211
|
# Export the config to environment variables
|
|
54
212
|
export_config_to_env(ingest_config)
|
|
55
213
|
|
|
56
|
-
|
|
214
|
+
_ = logging.getLogger().getEffectiveLevel()
|
|
215
|
+
logging_config = build_logging_config_from_env()
|
|
57
216
|
ray_context = ray.init(
|
|
58
217
|
namespace="nv_ingest_ray",
|
|
59
|
-
|
|
218
|
+
logging_config=logging_config,
|
|
60
219
|
ignore_reinit_error=True,
|
|
61
220
|
dashboard_host="0.0.0.0",
|
|
62
221
|
dashboard_port=8265,
|
|
@@ -23,18 +23,21 @@ from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import (
|
|
|
23
23
|
RayPipelineInterface,
|
|
24
24
|
)
|
|
25
25
|
from nv_ingest.framework.orchestration.ray.util.pipeline.pipeline_builders import setup_ingestion_pipeline
|
|
26
|
+
from nv_ingest.framework.orchestration.ray.util.env_config import (
|
|
27
|
+
DISABLE_DYNAMIC_SCALING,
|
|
28
|
+
DYNAMIC_MEMORY_THRESHOLD,
|
|
29
|
+
DYNAMIC_MEMORY_KP,
|
|
30
|
+
DYNAMIC_MEMORY_KI,
|
|
31
|
+
DYNAMIC_MEMORY_EMA_ALPHA,
|
|
32
|
+
DYNAMIC_MEMORY_TARGET_QUEUE_DEPTH,
|
|
33
|
+
DYNAMIC_MEMORY_PENALTY_FACTOR,
|
|
34
|
+
DYNAMIC_MEMORY_ERROR_BOOST_FACTOR,
|
|
35
|
+
DYNAMIC_MEMORY_RCM_MEMORY_SAFETY_BUFFER_FRACTION,
|
|
36
|
+
)
|
|
26
37
|
|
|
27
38
|
logger = logging.getLogger(__name__)
|
|
28
39
|
|
|
29
40
|
|
|
30
|
-
def str_to_bool(value: str) -> bool:
|
|
31
|
-
return value.strip().lower() in {"1", "true", "yes", "on"}
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
DISABLE_DYNAMIC_SCALING = str_to_bool(os.environ.get("INGEST_DISABLE_DYNAMIC_SCALING", "false"))
|
|
35
|
-
DYNAMIC_MEMORY_THRESHOLD = float(os.environ.get("INGEST_DYNAMIC_MEMORY_THRESHOLD", 0.75))
|
|
36
|
-
|
|
37
|
-
|
|
38
41
|
class PipelineCreationSchema(BaseModel):
|
|
39
42
|
"""
|
|
40
43
|
Schema for pipeline creation configuration.
|
|
@@ -78,15 +81,17 @@ class PipelineCreationSchema(BaseModel):
|
|
|
78
81
|
otel_exporter_otlp_endpoint: str = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317")
|
|
79
82
|
|
|
80
83
|
# OCR settings
|
|
81
|
-
|
|
82
|
-
|
|
84
|
+
ocr_http_endpoint: str = os.getenv("OCR_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/baidu/paddleocr")
|
|
85
|
+
ocr_infer_protocol: str = os.getenv("OCR_INFER_PROTOCOL", "http")
|
|
86
|
+
ocr_model_name: str = os.getenv("OCR_MODEL_NAME", "paddle")
|
|
83
87
|
|
|
84
88
|
# Task queue settings
|
|
85
89
|
REDIS_INGEST_TASK_QUEUE: str = "ingest_task_queue"
|
|
86
90
|
|
|
87
91
|
# Vision language model settings
|
|
88
92
|
vlm_caption_endpoint: str = os.getenv(
|
|
89
|
-
"VLM_CAPTION_ENDPOINT",
|
|
93
|
+
"VLM_CAPTION_ENDPOINT",
|
|
94
|
+
"https://integrate.api.nvidia.com/v1/chat/completions",
|
|
90
95
|
)
|
|
91
96
|
vlm_caption_model_name: str = os.getenv("VLM_CAPTION_MODEL_NAME", "nvidia/llama-3.1-nemotron-nano-vl-8b-v1")
|
|
92
97
|
|
|
@@ -233,7 +238,15 @@ def _launch_pipeline(
|
|
|
233
238
|
dynamic_memory_threshold = dynamic_memory_threshold if dynamic_memory_threshold else DYNAMIC_MEMORY_THRESHOLD
|
|
234
239
|
|
|
235
240
|
scaling_config = ScalingConfig(
|
|
236
|
-
dynamic_memory_scaling=dynamic_memory_scaling,
|
|
241
|
+
dynamic_memory_scaling=dynamic_memory_scaling,
|
|
242
|
+
dynamic_memory_threshold=dynamic_memory_threshold,
|
|
243
|
+
pid_kp=DYNAMIC_MEMORY_KP,
|
|
244
|
+
pid_ki=DYNAMIC_MEMORY_KI,
|
|
245
|
+
pid_ema_alpha=DYNAMIC_MEMORY_EMA_ALPHA,
|
|
246
|
+
pid_target_queue_depth=DYNAMIC_MEMORY_TARGET_QUEUE_DEPTH,
|
|
247
|
+
pid_penalty_factor=DYNAMIC_MEMORY_PENALTY_FACTOR,
|
|
248
|
+
pid_error_boost_factor=DYNAMIC_MEMORY_ERROR_BOOST_FACTOR,
|
|
249
|
+
rcm_memory_safety_buffer_fraction=DYNAMIC_MEMORY_RCM_MEMORY_SAFETY_BUFFER_FRACTION,
|
|
237
250
|
)
|
|
238
251
|
|
|
239
252
|
pipeline = RayPipeline(scaling_config=scaling_config)
|