hud-python 0.4.28__py3-none-any.whl → 0.4.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +2 -1
- hud/agents/base.py +73 -45
- hud/agents/claude.py +8 -4
- hud/agents/openai_chat_generic.py +65 -40
- hud/agents/tests/test_base.py +0 -4
- hud/agents/tests/test_openai.py +1 -1
- hud/cli/__init__.py +182 -52
- hud/cli/dev.py +8 -9
- hud/cli/eval.py +317 -119
- hud/cli/flows/__init__.py +0 -0
- hud/cli/flows/tasks.py +0 -0
- hud/cli/get.py +160 -0
- hud/cli/rl/__init__.py +563 -71
- hud/cli/rl/config.py +94 -0
- hud/cli/rl/display.py +133 -0
- hud/cli/rl/gpu.py +63 -0
- hud/cli/rl/gpu_utils.py +318 -0
- hud/cli/rl/presets.py +96 -0
- hud/cli/rl/remote_runner.py +348 -0
- hud/cli/rl/rl_api.py +150 -0
- hud/cli/rl/vllm.py +177 -0
- hud/cli/tests/test_analyze_metadata.py +0 -1
- hud/cli/utils/tasks.py +26 -0
- hud/clients/base.py +21 -23
- hud/clients/mcp_use.py +36 -44
- hud/clients/tests/test_mcp_use_retry.py +10 -10
- hud/datasets/__init__.py +4 -3
- hud/datasets/{execution/parallel.py → parallel.py} +1 -1
- hud/datasets/{execution/runner.py → runner.py} +1 -1
- hud/datasets/utils.py +1 -1
- hud/native/tests/test_native_init.py +1 -1
- hud/otel/config.py +1 -1
- hud/otel/instrumentation.py +35 -0
- hud/rl/README.md +31 -0
- hud/rl/__init__.py +1 -0
- hud/rl/actor.py +174 -0
- hud/rl/buffer.py +371 -0
- hud/rl/chat_template.jinja +101 -0
- hud/rl/config.py +184 -0
- hud/rl/distributed.py +95 -0
- hud/rl/learner.py +586 -0
- hud/rl/tests/__init__.py +1 -0
- hud/rl/tests/test_learner.py +171 -0
- hud/rl/train.py +354 -0
- hud/rl/types.py +101 -0
- hud/rl/utils/start_vllm_server.sh +30 -0
- hud/rl/utils.py +524 -0
- hud/rl/vllm_adapter.py +125 -0
- hud/settings.py +6 -0
- hud/telemetry/__init__.py +2 -1
- hud/telemetry/job.py +46 -3
- hud/telemetry/tests/test_trace.py +3 -3
- hud/telemetry/trace.py +85 -13
- hud/tools/tests/test_computer.py +3 -3
- hud/tools/tests/test_computer_actions.py +1 -1
- hud/types.py +123 -2
- hud/utils/group_eval.py +223 -0
- hud/utils/hud_console.py +113 -13
- hud/utils/tasks.py +119 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/METADATA +20 -2
- {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/RECORD +66 -46
- hud/cli/hf.py +0 -406
- hud/cli/rl/README.md +0 -243
- hud/cli/rl/init.py +0 -370
- hud/cli/rl/pod.py +0 -501
- hud/cli/rl/ssh.py +0 -322
- hud/cli/rl/train.py +0 -562
- hud/cli/rl/utils.py +0 -165
- hud/datasets/execution/__init__.py +0 -13
- hud/datasets/task.py +0 -116
- {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/WHEEL +0 -0
- {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/licenses/LICENSE +0 -0
hud/cli/rl/config.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Configuration generation and management for RL training."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
|
|
10
|
+
from hud.rl.config import Config, validate_vl_model
|
|
11
|
+
from hud.utils.hud_console import hud_console
|
|
12
|
+
|
|
13
|
+
from .display import display_preset_table
|
|
14
|
+
from .presets import estimate_memory_usage
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
console = Console()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def generate_config_interactive(
|
|
22
|
+
model_name: str,
|
|
23
|
+
presets: list[dict[str, Any]],
|
|
24
|
+
) -> tuple[Config, float]:
|
|
25
|
+
"""Generate RL training configuration interactively."""
|
|
26
|
+
# Validate model is a VL model
|
|
27
|
+
validate_vl_model(model_name)
|
|
28
|
+
|
|
29
|
+
# Display preset options
|
|
30
|
+
display_preset_table(presets, 80.0) # Assuming A100 80GB
|
|
31
|
+
|
|
32
|
+
# Let user select preset
|
|
33
|
+
preset_choice = hud_console.select(
|
|
34
|
+
"Select a training configuration preset:",
|
|
35
|
+
choices=[{"name": p["name"], "value": i} for i, p in enumerate(presets)],
|
|
36
|
+
default=1 if len(presets) > 1 else 0, # Default to "Balanced" if available
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
selected_preset = presets[preset_choice] # type: ignore
|
|
40
|
+
|
|
41
|
+
# Use preset values directly
|
|
42
|
+
max_steps_per_episode = selected_preset["max_steps_per_episode"]
|
|
43
|
+
|
|
44
|
+
# Calculate memory estimate
|
|
45
|
+
max_pixels = 256 * 28 * 28
|
|
46
|
+
estimated_memory = estimate_memory_usage(
|
|
47
|
+
selected_preset["mini_batch_size"],
|
|
48
|
+
max_steps_per_episode,
|
|
49
|
+
selected_preset["max_new_tokens"],
|
|
50
|
+
max_pixels,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
config_adds = {
|
|
54
|
+
"actor": {
|
|
55
|
+
"max_new_tokens": selected_preset["max_new_tokens"],
|
|
56
|
+
"max_parallel_episodes": selected_preset["batch_size"],
|
|
57
|
+
"max_steps_per_episode": selected_preset["max_steps_per_episode"],
|
|
58
|
+
"force_tool_choice": True,
|
|
59
|
+
},
|
|
60
|
+
"training": {
|
|
61
|
+
"mini_batch_size": selected_preset["mini_batch_size"],
|
|
62
|
+
"group_size": selected_preset["group_size"],
|
|
63
|
+
"batch_size": selected_preset["batch_size"],
|
|
64
|
+
"lr": selected_preset["lr"],
|
|
65
|
+
"epochs": selected_preset["epochs"],
|
|
66
|
+
},
|
|
67
|
+
"verbose": True,
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
# Create config
|
|
71
|
+
config = Config.from_dict(config_adds)
|
|
72
|
+
|
|
73
|
+
return config, estimated_memory
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def save_config(config: Config, path: Path) -> None:
|
|
77
|
+
"""Save configuration to a JSON file."""
|
|
78
|
+
config_dict = config.to_dict()
|
|
79
|
+
|
|
80
|
+
with open(path, "w") as f:
|
|
81
|
+
json.dump(config_dict, f, indent=2)
|
|
82
|
+
f.write("\n") # Add newline at end of file
|
|
83
|
+
|
|
84
|
+
if not path.name.startswith("."): # Don't show message for temp files
|
|
85
|
+
console.print(f"[green]✅ Configuration saved to {path}[/green]")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def load_config(path: Path) -> Config:
|
|
89
|
+
"""Load configuration from a JSON file."""
|
|
90
|
+
with open(path) as f:
|
|
91
|
+
data = json.load(f)
|
|
92
|
+
|
|
93
|
+
# Use Config.from_dict which handles missing fields gracefully
|
|
94
|
+
return Config.from_dict(data)
|
hud/cli/rl/display.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Display utilities for RL training configuration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from hud.rl.config import Config
|
|
12
|
+
|
|
13
|
+
console = Console()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def display_gpu_info(gpu_info: dict[str, Any]) -> None:
|
|
17
|
+
"""Display GPU information in a table."""
|
|
18
|
+
if not gpu_info["available"]:
|
|
19
|
+
console.print(f"[red]❌ CUDA not available: {gpu_info.get('error', 'Unknown error')}[/red]")
|
|
20
|
+
return
|
|
21
|
+
|
|
22
|
+
gpu_table = Table(title="🖥️ Available GPUs", title_style="bold cyan")
|
|
23
|
+
gpu_table.add_column("Index", style="yellow")
|
|
24
|
+
gpu_table.add_column("Name", style="cyan")
|
|
25
|
+
gpu_table.add_column("Memory", style="green")
|
|
26
|
+
|
|
27
|
+
for device in gpu_info["devices"]:
|
|
28
|
+
gpu_table.add_row(f"GPU {device['index']}", device["name"], f"{device['memory_gb']:.1f} GB")
|
|
29
|
+
|
|
30
|
+
console.print(gpu_table)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def display_preset_table(presets: list[dict[str, Any]], gpu_memory_gb: float) -> None:
|
|
34
|
+
"""Display training configuration presets in a table."""
|
|
35
|
+
preset_table = Table(title="📊 Training Configuration Presets", title_style="bold cyan")
|
|
36
|
+
preset_table.add_column("Option", style="yellow")
|
|
37
|
+
preset_table.add_column("Steps", style="cyan")
|
|
38
|
+
preset_table.add_column("Mini-batch", style="cyan")
|
|
39
|
+
preset_table.add_column("Group", style="cyan")
|
|
40
|
+
preset_table.add_column("Episodes/batch", style="cyan")
|
|
41
|
+
|
|
42
|
+
# Add time columns for A100
|
|
43
|
+
if gpu_memory_gb >= 40:
|
|
44
|
+
preset_table.add_column("Tasks/hour", style="green")
|
|
45
|
+
preset_table.add_column("Steps/hour", style="green")
|
|
46
|
+
|
|
47
|
+
for i, preset in enumerate(presets):
|
|
48
|
+
row = [
|
|
49
|
+
f"{i + 1}. {preset['name']}",
|
|
50
|
+
str(preset["max_steps_per_episode"]),
|
|
51
|
+
str(preset["mini_batch_size"]),
|
|
52
|
+
str(preset["group_size"]),
|
|
53
|
+
str(preset["batch_size"]),
|
|
54
|
+
]
|
|
55
|
+
if "tasks_per_hour" in preset:
|
|
56
|
+
row.extend(
|
|
57
|
+
[
|
|
58
|
+
str(preset["tasks_per_hour"]),
|
|
59
|
+
str(preset["steps_per_hour"]),
|
|
60
|
+
]
|
|
61
|
+
)
|
|
62
|
+
preset_table.add_row(*row)
|
|
63
|
+
|
|
64
|
+
console.print("\n")
|
|
65
|
+
console.print(preset_table)
|
|
66
|
+
console.print("\n")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def display_config_summary(
|
|
70
|
+
config: Config, tasks_count: int, gpu_info: dict[str, Any], estimated_memory: float
|
|
71
|
+
) -> None:
|
|
72
|
+
"""Display comprehensive configuration summary for review."""
|
|
73
|
+
console.print("\n[bold cyan]📋 RL Training Configuration Summary[/bold cyan]\n")
|
|
74
|
+
|
|
75
|
+
# GPU Information
|
|
76
|
+
if gpu_info["available"]:
|
|
77
|
+
gpu_table = Table(title="🖥️ GPU Information", title_style="bold yellow")
|
|
78
|
+
gpu_table.add_column("Property", style="cyan")
|
|
79
|
+
gpu_table.add_column("Value", style="green")
|
|
80
|
+
|
|
81
|
+
device = gpu_info["devices"][0] # Primary GPU
|
|
82
|
+
gpu_table.add_row("GPU 0", device["name"])
|
|
83
|
+
gpu_table.add_row("Memory", f"{device['memory_gb']:.1f} GB")
|
|
84
|
+
gpu_table.add_row("Compute Capability", "8.0") # Assuming A100
|
|
85
|
+
|
|
86
|
+
console.print(gpu_table)
|
|
87
|
+
|
|
88
|
+
# Model Configuration
|
|
89
|
+
model_table = Table(title="🤖 Model Configuration", title_style="bold yellow")
|
|
90
|
+
model_table.add_column("Parameter", style="cyan")
|
|
91
|
+
model_table.add_column("Value", style="green")
|
|
92
|
+
|
|
93
|
+
model_table.add_row("Base Model", config.model.base_model)
|
|
94
|
+
model_table.add_row("LoRA Rank (r)", str(config.model.lora_r))
|
|
95
|
+
model_table.add_row("LoRA Alpha", str(config.model.lora_alpha))
|
|
96
|
+
model_table.add_row("LoRA Dropout", str(config.model.lora_dropout))
|
|
97
|
+
|
|
98
|
+
console.print(model_table)
|
|
99
|
+
|
|
100
|
+
# Training Configuration
|
|
101
|
+
training_table = Table(title="🎯 Training Configuration", title_style="bold yellow")
|
|
102
|
+
training_table.add_column("Parameter", style="cyan")
|
|
103
|
+
training_table.add_column("Value", style="green")
|
|
104
|
+
|
|
105
|
+
training_table.add_row("Tasks Count", str(tasks_count))
|
|
106
|
+
training_table.add_row("Learning Rate", f"{config.training.lr:.1e}")
|
|
107
|
+
training_table.add_row("Epochs", str(config.training.epochs))
|
|
108
|
+
training_table.add_row("Mini Batch Size", str(config.training.mini_batch_size))
|
|
109
|
+
training_table.add_row("Batch Size", str(config.training.batch_size))
|
|
110
|
+
training_table.add_row("Group Size", str(config.training.group_size))
|
|
111
|
+
training_table.add_row("Training Steps", str(config.training.training_steps))
|
|
112
|
+
training_table.add_row("Max Parallel Episodes", str(config.actor.max_parallel_episodes))
|
|
113
|
+
|
|
114
|
+
console.print(training_table)
|
|
115
|
+
|
|
116
|
+
# Memory Estimation
|
|
117
|
+
memory_table = Table(title="💾 Memory Estimation", title_style="bold yellow")
|
|
118
|
+
memory_table.add_column("Metric", style="cyan")
|
|
119
|
+
memory_table.add_column("Value", style="green")
|
|
120
|
+
|
|
121
|
+
memory_table.add_row("Estimated GPU Memory", f"{estimated_memory:.1f} GB")
|
|
122
|
+
if gpu_info["available"]:
|
|
123
|
+
available_memory = gpu_info["devices"][0]["memory_gb"]
|
|
124
|
+
memory_table.add_row("Available GPU Memory", f"{available_memory:.1f} GB")
|
|
125
|
+
|
|
126
|
+
if estimated_memory > available_memory:
|
|
127
|
+
status = "[red]⚠️ May exceed available memory[/red]"
|
|
128
|
+
else:
|
|
129
|
+
status = "[green]✅ Within memory limits[/green]"
|
|
130
|
+
memory_table.add_row("Status", status)
|
|
131
|
+
|
|
132
|
+
console.print(memory_table)
|
|
133
|
+
console.print("\n")
|
hud/cli/rl/gpu.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""GPU detection and validation utilities for RL training."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import subprocess
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def detect_cuda_devices() -> dict[str, Any]:
|
|
10
|
+
"""Detect available CUDA devices and their properties."""
|
|
11
|
+
try:
|
|
12
|
+
# Check if CUDA is available
|
|
13
|
+
result = subprocess.run(
|
|
14
|
+
["nvidia-smi", "--query-gpu=index,name,memory.total", "--format=csv,noheader,nounits"], # noqa: S607
|
|
15
|
+
capture_output=True,
|
|
16
|
+
text=True,
|
|
17
|
+
check=True,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
if result.returncode != 0:
|
|
21
|
+
return {"available": False, "error": "nvidia-smi command failed"}
|
|
22
|
+
|
|
23
|
+
devices = []
|
|
24
|
+
for line in result.stdout.strip().split("\n"):
|
|
25
|
+
parts = line.split(", ")
|
|
26
|
+
if len(parts) >= 3:
|
|
27
|
+
devices.append(
|
|
28
|
+
{
|
|
29
|
+
"index": int(parts[0]),
|
|
30
|
+
"name": parts[1],
|
|
31
|
+
"memory_gb": float(parts[2]) / 1024, # Convert MB to GB
|
|
32
|
+
}
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
return {"available": True, "devices": devices}
|
|
36
|
+
|
|
37
|
+
except FileNotFoundError:
|
|
38
|
+
return {
|
|
39
|
+
"available": False,
|
|
40
|
+
"error": "nvidia-smi not found - CUDA drivers may not be installed",
|
|
41
|
+
}
|
|
42
|
+
except Exception as e:
|
|
43
|
+
return {"available": False, "error": str(e)}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def select_gpu_for_vllm(devices: list[dict[str, Any]]) -> int:
|
|
47
|
+
"""Select the best GPU for vLLM server (typically GPU 1 if available)."""
|
|
48
|
+
if len(devices) > 1:
|
|
49
|
+
# Prefer GPU 1 for vLLM to leave GPU 0 for other processes
|
|
50
|
+
return 1
|
|
51
|
+
return 0
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def validate_gpu_memory(gpu_memory_gb: float, model_size: str = "3B") -> bool:
|
|
55
|
+
"""Validate if GPU has sufficient memory for the model."""
|
|
56
|
+
min_memory_requirements = {
|
|
57
|
+
"3B": 12.0, # Minimum for Qwen 2.5 VL 3B
|
|
58
|
+
"7B": 24.0,
|
|
59
|
+
"14B": 40.0,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
min_required = min_memory_requirements.get(model_size, 12.0)
|
|
63
|
+
return gpu_memory_gb >= min_required
|
hud/cli/rl/gpu_utils.py
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""GPU utilities for DDP training."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import subprocess
|
|
7
|
+
import time
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
import torch
|
|
11
|
+
|
|
12
|
+
from hud.utils.hud_console import HUDConsole
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from hud.rl.config import Config
|
|
16
|
+
hud_console = HUDConsole(logging.getLogger(__name__))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_gpu_memory_info() -> dict[int, dict[str, Any]]:
|
|
20
|
+
"""Get memory usage information for all GPUs."""
|
|
21
|
+
|
|
22
|
+
gpu_memory = {}
|
|
23
|
+
try:
|
|
24
|
+
# Get memory info for all GPUs
|
|
25
|
+
cmd = [
|
|
26
|
+
"nvidia-smi",
|
|
27
|
+
"--query-gpu=index,memory.used,memory.total,memory.free",
|
|
28
|
+
"--format=csv,noheader,nounits",
|
|
29
|
+
]
|
|
30
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True) # noqa: S603
|
|
31
|
+
|
|
32
|
+
for line in result.stdout.strip().split("\n"):
|
|
33
|
+
if not line:
|
|
34
|
+
continue
|
|
35
|
+
parts = line.split(", ")
|
|
36
|
+
if len(parts) >= 4:
|
|
37
|
+
gpu_idx = int(parts[0])
|
|
38
|
+
memory_used = float(parts[1])
|
|
39
|
+
memory_total = float(parts[2])
|
|
40
|
+
memory_free = float(parts[3])
|
|
41
|
+
gpu_memory[gpu_idx] = {
|
|
42
|
+
"used_mb": memory_used,
|
|
43
|
+
"total_mb": memory_total,
|
|
44
|
+
"free_mb": memory_free,
|
|
45
|
+
"used_pct": (memory_used / memory_total) * 100,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
# Get process information per GPU
|
|
49
|
+
for gpu_idx in gpu_memory: # noqa: PLC0206
|
|
50
|
+
cmd = [
|
|
51
|
+
"nvidia-smi",
|
|
52
|
+
"-i",
|
|
53
|
+
str(gpu_idx),
|
|
54
|
+
"--query-compute-apps=pid,used_memory",
|
|
55
|
+
"--format=csv,noheader,nounits",
|
|
56
|
+
]
|
|
57
|
+
try:
|
|
58
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True) # noqa: S603
|
|
59
|
+
processes = []
|
|
60
|
+
for line in result.stdout.strip().split("\n"):
|
|
61
|
+
if not line:
|
|
62
|
+
continue
|
|
63
|
+
parts = line.split(", ")
|
|
64
|
+
if len(parts) >= 2:
|
|
65
|
+
pid = int(parts[0])
|
|
66
|
+
memory_mb = float(parts[1])
|
|
67
|
+
processes.append({"pid": pid, "memory_mb": memory_mb})
|
|
68
|
+
gpu_memory[gpu_idx]["processes"] = processes
|
|
69
|
+
except Exception as e:
|
|
70
|
+
hud_console.error(f"Failed to get process info for GPU {gpu_idx}: {e}")
|
|
71
|
+
gpu_memory[gpu_idx]["processes"] = []
|
|
72
|
+
|
|
73
|
+
except Exception as e:
|
|
74
|
+
hud_console.error(f"Failed to get GPU memory info {e}")
|
|
75
|
+
return {}
|
|
76
|
+
|
|
77
|
+
return gpu_memory
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def health_check_gpus(gpu_indices: list[int]) -> dict[str, Any]:
|
|
81
|
+
"""Perform health check on specified GPUs including memory status.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Dict with:
|
|
85
|
+
- healthy_gpus: List of healthy GPU indices
|
|
86
|
+
- unhealthy_gpus: Dict of unhealthy GPU index -> error message
|
|
87
|
+
- all_healthy: Boolean indicating if all GPUs are healthy
|
|
88
|
+
- memory_issues: Boolean indicating if there are memory issues
|
|
89
|
+
"""
|
|
90
|
+
from rich.console import Console
|
|
91
|
+
from rich.table import Table
|
|
92
|
+
|
|
93
|
+
console = Console()
|
|
94
|
+
|
|
95
|
+
console.print("\n[bold cyan]🏥 GPU Health Check[/bold cyan]")
|
|
96
|
+
|
|
97
|
+
# First get memory info
|
|
98
|
+
memory_info = get_gpu_memory_info()
|
|
99
|
+
|
|
100
|
+
healthy_gpus = []
|
|
101
|
+
unhealthy_gpus = {}
|
|
102
|
+
memory_issues = []
|
|
103
|
+
|
|
104
|
+
# Create a table for results
|
|
105
|
+
table = Table(title="GPU Health Status")
|
|
106
|
+
table.add_column("GPU", style="cyan")
|
|
107
|
+
table.add_column("Memory Usage", style="yellow")
|
|
108
|
+
table.add_column("Status", style="green")
|
|
109
|
+
table.add_column("Details", style="yellow")
|
|
110
|
+
|
|
111
|
+
for gpu_idx in gpu_indices:
|
|
112
|
+
# Memory info
|
|
113
|
+
mem_str = "Unknown"
|
|
114
|
+
if gpu_idx in memory_info:
|
|
115
|
+
mem = memory_info[gpu_idx]
|
|
116
|
+
used_gb = mem["used_mb"] / 1024
|
|
117
|
+
total_gb = mem["total_mb"] / 1024
|
|
118
|
+
mem_str = f"{used_gb:.1f}/{total_gb:.1f} GB ({mem['used_pct']:.0f}%)"
|
|
119
|
+
|
|
120
|
+
# Check for high memory usage
|
|
121
|
+
if mem["used_pct"] > 70:
|
|
122
|
+
memory_issues.append(gpu_idx)
|
|
123
|
+
proc_info = f" ({len(mem['processes'])} processes)" if mem["processes"] else ""
|
|
124
|
+
unhealthy_gpus[gpu_idx] = f"High memory usage{proc_info}"
|
|
125
|
+
table.add_row(
|
|
126
|
+
f"GPU {gpu_idx}", mem_str, "❌ Unhealthy", f"High memory usage{proc_info}"
|
|
127
|
+
)
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
# If no severe memory issue, do accessibility test
|
|
131
|
+
try:
|
|
132
|
+
# Try to allocate a small tensor on the GPU
|
|
133
|
+
torch.cuda.set_device(gpu_idx)
|
|
134
|
+
device = torch.device(f"cuda:{gpu_idx}")
|
|
135
|
+
|
|
136
|
+
# Test basic allocation
|
|
137
|
+
test_tensor = torch.zeros(100, 100, device=device)
|
|
138
|
+
|
|
139
|
+
# Test computation
|
|
140
|
+
result = torch.matmul(test_tensor, test_tensor)
|
|
141
|
+
|
|
142
|
+
# Force synchronization
|
|
143
|
+
torch.cuda.synchronize(device)
|
|
144
|
+
|
|
145
|
+
# Clean up
|
|
146
|
+
del test_tensor, result
|
|
147
|
+
torch.cuda.empty_cache()
|
|
148
|
+
|
|
149
|
+
healthy_gpus.append(gpu_idx)
|
|
150
|
+
table.add_row(f"GPU {gpu_idx}", mem_str, "✅ Healthy", "Passed all tests")
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
error_msg = str(e)
|
|
154
|
+
if "busy or unavailable" in error_msg:
|
|
155
|
+
short_msg = "Device busy or unavailable"
|
|
156
|
+
elif "out of memory" in error_msg:
|
|
157
|
+
short_msg = "Insufficient memory"
|
|
158
|
+
else:
|
|
159
|
+
short_msg = error_msg[:50] + "..." if len(error_msg) > 50 else error_msg
|
|
160
|
+
|
|
161
|
+
unhealthy_gpus[gpu_idx] = short_msg
|
|
162
|
+
table.add_row(f"GPU {gpu_idx}", mem_str, "❌ Unhealthy", short_msg)
|
|
163
|
+
|
|
164
|
+
# Small delay between GPU checks
|
|
165
|
+
time.sleep(0.1)
|
|
166
|
+
|
|
167
|
+
console.print(table)
|
|
168
|
+
|
|
169
|
+
return {
|
|
170
|
+
"healthy_gpus": healthy_gpus,
|
|
171
|
+
"unhealthy_gpus": unhealthy_gpus,
|
|
172
|
+
"all_healthy": len(unhealthy_gpus) == 0,
|
|
173
|
+
"memory_issues": memory_issues,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def calculate_optimal_gpu_allocation(gpu_info: dict[str, Any], config: Config) -> dict[str, Any]:
|
|
178
|
+
"""Calculate optimal GPU allocation for DDP GRPO training.
|
|
179
|
+
|
|
180
|
+
Key insight: In GRPO, we want to process groups in parallel.
|
|
181
|
+
Optimal case: num_gpus = num_groups (each GPU processes 1 group).
|
|
182
|
+
"""
|
|
183
|
+
devices = gpu_info["devices"]
|
|
184
|
+
available_gpus = [device["index"] for device in devices]
|
|
185
|
+
|
|
186
|
+
# Need at least 2 GPUs (1 for training, 1 for vLLM)
|
|
187
|
+
if len(available_gpus) < 2:
|
|
188
|
+
return {"use_ddp": False, "reason": "Need at least 2 GPUs"}
|
|
189
|
+
|
|
190
|
+
# Reserve last GPU for vLLM
|
|
191
|
+
vllm_gpu = available_gpus[-1]
|
|
192
|
+
training_gpus = available_gpus[:-1]
|
|
193
|
+
|
|
194
|
+
# Calculate number of groups
|
|
195
|
+
batch_size = config.training.batch_size
|
|
196
|
+
group_size = config.training.group_size
|
|
197
|
+
num_groups = batch_size // group_size
|
|
198
|
+
|
|
199
|
+
if num_groups == 0:
|
|
200
|
+
num_groups = 1
|
|
201
|
+
|
|
202
|
+
# Optimal: Use exactly num_groups GPUs (each processes 1 group in parallel)
|
|
203
|
+
# But cap at available training GPUs
|
|
204
|
+
optimal_gpu_count = min(len(training_gpus), num_groups)
|
|
205
|
+
|
|
206
|
+
# Only use DDP if we have more than 1 group and more than 1 GPU
|
|
207
|
+
use_ddp = optimal_gpu_count > 1 and num_groups > 1
|
|
208
|
+
|
|
209
|
+
if not use_ddp:
|
|
210
|
+
# Single GPU training
|
|
211
|
+
return {
|
|
212
|
+
"use_ddp": False,
|
|
213
|
+
"reason": f"Single GPU sufficient for {num_groups} group(s)",
|
|
214
|
+
"training_gpus": [training_gpus[0]],
|
|
215
|
+
"vllm_gpu": vllm_gpu,
|
|
216
|
+
"num_groups": num_groups,
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
# Use optimal number of GPUs for DDP
|
|
220
|
+
training_gpus = training_gpus[:optimal_gpu_count]
|
|
221
|
+
|
|
222
|
+
return {
|
|
223
|
+
"use_ddp": True,
|
|
224
|
+
"training_gpus": training_gpus,
|
|
225
|
+
"vllm_gpu": vllm_gpu,
|
|
226
|
+
"num_groups": num_groups,
|
|
227
|
+
"groups_per_gpu": num_groups / len(training_gpus),
|
|
228
|
+
"parallel_efficiency": min(
|
|
229
|
+
1.0, num_groups / len(training_gpus)
|
|
230
|
+
), # 1.0 = perfect load balance
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def adjust_config_for_ddp(config: Config, num_gpus: int) -> Config:
|
|
235
|
+
"""Adjust configuration for optimal DDP performance.
|
|
236
|
+
|
|
237
|
+
Scaling rule:
|
|
238
|
+
- For 1 GPU: batch_size = 2 * group_size
|
|
239
|
+
- For N GPUs (N > 1): batch_size = N * group_size
|
|
240
|
+
|
|
241
|
+
This ensures each GPU processes exactly 1 group in parallel for optimal performance.
|
|
242
|
+
"""
|
|
243
|
+
group_size = config.training.group_size
|
|
244
|
+
|
|
245
|
+
# Apply scaling rule
|
|
246
|
+
if num_gpus == 1:
|
|
247
|
+
# Special case: 2 groups for single GPU
|
|
248
|
+
config.training.batch_size = 2 * group_size
|
|
249
|
+
else:
|
|
250
|
+
# Multi-GPU: each GPU processes 1 group
|
|
251
|
+
config.training.batch_size = num_gpus * group_size
|
|
252
|
+
|
|
253
|
+
# Update max_parallel_episodes to match
|
|
254
|
+
config.actor.max_parallel_episodes = config.training.batch_size
|
|
255
|
+
|
|
256
|
+
# Log the adjustment
|
|
257
|
+
from rich.console import Console
|
|
258
|
+
|
|
259
|
+
console = Console()
|
|
260
|
+
console.print(
|
|
261
|
+
f"\n[cyan]📊 Adjusted batch_size to {config.training.batch_size} ({config.training.batch_size // group_size} groups)[/cyan]" # noqa: E501
|
|
262
|
+
)
|
|
263
|
+
console.print(
|
|
264
|
+
f"[cyan] Each of the {num_gpus} GPU(s) will process {config.training.batch_size // group_size // num_gpus} group(s) in parallel[/cyan]" # noqa: E501
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
return config
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def kill_high_memory_processes(memory_threshold: float = 70.0) -> int:
|
|
271
|
+
"""Kill all GPU processes using more than threshold% memory.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
Number of processes killed
|
|
275
|
+
"""
|
|
276
|
+
from rich.console import Console
|
|
277
|
+
|
|
278
|
+
console = Console()
|
|
279
|
+
|
|
280
|
+
memory_info = get_gpu_memory_info()
|
|
281
|
+
killed_count = 0
|
|
282
|
+
|
|
283
|
+
for gpu_idx, info in memory_info.items():
|
|
284
|
+
if info["used_pct"] > memory_threshold:
|
|
285
|
+
for proc in info.get("processes", []):
|
|
286
|
+
pid = proc["pid"]
|
|
287
|
+
try:
|
|
288
|
+
# Try graceful termination first
|
|
289
|
+
subprocess.run(["kill", "-TERM", str(pid)], check=False, capture_output=True) # noqa: S603, S607
|
|
290
|
+
killed_count += 1
|
|
291
|
+
console.print(
|
|
292
|
+
f"[yellow]Terminating PID {pid} on GPU {gpu_idx} ({proc['memory_mb'] / 1024:.1f} GB)[/yellow]" # noqa: E501
|
|
293
|
+
)
|
|
294
|
+
except Exception as e:
|
|
295
|
+
console.print(f"[red]Failed to kill PID {pid}: {e}[/red]")
|
|
296
|
+
|
|
297
|
+
if killed_count > 0:
|
|
298
|
+
console.print(f"\n[yellow]Sent termination signal to {killed_count} processes...[/yellow]")
|
|
299
|
+
time.sleep(3)
|
|
300
|
+
|
|
301
|
+
# Force kill any remaining
|
|
302
|
+
for info in memory_info.values():
|
|
303
|
+
for proc in info.get("processes", []):
|
|
304
|
+
pid = proc["pid"]
|
|
305
|
+
try:
|
|
306
|
+
# Check if still running
|
|
307
|
+
subprocess.run( # noqa: S603
|
|
308
|
+
["kill", "-0", str(pid)], # noqa: S607
|
|
309
|
+
check=True,
|
|
310
|
+
capture_output=True,
|
|
311
|
+
)
|
|
312
|
+
# If no error, process is still running, force kill
|
|
313
|
+
subprocess.run(["kill", "-KILL", str(pid)], check=False) # noqa: S603, S607
|
|
314
|
+
console.print(f"[red]Force killed PID {pid}[/red]")
|
|
315
|
+
except Exception:
|
|
316
|
+
hud_console.error(f"Failed to kill PID {pid}")
|
|
317
|
+
|
|
318
|
+
return killed_count
|