hud-python 0.4.28__py3-none-any.whl → 0.4.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (75) hide show
  1. hud/__init__.py +2 -1
  2. hud/agents/base.py +73 -45
  3. hud/agents/claude.py +8 -4
  4. hud/agents/openai_chat_generic.py +65 -40
  5. hud/agents/tests/test_base.py +0 -4
  6. hud/agents/tests/test_openai.py +1 -1
  7. hud/cli/__init__.py +182 -52
  8. hud/cli/dev.py +8 -9
  9. hud/cli/eval.py +317 -119
  10. hud/cli/flows/__init__.py +0 -0
  11. hud/cli/flows/tasks.py +0 -0
  12. hud/cli/get.py +160 -0
  13. hud/cli/rl/__init__.py +563 -71
  14. hud/cli/rl/config.py +94 -0
  15. hud/cli/rl/display.py +133 -0
  16. hud/cli/rl/gpu.py +63 -0
  17. hud/cli/rl/gpu_utils.py +318 -0
  18. hud/cli/rl/presets.py +96 -0
  19. hud/cli/rl/remote_runner.py +348 -0
  20. hud/cli/rl/rl_api.py +150 -0
  21. hud/cli/rl/vllm.py +177 -0
  22. hud/cli/tests/test_analyze_metadata.py +0 -1
  23. hud/cli/utils/tasks.py +26 -0
  24. hud/clients/base.py +21 -23
  25. hud/clients/mcp_use.py +36 -44
  26. hud/clients/tests/test_mcp_use_retry.py +10 -10
  27. hud/datasets/__init__.py +4 -3
  28. hud/datasets/{execution/parallel.py → parallel.py} +1 -1
  29. hud/datasets/{execution/runner.py → runner.py} +1 -1
  30. hud/datasets/utils.py +1 -1
  31. hud/native/tests/test_native_init.py +1 -1
  32. hud/otel/config.py +1 -1
  33. hud/otel/instrumentation.py +35 -0
  34. hud/rl/README.md +31 -0
  35. hud/rl/__init__.py +1 -0
  36. hud/rl/actor.py +174 -0
  37. hud/rl/buffer.py +371 -0
  38. hud/rl/chat_template.jinja +101 -0
  39. hud/rl/config.py +184 -0
  40. hud/rl/distributed.py +95 -0
  41. hud/rl/learner.py +586 -0
  42. hud/rl/tests/__init__.py +1 -0
  43. hud/rl/tests/test_learner.py +171 -0
  44. hud/rl/train.py +354 -0
  45. hud/rl/types.py +101 -0
  46. hud/rl/utils/start_vllm_server.sh +30 -0
  47. hud/rl/utils.py +524 -0
  48. hud/rl/vllm_adapter.py +125 -0
  49. hud/settings.py +6 -0
  50. hud/telemetry/__init__.py +2 -1
  51. hud/telemetry/job.py +46 -3
  52. hud/telemetry/tests/test_trace.py +3 -3
  53. hud/telemetry/trace.py +85 -13
  54. hud/tools/tests/test_computer.py +3 -3
  55. hud/tools/tests/test_computer_actions.py +1 -1
  56. hud/types.py +123 -2
  57. hud/utils/group_eval.py +223 -0
  58. hud/utils/hud_console.py +113 -13
  59. hud/utils/tasks.py +119 -0
  60. hud/utils/tests/test_version.py +1 -1
  61. hud/version.py +1 -1
  62. {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/METADATA +20 -2
  63. {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/RECORD +66 -46
  64. hud/cli/hf.py +0 -406
  65. hud/cli/rl/README.md +0 -243
  66. hud/cli/rl/init.py +0 -370
  67. hud/cli/rl/pod.py +0 -501
  68. hud/cli/rl/ssh.py +0 -322
  69. hud/cli/rl/train.py +0 -562
  70. hud/cli/rl/utils.py +0 -165
  71. hud/datasets/execution/__init__.py +0 -13
  72. hud/datasets/task.py +0 -116
  73. {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/WHEEL +0 -0
  74. {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/entry_points.txt +0 -0
  75. {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/licenses/LICENSE +0 -0
hud/cli/rl/config.py ADDED
@@ -0,0 +1,94 @@
1
+ """Configuration generation and management for RL training."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ from rich.console import Console
9
+
10
+ from hud.rl.config import Config, validate_vl_model
11
+ from hud.utils.hud_console import hud_console
12
+
13
+ from .display import display_preset_table
14
+ from .presets import estimate_memory_usage
15
+
16
+ if TYPE_CHECKING:
17
+ from pathlib import Path
18
+ console = Console()
19
+
20
+
21
+ def generate_config_interactive(
22
+ model_name: str,
23
+ presets: list[dict[str, Any]],
24
+ ) -> tuple[Config, float]:
25
+ """Generate RL training configuration interactively."""
26
+ # Validate model is a VL model
27
+ validate_vl_model(model_name)
28
+
29
+ # Display preset options
30
+ display_preset_table(presets, 80.0) # Assuming A100 80GB
31
+
32
+ # Let user select preset
33
+ preset_choice = hud_console.select(
34
+ "Select a training configuration preset:",
35
+ choices=[{"name": p["name"], "value": i} for i, p in enumerate(presets)],
36
+ default=1 if len(presets) > 1 else 0, # Default to "Balanced" if available
37
+ )
38
+
39
+ selected_preset = presets[preset_choice] # type: ignore
40
+
41
+ # Use preset values directly
42
+ max_steps_per_episode = selected_preset["max_steps_per_episode"]
43
+
44
+ # Calculate memory estimate
45
+ max_pixels = 256 * 28 * 28
46
+ estimated_memory = estimate_memory_usage(
47
+ selected_preset["mini_batch_size"],
48
+ max_steps_per_episode,
49
+ selected_preset["max_new_tokens"],
50
+ max_pixels,
51
+ )
52
+
53
+ config_adds = {
54
+ "actor": {
55
+ "max_new_tokens": selected_preset["max_new_tokens"],
56
+ "max_parallel_episodes": selected_preset["batch_size"],
57
+ "max_steps_per_episode": selected_preset["max_steps_per_episode"],
58
+ "force_tool_choice": True,
59
+ },
60
+ "training": {
61
+ "mini_batch_size": selected_preset["mini_batch_size"],
62
+ "group_size": selected_preset["group_size"],
63
+ "batch_size": selected_preset["batch_size"],
64
+ "lr": selected_preset["lr"],
65
+ "epochs": selected_preset["epochs"],
66
+ },
67
+ "verbose": True,
68
+ }
69
+
70
+ # Create config
71
+ config = Config.from_dict(config_adds)
72
+
73
+ return config, estimated_memory
74
+
75
+
76
+ def save_config(config: Config, path: Path) -> None:
77
+ """Save configuration to a JSON file."""
78
+ config_dict = config.to_dict()
79
+
80
+ with open(path, "w") as f:
81
+ json.dump(config_dict, f, indent=2)
82
+ f.write("\n") # Add newline at end of file
83
+
84
+ if not path.name.startswith("."): # Don't show message for temp files
85
+ console.print(f"[green]✅ Configuration saved to {path}[/green]")
86
+
87
+
88
+ def load_config(path: Path) -> Config:
89
+ """Load configuration from a JSON file."""
90
+ with open(path) as f:
91
+ data = json.load(f)
92
+
93
+ # Use Config.from_dict which handles missing fields gracefully
94
+ return Config.from_dict(data)
hud/cli/rl/display.py ADDED
@@ -0,0 +1,133 @@
1
+ """Display utilities for RL training configuration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from rich.console import Console
8
+ from rich.table import Table
9
+
10
+ if TYPE_CHECKING:
11
+ from hud.rl.config import Config
12
+
13
+ console = Console()
14
+
15
+
16
+ def display_gpu_info(gpu_info: dict[str, Any]) -> None:
17
+ """Display GPU information in a table."""
18
+ if not gpu_info["available"]:
19
+ console.print(f"[red]❌ CUDA not available: {gpu_info.get('error', 'Unknown error')}[/red]")
20
+ return
21
+
22
+ gpu_table = Table(title="🖥️ Available GPUs", title_style="bold cyan")
23
+ gpu_table.add_column("Index", style="yellow")
24
+ gpu_table.add_column("Name", style="cyan")
25
+ gpu_table.add_column("Memory", style="green")
26
+
27
+ for device in gpu_info["devices"]:
28
+ gpu_table.add_row(f"GPU {device['index']}", device["name"], f"{device['memory_gb']:.1f} GB")
29
+
30
+ console.print(gpu_table)
31
+
32
+
33
+ def display_preset_table(presets: list[dict[str, Any]], gpu_memory_gb: float) -> None:
34
+ """Display training configuration presets in a table."""
35
+ preset_table = Table(title="📊 Training Configuration Presets", title_style="bold cyan")
36
+ preset_table.add_column("Option", style="yellow")
37
+ preset_table.add_column("Steps", style="cyan")
38
+ preset_table.add_column("Mini-batch", style="cyan")
39
+ preset_table.add_column("Group", style="cyan")
40
+ preset_table.add_column("Episodes/batch", style="cyan")
41
+
42
+ # Add time columns for A100
43
+ if gpu_memory_gb >= 40:
44
+ preset_table.add_column("Tasks/hour", style="green")
45
+ preset_table.add_column("Steps/hour", style="green")
46
+
47
+ for i, preset in enumerate(presets):
48
+ row = [
49
+ f"{i + 1}. {preset['name']}",
50
+ str(preset["max_steps_per_episode"]),
51
+ str(preset["mini_batch_size"]),
52
+ str(preset["group_size"]),
53
+ str(preset["batch_size"]),
54
+ ]
55
+ if "tasks_per_hour" in preset:
56
+ row.extend(
57
+ [
58
+ str(preset["tasks_per_hour"]),
59
+ str(preset["steps_per_hour"]),
60
+ ]
61
+ )
62
+ preset_table.add_row(*row)
63
+
64
+ console.print("\n")
65
+ console.print(preset_table)
66
+ console.print("\n")
67
+
68
+
69
+ def display_config_summary(
70
+ config: Config, tasks_count: int, gpu_info: dict[str, Any], estimated_memory: float
71
+ ) -> None:
72
+ """Display comprehensive configuration summary for review."""
73
+ console.print("\n[bold cyan]📋 RL Training Configuration Summary[/bold cyan]\n")
74
+
75
+ # GPU Information
76
+ if gpu_info["available"]:
77
+ gpu_table = Table(title="🖥️ GPU Information", title_style="bold yellow")
78
+ gpu_table.add_column("Property", style="cyan")
79
+ gpu_table.add_column("Value", style="green")
80
+
81
+ device = gpu_info["devices"][0] # Primary GPU
82
+ gpu_table.add_row("GPU 0", device["name"])
83
+ gpu_table.add_row("Memory", f"{device['memory_gb']:.1f} GB")
84
+ gpu_table.add_row("Compute Capability", "8.0") # Assuming A100
85
+
86
+ console.print(gpu_table)
87
+
88
+ # Model Configuration
89
+ model_table = Table(title="🤖 Model Configuration", title_style="bold yellow")
90
+ model_table.add_column("Parameter", style="cyan")
91
+ model_table.add_column("Value", style="green")
92
+
93
+ model_table.add_row("Base Model", config.model.base_model)
94
+ model_table.add_row("LoRA Rank (r)", str(config.model.lora_r))
95
+ model_table.add_row("LoRA Alpha", str(config.model.lora_alpha))
96
+ model_table.add_row("LoRA Dropout", str(config.model.lora_dropout))
97
+
98
+ console.print(model_table)
99
+
100
+ # Training Configuration
101
+ training_table = Table(title="🎯 Training Configuration", title_style="bold yellow")
102
+ training_table.add_column("Parameter", style="cyan")
103
+ training_table.add_column("Value", style="green")
104
+
105
+ training_table.add_row("Tasks Count", str(tasks_count))
106
+ training_table.add_row("Learning Rate", f"{config.training.lr:.1e}")
107
+ training_table.add_row("Epochs", str(config.training.epochs))
108
+ training_table.add_row("Mini Batch Size", str(config.training.mini_batch_size))
109
+ training_table.add_row("Batch Size", str(config.training.batch_size))
110
+ training_table.add_row("Group Size", str(config.training.group_size))
111
+ training_table.add_row("Training Steps", str(config.training.training_steps))
112
+ training_table.add_row("Max Parallel Episodes", str(config.actor.max_parallel_episodes))
113
+
114
+ console.print(training_table)
115
+
116
+ # Memory Estimation
117
+ memory_table = Table(title="💾 Memory Estimation", title_style="bold yellow")
118
+ memory_table.add_column("Metric", style="cyan")
119
+ memory_table.add_column("Value", style="green")
120
+
121
+ memory_table.add_row("Estimated GPU Memory", f"{estimated_memory:.1f} GB")
122
+ if gpu_info["available"]:
123
+ available_memory = gpu_info["devices"][0]["memory_gb"]
124
+ memory_table.add_row("Available GPU Memory", f"{available_memory:.1f} GB")
125
+
126
+ if estimated_memory > available_memory:
127
+ status = "[red]⚠️ May exceed available memory[/red]"
128
+ else:
129
+ status = "[green]✅ Within memory limits[/green]"
130
+ memory_table.add_row("Status", status)
131
+
132
+ console.print(memory_table)
133
+ console.print("\n")
hud/cli/rl/gpu.py ADDED
@@ -0,0 +1,63 @@
1
+ """GPU detection and validation utilities for RL training."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import subprocess
6
+ from typing import Any
7
+
8
+
9
+ def detect_cuda_devices() -> dict[str, Any]:
10
+ """Detect available CUDA devices and their properties."""
11
+ try:
12
+ # Check if CUDA is available
13
+ result = subprocess.run(
14
+ ["nvidia-smi", "--query-gpu=index,name,memory.total", "--format=csv,noheader,nounits"], # noqa: S607
15
+ capture_output=True,
16
+ text=True,
17
+ check=True,
18
+ )
19
+
20
+ if result.returncode != 0:
21
+ return {"available": False, "error": "nvidia-smi command failed"}
22
+
23
+ devices = []
24
+ for line in result.stdout.strip().split("\n"):
25
+ parts = line.split(", ")
26
+ if len(parts) >= 3:
27
+ devices.append(
28
+ {
29
+ "index": int(parts[0]),
30
+ "name": parts[1],
31
+ "memory_gb": float(parts[2]) / 1024, # Convert MB to GB
32
+ }
33
+ )
34
+
35
+ return {"available": True, "devices": devices}
36
+
37
+ except FileNotFoundError:
38
+ return {
39
+ "available": False,
40
+ "error": "nvidia-smi not found - CUDA drivers may not be installed",
41
+ }
42
+ except Exception as e:
43
+ return {"available": False, "error": str(e)}
44
+
45
+
46
+ def select_gpu_for_vllm(devices: list[dict[str, Any]]) -> int:
47
+ """Select the best GPU for vLLM server (typically GPU 1 if available)."""
48
+ if len(devices) > 1:
49
+ # Prefer GPU 1 for vLLM to leave GPU 0 for other processes
50
+ return 1
51
+ return 0
52
+
53
+
54
+ def validate_gpu_memory(gpu_memory_gb: float, model_size: str = "3B") -> bool:
55
+ """Validate if GPU has sufficient memory for the model."""
56
+ min_memory_requirements = {
57
+ "3B": 12.0, # Minimum for Qwen 2.5 VL 3B
58
+ "7B": 24.0,
59
+ "14B": 40.0,
60
+ }
61
+
62
+ min_required = min_memory_requirements.get(model_size, 12.0)
63
+ return gpu_memory_gb >= min_required
@@ -0,0 +1,318 @@
1
+ """GPU utilities for DDP training."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import subprocess
7
+ import time
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ import torch
11
+
12
+ from hud.utils.hud_console import HUDConsole
13
+
14
+ if TYPE_CHECKING:
15
+ from hud.rl.config import Config
16
+ hud_console = HUDConsole(logging.getLogger(__name__))
17
+
18
+
19
+ def get_gpu_memory_info() -> dict[int, dict[str, Any]]:
20
+ """Get memory usage information for all GPUs."""
21
+
22
+ gpu_memory = {}
23
+ try:
24
+ # Get memory info for all GPUs
25
+ cmd = [
26
+ "nvidia-smi",
27
+ "--query-gpu=index,memory.used,memory.total,memory.free",
28
+ "--format=csv,noheader,nounits",
29
+ ]
30
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True) # noqa: S603
31
+
32
+ for line in result.stdout.strip().split("\n"):
33
+ if not line:
34
+ continue
35
+ parts = line.split(", ")
36
+ if len(parts) >= 4:
37
+ gpu_idx = int(parts[0])
38
+ memory_used = float(parts[1])
39
+ memory_total = float(parts[2])
40
+ memory_free = float(parts[3])
41
+ gpu_memory[gpu_idx] = {
42
+ "used_mb": memory_used,
43
+ "total_mb": memory_total,
44
+ "free_mb": memory_free,
45
+ "used_pct": (memory_used / memory_total) * 100,
46
+ }
47
+
48
+ # Get process information per GPU
49
+ for gpu_idx in gpu_memory: # noqa: PLC0206
50
+ cmd = [
51
+ "nvidia-smi",
52
+ "-i",
53
+ str(gpu_idx),
54
+ "--query-compute-apps=pid,used_memory",
55
+ "--format=csv,noheader,nounits",
56
+ ]
57
+ try:
58
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True) # noqa: S603
59
+ processes = []
60
+ for line in result.stdout.strip().split("\n"):
61
+ if not line:
62
+ continue
63
+ parts = line.split(", ")
64
+ if len(parts) >= 2:
65
+ pid = int(parts[0])
66
+ memory_mb = float(parts[1])
67
+ processes.append({"pid": pid, "memory_mb": memory_mb})
68
+ gpu_memory[gpu_idx]["processes"] = processes
69
+ except Exception as e:
70
+ hud_console.error(f"Failed to get process info for GPU {gpu_idx}: {e}")
71
+ gpu_memory[gpu_idx]["processes"] = []
72
+
73
+ except Exception as e:
74
+ hud_console.error(f"Failed to get GPU memory info {e}")
75
+ return {}
76
+
77
+ return gpu_memory
78
+
79
+
80
+ def health_check_gpus(gpu_indices: list[int]) -> dict[str, Any]:
81
+ """Perform health check on specified GPUs including memory status.
82
+
83
+ Returns:
84
+ Dict with:
85
+ - healthy_gpus: List of healthy GPU indices
86
+ - unhealthy_gpus: Dict of unhealthy GPU index -> error message
87
+ - all_healthy: Boolean indicating if all GPUs are healthy
88
+ - memory_issues: Boolean indicating if there are memory issues
89
+ """
90
+ from rich.console import Console
91
+ from rich.table import Table
92
+
93
+ console = Console()
94
+
95
+ console.print("\n[bold cyan]🏥 GPU Health Check[/bold cyan]")
96
+
97
+ # First get memory info
98
+ memory_info = get_gpu_memory_info()
99
+
100
+ healthy_gpus = []
101
+ unhealthy_gpus = {}
102
+ memory_issues = []
103
+
104
+ # Create a table for results
105
+ table = Table(title="GPU Health Status")
106
+ table.add_column("GPU", style="cyan")
107
+ table.add_column("Memory Usage", style="yellow")
108
+ table.add_column("Status", style="green")
109
+ table.add_column("Details", style="yellow")
110
+
111
+ for gpu_idx in gpu_indices:
112
+ # Memory info
113
+ mem_str = "Unknown"
114
+ if gpu_idx in memory_info:
115
+ mem = memory_info[gpu_idx]
116
+ used_gb = mem["used_mb"] / 1024
117
+ total_gb = mem["total_mb"] / 1024
118
+ mem_str = f"{used_gb:.1f}/{total_gb:.1f} GB ({mem['used_pct']:.0f}%)"
119
+
120
+ # Check for high memory usage
121
+ if mem["used_pct"] > 70:
122
+ memory_issues.append(gpu_idx)
123
+ proc_info = f" ({len(mem['processes'])} processes)" if mem["processes"] else ""
124
+ unhealthy_gpus[gpu_idx] = f"High memory usage{proc_info}"
125
+ table.add_row(
126
+ f"GPU {gpu_idx}", mem_str, "❌ Unhealthy", f"High memory usage{proc_info}"
127
+ )
128
+ continue
129
+
130
+ # If no severe memory issue, do accessibility test
131
+ try:
132
+ # Try to allocate a small tensor on the GPU
133
+ torch.cuda.set_device(gpu_idx)
134
+ device = torch.device(f"cuda:{gpu_idx}")
135
+
136
+ # Test basic allocation
137
+ test_tensor = torch.zeros(100, 100, device=device)
138
+
139
+ # Test computation
140
+ result = torch.matmul(test_tensor, test_tensor)
141
+
142
+ # Force synchronization
143
+ torch.cuda.synchronize(device)
144
+
145
+ # Clean up
146
+ del test_tensor, result
147
+ torch.cuda.empty_cache()
148
+
149
+ healthy_gpus.append(gpu_idx)
150
+ table.add_row(f"GPU {gpu_idx}", mem_str, "✅ Healthy", "Passed all tests")
151
+
152
+ except Exception as e:
153
+ error_msg = str(e)
154
+ if "busy or unavailable" in error_msg:
155
+ short_msg = "Device busy or unavailable"
156
+ elif "out of memory" in error_msg:
157
+ short_msg = "Insufficient memory"
158
+ else:
159
+ short_msg = error_msg[:50] + "..." if len(error_msg) > 50 else error_msg
160
+
161
+ unhealthy_gpus[gpu_idx] = short_msg
162
+ table.add_row(f"GPU {gpu_idx}", mem_str, "❌ Unhealthy", short_msg)
163
+
164
+ # Small delay between GPU checks
165
+ time.sleep(0.1)
166
+
167
+ console.print(table)
168
+
169
+ return {
170
+ "healthy_gpus": healthy_gpus,
171
+ "unhealthy_gpus": unhealthy_gpus,
172
+ "all_healthy": len(unhealthy_gpus) == 0,
173
+ "memory_issues": memory_issues,
174
+ }
175
+
176
+
177
+ def calculate_optimal_gpu_allocation(gpu_info: dict[str, Any], config: Config) -> dict[str, Any]:
178
+ """Calculate optimal GPU allocation for DDP GRPO training.
179
+
180
+ Key insight: In GRPO, we want to process groups in parallel.
181
+ Optimal case: num_gpus = num_groups (each GPU processes 1 group).
182
+ """
183
+ devices = gpu_info["devices"]
184
+ available_gpus = [device["index"] for device in devices]
185
+
186
+ # Need at least 2 GPUs (1 for training, 1 for vLLM)
187
+ if len(available_gpus) < 2:
188
+ return {"use_ddp": False, "reason": "Need at least 2 GPUs"}
189
+
190
+ # Reserve last GPU for vLLM
191
+ vllm_gpu = available_gpus[-1]
192
+ training_gpus = available_gpus[:-1]
193
+
194
+ # Calculate number of groups
195
+ batch_size = config.training.batch_size
196
+ group_size = config.training.group_size
197
+ num_groups = batch_size // group_size
198
+
199
+ if num_groups == 0:
200
+ num_groups = 1
201
+
202
+ # Optimal: Use exactly num_groups GPUs (each processes 1 group in parallel)
203
+ # But cap at available training GPUs
204
+ optimal_gpu_count = min(len(training_gpus), num_groups)
205
+
206
+ # Only use DDP if we have more than 1 group and more than 1 GPU
207
+ use_ddp = optimal_gpu_count > 1 and num_groups > 1
208
+
209
+ if not use_ddp:
210
+ # Single GPU training
211
+ return {
212
+ "use_ddp": False,
213
+ "reason": f"Single GPU sufficient for {num_groups} group(s)",
214
+ "training_gpus": [training_gpus[0]],
215
+ "vllm_gpu": vllm_gpu,
216
+ "num_groups": num_groups,
217
+ }
218
+
219
+ # Use optimal number of GPUs for DDP
220
+ training_gpus = training_gpus[:optimal_gpu_count]
221
+
222
+ return {
223
+ "use_ddp": True,
224
+ "training_gpus": training_gpus,
225
+ "vllm_gpu": vllm_gpu,
226
+ "num_groups": num_groups,
227
+ "groups_per_gpu": num_groups / len(training_gpus),
228
+ "parallel_efficiency": min(
229
+ 1.0, num_groups / len(training_gpus)
230
+ ), # 1.0 = perfect load balance
231
+ }
232
+
233
+
234
+ def adjust_config_for_ddp(config: Config, num_gpus: int) -> Config:
235
+ """Adjust configuration for optimal DDP performance.
236
+
237
+ Scaling rule:
238
+ - For 1 GPU: batch_size = 2 * group_size
239
+ - For N GPUs (N > 1): batch_size = N * group_size
240
+
241
+ This ensures each GPU processes exactly 1 group in parallel for optimal performance.
242
+ """
243
+ group_size = config.training.group_size
244
+
245
+ # Apply scaling rule
246
+ if num_gpus == 1:
247
+ # Special case: 2 groups for single GPU
248
+ config.training.batch_size = 2 * group_size
249
+ else:
250
+ # Multi-GPU: each GPU processes 1 group
251
+ config.training.batch_size = num_gpus * group_size
252
+
253
+ # Update max_parallel_episodes to match
254
+ config.actor.max_parallel_episodes = config.training.batch_size
255
+
256
+ # Log the adjustment
257
+ from rich.console import Console
258
+
259
+ console = Console()
260
+ console.print(
261
+ f"\n[cyan]📊 Adjusted batch_size to {config.training.batch_size} ({config.training.batch_size // group_size} groups)[/cyan]" # noqa: E501
262
+ )
263
+ console.print(
264
+ f"[cyan] Each of the {num_gpus} GPU(s) will process {config.training.batch_size // group_size // num_gpus} group(s) in parallel[/cyan]" # noqa: E501
265
+ )
266
+
267
+ return config
268
+
269
+
270
+ def kill_high_memory_processes(memory_threshold: float = 70.0) -> int:
271
+ """Kill all GPU processes using more than threshold% memory.
272
+
273
+ Returns:
274
+ Number of processes killed
275
+ """
276
+ from rich.console import Console
277
+
278
+ console = Console()
279
+
280
+ memory_info = get_gpu_memory_info()
281
+ killed_count = 0
282
+
283
+ for gpu_idx, info in memory_info.items():
284
+ if info["used_pct"] > memory_threshold:
285
+ for proc in info.get("processes", []):
286
+ pid = proc["pid"]
287
+ try:
288
+ # Try graceful termination first
289
+ subprocess.run(["kill", "-TERM", str(pid)], check=False, capture_output=True) # noqa: S603, S607
290
+ killed_count += 1
291
+ console.print(
292
+ f"[yellow]Terminating PID {pid} on GPU {gpu_idx} ({proc['memory_mb'] / 1024:.1f} GB)[/yellow]" # noqa: E501
293
+ )
294
+ except Exception as e:
295
+ console.print(f"[red]Failed to kill PID {pid}: {e}[/red]")
296
+
297
+ if killed_count > 0:
298
+ console.print(f"\n[yellow]Sent termination signal to {killed_count} processes...[/yellow]")
299
+ time.sleep(3)
300
+
301
+ # Force kill any remaining
302
+ for info in memory_info.values():
303
+ for proc in info.get("processes", []):
304
+ pid = proc["pid"]
305
+ try:
306
+ # Check if still running
307
+ subprocess.run( # noqa: S603
308
+ ["kill", "-0", str(pid)], # noqa: S607
309
+ check=True,
310
+ capture_output=True,
311
+ )
312
+ # If no error, process is still running, force kill
313
+ subprocess.run(["kill", "-KILL", str(pid)], check=False) # noqa: S603, S607
314
+ console.print(f"[red]Force killed PID {pid}[/red]")
315
+ except Exception:
316
+ hud_console.error(f"Failed to kill PID {pid}")
317
+
318
+ return killed_count