hud-python 0.4.27__py3-none-any.whl → 0.4.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (76) hide show
  1. hud/__init__.py +2 -1
  2. hud/agents/base.py +73 -45
  3. hud/agents/claude.py +8 -4
  4. hud/agents/openai_chat_generic.py +65 -40
  5. hud/agents/tests/test_base.py +0 -4
  6. hud/agents/tests/test_openai.py +1 -1
  7. hud/cli/__init__.py +182 -52
  8. hud/cli/dev.py +8 -9
  9. hud/cli/eval.py +317 -119
  10. hud/cli/flows/__init__.py +0 -0
  11. hud/cli/flows/tasks.py +0 -0
  12. hud/cli/get.py +160 -0
  13. hud/cli/rl/__init__.py +563 -71
  14. hud/cli/rl/config.py +94 -0
  15. hud/cli/rl/display.py +133 -0
  16. hud/cli/rl/gpu.py +63 -0
  17. hud/cli/rl/gpu_utils.py +318 -0
  18. hud/cli/rl/presets.py +96 -0
  19. hud/cli/rl/remote_runner.py +348 -0
  20. hud/cli/rl/rl_api.py +150 -0
  21. hud/cli/rl/vllm.py +177 -0
  22. hud/cli/tests/test_analyze_metadata.py +0 -1
  23. hud/cli/utils/tasks.py +26 -0
  24. hud/clients/base.py +21 -23
  25. hud/clients/mcp_use.py +36 -44
  26. hud/clients/tests/test_mcp_use_retry.py +10 -10
  27. hud/datasets/__init__.py +4 -3
  28. hud/datasets/{execution/parallel.py → parallel.py} +1 -1
  29. hud/datasets/{execution/runner.py → runner.py} +1 -1
  30. hud/datasets/utils.py +1 -1
  31. hud/native/tests/test_native_init.py +1 -1
  32. hud/otel/config.py +1 -1
  33. hud/otel/instrumentation.py +35 -0
  34. hud/rl/README.md +31 -0
  35. hud/rl/__init__.py +1 -0
  36. hud/rl/actor.py +174 -0
  37. hud/rl/buffer.py +371 -0
  38. hud/rl/chat_template.jinja +101 -0
  39. hud/rl/config.py +184 -0
  40. hud/rl/distributed.py +95 -0
  41. hud/rl/learner.py +586 -0
  42. hud/rl/tests/__init__.py +1 -0
  43. hud/rl/tests/test_learner.py +171 -0
  44. hud/rl/train.py +354 -0
  45. hud/rl/types.py +101 -0
  46. hud/rl/utils/start_vllm_server.sh +30 -0
  47. hud/rl/utils.py +524 -0
  48. hud/rl/vllm_adapter.py +125 -0
  49. hud/settings.py +6 -0
  50. hud/telemetry/__init__.py +2 -1
  51. hud/telemetry/job.py +46 -3
  52. hud/telemetry/tests/test_trace.py +3 -3
  53. hud/telemetry/trace.py +85 -13
  54. hud/tools/computer/hud.py +4 -4
  55. hud/tools/tests/test_computer.py +3 -3
  56. hud/tools/tests/test_computer_actions.py +1 -1
  57. hud/types.py +123 -2
  58. hud/utils/group_eval.py +223 -0
  59. hud/utils/hud_console.py +113 -13
  60. hud/utils/tasks.py +119 -0
  61. hud/utils/tests/test_version.py +1 -1
  62. hud/version.py +1 -1
  63. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/METADATA +20 -2
  64. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/RECORD +67 -47
  65. hud/cli/hf.py +0 -406
  66. hud/cli/rl/README.md +0 -243
  67. hud/cli/rl/init.py +0 -370
  68. hud/cli/rl/pod.py +0 -501
  69. hud/cli/rl/ssh.py +0 -322
  70. hud/cli/rl/train.py +0 -562
  71. hud/cli/rl/utils.py +0 -165
  72. hud/datasets/execution/__init__.py +0 -13
  73. hud/datasets/task.py +0 -116
  74. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/WHEEL +0 -0
  75. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/entry_points.txt +0 -0
  76. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,171 @@
1
+ from __future__ import annotations
2
+
3
+ import pytest
4
+ import torch
5
+
6
+ from hud.rl.config import Config
7
+ from hud.rl.learner import GRPOLearner
8
+ from hud.rl.types import TrainingSample
9
+
10
+
11
+ @pytest.fixture()
12
+ def learner_stub(monkeypatch):
13
+ cfg = Config()
14
+ # Speed up: tiny settings
15
+ cfg.training.epochs = 1
16
+ cfg.training.group_size = 1
17
+ cfg.training.mini_batch_size = 1
18
+ cfg.training.use_8bit_optimizer = False
19
+
20
+ # Stub _load_models to avoid heavy model init
21
+ def _stub_load_models(self):
22
+ class DummyPolicy(torch.nn.Module):
23
+ def __init__(self):
24
+ super().__init__()
25
+ self.w = torch.nn.Parameter(torch.zeros(1))
26
+
27
+ dummy_policy = DummyPolicy()
28
+ dummy_opt = torch.optim.SGD(dummy_policy.parameters(), lr=0.1)
29
+ return None, dummy_policy, None, dummy_opt
30
+
31
+ monkeypatch.setattr(GRPOLearner, "_load_models", _stub_load_models, raising=True)
32
+ return GRPOLearner(cfg)
33
+
34
+
35
+ def make_sample(
36
+ pol_logp_tok: torch.Tensor,
37
+ old_logp_tok: torch.Tensor,
38
+ ref_logp_tok: torch.Tensor,
39
+ advantage: float,
40
+ ):
41
+ # Minimal object with required attributes for compute_loss
42
+ # inputs only needed for metrics token count
43
+ Tm1 = pol_logp_tok.size(-1)
44
+ inputs = {"input_ids": torch.zeros(1, Tm1 + 1, dtype=torch.long)}
45
+ return TrainingSample(
46
+ inputs=inputs,
47
+ old_logprobs=old_logp_tok,
48
+ ref_logprobs=ref_logp_tok,
49
+ advantage=torch.tensor(advantage, dtype=torch.float32),
50
+ )
51
+
52
+
53
+ def patch_compute_logprobs(
54
+ monkeypatch, learner: GRPOLearner, pol_logp_tok: torch.Tensor, pol_entropy_tok: torch.Tensor
55
+ ):
56
+ # Return (pol_logp, pol_entropy) as expected by compute_loss
57
+ def _stub_compute_logprobs(self, model, inputs):
58
+ return pol_logp_tok.to(inputs["input_ids"].device), pol_entropy_tok.to(
59
+ inputs["input_ids"].device
60
+ )
61
+
62
+ monkeypatch.setattr(GRPOLearner, "compute_logprobs", _stub_compute_logprobs, raising=True)
63
+
64
+
65
+ def test_per_token_mean_vs_sum(monkeypatch, learner_stub: GRPOLearner):
66
+ # Setup
67
+ _, Tm1 = 1, 4
68
+ pol = torch.tensor([[-1.0, -1.0, -1.0, -1.0]], dtype=torch.float32) # logp
69
+ old = torch.tensor([[-1.2, -0.8, -1.0, -1.1]], dtype=torch.float32)
70
+ ref = torch.tensor([[-1.0, -1.0, -1.0, -1.0]], dtype=torch.float32)
71
+ ent = torch.zeros_like(pol)
72
+ patch_compute_logprobs(monkeypatch, learner_stub, pol, ent)
73
+
74
+ # Common config
75
+ learner_stub.config.training.kl_beta = 0.0
76
+ learner_stub.config.training.entropy_beta = 0.0
77
+ learner_stub.config.training.top_eps = 0.2
78
+ learner_stub.config.training.bottom_eps = 0.1
79
+
80
+ sample = make_sample(pol, old, ref, advantage=1.0)
81
+
82
+ # token_agg=mean
83
+ learner_stub.config.training.ppo_mode = "per_token"
84
+ learner_stub.config.training.token_agg = "mean"
85
+ loss_mean = learner_stub.compute_loss(sample).item()
86
+
87
+ # token_agg=sum
88
+ learner_stub.config.training.token_agg = "sum"
89
+ loss_sum = learner_stub.compute_loss(sample).item()
90
+
91
+ # Expect sum ≈ mean * num_tokens
92
+ assert pytest.approx(loss_sum, rel=1e-5) == loss_mean * Tm1
93
+
94
+
95
+ def test_per_trace_vs_per_token(monkeypatch, learner_stub: GRPOLearner):
96
+ # Equal per-token deltas -> per_trace matches per_token(mean)
97
+ pol = torch.tensor([[-1.0, -1.0, -1.0]], dtype=torch.float32)
98
+ old = torch.tensor([[-1.2, -1.2, -1.2]], dtype=torch.float32)
99
+ ref = torch.tensor([[-1.1, -1.1, -1.1]], dtype=torch.float32)
100
+ ent = torch.zeros_like(pol)
101
+ patch_compute_logprobs(monkeypatch, learner_stub, pol, ent)
102
+
103
+ learner_stub.config.training.kl_beta = 0.0
104
+ learner_stub.config.training.entropy_beta = 0.0
105
+ learner_stub.config.training.top_eps = 0.2
106
+ learner_stub.config.training.bottom_eps = 0.1
107
+
108
+ sample = make_sample(pol, old, ref, advantage=1.0)
109
+
110
+ learner_stub.config.training.ppo_mode = "per_token"
111
+ learner_stub.config.training.token_agg = "mean"
112
+ ltok = learner_stub.compute_loss(sample).item()
113
+
114
+ learner_stub.config.training.ppo_mode = "per_trace"
115
+ ltraj = learner_stub.compute_loss(sample).item()
116
+
117
+ assert pytest.approx(ltraj, rel=1e-6) == ltok
118
+
119
+
120
+ def test_entropy_beta_effect(monkeypatch, learner_stub: GRPOLearner):
121
+ pol = torch.tensor([[-1.0, -1.1]], dtype=torch.float32)
122
+ old = torch.tensor([[-1.0, -1.1]], dtype=torch.float32)
123
+ ref = torch.tensor([[-1.0, -1.1]], dtype=torch.float32)
124
+ ent = torch.tensor([[0.5, 1.5]], dtype=torch.float32)
125
+ patch_compute_logprobs(monkeypatch, learner_stub, pol, ent)
126
+
127
+ # No policy/kl effect, only entropy
128
+ learner_stub.config.training.ppo_mode = "per_token"
129
+ learner_stub.config.training.token_agg = "mean"
130
+ learner_stub.config.training.kl_beta = 0.0
131
+
132
+ sample = make_sample(pol, old, ref, advantage=0.0)
133
+
134
+ learner_stub.config.training.entropy_beta = 0.0
135
+ l0 = learner_stub.compute_loss(sample).item()
136
+
137
+ learner_stub.config.training.entropy_beta = 2.0
138
+ l1 = learner_stub.compute_loss(sample).item()
139
+
140
+ # Mean entropy = (0.5+1.5)/2 = 1.0, scaled by beta=2.0 -> +2.0
141
+ assert pytest.approx(l1 - l0, rel=1e-6) == 2.0
142
+
143
+
144
+ def test_skip_update_when_zero_adv(monkeypatch, learner_stub: GRPOLearner):
145
+ # Patch prepare_groups to yield a single group with a minibatch-like object
146
+ class MiniBatch:
147
+ def __init__(self):
148
+ self.advantage = torch.zeros(1)
149
+
150
+ def to_device(self, device: torch.device) -> MiniBatch:
151
+ return self
152
+
153
+ def _stub_prepare_groups(self, samples: list[TrainingSample]) -> list[list[MiniBatch]]:
154
+ return [[MiniBatch(), MiniBatch()]]
155
+
156
+ monkeypatch.setattr(GRPOLearner, "prepare_groups", _stub_prepare_groups, raising=True)
157
+
158
+ # Count optimizer.step calls
159
+ steps = {"n": 0}
160
+ # orig_step = learner_stub.optimizer.step
161
+
162
+ def _count_step():
163
+ steps["n"] += 1
164
+
165
+ monkeypatch.setattr(learner_stub.optimizer, "step", _count_step, raising=False)
166
+
167
+ # Ensure dummy backward can touch a parameter
168
+ assert any(p.requires_grad for p in learner_stub.policy.parameters())
169
+
170
+ learner_stub.update([])
171
+ assert steps["n"] == 0
hud/rl/train.py ADDED
@@ -0,0 +1,354 @@
1
+ """Main training loop for GRPO RL."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ # Disable tokenizer parallelism warnings
8
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
9
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
10
+ import argparse
11
+ import asyncio
12
+ import json
13
+ import logging
14
+ from datetime import datetime
15
+ from pathlib import Path
16
+ from typing import TYPE_CHECKING
17
+
18
+ import hud
19
+ from hud.rl.actor import Actor
20
+ from hud.rl.buffer import DatasetBuffer, ReplayBuffer
21
+ from hud.rl.config import Config
22
+ from hud.rl.distributed import (
23
+ broadcast_object,
24
+ cleanup_distributed,
25
+ get_global_rank,
26
+ get_world_size,
27
+ is_main_process,
28
+ setup_distributed,
29
+ synchronize,
30
+ )
31
+ from hud.rl.learner import GRPOLearner
32
+ from hud.rl.utils import (
33
+ aggregate_metrics_across_ranks,
34
+ ensure_dir,
35
+ preprocess_advantages,
36
+ set_seed,
37
+ )
38
+ from hud.rl.vllm_adapter import VLLMAdapter
39
+ from hud.utils.hud_console import HUDConsole
40
+ from hud.utils.tasks import load_tasks
41
+
42
+ if TYPE_CHECKING:
43
+ from hud.types import Task
44
+ hud_console = HUDConsole(logging.getLogger(__name__))
45
+
46
+
47
+ async def train(config: Config, tasks: list[Task]) -> None:
48
+ """Main training loop."""
49
+ # Setup distributed environment
50
+ setup_distributed()
51
+
52
+ # Initialize components
53
+ set_seed(config.seed + get_global_rank()) # Different seed per rank
54
+ ensure_dir(config.out_dir)
55
+ if config.verbose:
56
+ logging.basicConfig(level=logging.INFO)
57
+ # Remove httpx logger
58
+ logging.getLogger("httpx").setLevel(logging.WARNING)
59
+
60
+ if is_main_process():
61
+ hud_console.header("Starting GRPO Training")
62
+ hud_console.section_title(
63
+ f"\n[1/3] Initializing components (world_size={get_world_size()})..."
64
+ )
65
+
66
+ num_gpus = get_world_size()
67
+
68
+ # Actor is responsible for running tasks and collecting episodes
69
+ actor = Actor(config) if is_main_process() else None
70
+
71
+ # Learner is responsible for updating the policy
72
+ learner = GRPOLearner(config)
73
+
74
+ # Dataset buffer is responsible for storing tasks
75
+ dataset_buffer = DatasetBuffer(tasks, config)
76
+ if is_main_process():
77
+ hud_console.key_value_table(dataset_buffer.info)
78
+
79
+ if dataset_buffer.groups_per_batch % num_gpus != 0:
80
+ hud_console.warning(
81
+ f"Groups per batch {dataset_buffer.groups_per_batch} is not divisible by number of GPUs {num_gpus}" # noqa: E501
82
+ )
83
+ exit(1)
84
+
85
+ # Replay buffer is responsible for storing episodes for training
86
+ trace_buffer = ReplayBuffer(config)
87
+
88
+ # VLLM adapter is responsible for loading and unloading adapters (only on main process)
89
+ vllm = (
90
+ VLLMAdapter(config.actor.vllm_base_url, config.actor.vllm_api_key)
91
+ if is_main_process()
92
+ else None
93
+ )
94
+
95
+ # Training state
96
+ step = 0
97
+ last_metrics = None # Store last successful metrics for error recovery
98
+
99
+ if is_main_process():
100
+ hud_console.section_title("\n[2/3] Running training loop...")
101
+
102
+ # Create job on main process and distribute ID across GPUs
103
+ if is_main_process():
104
+ hud_console.info(f"Creating job with config.job_id: {config.job_id}")
105
+ job_obj = hud.create_job(
106
+ job_id=config.job_id, name=config.job_name, metadata={"config": config.to_dict()}
107
+ )
108
+ hud_console.info(f"Created job with job_obj.id: {job_obj.id}")
109
+ job_obj.update_status_sync("running")
110
+ job_id = job_obj.id
111
+ else:
112
+ job_obj = None
113
+ job_id = None
114
+
115
+ # Broadcast job ID to all ranks
116
+ job_id = broadcast_object(job_id, src=0)
117
+
118
+ try:
119
+ while len(dataset_buffer) > 0:
120
+ if is_main_process():
121
+ hud_console.section_title(f"Step {step + 1}/{dataset_buffer.training_steps}")
122
+ hud_console.info(f"{len(dataset_buffer)} tasks remaining")
123
+ # Get batch of tasks (all ranks need same tasks)
124
+ tasks = dataset_buffer.get_tasks()
125
+
126
+ # Initialize variables on all ranks
127
+ global_reward_stats = None
128
+ global_advantage_stats = None
129
+
130
+ # Only rank 0 runs tasks and collects traces
131
+ if is_main_process() and actor is not None:
132
+ import time
133
+
134
+ episode_start_time = time.time()
135
+ traces = await actor.run_tasks(tasks, job_id=job_id)
136
+ episode_time = time.time() - episode_start_time
137
+ hud_console.info(f"Sampled {len(traces)} traces in {episode_time:.1f}s")
138
+ trace_buffer.add(traces)
139
+ global_reward_stats = [trace.reward for trace in traces]
140
+
141
+ # Get all traces from buffer for distribution
142
+ all_traces = trace_buffer.sample_traces()
143
+
144
+ assert len(traces) == len(all_traces) # noqa: S101
145
+
146
+ # Preprocess traces to training samples
147
+ preprocessed_traces = preprocess_advantages(all_traces, config)
148
+
149
+ # Store these for later use in metrics
150
+ global_advantage_stats = [sample.advantage for sample in preprocessed_traces]
151
+
152
+ # Distribute preprocessed samples in groups across ranks
153
+ gpu_batch_size = len(preprocessed_traces) // num_gpus
154
+ rank_samples = [
155
+ preprocessed_traces[i : i + gpu_batch_size]
156
+ for i in range(0, len(preprocessed_traces), gpu_batch_size)
157
+ ]
158
+
159
+ # Log distribution info
160
+ hud_console.info(
161
+ f"Distributing {len(preprocessed_traces)} samples as {gpu_batch_size} sized batches across {num_gpus} GPUs" # noqa: E501
162
+ )
163
+ for rank in range(num_gpus):
164
+ n_samples = len(rank_samples[rank])
165
+ hud_console.info(f" Rank {rank}: {n_samples} samples")
166
+
167
+ hud_console.section_title(f"Training on {len(all_traces)} traces")
168
+ episode_time_value = episode_time
169
+ else:
170
+ rank_samples = None
171
+ episode_time_value = None
172
+
173
+ # Broadcast each rank's samples and episode time
174
+ rank_samples = broadcast_object(rank_samples, src=0)
175
+ episode_time_value = broadcast_object(episode_time_value, src=0)
176
+ my_samples = rank_samples[get_global_rank()] if rank_samples else []
177
+
178
+ # Process only assigned samples
179
+ last_metrics = learner.update(my_samples)
180
+
181
+ # Add episode time (same for all ranks since episodes run on rank 0)
182
+ if episode_time_value is not None:
183
+ last_metrics.update(
184
+ {
185
+ "episode_time": episode_time_value,
186
+ }
187
+ )
188
+
189
+ # Aggregate metrics across all GPUs for proper statistics
190
+ aggregate_metrics_across_ranks(last_metrics)
191
+
192
+ if is_main_process() and job_obj is not None:
193
+ # Use the global statistics we collected before distribution
194
+ if global_reward_stats is not None and global_advantage_stats is not None:
195
+ last_metrics.update(
196
+ {
197
+ "advantage": global_advantage_stats,
198
+ "reward": global_reward_stats,
199
+ }
200
+ )
201
+ else:
202
+ # Fallback: use only this rank's data
203
+ hud_console.warning("Global statistics not available, using partial data")
204
+ last_metrics.update(
205
+ {
206
+ "advantage": [sample.advantage for sample in my_samples]
207
+ if my_samples
208
+ else [],
209
+ "reward": [sample.reward for sample in my_samples]
210
+ if my_samples
211
+ else [],
212
+ }
213
+ )
214
+
215
+ job_obj.log_sync(last_metrics.to_dict())
216
+
217
+ if step % config.stats_interval == 0:
218
+ hud_console.key_value_table(last_metrics.to_dict())
219
+
220
+ # Increment step counter on all processes
221
+ step += 1
222
+
223
+ # Save checkpoint and update vLLM (only on main process)
224
+ if step % config.training.save_every_batches == 0:
225
+ if is_main_process() and vllm is not None and actor is not None:
226
+ hud_console.section_title("Saving checkpoint and updating vLLM")
227
+ # get date and time
228
+ now = datetime.now()
229
+ checkpoint_id = now.strftime("%Y%m%d_%H%M%S") + f"-{get_global_rank()}"
230
+ checkpoint_path = (
231
+ Path(config.out_dir) / f"{config.adapter_prefix}-{checkpoint_id}"
232
+ )
233
+ learner.save(str(checkpoint_path))
234
+
235
+ adapter_name = f"{config.adapter_prefix}-{checkpoint_id}"
236
+ if vllm.load_adapter(adapter_name, str(checkpoint_path)):
237
+ actor.update_adapter(adapter_name)
238
+ hud_console.info(f"✓ Checkpoint saved and loaded: {adapter_name}")
239
+ else:
240
+ hud_console.warning(f"Failed to hot-load adapter {adapter_name}")
241
+
242
+ # Ensure all processes wait for checkpoint operations to complete
243
+ synchronize()
244
+
245
+ if is_main_process():
246
+ hud_console.section_title("\n[3/3] Training completed!")
247
+ # Update job status to completed
248
+ if job_obj:
249
+ job_obj.update_status_sync("completed")
250
+ except Exception as e:
251
+ # Log error and any available metrics before failing
252
+ hud_console.error(f"Training failed on rank {get_global_rank()}: {e}")
253
+
254
+ if is_main_process():
255
+ # Log final metrics if we have any
256
+ if last_metrics and job_obj:
257
+ try:
258
+ job_obj.log_sync(last_metrics.to_dict())
259
+ except Exception:
260
+ hud_console.warning("Failed to log final metrics")
261
+
262
+ # Update job status to failed
263
+ if job_obj:
264
+ job_obj.update_status_sync("failed")
265
+
266
+ # Don't re-raise immediately to allow cleanup
267
+ raise
268
+
269
+ finally:
270
+ # Try to sync one last time, but don't fail if it doesn't work
271
+ try:
272
+ synchronize()
273
+ except Exception:
274
+ hud_console.warning("Failed to synchronize during cleanup")
275
+
276
+ # Clean up distributed environment
277
+ cleanup_distributed()
278
+
279
+
280
+ async def main() -> None:
281
+ parser = argparse.ArgumentParser(description="GRPO RL Training")
282
+ parser.add_argument("--config", type=str, help="Path to config JSON file")
283
+ parser.add_argument("--test", action="store_true", help="Run in test mode")
284
+ parser.add_argument("--debug", action="store_true", help="Enable debug mode")
285
+ parser.add_argument("--verbose", action="store_true", help="Enable verbose mode")
286
+ # Task input arguments
287
+ parser.add_argument(
288
+ "--tasks", type=str, help="Path to tasks JSONL file or HuggingFace dataset name"
289
+ )
290
+ parser.add_argument("--tasks-json", type=json.loads, help="Tasks as JSON list string")
291
+
292
+ args = parser.parse_args()
293
+
294
+ # Load config
295
+ if args.config:
296
+ with open(args.config) as f: # noqa: ASYNC230
297
+ config_dict = json.load(f)
298
+ config = Config.from_dict(config_dict)
299
+ else:
300
+ config = Config()
301
+
302
+ # Apply test mode settings
303
+ if args.test:
304
+ hud_console.info("[TEST MODE] Using minimal configuration")
305
+ eps = 6
306
+ config.training.batch_size = eps
307
+ config.actor.max_parallel_episodes = 12
308
+ config.training.group_size = eps
309
+ config.training.mini_batch_size = 3
310
+ config.training.training_steps = 4
311
+ config.actor.max_steps_per_episode = 4
312
+
313
+ # Calculate the memory usage
314
+ INITIAL_MEMORY = 8.0
315
+ SCALING_FACTOR = 4 / (28 * 28 * 256 * 1024)
316
+ token_estimate = (
317
+ config.training.mini_batch_size
318
+ * config.actor.max_steps_per_episode
319
+ * config.actor.max_new_tokens
320
+ )
321
+ hud_console.info(f"Estimated tokens per forward pass: {token_estimate}")
322
+ image_estimate = config.model.max_pixels
323
+ total_memory = INITIAL_MEMORY + SCALING_FACTOR * token_estimate * image_estimate
324
+ hud_console.info(f"Estimated memory peak: {total_memory:.2f} GB")
325
+ if total_memory > 75.0:
326
+ hud_console.warning(
327
+ "Potential memory usage is too high, decrease either training steps or mini batch size"
328
+ )
329
+ exit(1)
330
+
331
+ # Load tasks
332
+ if args.tasks_json:
333
+ # Tasks provided as JSON list via command line
334
+ tasks = load_tasks(args.tasks_jso)
335
+ elif args.tasks:
336
+ # Tasks provided as file path or HuggingFace dataset
337
+ tasks = load_tasks(args.tasks)
338
+ else:
339
+ # Default to browser_2048_tasks.jsonl if it exists
340
+ default_tasks_path = "browser_2048_tasks.jsonl"
341
+ if Path(default_tasks_path).exists():
342
+ hud_console.info(f"No tasks specified, using default: {default_tasks_path}")
343
+ tasks = load_tasks(default_tasks_path)
344
+ else:
345
+ raise ValueError(
346
+ "No tasks specified. Use --tasks, --tasks-json, or specify tasks_file in config"
347
+ )
348
+
349
+ # Run training
350
+ await train(config, tasks)
351
+
352
+
353
+ if __name__ == "__main__":
354
+ asyncio.run(main())
hud/rl/types.py ADDED
@@ -0,0 +1,101 @@
1
+ """Shared types for RL training."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from typing import Any
7
+
8
+ from pydantic import ConfigDict, Field
9
+ from pydantic.dataclasses import dataclass
10
+
11
+ from hud.types import Trace
12
+
13
+ try:
14
+ import torch
15
+ except ImportError:
16
+ raise ImportError("uv tool install hud-python[rl] to use this module") from None
17
+
18
+
19
+ class TrainingSample(Trace):
20
+ """A single training sample for GRPO."""
21
+
22
+ model_config = ConfigDict(arbitrary_types_allowed=True)
23
+ # Tokenized inputs to the model (model.forward(*inputs))
24
+ # This includes the input tokens, logit mask, etc.
25
+ inputs: dict[str, torch.Tensor] = Field(default_factory=dict)
26
+ old_logprobs: torch.Tensor | None = Field(default=None)
27
+ ref_logprobs: torch.Tensor | None = Field(default=None)
28
+
29
+ # Weighted advantage of group calculation
30
+ advantage: torch.Tensor | None = Field(default=None)
31
+
32
+ def to_device(self, device: torch.device) -> TrainingSample:
33
+ """Move sample to device."""
34
+ self.inputs = {
35
+ k: (t.to(device, non_blocking=True) if hasattr(t, "to") else t)
36
+ for k, t in self.inputs.items()
37
+ }
38
+ self.advantage = self.advantage.to(device) if self.advantage is not None else None
39
+ self.old_logprobs = self.old_logprobs.to(device) if self.old_logprobs is not None else None
40
+ self.ref_logprobs = self.ref_logprobs.to(device) if self.ref_logprobs is not None else None
41
+ return self
42
+
43
+
44
+ @dataclass
45
+ class Metric:
46
+ """A tuple for metrics."""
47
+
48
+ name: str = Field(default="")
49
+ mean: float = Field(default=0.0)
50
+ std: float = Field(default=0.0)
51
+ values: list[float] = Field(default_factory=list)
52
+
53
+ def update(
54
+ self, value: float | torch.Tensor | list[float] | list[int] | list[torch.Tensor]
55
+ ) -> None:
56
+ """Update metric."""
57
+ if isinstance(value, list):
58
+ self.values.extend(value.item() if isinstance(value, torch.Tensor) else value) # type: ignore
59
+ else:
60
+ self.values.append(value.item() if isinstance(value, torch.Tensor) else value) # type: ignore
61
+ mean_val = sum(self.values) / len(self.values)
62
+ self.mean = mean_val.item() if isinstance(mean_val, torch.Tensor) else float(mean_val) # type: ignore
63
+ variance = sum((x - self.mean) ** 2 for x in self.values) / len(self.values)
64
+ variance_val = variance.item() if isinstance(variance, torch.Tensor) else float(variance) # type: ignore
65
+ self.std = math.sqrt(variance_val)
66
+
67
+
68
+ @dataclass
69
+ class TrainingMetrics:
70
+ """Metrics for GRPO training (per training step)."""
71
+
72
+ # Learner metrics
73
+ grad_norm: Metric = Field(default=Metric())
74
+ loss: Metric = Field(default=Metric())
75
+ kl: Metric = Field(default=Metric())
76
+ reward: Metric = Field(default=Metric())
77
+ advantage: Metric = Field(default=Metric())
78
+ policy_ratio: Metric = Field(default=Metric())
79
+ tokens: Metric = Field(default=Metric())
80
+ entropy: Metric = Field(default=Metric())
81
+
82
+ # Computation metrics
83
+ gpu_util: Metric = Field(default=Metric()) # GPU utilization percentage
84
+ gpu_memory: Metric = Field(default=Metric()) # GPU memory usage in GB
85
+ episode_time: Metric = Field(default=Metric()) # Time to run episodes (actor)
86
+ training_time: Metric = Field(default=Metric()) # Time for gradient updates (learner)
87
+ samples_per_second: Metric = Field(default=Metric()) # Training throughput
88
+
89
+ def update(self, metrics: dict[str, Any]) -> None:
90
+ """Update metrics."""
91
+ for key, value in metrics.items():
92
+ if key in self.__dataclass_fields__:
93
+ getattr(self, key).update(value)
94
+
95
+ def to_dict(self) -> dict[str, Any]:
96
+ """Convert metrics to dictionary."""
97
+ final_metrics = {}
98
+ for key in self.__dataclass_fields__:
99
+ final_metrics[f"{key}_mean"] = getattr(self, key).mean
100
+ final_metrics[f"{key}_std"] = getattr(self, key).std
101
+ return final_metrics
@@ -0,0 +1,30 @@
1
+ #!/bin/bash
2
+ # Start vLLM server with OpenAI-compatible API
3
+
4
+ echo "Starting vLLM server for Qwen2.5-VL-3B-Instruct..."
5
+
6
+ # Enable runtime LoRA adapter loading
7
+ export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
8
+
9
+ export TOKENIZERS_PARALLELISM=false
10
+ export VLLM_LOGGING_LEVEL=DEBUG
11
+ export CUDA_LAUNCH_BLOCKING=1 # Better error messages for CUDA errors
12
+
13
+ # Common vLLM server command
14
+ # Using CUDA_VISIBLE_DEVICES to put vLLM on GPU 1
15
+ CUDA_VISIBLE_DEVICES=1 uv run vllm serve \
16
+ Qwen/Qwen2.5-VL-3B-Instruct \
17
+ --api-key token-abc123 \
18
+ --host 0.0.0.0 \
19
+ --port 8000 \
20
+ --tensor-parallel-size 1 \
21
+ --trust-remote-code \
22
+ --max-model-len 16384 \
23
+ --enable-lora \
24
+ --max-lora-rank 64 \
25
+ --max-cpu-loras 4 \
26
+ --enable-auto-tool-choice \
27
+ --tool-call-parser hermes \
28
+ --chat-template chat_template.jinja \
29
+ --enable-log-requests \
30
+ --uvicorn-log-level=debug 2>&1 | tee vllm_debug.log