hud-python 0.4.27__py3-none-any.whl → 0.4.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +2 -1
- hud/agents/base.py +73 -45
- hud/agents/claude.py +8 -4
- hud/agents/openai_chat_generic.py +65 -40
- hud/agents/tests/test_base.py +0 -4
- hud/agents/tests/test_openai.py +1 -1
- hud/cli/__init__.py +182 -52
- hud/cli/dev.py +8 -9
- hud/cli/eval.py +317 -119
- hud/cli/flows/__init__.py +0 -0
- hud/cli/flows/tasks.py +0 -0
- hud/cli/get.py +160 -0
- hud/cli/rl/__init__.py +563 -71
- hud/cli/rl/config.py +94 -0
- hud/cli/rl/display.py +133 -0
- hud/cli/rl/gpu.py +63 -0
- hud/cli/rl/gpu_utils.py +318 -0
- hud/cli/rl/presets.py +96 -0
- hud/cli/rl/remote_runner.py +348 -0
- hud/cli/rl/rl_api.py +150 -0
- hud/cli/rl/vllm.py +177 -0
- hud/cli/tests/test_analyze_metadata.py +0 -1
- hud/cli/utils/tasks.py +26 -0
- hud/clients/base.py +21 -23
- hud/clients/mcp_use.py +36 -44
- hud/clients/tests/test_mcp_use_retry.py +10 -10
- hud/datasets/__init__.py +4 -3
- hud/datasets/{execution/parallel.py → parallel.py} +1 -1
- hud/datasets/{execution/runner.py → runner.py} +1 -1
- hud/datasets/utils.py +1 -1
- hud/native/tests/test_native_init.py +1 -1
- hud/otel/config.py +1 -1
- hud/otel/instrumentation.py +35 -0
- hud/rl/README.md +31 -0
- hud/rl/__init__.py +1 -0
- hud/rl/actor.py +174 -0
- hud/rl/buffer.py +371 -0
- hud/rl/chat_template.jinja +101 -0
- hud/rl/config.py +184 -0
- hud/rl/distributed.py +95 -0
- hud/rl/learner.py +586 -0
- hud/rl/tests/__init__.py +1 -0
- hud/rl/tests/test_learner.py +171 -0
- hud/rl/train.py +354 -0
- hud/rl/types.py +101 -0
- hud/rl/utils/start_vllm_server.sh +30 -0
- hud/rl/utils.py +524 -0
- hud/rl/vllm_adapter.py +125 -0
- hud/settings.py +6 -0
- hud/telemetry/__init__.py +2 -1
- hud/telemetry/job.py +46 -3
- hud/telemetry/tests/test_trace.py +3 -3
- hud/telemetry/trace.py +85 -13
- hud/tools/computer/hud.py +4 -4
- hud/tools/tests/test_computer.py +3 -3
- hud/tools/tests/test_computer_actions.py +1 -1
- hud/types.py +123 -2
- hud/utils/group_eval.py +223 -0
- hud/utils/hud_console.py +113 -13
- hud/utils/tasks.py +119 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/METADATA +20 -2
- {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/RECORD +67 -47
- hud/cli/hf.py +0 -406
- hud/cli/rl/README.md +0 -243
- hud/cli/rl/init.py +0 -370
- hud/cli/rl/pod.py +0 -501
- hud/cli/rl/ssh.py +0 -322
- hud/cli/rl/train.py +0 -562
- hud/cli/rl/utils.py +0 -165
- hud/datasets/execution/__init__.py +0 -13
- hud/datasets/task.py +0 -116
- {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/WHEEL +0 -0
- {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/licenses/LICENSE +0 -0
hud/cli/rl/ssh.py
DELETED
|
@@ -1,322 +0,0 @@
|
|
|
1
|
-
"""SSH key configuration and connection utilities for Prime Intellect."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import os
|
|
6
|
-
import subprocess
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
|
|
9
|
-
import typer
|
|
10
|
-
|
|
11
|
-
from hud.settings import settings
|
|
12
|
-
from hud.utils.hud_console import HUDConsole
|
|
13
|
-
|
|
14
|
-
hud_console = HUDConsole()
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
async def check_and_configure_ssh_key() -> bool:
|
|
18
|
-
"""Check if SSH key is configured, prompt for it if not."""
|
|
19
|
-
# Check current SSH key configuration
|
|
20
|
-
result = subprocess.run( # noqa: ASYNC221
|
|
21
|
-
["prime", "config", "view"], # noqa: S607
|
|
22
|
-
capture_output=True,
|
|
23
|
-
text=True,
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
ssh_key_path = None
|
|
27
|
-
if result.returncode == 0:
|
|
28
|
-
# Parse the output for SSH key path
|
|
29
|
-
for line in result.stdout.split("\n"):
|
|
30
|
-
if "SSH Key Path" in line:
|
|
31
|
-
# Handle table format: "| SSH Key Path | C:\\Users\\saecl\\.ssh\\private_key.pem |" # noqa: E501
|
|
32
|
-
if "|" in line:
|
|
33
|
-
parts = line.split("|")
|
|
34
|
-
if len(parts) >= 3:
|
|
35
|
-
path = parts[2].strip()
|
|
36
|
-
if path and path != "None":
|
|
37
|
-
ssh_key_path = path
|
|
38
|
-
break
|
|
39
|
-
# Handle simple format: "SSH Key Path: /path/to/key"
|
|
40
|
-
elif ":" in line:
|
|
41
|
-
parts = line.split(":", 1)
|
|
42
|
-
if len(parts) > 1:
|
|
43
|
-
path = parts[1].strip()
|
|
44
|
-
if path and path != "None":
|
|
45
|
-
ssh_key_path = path
|
|
46
|
-
break
|
|
47
|
-
|
|
48
|
-
# If SSH key is configured, verify it exists
|
|
49
|
-
if ssh_key_path:
|
|
50
|
-
if Path(ssh_key_path).expanduser().exists():
|
|
51
|
-
hud_console.info(f"Using configured SSH key: {ssh_key_path}")
|
|
52
|
-
return True
|
|
53
|
-
else:
|
|
54
|
-
hud_console.warning(f"Configured SSH key not found: {ssh_key_path}")
|
|
55
|
-
|
|
56
|
-
# Prompt for SSH key
|
|
57
|
-
hud_console.section_title("🔑 SSH Key Configuration")
|
|
58
|
-
hud_console.info("Prime Intellect requires an SSH key for pod access.")
|
|
59
|
-
hud_console.info("")
|
|
60
|
-
hud_console.info("If you don't have a key:")
|
|
61
|
-
hud_console.info("1. Visit https://app.primeintellect.ai/dashboard/profile")
|
|
62
|
-
hud_console.info("2. Generate or upload your SSH key")
|
|
63
|
-
hud_console.info("3. Download the private key file")
|
|
64
|
-
hud_console.info("")
|
|
65
|
-
|
|
66
|
-
key_path = typer.prompt("Enter path to your Prime SSH private key (e.g., ~/.ssh/prime-key.pem)")
|
|
67
|
-
key_path = Path(key_path).expanduser()
|
|
68
|
-
|
|
69
|
-
if not key_path.exists():
|
|
70
|
-
hud_console.error(f"File not found: {key_path}")
|
|
71
|
-
return False
|
|
72
|
-
|
|
73
|
-
# Set permissions if not Windows
|
|
74
|
-
if os.name != "nt":
|
|
75
|
-
subprocess.run(["chmod", "400", str(key_path)]) # noqa: S603, S607, ASYNC221
|
|
76
|
-
hud_console.success("Set proper permissions on key file")
|
|
77
|
-
|
|
78
|
-
# Configure the SSH key globally
|
|
79
|
-
result = subprocess.run( # noqa: S603, ASYNC221
|
|
80
|
-
["prime", "config", "set-ssh-key-path", str(key_path)], # noqa: S607
|
|
81
|
-
capture_output=True,
|
|
82
|
-
text=True,
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
if result.returncode == 0:
|
|
86
|
-
hud_console.success("SSH key configured successfully")
|
|
87
|
-
return True
|
|
88
|
-
else:
|
|
89
|
-
hud_console.error("Failed to configure SSH key")
|
|
90
|
-
if result.stderr:
|
|
91
|
-
hud_console.error(f"Error: {result.stderr}")
|
|
92
|
-
return False
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
async def connect_and_train(
|
|
96
|
-
pod_id: str,
|
|
97
|
-
ssh_info: str,
|
|
98
|
-
model: str,
|
|
99
|
-
dataset: str,
|
|
100
|
-
config: Path,
|
|
101
|
-
output_dir: Path,
|
|
102
|
-
image: str,
|
|
103
|
-
dataset_size: int | None = None,
|
|
104
|
-
is_json_file: bool = False,
|
|
105
|
-
) -> None:
|
|
106
|
-
"""Connect to the pod via SSH and run training commands."""
|
|
107
|
-
hud_console.section_title("🚀 Starting Remote Training")
|
|
108
|
-
|
|
109
|
-
# Parse SSH info to get host and port
|
|
110
|
-
# Format is like "root@65.108.33.78 -p 1234"
|
|
111
|
-
ssh_parts = ssh_info.split()
|
|
112
|
-
ssh_user_host = ssh_parts[0] # root@65.108.33.78
|
|
113
|
-
ssh_port = ssh_parts[2] if len(ssh_parts) > 2 else "22" # 1234 or default 22
|
|
114
|
-
|
|
115
|
-
# Get SSH key path from Prime config
|
|
116
|
-
result = subprocess.run( # noqa: ASYNC221
|
|
117
|
-
["prime", "config", "view"], # noqa: S607
|
|
118
|
-
capture_output=True,
|
|
119
|
-
text=True,
|
|
120
|
-
)
|
|
121
|
-
|
|
122
|
-
ssh_key_path = None
|
|
123
|
-
if result.returncode == 0:
|
|
124
|
-
for line in result.stdout.split("\n"):
|
|
125
|
-
if "SSH Key Path" in line:
|
|
126
|
-
if "|" in line:
|
|
127
|
-
parts = line.split("|")
|
|
128
|
-
if len(parts) >= 3:
|
|
129
|
-
ssh_key_path = parts[2].strip()
|
|
130
|
-
break
|
|
131
|
-
elif ":" in line:
|
|
132
|
-
parts = line.split(":", 1)
|
|
133
|
-
if len(parts) > 1:
|
|
134
|
-
ssh_key_path = parts[1].strip()
|
|
135
|
-
break
|
|
136
|
-
|
|
137
|
-
if not ssh_key_path:
|
|
138
|
-
hud_console.error("SSH key path not configured")
|
|
139
|
-
raise typer.Exit(1)
|
|
140
|
-
|
|
141
|
-
# Verify SSH key exists
|
|
142
|
-
ssh_key_path = Path(ssh_key_path).expanduser()
|
|
143
|
-
if not ssh_key_path.exists():
|
|
144
|
-
hud_console.error(f"SSH key not found: {ssh_key_path}")
|
|
145
|
-
raise typer.Exit(1)
|
|
146
|
-
|
|
147
|
-
hud_console.info(f"Using SSH key: {ssh_key_path}")
|
|
148
|
-
|
|
149
|
-
# First, copy the config file to the pod using scp
|
|
150
|
-
hud_console.info("Copying config file to pod...")
|
|
151
|
-
try:
|
|
152
|
-
# On Windows, we need to ensure proper path formatting
|
|
153
|
-
config_path = str(config).replace("\\", "/")
|
|
154
|
-
scp_cmd = [
|
|
155
|
-
"scp",
|
|
156
|
-
"-i",
|
|
157
|
-
str(ssh_key_path),
|
|
158
|
-
"-P",
|
|
159
|
-
ssh_port,
|
|
160
|
-
"-o",
|
|
161
|
-
"StrictHostKeyChecking=no",
|
|
162
|
-
"-o",
|
|
163
|
-
"UserKnownHostsFile=/dev/null",
|
|
164
|
-
config_path,
|
|
165
|
-
f"{ssh_user_host}:/root/config.yaml",
|
|
166
|
-
]
|
|
167
|
-
hud_console.debug(f"Running: {' '.join(scp_cmd)}")
|
|
168
|
-
subprocess.run(scp_cmd, check=True) # noqa: S603, ASYNC221
|
|
169
|
-
hud_console.success("Config file copied")
|
|
170
|
-
except subprocess.CalledProcessError as e:
|
|
171
|
-
hud_console.error(f"Failed to copy config file: {e}")
|
|
172
|
-
if os.name == "nt": # Windows
|
|
173
|
-
hud_console.info("Make sure OpenSSH is installed. On Windows 10+, it's built-in.")
|
|
174
|
-
hud_console.info(
|
|
175
|
-
"If using older Windows, install Git for Windows which includes SSH/SCP."
|
|
176
|
-
)
|
|
177
|
-
else:
|
|
178
|
-
hud_console.info("Make sure scp is installed and in your PATH")
|
|
179
|
-
raise typer.Exit(1) from e
|
|
180
|
-
|
|
181
|
-
# If dataset is a JSON file, copy it too
|
|
182
|
-
remote_dataset = dataset # Default to unchanged
|
|
183
|
-
if is_json_file:
|
|
184
|
-
hud_console.info("Copying task file to pod...")
|
|
185
|
-
try:
|
|
186
|
-
# On Windows, we need to ensure proper path formatting
|
|
187
|
-
dataset_path = str(dataset).replace("\\", "/")
|
|
188
|
-
# Extract just the filename for the remote path
|
|
189
|
-
dataset_filename = os.path.basename(dataset)
|
|
190
|
-
remote_dataset = f"/root/{dataset_filename}"
|
|
191
|
-
|
|
192
|
-
scp_cmd = [
|
|
193
|
-
"scp",
|
|
194
|
-
"-i",
|
|
195
|
-
str(ssh_key_path),
|
|
196
|
-
"-P",
|
|
197
|
-
ssh_port,
|
|
198
|
-
"-o",
|
|
199
|
-
"StrictHostKeyChecking=no",
|
|
200
|
-
"-o",
|
|
201
|
-
"UserKnownHostsFile=/dev/null",
|
|
202
|
-
dataset_path,
|
|
203
|
-
f"{ssh_user_host}:{remote_dataset}",
|
|
204
|
-
]
|
|
205
|
-
hud_console.debug(f"Running: {' '.join(scp_cmd)}")
|
|
206
|
-
subprocess.run(scp_cmd, check=True) # noqa: S603, ASYNC221
|
|
207
|
-
hud_console.success(f"Task file copied to {remote_dataset}")
|
|
208
|
-
except subprocess.CalledProcessError as e:
|
|
209
|
-
hud_console.error(f"Failed to copy task file: {e}")
|
|
210
|
-
raise typer.Exit(1) from e
|
|
211
|
-
|
|
212
|
-
hud_console.info("Setting up environment and starting training...")
|
|
213
|
-
hud_console.info("This will take a few minutes for initial setup, then training will begin.")
|
|
214
|
-
hud_console.info("")
|
|
215
|
-
|
|
216
|
-
# Build environment exports
|
|
217
|
-
env_exports = []
|
|
218
|
-
wandb_key = getattr(settings, "wandb_api_key", None)
|
|
219
|
-
if wandb_key:
|
|
220
|
-
env_exports.append(f"export WANDB_API_KEY={wandb_key}")
|
|
221
|
-
if settings.api_key: # HUD API key
|
|
222
|
-
env_exports.append(f"export HUD_API_KEY={settings.api_key}")
|
|
223
|
-
env_export_cmd = " && ".join(env_exports) + " && " if env_exports else ""
|
|
224
|
-
|
|
225
|
-
# Create the training script content using echo commands
|
|
226
|
-
# This is more reliable than heredoc through SSH
|
|
227
|
-
training_script_lines = [
|
|
228
|
-
"import verifiers as vf",
|
|
229
|
-
"",
|
|
230
|
-
"# Load environment",
|
|
231
|
-
"env = vf.load_environment(",
|
|
232
|
-
' env_id="hud-vf-gym",',
|
|
233
|
-
f' taskset="{remote_dataset}",',
|
|
234
|
-
' config_path="/root/config.yaml",',
|
|
235
|
-
f" num_tasks={dataset_size},",
|
|
236
|
-
")",
|
|
237
|
-
"",
|
|
238
|
-
'print(f"Loaded environment with {len(env.dataset)} tasks")',
|
|
239
|
-
"",
|
|
240
|
-
"# Load model and tokenizer",
|
|
241
|
-
f'model, tokenizer = vf.get_model_and_tokenizer("{model}")',
|
|
242
|
-
"",
|
|
243
|
-
"# Get default training args",
|
|
244
|
-
f'args = vf.grpo_defaults(run_name="hud-rl-{pod_id[:8]}")',
|
|
245
|
-
f'args.output_dir = "{output_dir}"',
|
|
246
|
-
'args.wandb_project = "hud-rl"',
|
|
247
|
-
"args.logging_steps = 1",
|
|
248
|
-
"",
|
|
249
|
-
"# Create trainer",
|
|
250
|
-
"trainer = vf.GRPOTrainer(",
|
|
251
|
-
" model=model,",
|
|
252
|
-
" processing_class=tokenizer,",
|
|
253
|
-
" env=env,",
|
|
254
|
-
" args=args,",
|
|
255
|
-
" peft_config=vf.lora_defaults(),",
|
|
256
|
-
")",
|
|
257
|
-
"",
|
|
258
|
-
"# Train",
|
|
259
|
-
'print("Starting training...")',
|
|
260
|
-
"trainer.train()",
|
|
261
|
-
]
|
|
262
|
-
|
|
263
|
-
# Create echo commands for each line
|
|
264
|
-
# First remove any existing file, then create new one
|
|
265
|
-
training_script = "rm -f /root/train_hud_rl.py && " + " && ".join(
|
|
266
|
-
[f"echo {line!r} >> /root/train_hud_rl.py" for line in training_script_lines]
|
|
267
|
-
)
|
|
268
|
-
|
|
269
|
-
# Build the full setup and training command
|
|
270
|
-
full_command = (
|
|
271
|
-
# Install uv
|
|
272
|
-
"curl -LsSf https://astral.sh/uv/install.sh | sh && "
|
|
273
|
-
'source "$HOME/.local/bin/env" && '
|
|
274
|
-
# Install prime CLI and create venv
|
|
275
|
-
"uv tool install prime && "
|
|
276
|
-
"uv venv --python 3.12 && "
|
|
277
|
-
"source .venv/bin/activate && "
|
|
278
|
-
# Install packages
|
|
279
|
-
"prime env install hud/hud-vf-gym@0.1.1 && "
|
|
280
|
-
"uv pip install 'verifiers[train]' && "
|
|
281
|
-
"uv pip install flash-attn --no-build-isolation && "
|
|
282
|
-
# Set environment variables
|
|
283
|
-
f"{env_export_cmd}"
|
|
284
|
-
# Create the training script
|
|
285
|
-
f"{training_script} && "
|
|
286
|
-
"echo '✓ Training script created' && "
|
|
287
|
-
# Start vLLM server in tmux (on GPU 0)
|
|
288
|
-
f"tmux new-session -d -s vllm-server 'CUDA_VISIBLE_DEVICES=0 vf-vllm --model {model} --enforce-eager --disable-log-requests' && " # noqa: E501
|
|
289
|
-
"echo '✓ vLLM server started in tmux' && "
|
|
290
|
-
# Wait a bit for server to start
|
|
291
|
-
"echo 'Waiting for vLLM server to initialize...' && "
|
|
292
|
-
"sleep 10 && "
|
|
293
|
-
# Run training on GPU 1
|
|
294
|
-
"echo 'Starting training on GPU 1...' && "
|
|
295
|
-
"CUDA_VISIBLE_DEVICES=1 python /root/train_hud_rl.py"
|
|
296
|
-
)
|
|
297
|
-
|
|
298
|
-
try:
|
|
299
|
-
# Execute the full command via SSH
|
|
300
|
-
ssh_cmd = [
|
|
301
|
-
"ssh",
|
|
302
|
-
"-i",
|
|
303
|
-
str(ssh_key_path),
|
|
304
|
-
"-p",
|
|
305
|
-
ssh_port,
|
|
306
|
-
"-o",
|
|
307
|
-
"StrictHostKeyChecking=no",
|
|
308
|
-
"-o",
|
|
309
|
-
"UserKnownHostsFile=/dev/null",
|
|
310
|
-
ssh_user_host,
|
|
311
|
-
full_command,
|
|
312
|
-
]
|
|
313
|
-
subprocess.run(ssh_cmd, check=True) # noqa: S603, ASYNC221
|
|
314
|
-
|
|
315
|
-
except subprocess.CalledProcessError as e:
|
|
316
|
-
hud_console.error(f"Training failed: {e}")
|
|
317
|
-
raise typer.Exit(1) from e
|
|
318
|
-
except KeyboardInterrupt:
|
|
319
|
-
hud_console.warning("Training interrupted by user")
|
|
320
|
-
hud_console.info(f"To reconnect: prime pods ssh {pod_id}")
|
|
321
|
-
hud_console.info(f"To check status: prime pods status {pod_id}")
|
|
322
|
-
hud_console.info(f"To terminate: prime pods terminate {pod_id}")
|