hud-python 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (42) hide show
  1. hud/agents/base.py +118 -33
  2. hud/agents/claude.py +1 -1
  3. hud/agents/openai.py +5 -16
  4. hud/agents/tests/test_openai.py +24 -79
  5. hud/cli/__init__.py +137 -15
  6. hud/cli/analyze.py +2 -4
  7. hud/cli/build.py +6 -2
  8. hud/cli/dev.py +67 -0
  9. hud/cli/eval.py +90 -35
  10. hud/cli/hf.py +406 -0
  11. hud/cli/init.py +38 -19
  12. hud/cli/rl/README.md +243 -0
  13. hud/cli/rl/__init__.py +82 -0
  14. hud/cli/rl/init.py +370 -0
  15. hud/cli/rl/pod.py +491 -0
  16. hud/cli/rl/ssh.py +288 -0
  17. hud/cli/rl/train.py +421 -0
  18. hud/cli/rl/utils.py +165 -0
  19. hud/cli/tests/test_mcp_server.py +1 -4
  20. hud/clients/base.py +2 -0
  21. hud/clients/fastmcp.py +7 -2
  22. hud/clients/mcp_use.py +3 -1
  23. hud/clients/utils/retry_transport.py +34 -8
  24. hud/datasets/__init__.py +32 -0
  25. hud/datasets/execution/__init__.py +13 -0
  26. hud/datasets/execution/parallel.py +592 -0
  27. hud/datasets/execution/runner.py +123 -0
  28. hud/datasets/task.py +107 -0
  29. hud/datasets/utils.py +118 -0
  30. hud/otel/instrumentation.py +2 -1
  31. hud/server/server.py +58 -21
  32. hud/settings.py +12 -0
  33. hud/types.py +31 -10
  34. hud/utils/design.py +168 -2
  35. hud/utils/tests/test_version.py +1 -1
  36. hud/version.py +1 -1
  37. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/METADATA +4 -3
  38. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/RECORD +41 -28
  39. hud/datasets.py +0 -327
  40. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/WHEEL +0 -0
  41. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/entry_points.txt +0 -0
  42. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/licenses/LICENSE +0 -0
hud/cli/rl/ssh.py ADDED
@@ -0,0 +1,288 @@
1
+ """SSH key configuration and connection utilities for Prime Intellect."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import subprocess
7
+ from pathlib import Path
8
+
9
+ import typer
10
+
11
+ from hud.settings import settings
12
+ from hud.utils.design import HUDDesign
13
+
14
+ design = HUDDesign()
15
+
16
+
17
+ async def check_and_configure_ssh_key() -> bool:
18
+ """Check if SSH key is configured, prompt for it if not."""
19
+ # Check current SSH key configuration
20
+ result = subprocess.run( # noqa: ASYNC221
21
+ ["prime", "config", "view"], # noqa: S607
22
+ capture_output=True,
23
+ text=True,
24
+ )
25
+
26
+ ssh_key_path = None
27
+ if result.returncode == 0:
28
+ # Parse the output for SSH key path
29
+ for line in result.stdout.split("\n"):
30
+ if "SSH Key Path" in line:
31
+ # Handle table format: "| SSH Key Path | C:\\Users\\saecl\\.ssh\\private_key.pem |" # noqa: E501
32
+ if "|" in line:
33
+ parts = line.split("|")
34
+ if len(parts) >= 3:
35
+ path = parts[2].strip()
36
+ if path and path != "None":
37
+ ssh_key_path = path
38
+ break
39
+ # Handle simple format: "SSH Key Path: /path/to/key"
40
+ elif ":" in line:
41
+ parts = line.split(":", 1)
42
+ if len(parts) > 1:
43
+ path = parts[1].strip()
44
+ if path and path != "None":
45
+ ssh_key_path = path
46
+ break
47
+
48
+ # If SSH key is configured, verify it exists
49
+ if ssh_key_path:
50
+ if Path(ssh_key_path).expanduser().exists():
51
+ design.info(f"Using configured SSH key: {ssh_key_path}")
52
+ return True
53
+ else:
54
+ design.warning(f"Configured SSH key not found: {ssh_key_path}")
55
+
56
+ # Prompt for SSH key
57
+ design.section_title("🔑 SSH Key Configuration")
58
+ design.info("Prime Intellect requires an SSH key for pod access.")
59
+ design.info("")
60
+ design.info("If you don't have a key:")
61
+ design.info("1. Visit https://app.primeintellect.ai/dashboard/profile")
62
+ design.info("2. Generate or upload your SSH key")
63
+ design.info("3. Download the private key file")
64
+ design.info("")
65
+
66
+ key_path = typer.prompt("Enter path to your Prime SSH private key (e.g., ~/.ssh/prime-key.pem)")
67
+ key_path = Path(key_path).expanduser()
68
+
69
+ if not key_path.exists():
70
+ design.error(f"File not found: {key_path}")
71
+ return False
72
+
73
+ # Set permissions if not Windows
74
+ if os.name != "nt":
75
+ subprocess.run(["chmod", "400", str(key_path)]) # noqa: S603, S607, ASYNC221
76
+ design.success("Set proper permissions on key file")
77
+
78
+ # Configure the SSH key globally
79
+ result = subprocess.run( # noqa: S603, ASYNC221
80
+ ["prime", "config", "set-ssh-key-path", str(key_path)], # noqa: S607
81
+ capture_output=True,
82
+ text=True,
83
+ )
84
+
85
+ if result.returncode == 0:
86
+ design.success("SSH key configured successfully")
87
+ return True
88
+ else:
89
+ design.error("Failed to configure SSH key")
90
+ if result.stderr:
91
+ design.error(f"Error: {result.stderr}")
92
+ return False
93
+
94
+
95
+ async def connect_and_train(
96
+ pod_id: str,
97
+ ssh_info: str,
98
+ model: str,
99
+ dataset: str,
100
+ config: Path,
101
+ output_dir: Path,
102
+ image: str,
103
+ dataset_size: int | None = None,
104
+ ) -> None:
105
+ """Connect to the pod via SSH and run training commands."""
106
+ design.section_title("🚀 Starting Remote Training")
107
+
108
+ # Parse SSH info to get host and port
109
+ # Format is like "root@65.108.33.78 -p 1234"
110
+ ssh_parts = ssh_info.split()
111
+ ssh_user_host = ssh_parts[0] # root@65.108.33.78
112
+ ssh_port = ssh_parts[2] if len(ssh_parts) > 2 else "22" # 1234 or default 22
113
+
114
+ # Get SSH key path from Prime config
115
+ result = subprocess.run( # noqa: ASYNC221
116
+ ["prime", "config", "view"], # noqa: S607
117
+ capture_output=True,
118
+ text=True,
119
+ )
120
+
121
+ ssh_key_path = None
122
+ if result.returncode == 0:
123
+ for line in result.stdout.split("\n"):
124
+ if "SSH Key Path" in line:
125
+ if "|" in line:
126
+ parts = line.split("|")
127
+ if len(parts) >= 3:
128
+ ssh_key_path = parts[2].strip()
129
+ break
130
+ elif ":" in line:
131
+ parts = line.split(":", 1)
132
+ if len(parts) > 1:
133
+ ssh_key_path = parts[1].strip()
134
+ break
135
+
136
+ if not ssh_key_path:
137
+ design.error("SSH key path not configured")
138
+ raise typer.Exit(1)
139
+
140
+ # Verify SSH key exists
141
+ ssh_key_path = Path(ssh_key_path).expanduser()
142
+ if not ssh_key_path.exists():
143
+ design.error(f"SSH key not found: {ssh_key_path}")
144
+ raise typer.Exit(1)
145
+
146
+ design.info(f"Using SSH key: {ssh_key_path}")
147
+
148
+ # First, copy the config file to the pod using scp
149
+ design.info("Copying config file to pod...")
150
+ try:
151
+ # On Windows, we need to ensure proper path formatting
152
+ config_path = str(config).replace("\\", "/")
153
+ scp_cmd = [
154
+ "scp",
155
+ "-i",
156
+ str(ssh_key_path),
157
+ "-P",
158
+ ssh_port,
159
+ "-o",
160
+ "StrictHostKeyChecking=no",
161
+ "-o",
162
+ "UserKnownHostsFile=/dev/null",
163
+ config_path,
164
+ f"{ssh_user_host}:/root/config.yaml",
165
+ ]
166
+ design.debug(f"Running: {' '.join(scp_cmd)}")
167
+ subprocess.run(scp_cmd, check=True) # noqa: S603, ASYNC221
168
+ design.success("Config file copied")
169
+ except subprocess.CalledProcessError as e:
170
+ design.error(f"Failed to copy config file: {e}")
171
+ if os.name == "nt": # Windows
172
+ design.info("Make sure OpenSSH is installed. On Windows 10+, it's built-in.")
173
+ design.info("If using older Windows, install Git for Windows which includes SSH/SCP.")
174
+ else:
175
+ design.info("Make sure scp is installed and in your PATH")
176
+ raise typer.Exit(1) from e
177
+
178
+ design.info("Setting up environment and starting training...")
179
+ design.info("This will take a few minutes for initial setup, then training will begin.")
180
+ design.info("")
181
+
182
+ # Build environment exports
183
+ env_exports = []
184
+ wandb_key = getattr(settings, "wandb_api_key", None)
185
+ if wandb_key:
186
+ env_exports.append(f"export WANDB_API_KEY={wandb_key}")
187
+ if settings.api_key: # HUD API key
188
+ env_exports.append(f"export HUD_API_KEY={settings.api_key}")
189
+ env_export_cmd = " && ".join(env_exports) + " && " if env_exports else ""
190
+
191
+ # Create the training script content using echo commands
192
+ # This is more reliable than heredoc through SSH
193
+ training_script_lines = [
194
+ "import verifiers as vf",
195
+ "",
196
+ "# Load environment",
197
+ "env = vf.load_environment(",
198
+ ' env_id="hud-vf-gym",',
199
+ f' taskset="{dataset}",',
200
+ ' config_path="/root/config.yaml",',
201
+ f" num_tasks={dataset_size},",
202
+ ")",
203
+ "",
204
+ 'print(f"Loaded environment with {len(env.dataset)} tasks")',
205
+ "",
206
+ "# Load model and tokenizer",
207
+ f'model, tokenizer = vf.get_model_and_tokenizer("{model}")',
208
+ "",
209
+ "# Get default training args",
210
+ f'args = vf.grpo_defaults(run_name="hud-rl-{pod_id[:8]}")',
211
+ f'args.output_dir = "{output_dir}"',
212
+ 'args.wandb_project = "hud-rl"',
213
+ "args.logging_steps = 1",
214
+ "",
215
+ "# Create trainer",
216
+ "trainer = vf.GRPOTrainer(",
217
+ " model=model,",
218
+ " processing_class=tokenizer,",
219
+ " env=env,",
220
+ " args=args,",
221
+ " peft_config=vf.lora_defaults(),",
222
+ ")",
223
+ "",
224
+ "# Train",
225
+ 'print("Starting training...")',
226
+ "trainer.train()",
227
+ ]
228
+
229
+ # Create echo commands for each line
230
+ # First remove any existing file, then create new one
231
+ training_script = "rm -f /root/train_hud_rl.py && " + " && ".join(
232
+ [f"echo {line!r} >> /root/train_hud_rl.py" for line in training_script_lines]
233
+ )
234
+
235
+ # Build the full setup and training command
236
+ full_command = (
237
+ # Install uv
238
+ "curl -LsSf https://astral.sh/uv/install.sh | sh && "
239
+ 'source "$HOME/.local/bin/env" && '
240
+ # Install prime CLI and create venv
241
+ "uv tool install prime && "
242
+ "uv venv --python 3.12 && "
243
+ "source .venv/bin/activate && "
244
+ # Install packages
245
+ "prime env install hud/hud-vf-gym@0.1.0 && "
246
+ "uv pip install 'verifiers[train]' && "
247
+ "uv pip install flash-attn --no-build-isolation && "
248
+ # Set environment variables
249
+ f"{env_export_cmd}"
250
+ # Create the training script
251
+ f"{training_script} && "
252
+ "echo '✓ Training script created' && "
253
+ # Start vLLM server in tmux (on GPU 0)
254
+ f"tmux new-session -d -s vllm-server 'CUDA_VISIBLE_DEVICES=0 vf-vllm --model {model} --enforce-eager --disable-log-requests' && " # noqa: E501
255
+ "echo '✓ vLLM server started in tmux' && "
256
+ # Wait a bit for server to start
257
+ "echo 'Waiting for vLLM server to initialize...' && "
258
+ "sleep 10 && "
259
+ # Run training on GPU 1
260
+ "echo 'Starting training on GPU 1...' && "
261
+ "CUDA_VISIBLE_DEVICES=1 python /root/train_hud_rl.py"
262
+ )
263
+
264
+ try:
265
+ # Execute the full command via SSH
266
+ ssh_cmd = [
267
+ "ssh",
268
+ "-i",
269
+ str(ssh_key_path),
270
+ "-p",
271
+ ssh_port,
272
+ "-o",
273
+ "StrictHostKeyChecking=no",
274
+ "-o",
275
+ "UserKnownHostsFile=/dev/null",
276
+ ssh_user_host,
277
+ full_command,
278
+ ]
279
+ subprocess.run(ssh_cmd, check=True) # noqa: S603, ASYNC221
280
+
281
+ except subprocess.CalledProcessError as e:
282
+ design.error(f"Training failed: {e}")
283
+ raise typer.Exit(1) from e
284
+ except KeyboardInterrupt:
285
+ design.warning("Training interrupted by user")
286
+ design.info(f"To reconnect: prime pods ssh {pod_id}")
287
+ design.info(f"To check status: prime pods status {pod_id}")
288
+ design.info(f"To terminate: prime pods terminate {pod_id}")