hud-python 0.4.28__py3-none-any.whl → 0.4.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (75) hide show
  1. hud/__init__.py +2 -1
  2. hud/agents/base.py +73 -45
  3. hud/agents/claude.py +8 -4
  4. hud/agents/openai_chat_generic.py +65 -40
  5. hud/agents/tests/test_base.py +0 -4
  6. hud/agents/tests/test_openai.py +1 -1
  7. hud/cli/__init__.py +182 -52
  8. hud/cli/dev.py +8 -9
  9. hud/cli/eval.py +317 -119
  10. hud/cli/flows/__init__.py +0 -0
  11. hud/cli/flows/tasks.py +0 -0
  12. hud/cli/get.py +160 -0
  13. hud/cli/rl/__init__.py +563 -71
  14. hud/cli/rl/config.py +94 -0
  15. hud/cli/rl/display.py +133 -0
  16. hud/cli/rl/gpu.py +63 -0
  17. hud/cli/rl/gpu_utils.py +318 -0
  18. hud/cli/rl/presets.py +96 -0
  19. hud/cli/rl/remote_runner.py +348 -0
  20. hud/cli/rl/rl_api.py +150 -0
  21. hud/cli/rl/vllm.py +177 -0
  22. hud/cli/tests/test_analyze_metadata.py +0 -1
  23. hud/cli/utils/tasks.py +26 -0
  24. hud/clients/base.py +21 -23
  25. hud/clients/mcp_use.py +36 -44
  26. hud/clients/tests/test_mcp_use_retry.py +10 -10
  27. hud/datasets/__init__.py +4 -3
  28. hud/datasets/{execution/parallel.py → parallel.py} +1 -1
  29. hud/datasets/{execution/runner.py → runner.py} +1 -1
  30. hud/datasets/utils.py +1 -1
  31. hud/native/tests/test_native_init.py +1 -1
  32. hud/otel/config.py +1 -1
  33. hud/otel/instrumentation.py +35 -0
  34. hud/rl/README.md +31 -0
  35. hud/rl/__init__.py +1 -0
  36. hud/rl/actor.py +174 -0
  37. hud/rl/buffer.py +371 -0
  38. hud/rl/chat_template.jinja +101 -0
  39. hud/rl/config.py +184 -0
  40. hud/rl/distributed.py +95 -0
  41. hud/rl/learner.py +586 -0
  42. hud/rl/tests/__init__.py +1 -0
  43. hud/rl/tests/test_learner.py +171 -0
  44. hud/rl/train.py +354 -0
  45. hud/rl/types.py +101 -0
  46. hud/rl/utils/start_vllm_server.sh +30 -0
  47. hud/rl/utils.py +524 -0
  48. hud/rl/vllm_adapter.py +125 -0
  49. hud/settings.py +6 -0
  50. hud/telemetry/__init__.py +2 -1
  51. hud/telemetry/job.py +46 -3
  52. hud/telemetry/tests/test_trace.py +3 -3
  53. hud/telemetry/trace.py +85 -13
  54. hud/tools/tests/test_computer.py +3 -3
  55. hud/tools/tests/test_computer_actions.py +1 -1
  56. hud/types.py +123 -2
  57. hud/utils/group_eval.py +223 -0
  58. hud/utils/hud_console.py +113 -13
  59. hud/utils/tasks.py +119 -0
  60. hud/utils/tests/test_version.py +1 -1
  61. hud/version.py +1 -1
  62. {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/METADATA +20 -2
  63. {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/RECORD +66 -46
  64. hud/cli/hf.py +0 -406
  65. hud/cli/rl/README.md +0 -243
  66. hud/cli/rl/init.py +0 -370
  67. hud/cli/rl/pod.py +0 -501
  68. hud/cli/rl/ssh.py +0 -322
  69. hud/cli/rl/train.py +0 -562
  70. hud/cli/rl/utils.py +0 -165
  71. hud/datasets/execution/__init__.py +0 -13
  72. hud/datasets/task.py +0 -116
  73. {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/WHEEL +0 -0
  74. {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/entry_points.txt +0 -0
  75. {hud_python-0.4.28.dist-info → hud_python-0.4.29.dist-info}/licenses/LICENSE +0 -0
hud/cli/rl/ssh.py DELETED
@@ -1,322 +0,0 @@
1
- """SSH key configuration and connection utilities for Prime Intellect."""
2
-
3
- from __future__ import annotations
4
-
5
- import os
6
- import subprocess
7
- from pathlib import Path
8
-
9
- import typer
10
-
11
- from hud.settings import settings
12
- from hud.utils.hud_console import HUDConsole
13
-
14
- hud_console = HUDConsole()
15
-
16
-
17
- async def check_and_configure_ssh_key() -> bool:
18
- """Check if SSH key is configured, prompt for it if not."""
19
- # Check current SSH key configuration
20
- result = subprocess.run( # noqa: ASYNC221
21
- ["prime", "config", "view"], # noqa: S607
22
- capture_output=True,
23
- text=True,
24
- )
25
-
26
- ssh_key_path = None
27
- if result.returncode == 0:
28
- # Parse the output for SSH key path
29
- for line in result.stdout.split("\n"):
30
- if "SSH Key Path" in line:
31
- # Handle table format: "| SSH Key Path | C:\\Users\\saecl\\.ssh\\private_key.pem |" # noqa: E501
32
- if "|" in line:
33
- parts = line.split("|")
34
- if len(parts) >= 3:
35
- path = parts[2].strip()
36
- if path and path != "None":
37
- ssh_key_path = path
38
- break
39
- # Handle simple format: "SSH Key Path: /path/to/key"
40
- elif ":" in line:
41
- parts = line.split(":", 1)
42
- if len(parts) > 1:
43
- path = parts[1].strip()
44
- if path and path != "None":
45
- ssh_key_path = path
46
- break
47
-
48
- # If SSH key is configured, verify it exists
49
- if ssh_key_path:
50
- if Path(ssh_key_path).expanduser().exists():
51
- hud_console.info(f"Using configured SSH key: {ssh_key_path}")
52
- return True
53
- else:
54
- hud_console.warning(f"Configured SSH key not found: {ssh_key_path}")
55
-
56
- # Prompt for SSH key
57
- hud_console.section_title("🔑 SSH Key Configuration")
58
- hud_console.info("Prime Intellect requires an SSH key for pod access.")
59
- hud_console.info("")
60
- hud_console.info("If you don't have a key:")
61
- hud_console.info("1. Visit https://app.primeintellect.ai/dashboard/profile")
62
- hud_console.info("2. Generate or upload your SSH key")
63
- hud_console.info("3. Download the private key file")
64
- hud_console.info("")
65
-
66
- key_path = typer.prompt("Enter path to your Prime SSH private key (e.g., ~/.ssh/prime-key.pem)")
67
- key_path = Path(key_path).expanduser()
68
-
69
- if not key_path.exists():
70
- hud_console.error(f"File not found: {key_path}")
71
- return False
72
-
73
- # Set permissions if not Windows
74
- if os.name != "nt":
75
- subprocess.run(["chmod", "400", str(key_path)]) # noqa: S603, S607, ASYNC221
76
- hud_console.success("Set proper permissions on key file")
77
-
78
- # Configure the SSH key globally
79
- result = subprocess.run( # noqa: S603, ASYNC221
80
- ["prime", "config", "set-ssh-key-path", str(key_path)], # noqa: S607
81
- capture_output=True,
82
- text=True,
83
- )
84
-
85
- if result.returncode == 0:
86
- hud_console.success("SSH key configured successfully")
87
- return True
88
- else:
89
- hud_console.error("Failed to configure SSH key")
90
- if result.stderr:
91
- hud_console.error(f"Error: {result.stderr}")
92
- return False
93
-
94
-
95
- async def connect_and_train(
96
- pod_id: str,
97
- ssh_info: str,
98
- model: str,
99
- dataset: str,
100
- config: Path,
101
- output_dir: Path,
102
- image: str,
103
- dataset_size: int | None = None,
104
- is_json_file: bool = False,
105
- ) -> None:
106
- """Connect to the pod via SSH and run training commands."""
107
- hud_console.section_title("🚀 Starting Remote Training")
108
-
109
- # Parse SSH info to get host and port
110
- # Format is like "root@65.108.33.78 -p 1234"
111
- ssh_parts = ssh_info.split()
112
- ssh_user_host = ssh_parts[0] # root@65.108.33.78
113
- ssh_port = ssh_parts[2] if len(ssh_parts) > 2 else "22" # 1234 or default 22
114
-
115
- # Get SSH key path from Prime config
116
- result = subprocess.run( # noqa: ASYNC221
117
- ["prime", "config", "view"], # noqa: S607
118
- capture_output=True,
119
- text=True,
120
- )
121
-
122
- ssh_key_path = None
123
- if result.returncode == 0:
124
- for line in result.stdout.split("\n"):
125
- if "SSH Key Path" in line:
126
- if "|" in line:
127
- parts = line.split("|")
128
- if len(parts) >= 3:
129
- ssh_key_path = parts[2].strip()
130
- break
131
- elif ":" in line:
132
- parts = line.split(":", 1)
133
- if len(parts) > 1:
134
- ssh_key_path = parts[1].strip()
135
- break
136
-
137
- if not ssh_key_path:
138
- hud_console.error("SSH key path not configured")
139
- raise typer.Exit(1)
140
-
141
- # Verify SSH key exists
142
- ssh_key_path = Path(ssh_key_path).expanduser()
143
- if not ssh_key_path.exists():
144
- hud_console.error(f"SSH key not found: {ssh_key_path}")
145
- raise typer.Exit(1)
146
-
147
- hud_console.info(f"Using SSH key: {ssh_key_path}")
148
-
149
- # First, copy the config file to the pod using scp
150
- hud_console.info("Copying config file to pod...")
151
- try:
152
- # On Windows, we need to ensure proper path formatting
153
- config_path = str(config).replace("\\", "/")
154
- scp_cmd = [
155
- "scp",
156
- "-i",
157
- str(ssh_key_path),
158
- "-P",
159
- ssh_port,
160
- "-o",
161
- "StrictHostKeyChecking=no",
162
- "-o",
163
- "UserKnownHostsFile=/dev/null",
164
- config_path,
165
- f"{ssh_user_host}:/root/config.yaml",
166
- ]
167
- hud_console.debug(f"Running: {' '.join(scp_cmd)}")
168
- subprocess.run(scp_cmd, check=True) # noqa: S603, ASYNC221
169
- hud_console.success("Config file copied")
170
- except subprocess.CalledProcessError as e:
171
- hud_console.error(f"Failed to copy config file: {e}")
172
- if os.name == "nt": # Windows
173
- hud_console.info("Make sure OpenSSH is installed. On Windows 10+, it's built-in.")
174
- hud_console.info(
175
- "If using older Windows, install Git for Windows which includes SSH/SCP."
176
- )
177
- else:
178
- hud_console.info("Make sure scp is installed and in your PATH")
179
- raise typer.Exit(1) from e
180
-
181
- # If dataset is a JSON file, copy it too
182
- remote_dataset = dataset # Default to unchanged
183
- if is_json_file:
184
- hud_console.info("Copying task file to pod...")
185
- try:
186
- # On Windows, we need to ensure proper path formatting
187
- dataset_path = str(dataset).replace("\\", "/")
188
- # Extract just the filename for the remote path
189
- dataset_filename = os.path.basename(dataset)
190
- remote_dataset = f"/root/{dataset_filename}"
191
-
192
- scp_cmd = [
193
- "scp",
194
- "-i",
195
- str(ssh_key_path),
196
- "-P",
197
- ssh_port,
198
- "-o",
199
- "StrictHostKeyChecking=no",
200
- "-o",
201
- "UserKnownHostsFile=/dev/null",
202
- dataset_path,
203
- f"{ssh_user_host}:{remote_dataset}",
204
- ]
205
- hud_console.debug(f"Running: {' '.join(scp_cmd)}")
206
- subprocess.run(scp_cmd, check=True) # noqa: S603, ASYNC221
207
- hud_console.success(f"Task file copied to {remote_dataset}")
208
- except subprocess.CalledProcessError as e:
209
- hud_console.error(f"Failed to copy task file: {e}")
210
- raise typer.Exit(1) from e
211
-
212
- hud_console.info("Setting up environment and starting training...")
213
- hud_console.info("This will take a few minutes for initial setup, then training will begin.")
214
- hud_console.info("")
215
-
216
- # Build environment exports
217
- env_exports = []
218
- wandb_key = getattr(settings, "wandb_api_key", None)
219
- if wandb_key:
220
- env_exports.append(f"export WANDB_API_KEY={wandb_key}")
221
- if settings.api_key: # HUD API key
222
- env_exports.append(f"export HUD_API_KEY={settings.api_key}")
223
- env_export_cmd = " && ".join(env_exports) + " && " if env_exports else ""
224
-
225
- # Create the training script content using echo commands
226
- # This is more reliable than heredoc through SSH
227
- training_script_lines = [
228
- "import verifiers as vf",
229
- "",
230
- "# Load environment",
231
- "env = vf.load_environment(",
232
- ' env_id="hud-vf-gym",',
233
- f' taskset="{remote_dataset}",',
234
- ' config_path="/root/config.yaml",',
235
- f" num_tasks={dataset_size},",
236
- ")",
237
- "",
238
- 'print(f"Loaded environment with {len(env.dataset)} tasks")',
239
- "",
240
- "# Load model and tokenizer",
241
- f'model, tokenizer = vf.get_model_and_tokenizer("{model}")',
242
- "",
243
- "# Get default training args",
244
- f'args = vf.grpo_defaults(run_name="hud-rl-{pod_id[:8]}")',
245
- f'args.output_dir = "{output_dir}"',
246
- 'args.wandb_project = "hud-rl"',
247
- "args.logging_steps = 1",
248
- "",
249
- "# Create trainer",
250
- "trainer = vf.GRPOTrainer(",
251
- " model=model,",
252
- " processing_class=tokenizer,",
253
- " env=env,",
254
- " args=args,",
255
- " peft_config=vf.lora_defaults(),",
256
- ")",
257
- "",
258
- "# Train",
259
- 'print("Starting training...")',
260
- "trainer.train()",
261
- ]
262
-
263
- # Create echo commands for each line
264
- # First remove any existing file, then create new one
265
- training_script = "rm -f /root/train_hud_rl.py && " + " && ".join(
266
- [f"echo {line!r} >> /root/train_hud_rl.py" for line in training_script_lines]
267
- )
268
-
269
- # Build the full setup and training command
270
- full_command = (
271
- # Install uv
272
- "curl -LsSf https://astral.sh/uv/install.sh | sh && "
273
- 'source "$HOME/.local/bin/env" && '
274
- # Install prime CLI and create venv
275
- "uv tool install prime && "
276
- "uv venv --python 3.12 && "
277
- "source .venv/bin/activate && "
278
- # Install packages
279
- "prime env install hud/hud-vf-gym@0.1.1 && "
280
- "uv pip install 'verifiers[train]' && "
281
- "uv pip install flash-attn --no-build-isolation && "
282
- # Set environment variables
283
- f"{env_export_cmd}"
284
- # Create the training script
285
- f"{training_script} && "
286
- "echo '✓ Training script created' && "
287
- # Start vLLM server in tmux (on GPU 0)
288
- f"tmux new-session -d -s vllm-server 'CUDA_VISIBLE_DEVICES=0 vf-vllm --model {model} --enforce-eager --disable-log-requests' && " # noqa: E501
289
- "echo '✓ vLLM server started in tmux' && "
290
- # Wait a bit for server to start
291
- "echo 'Waiting for vLLM server to initialize...' && "
292
- "sleep 10 && "
293
- # Run training on GPU 1
294
- "echo 'Starting training on GPU 1...' && "
295
- "CUDA_VISIBLE_DEVICES=1 python /root/train_hud_rl.py"
296
- )
297
-
298
- try:
299
- # Execute the full command via SSH
300
- ssh_cmd = [
301
- "ssh",
302
- "-i",
303
- str(ssh_key_path),
304
- "-p",
305
- ssh_port,
306
- "-o",
307
- "StrictHostKeyChecking=no",
308
- "-o",
309
- "UserKnownHostsFile=/dev/null",
310
- ssh_user_host,
311
- full_command,
312
- ]
313
- subprocess.run(ssh_cmd, check=True) # noqa: S603, ASYNC221
314
-
315
- except subprocess.CalledProcessError as e:
316
- hud_console.error(f"Training failed: {e}")
317
- raise typer.Exit(1) from e
318
- except KeyboardInterrupt:
319
- hud_console.warning("Training interrupted by user")
320
- hud_console.info(f"To reconnect: prime pods ssh {pod_id}")
321
- hud_console.info(f"To check status: prime pods status {pod_id}")
322
- hud_console.info(f"To terminate: prime pods terminate {pod_id}")