hud-python 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (42) hide show
  1. hud/agents/base.py +118 -33
  2. hud/agents/claude.py +1 -1
  3. hud/agents/openai.py +5 -16
  4. hud/agents/tests/test_openai.py +24 -79
  5. hud/cli/__init__.py +137 -15
  6. hud/cli/analyze.py +2 -4
  7. hud/cli/build.py +6 -2
  8. hud/cli/dev.py +67 -0
  9. hud/cli/eval.py +90 -35
  10. hud/cli/hf.py +406 -0
  11. hud/cli/init.py +38 -19
  12. hud/cli/rl/README.md +243 -0
  13. hud/cli/rl/__init__.py +82 -0
  14. hud/cli/rl/init.py +370 -0
  15. hud/cli/rl/pod.py +491 -0
  16. hud/cli/rl/ssh.py +288 -0
  17. hud/cli/rl/train.py +421 -0
  18. hud/cli/rl/utils.py +165 -0
  19. hud/cli/tests/test_mcp_server.py +1 -4
  20. hud/clients/base.py +2 -0
  21. hud/clients/fastmcp.py +7 -2
  22. hud/clients/mcp_use.py +3 -1
  23. hud/clients/utils/retry_transport.py +34 -8
  24. hud/datasets/__init__.py +32 -0
  25. hud/datasets/execution/__init__.py +13 -0
  26. hud/datasets/execution/parallel.py +592 -0
  27. hud/datasets/execution/runner.py +123 -0
  28. hud/datasets/task.py +107 -0
  29. hud/datasets/utils.py +118 -0
  30. hud/otel/instrumentation.py +2 -1
  31. hud/server/server.py +58 -21
  32. hud/settings.py +12 -0
  33. hud/types.py +31 -10
  34. hud/utils/design.py +168 -2
  35. hud/utils/tests/test_version.py +1 -1
  36. hud/version.py +1 -1
  37. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/METADATA +4 -3
  38. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/RECORD +41 -28
  39. hud/datasets.py +0 -327
  40. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/WHEEL +0 -0
  41. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/entry_points.txt +0 -0
  42. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/licenses/LICENSE +0 -0
hud/cli/hf.py ADDED
@@ -0,0 +1,406 @@
1
+ """HuggingFace dataset conversion command for HUD tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+
9
+ import typer
10
+
11
+ from hud.cli.rl.utils import get_mcp_config_from_lock, read_lock_file, write_lock_file
12
+ from hud.utils.design import HUDDesign
13
+
14
+ design = HUDDesign()
15
+
16
+
17
+ def hf_command(
18
+ tasks_file: Path | None = None,
19
+ name: str | None = None,
20
+ push: bool = True,
21
+ private: bool = False,
22
+ update_lock: bool = True,
23
+ token: str | None = None,
24
+ ) -> None:
25
+ """📊 Convert tasks to HuggingFace dataset format.
26
+
27
+ Automatically detects task files if not specified.
28
+ Suggests dataset name based on environment if not provided.
29
+ Converts a JSON file containing HUD tasks into a HuggingFace dataset
30
+ and optionally pushes it to the Hub. Also updates hud.lock.yaml with
31
+ the primary dataset reference.
32
+
33
+ Examples:
34
+ hud hf # Auto-detect tasks and suggest name
35
+ hud hf tasks.json # Use specific file, suggest name
36
+ hud hf --name my-org/my-tasks # Auto-detect tasks, use name
37
+ hud hf tasks.json --name hud-evals/web-tasks --private
38
+ hud hf tasks.json --name local-dataset --no-push
39
+ """
40
+ design.header("HuggingFace Dataset Converter", icon="📊")
41
+
42
+ # Auto-detect task file if not provided
43
+ if tasks_file is None:
44
+ design.info("Looking for task files...")
45
+
46
+ # Common task file patterns
47
+ patterns = [
48
+ "tasks.json",
49
+ "task.json",
50
+ "*_tasks.json",
51
+ "eval*.json",
52
+ "evaluation*.json",
53
+ ]
54
+
55
+ json_files = []
56
+ for pattern in patterns:
57
+ json_files.extend(Path(".").glob(pattern))
58
+
59
+ # Remove duplicates and sort
60
+ json_files = sorted(set(json_files))
61
+
62
+ if not json_files:
63
+ design.error("No task files found in current directory")
64
+ design.info("Create a task JSON file (e.g., tasks.json) or specify the file path")
65
+ raise typer.Exit(1)
66
+ elif len(json_files) == 1:
67
+ tasks_file = json_files[0]
68
+ design.info(f"Found task file: {tasks_file}")
69
+ else:
70
+ # Multiple files found, let user choose
71
+ design.info("Multiple task files found:")
72
+ file_choice = design.select(
73
+ "Select a task file to convert:",
74
+ choices=[str(f) for f in json_files],
75
+ )
76
+ tasks_file = Path(file_choice)
77
+ design.success(f"Selected: {tasks_file}")
78
+
79
+ # Validate inputs
80
+ if tasks_file and not tasks_file.exists():
81
+ design.error(f"Tasks file not found: {tasks_file}")
82
+ raise typer.Exit(1)
83
+
84
+ # Suggest dataset name if not provided
85
+ if name is None:
86
+ design.info("Generating dataset name suggestion...")
87
+
88
+ # Try to get HF username from environment or git config
89
+ hf_username = None
90
+ try:
91
+ # Try HF token first
92
+ from huggingface_hub import HfApi
93
+
94
+ api = HfApi(token=token)
95
+ user_info = api.whoami()
96
+ hf_username = user_info.get("name", None)
97
+ except Exception:
98
+ # Try git config as fallback
99
+ try:
100
+ import subprocess
101
+
102
+ result = subprocess.run(
103
+ ["git", "config", "user.name"], # noqa: S607
104
+ capture_output=True,
105
+ text=True,
106
+ )
107
+ if result.returncode == 0 and result.stdout.strip():
108
+ hf_username = result.stdout.strip().lower().replace(" ", "-")
109
+ except Exception:
110
+ design.warning("Failed to get HF username from git config")
111
+
112
+ # Get environment name from current directory or lock file
113
+ env_name = Path.cwd().name
114
+
115
+ # Try to get a better name from lock file
116
+ lock_path = Path("hud.lock.yaml")
117
+ if lock_path.exists():
118
+ try:
119
+ with open(lock_path) as f:
120
+ import yaml
121
+
122
+ lock_data = yaml.safe_load(f)
123
+ if "image" in lock_data:
124
+ # Extract name from image like "test:dev@sha256:..."
125
+ image_name = lock_data["image"].split(":")[0].split("/")[-1]
126
+ if image_name and image_name != "local":
127
+ env_name = image_name
128
+ except Exception as e:
129
+ design.warning(f"Failed to get HF username from lock file: {e}")
130
+
131
+ # Generate suggestions
132
+ suggestions = []
133
+ if hf_username:
134
+ suggestions.append(f"{hf_username}/{env_name}-tasks")
135
+ suggestions.append(f"{hf_username}/{env_name}-dataset")
136
+ suggestions.append(f"my-org/{env_name}-tasks")
137
+ suggestions.append(f"hud-evals/{env_name}-tasks")
138
+
139
+ # Let user choose or enter custom
140
+ design.info("Dataset name suggestions:")
141
+ suggestions.append("Enter custom name...")
142
+
143
+ choice = design.select("Select or enter a dataset name:", choices=suggestions)
144
+
145
+ if choice == "Enter custom name...":
146
+ name = typer.prompt("Enter dataset name (e.g., 'my-org/my-dataset')")
147
+ else:
148
+ name = choice
149
+
150
+ design.success(f"Using dataset name: {name}")
151
+
152
+ # Validate dataset name format
153
+ if push and name and "/" not in name:
154
+ design.error("Dataset name must include organization (e.g., 'my-org/my-dataset')")
155
+ design.info("For local-only datasets, use --no-push")
156
+ raise typer.Exit(1)
157
+
158
+ # Load tasks
159
+ design.info(f"Loading tasks from: {tasks_file}")
160
+ try:
161
+ if tasks_file is None:
162
+ raise ValueError("Tasks file is required")
163
+ with open(tasks_file) as f:
164
+ tasks_data = json.load(f)
165
+ except json.JSONDecodeError as e:
166
+ design.error(f"Invalid JSON file: {e}")
167
+ raise typer.Exit(1) from e
168
+
169
+ # Handle both single task and list of tasks
170
+ if isinstance(tasks_data, dict):
171
+ tasks = [tasks_data]
172
+ design.info("Found 1 task")
173
+ elif isinstance(tasks_data, list):
174
+ tasks = tasks_data
175
+ design.info(f"Found {len(tasks)} tasks")
176
+ else:
177
+ design.error("Tasks file must contain a JSON object or array")
178
+ raise typer.Exit(1)
179
+
180
+ # Validate task format
181
+ valid_tasks = []
182
+ for i, task in enumerate(tasks):
183
+ if not isinstance(task, dict):
184
+ design.warning(f"Skipping task {i}: not a JSON object")
185
+ continue
186
+
187
+ # Required fields
188
+ if "prompt" not in task:
189
+ design.warning(f"Skipping task {i}: missing 'prompt' field")
190
+ continue
191
+
192
+ if "evaluate_tool" not in task:
193
+ design.warning(f"Skipping task {i}: missing 'evaluate_tool' field")
194
+ continue
195
+
196
+ # Add default values
197
+ if "id" not in task:
198
+ task["id"] = f"task-{i:04d}"
199
+
200
+ if "mcp_config" not in task:
201
+ # Try to infer from hud.lock.yaml
202
+ mcp_config = get_mcp_config_from_lock()
203
+ if mcp_config:
204
+ task["mcp_config"] = mcp_config
205
+ else:
206
+ design.warning(f"Task {task['id']}: missing 'mcp_config' field")
207
+ continue
208
+
209
+ valid_tasks.append(task)
210
+
211
+ if not valid_tasks:
212
+ design.error("No valid tasks found")
213
+ raise typer.Exit(1)
214
+
215
+ design.success(f"Validated {len(valid_tasks)} tasks")
216
+
217
+ # Check if dataset is suitable for training
218
+ if len(valid_tasks) < 4:
219
+ design.warning(
220
+ f"Dataset has only {len(valid_tasks)} task(s). RL training typically requires at least 4 tasks." # noqa: E501
221
+ )
222
+ use_for_training = design.select(
223
+ "Will this dataset be used for RL training?",
224
+ ["Yes, duplicate tasks to reach 4", "No, keep as is"],
225
+ )
226
+
227
+ if use_for_training == "Yes, duplicate tasks to reach 4":
228
+ # Duplicate tasks to reach minimum of 4
229
+ original_count = len(valid_tasks)
230
+ while len(valid_tasks) < 4:
231
+ for task in valid_tasks[:original_count]:
232
+ if len(valid_tasks) >= 4:
233
+ break
234
+ # Create a copy with modified ID
235
+ duplicated_task = task.copy()
236
+ duplicated_task["id"] = (
237
+ f"{task['id']}_dup{len(valid_tasks) - original_count + 1}"
238
+ )
239
+ valid_tasks.append(duplicated_task)
240
+
241
+ design.info(f"Duplicated tasks: {original_count} → {len(valid_tasks)}")
242
+
243
+ # Check if MCP configs should be converted to remote
244
+ sample_mcp_config = valid_tasks[0].get("mcp_config", {})
245
+ if isinstance(sample_mcp_config, str):
246
+ sample_mcp_config = json.loads(sample_mcp_config)
247
+
248
+ # Check config type by looking at all MCP server URLs
249
+ config_type = "unknown"
250
+ remote_image = None
251
+
252
+ # Check all server configs (could be named anything, not just "hud")
253
+ for server_config in sample_mcp_config.values():
254
+ if isinstance(server_config, dict) and "url" in server_config:
255
+ url = server_config.get("url", "")
256
+ if "mcp.hud.so" in url:
257
+ config_type = "remote"
258
+ # Extract image from Mcp-Image header if present
259
+ headers = server_config.get("headers", {})
260
+ found_image = headers.get("Mcp-Image", "")
261
+ if found_image:
262
+ remote_image = found_image
263
+ break
264
+ else:
265
+ # Any non-mcp.hud.so URL means local config
266
+ config_type = "local"
267
+
268
+ if config_type == "remote" and remote_image:
269
+ design.info(f"Tasks already use remote MCP configs with image: {remote_image}")
270
+
271
+ if config_type == "local":
272
+ convert_to_remote = design.select(
273
+ "Tasks use local MCP configs. Convert to remote configs for training?",
274
+ ["Yes, convert to remote (requires public image)", "No, keep local configs"],
275
+ )
276
+
277
+ if convert_to_remote == "Yes, convert to remote (requires public image)":
278
+ # Get the image name from lock file
279
+ from hud.cli.rl.utils import get_image_from_lock
280
+
281
+ image = get_image_from_lock()
282
+
283
+ if not image:
284
+ design.error("No image found in hud.lock.yaml")
285
+ design.hint("Run 'hud build' first")
286
+ raise typer.Exit(1)
287
+
288
+ # Check if image contains registry prefix (indicates it's pushed)
289
+ if "/" not in image or image.startswith("local/"):
290
+ # Clean up image name for display (remove SHA if present)
291
+ display_image = image.split("@")[0] if "@" in image else image
292
+ design.warning(f"Image '{display_image}' appears to be local only")
293
+ push_image = design.select(
294
+ "Would you like to push the image to make it publicly available?",
295
+ ["Yes, push image", "No, cancel"],
296
+ )
297
+
298
+ if push_image == "Yes, push image":
299
+ design.info("Running 'hud push' to publish image...")
300
+ # Import here to avoid circular imports
301
+ from hud.cli.push import push_command
302
+
303
+ # Run push command (it's synchronous)
304
+ push_command(directory=".", yes=True)
305
+ design.success("Image pushed successfully")
306
+
307
+ # Re-read the image name as it may have changed
308
+ image = get_image_from_lock()
309
+ else:
310
+ design.info("Keeping local MCP configs")
311
+ convert_to_remote = None
312
+
313
+ if convert_to_remote and image:
314
+ # Convert all task configs to remote
315
+ design.info(f"Converting MCP configs to use remote image: {image}")
316
+
317
+ for task in valid_tasks:
318
+ # Create remote MCP config
319
+ remote_config = {
320
+ "hud": {
321
+ "url": "https://mcp.hud.so/v3/mcp",
322
+ "headers": {
323
+ "Authorization": "Bearer $HUD_API_KEY",
324
+ "Mcp-Image": image,
325
+ },
326
+ }
327
+ }
328
+ task["mcp_config"] = remote_config
329
+
330
+ design.success("✓ Converted all tasks to use remote MCP configs")
331
+
332
+ # Convert to HuggingFace format
333
+ dataset_dict = {
334
+ "id": [],
335
+ "prompt": [],
336
+ "mcp_config": [],
337
+ "setup_tool": [],
338
+ "evaluate_tool": [],
339
+ "metadata": [],
340
+ }
341
+
342
+ for task in valid_tasks:
343
+ dataset_dict["id"].append(task["id"])
344
+ dataset_dict["prompt"].append(task["prompt"])
345
+ dataset_dict["mcp_config"].append(json.dumps(task["mcp_config"]))
346
+ dataset_dict["setup_tool"].append(json.dumps(task.get("setup_tool", {})))
347
+ dataset_dict["evaluate_tool"].append(json.dumps(task["evaluate_tool"]))
348
+ dataset_dict["metadata"].append(json.dumps(task.get("metadata", {})))
349
+
350
+ # Push to HuggingFace Hub if requested
351
+ if push:
352
+ try:
353
+ from datasets import Dataset
354
+ except ImportError as e:
355
+ design.error("datasets library not installed")
356
+ design.info("Install with: pip install datasets")
357
+ raise typer.Exit(1) from e
358
+
359
+ design.info(f"Creating HuggingFace dataset: {name}")
360
+ dataset = Dataset.from_dict(dataset_dict)
361
+
362
+ # Set up HF token
363
+ if token:
364
+ import os
365
+
366
+ os.environ["HF_TOKEN"] = token
367
+
368
+ design.info(f"Pushing to Hub (private={private})...")
369
+ try:
370
+ if name is None:
371
+ raise ValueError("Dataset name is required")
372
+ dataset.push_to_hub(name, private=private)
373
+ design.success(f"Dataset published: https://huggingface.co/datasets/{name}")
374
+ except Exception as e:
375
+ design.error(f"Failed to push to Hub: {e}")
376
+ design.hint("Make sure you're logged in: huggingface-cli login")
377
+ raise typer.Exit(1) from e
378
+ else:
379
+ # Save locally
380
+ if name is None:
381
+ raise ValueError("Dataset name is required")
382
+ output_file = Path(f"{name.replace('/', '_')}_dataset.json")
383
+ with open(output_file, "w") as f:
384
+ json.dump(dataset_dict, f, indent=2)
385
+ design.success(f"Dataset saved locally: {output_file}")
386
+
387
+ # Update hud.lock.yaml if requested
388
+ if update_lock:
389
+ update_lock_file(name, len(valid_tasks))
390
+
391
+
392
+ def update_lock_file(dataset_name: str, task_count: int) -> None:
393
+ """Update hud.lock.yaml with primary dataset reference."""
394
+ # Load existing lock file or create new
395
+ lock_data = read_lock_file()
396
+
397
+ # Update dataset info
398
+ lock_data["primary_dataset"] = {
399
+ "name": dataset_name,
400
+ "task_count": task_count,
401
+ "updated_at": datetime.now().isoformat(),
402
+ }
403
+
404
+ # Write back
405
+ if write_lock_file(lock_data):
406
+ design.success(f"Updated hud.lock.yaml with dataset: {dataset_name}")
hud/cli/init.py CHANGED
@@ -139,7 +139,7 @@ if __name__ == "__main__":
139
139
  mcp.run()
140
140
  '''
141
141
 
142
- TASKS_JSON_TEMPLATE = '''[
142
+ TASKS_JSON_TEMPLATE = """[
143
143
  {{
144
144
  "prompt": "Increment the counter to reach 10",
145
145
  "mcp_config": {{
@@ -159,12 +159,12 @@ TASKS_JSON_TEMPLATE = '''[
159
159
  }}
160
160
  }}
161
161
  ]
162
- '''
162
+ """
163
163
 
164
164
  TEST_TASK_TEMPLATE = '''#!/usr/bin/env python
165
165
  """Simple example of running tasks from tasks.json.
166
166
 
167
- Make sure to run 'hud dev --build' in another terminal first!
167
+ Make sure to run 'hud dev --build' in another terminal first, and install hud-python[agents]
168
168
  """
169
169
 
170
170
  import asyncio
@@ -208,14 +208,16 @@ async def main():
208
208
 
209
209
  if __name__ == "__main__":
210
210
  asyncio.run(main())
211
- '''
211
+ ''' # noqa: E501
212
212
 
213
- NOTEBOOK_TEMPLATE = '''{{
213
+ NOTEBOOK_TEMPLATE = """{{
214
214
  "cells": [
215
215
  {{
216
216
  "cell_type": "markdown",
217
217
  "metadata": {{}},
218
218
  "source": [
219
+ "Make sure to `pip install hud-python[agents]` before running this notebook\\n",
220
+ "\\n",
219
221
  "### Step 1: Create a Task\\n",
220
222
  "\\n",
221
223
  "A Task combines:\\n",
@@ -427,9 +429,18 @@ NOTEBOOK_TEMPLATE = '''{{
427
429
  "nbformat": 4,
428
430
  "nbformat_minor": 4
429
431
  }}
430
- '''
432
+ """ # noqa: E501
431
433
 
432
- README_TEMPLATE = '''# {title}
434
+ ENV_FILE_TEMPLATE = """# HUD API Configuration
435
+ # Get your API key from https://app.hud.so/account
436
+ HUD_API_KEY=your_hud_api_key_here
437
+
438
+ # Anthropic API Configuration (optional)
439
+ # Required for using Claude agents - get from https://console.anthropic.com/
440
+ ANTHROPIC_API_KEY=your_anthropic_api_key_here
441
+ """
442
+
443
+ README_TEMPLATE = """# {title}
433
444
 
434
445
  A minimal HUD environment demonstrating the Task pattern with a simple counter.
435
446
 
@@ -437,21 +448,23 @@ A minimal HUD environment demonstrating the Task pattern with a simple counter.
437
448
 
438
449
  ### Interactive Development
439
450
  ```bash
440
- # 1. Start the environment (optional: with inspector)
451
+ # 1. Configure your API keys (optional - only needed for evaluation)
452
+ # Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY
453
+
454
+ # 2. Start the environment (optional: with inspector)
441
455
  hud dev --build --inspector
442
456
 
443
- # 2. Choose your preferred way to test:
457
+ # 3. Choose your preferred way to test:
444
458
 
445
- # Option A: Interactive notebook test_env.ipynb (great for learning!)
459
+ # Option A: Run the task with Claude (requires ANTHROPIC_API_KEY)
460
+ hud eval tasks.json --agent claude
446
461
 
447
- # Option B: Simple Python script (runs all tasks from tasks.json)
448
- python test_task.py
449
- ```
462
+ # Option B: Interactive notebook test_env.ipynb (great for learning!)
463
+ # Requires installation:
464
+ pip install hud-python[agents]
450
465
 
451
- ### Run with an Agent
452
- ```bash
453
- # Run the task with Claude
454
- hud eval tasks.json --agent claude
466
+ # Option C: Simple Python script (runs all tasks from tasks.json)
467
+ python test_task.py
455
468
  ```
456
469
 
457
470
  ## How HUD Environments Work
@@ -471,7 +484,7 @@ Once your environment is ready, you can share it with the community:
471
484
 
472
485
  ### 1. Push to Registry
473
486
  ```bash
474
- # Build and push your environment (this requires docker hub login and hud api key)
487
+ # Build and push your environment (requires docker hub login and hud api key)
475
488
  hud build
476
489
  hud push
477
490
  ```
@@ -510,7 +523,7 @@ hud eval "your-org/your-dataset" --agent claude
510
523
  **Note**: Only public HuggingFace datasets appear as leaderboards!
511
524
 
512
525
  📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
513
- '''
526
+ """ # noqa: E501
514
527
 
515
528
 
516
529
  def sanitize_name(name: str) -> str:
@@ -613,6 +626,12 @@ def create_environment(name: str | None, directory: str, force: bool) -> None:
613
626
  notebook_path.write_text(notebook_content, encoding="utf-8")
614
627
  files_created.append("test_env.ipynb")
615
628
 
629
+ # .env file
630
+ env_file_path = target_dir / ".env"
631
+ env_file_content = ENV_FILE_TEMPLATE.strip() + "\n"
632
+ env_file_path.write_text(env_file_content, encoding="utf-8")
633
+ files_created.append(".env")
634
+
616
635
  # Success message
617
636
  design.header(f"Created HUD Environment: {name}")
618
637