hud-python 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/base.py +118 -33
- hud/agents/claude.py +1 -1
- hud/agents/openai.py +5 -16
- hud/agents/tests/test_openai.py +24 -79
- hud/cli/__init__.py +137 -15
- hud/cli/analyze.py +2 -4
- hud/cli/build.py +6 -2
- hud/cli/dev.py +67 -0
- hud/cli/eval.py +90 -35
- hud/cli/hf.py +406 -0
- hud/cli/init.py +38 -19
- hud/cli/rl/README.md +243 -0
- hud/cli/rl/__init__.py +82 -0
- hud/cli/rl/init.py +370 -0
- hud/cli/rl/pod.py +491 -0
- hud/cli/rl/ssh.py +288 -0
- hud/cli/rl/train.py +421 -0
- hud/cli/rl/utils.py +165 -0
- hud/cli/tests/test_mcp_server.py +1 -4
- hud/clients/base.py +2 -0
- hud/clients/fastmcp.py +7 -2
- hud/clients/mcp_use.py +3 -1
- hud/clients/utils/retry_transport.py +34 -8
- hud/datasets/__init__.py +32 -0
- hud/datasets/execution/__init__.py +13 -0
- hud/datasets/execution/parallel.py +592 -0
- hud/datasets/execution/runner.py +123 -0
- hud/datasets/task.py +107 -0
- hud/datasets/utils.py +118 -0
- hud/otel/instrumentation.py +2 -1
- hud/server/server.py +58 -21
- hud/settings.py +12 -0
- hud/types.py +31 -10
- hud/utils/design.py +168 -2
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/METADATA +4 -3
- {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/RECORD +41 -28
- hud/datasets.py +0 -327
- {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/WHEEL +0 -0
- {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/licenses/LICENSE +0 -0
hud/cli/hf.py
ADDED
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
"""HuggingFace dataset conversion command for HUD tasks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import typer
|
|
10
|
+
|
|
11
|
+
from hud.cli.rl.utils import get_mcp_config_from_lock, read_lock_file, write_lock_file
|
|
12
|
+
from hud.utils.design import HUDDesign
|
|
13
|
+
|
|
14
|
+
design = HUDDesign()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def hf_command(
|
|
18
|
+
tasks_file: Path | None = None,
|
|
19
|
+
name: str | None = None,
|
|
20
|
+
push: bool = True,
|
|
21
|
+
private: bool = False,
|
|
22
|
+
update_lock: bool = True,
|
|
23
|
+
token: str | None = None,
|
|
24
|
+
) -> None:
|
|
25
|
+
"""📊 Convert tasks to HuggingFace dataset format.
|
|
26
|
+
|
|
27
|
+
Automatically detects task files if not specified.
|
|
28
|
+
Suggests dataset name based on environment if not provided.
|
|
29
|
+
Converts a JSON file containing HUD tasks into a HuggingFace dataset
|
|
30
|
+
and optionally pushes it to the Hub. Also updates hud.lock.yaml with
|
|
31
|
+
the primary dataset reference.
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
hud hf # Auto-detect tasks and suggest name
|
|
35
|
+
hud hf tasks.json # Use specific file, suggest name
|
|
36
|
+
hud hf --name my-org/my-tasks # Auto-detect tasks, use name
|
|
37
|
+
hud hf tasks.json --name hud-evals/web-tasks --private
|
|
38
|
+
hud hf tasks.json --name local-dataset --no-push
|
|
39
|
+
"""
|
|
40
|
+
design.header("HuggingFace Dataset Converter", icon="📊")
|
|
41
|
+
|
|
42
|
+
# Auto-detect task file if not provided
|
|
43
|
+
if tasks_file is None:
|
|
44
|
+
design.info("Looking for task files...")
|
|
45
|
+
|
|
46
|
+
# Common task file patterns
|
|
47
|
+
patterns = [
|
|
48
|
+
"tasks.json",
|
|
49
|
+
"task.json",
|
|
50
|
+
"*_tasks.json",
|
|
51
|
+
"eval*.json",
|
|
52
|
+
"evaluation*.json",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
json_files = []
|
|
56
|
+
for pattern in patterns:
|
|
57
|
+
json_files.extend(Path(".").glob(pattern))
|
|
58
|
+
|
|
59
|
+
# Remove duplicates and sort
|
|
60
|
+
json_files = sorted(set(json_files))
|
|
61
|
+
|
|
62
|
+
if not json_files:
|
|
63
|
+
design.error("No task files found in current directory")
|
|
64
|
+
design.info("Create a task JSON file (e.g., tasks.json) or specify the file path")
|
|
65
|
+
raise typer.Exit(1)
|
|
66
|
+
elif len(json_files) == 1:
|
|
67
|
+
tasks_file = json_files[0]
|
|
68
|
+
design.info(f"Found task file: {tasks_file}")
|
|
69
|
+
else:
|
|
70
|
+
# Multiple files found, let user choose
|
|
71
|
+
design.info("Multiple task files found:")
|
|
72
|
+
file_choice = design.select(
|
|
73
|
+
"Select a task file to convert:",
|
|
74
|
+
choices=[str(f) for f in json_files],
|
|
75
|
+
)
|
|
76
|
+
tasks_file = Path(file_choice)
|
|
77
|
+
design.success(f"Selected: {tasks_file}")
|
|
78
|
+
|
|
79
|
+
# Validate inputs
|
|
80
|
+
if tasks_file and not tasks_file.exists():
|
|
81
|
+
design.error(f"Tasks file not found: {tasks_file}")
|
|
82
|
+
raise typer.Exit(1)
|
|
83
|
+
|
|
84
|
+
# Suggest dataset name if not provided
|
|
85
|
+
if name is None:
|
|
86
|
+
design.info("Generating dataset name suggestion...")
|
|
87
|
+
|
|
88
|
+
# Try to get HF username from environment or git config
|
|
89
|
+
hf_username = None
|
|
90
|
+
try:
|
|
91
|
+
# Try HF token first
|
|
92
|
+
from huggingface_hub import HfApi
|
|
93
|
+
|
|
94
|
+
api = HfApi(token=token)
|
|
95
|
+
user_info = api.whoami()
|
|
96
|
+
hf_username = user_info.get("name", None)
|
|
97
|
+
except Exception:
|
|
98
|
+
# Try git config as fallback
|
|
99
|
+
try:
|
|
100
|
+
import subprocess
|
|
101
|
+
|
|
102
|
+
result = subprocess.run(
|
|
103
|
+
["git", "config", "user.name"], # noqa: S607
|
|
104
|
+
capture_output=True,
|
|
105
|
+
text=True,
|
|
106
|
+
)
|
|
107
|
+
if result.returncode == 0 and result.stdout.strip():
|
|
108
|
+
hf_username = result.stdout.strip().lower().replace(" ", "-")
|
|
109
|
+
except Exception:
|
|
110
|
+
design.warning("Failed to get HF username from git config")
|
|
111
|
+
|
|
112
|
+
# Get environment name from current directory or lock file
|
|
113
|
+
env_name = Path.cwd().name
|
|
114
|
+
|
|
115
|
+
# Try to get a better name from lock file
|
|
116
|
+
lock_path = Path("hud.lock.yaml")
|
|
117
|
+
if lock_path.exists():
|
|
118
|
+
try:
|
|
119
|
+
with open(lock_path) as f:
|
|
120
|
+
import yaml
|
|
121
|
+
|
|
122
|
+
lock_data = yaml.safe_load(f)
|
|
123
|
+
if "image" in lock_data:
|
|
124
|
+
# Extract name from image like "test:dev@sha256:..."
|
|
125
|
+
image_name = lock_data["image"].split(":")[0].split("/")[-1]
|
|
126
|
+
if image_name and image_name != "local":
|
|
127
|
+
env_name = image_name
|
|
128
|
+
except Exception as e:
|
|
129
|
+
design.warning(f"Failed to get HF username from lock file: {e}")
|
|
130
|
+
|
|
131
|
+
# Generate suggestions
|
|
132
|
+
suggestions = []
|
|
133
|
+
if hf_username:
|
|
134
|
+
suggestions.append(f"{hf_username}/{env_name}-tasks")
|
|
135
|
+
suggestions.append(f"{hf_username}/{env_name}-dataset")
|
|
136
|
+
suggestions.append(f"my-org/{env_name}-tasks")
|
|
137
|
+
suggestions.append(f"hud-evals/{env_name}-tasks")
|
|
138
|
+
|
|
139
|
+
# Let user choose or enter custom
|
|
140
|
+
design.info("Dataset name suggestions:")
|
|
141
|
+
suggestions.append("Enter custom name...")
|
|
142
|
+
|
|
143
|
+
choice = design.select("Select or enter a dataset name:", choices=suggestions)
|
|
144
|
+
|
|
145
|
+
if choice == "Enter custom name...":
|
|
146
|
+
name = typer.prompt("Enter dataset name (e.g., 'my-org/my-dataset')")
|
|
147
|
+
else:
|
|
148
|
+
name = choice
|
|
149
|
+
|
|
150
|
+
design.success(f"Using dataset name: {name}")
|
|
151
|
+
|
|
152
|
+
# Validate dataset name format
|
|
153
|
+
if push and name and "/" not in name:
|
|
154
|
+
design.error("Dataset name must include organization (e.g., 'my-org/my-dataset')")
|
|
155
|
+
design.info("For local-only datasets, use --no-push")
|
|
156
|
+
raise typer.Exit(1)
|
|
157
|
+
|
|
158
|
+
# Load tasks
|
|
159
|
+
design.info(f"Loading tasks from: {tasks_file}")
|
|
160
|
+
try:
|
|
161
|
+
if tasks_file is None:
|
|
162
|
+
raise ValueError("Tasks file is required")
|
|
163
|
+
with open(tasks_file) as f:
|
|
164
|
+
tasks_data = json.load(f)
|
|
165
|
+
except json.JSONDecodeError as e:
|
|
166
|
+
design.error(f"Invalid JSON file: {e}")
|
|
167
|
+
raise typer.Exit(1) from e
|
|
168
|
+
|
|
169
|
+
# Handle both single task and list of tasks
|
|
170
|
+
if isinstance(tasks_data, dict):
|
|
171
|
+
tasks = [tasks_data]
|
|
172
|
+
design.info("Found 1 task")
|
|
173
|
+
elif isinstance(tasks_data, list):
|
|
174
|
+
tasks = tasks_data
|
|
175
|
+
design.info(f"Found {len(tasks)} tasks")
|
|
176
|
+
else:
|
|
177
|
+
design.error("Tasks file must contain a JSON object or array")
|
|
178
|
+
raise typer.Exit(1)
|
|
179
|
+
|
|
180
|
+
# Validate task format
|
|
181
|
+
valid_tasks = []
|
|
182
|
+
for i, task in enumerate(tasks):
|
|
183
|
+
if not isinstance(task, dict):
|
|
184
|
+
design.warning(f"Skipping task {i}: not a JSON object")
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
# Required fields
|
|
188
|
+
if "prompt" not in task:
|
|
189
|
+
design.warning(f"Skipping task {i}: missing 'prompt' field")
|
|
190
|
+
continue
|
|
191
|
+
|
|
192
|
+
if "evaluate_tool" not in task:
|
|
193
|
+
design.warning(f"Skipping task {i}: missing 'evaluate_tool' field")
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
# Add default values
|
|
197
|
+
if "id" not in task:
|
|
198
|
+
task["id"] = f"task-{i:04d}"
|
|
199
|
+
|
|
200
|
+
if "mcp_config" not in task:
|
|
201
|
+
# Try to infer from hud.lock.yaml
|
|
202
|
+
mcp_config = get_mcp_config_from_lock()
|
|
203
|
+
if mcp_config:
|
|
204
|
+
task["mcp_config"] = mcp_config
|
|
205
|
+
else:
|
|
206
|
+
design.warning(f"Task {task['id']}: missing 'mcp_config' field")
|
|
207
|
+
continue
|
|
208
|
+
|
|
209
|
+
valid_tasks.append(task)
|
|
210
|
+
|
|
211
|
+
if not valid_tasks:
|
|
212
|
+
design.error("No valid tasks found")
|
|
213
|
+
raise typer.Exit(1)
|
|
214
|
+
|
|
215
|
+
design.success(f"Validated {len(valid_tasks)} tasks")
|
|
216
|
+
|
|
217
|
+
# Check if dataset is suitable for training
|
|
218
|
+
if len(valid_tasks) < 4:
|
|
219
|
+
design.warning(
|
|
220
|
+
f"Dataset has only {len(valid_tasks)} task(s). RL training typically requires at least 4 tasks." # noqa: E501
|
|
221
|
+
)
|
|
222
|
+
use_for_training = design.select(
|
|
223
|
+
"Will this dataset be used for RL training?",
|
|
224
|
+
["Yes, duplicate tasks to reach 4", "No, keep as is"],
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
if use_for_training == "Yes, duplicate tasks to reach 4":
|
|
228
|
+
# Duplicate tasks to reach minimum of 4
|
|
229
|
+
original_count = len(valid_tasks)
|
|
230
|
+
while len(valid_tasks) < 4:
|
|
231
|
+
for task in valid_tasks[:original_count]:
|
|
232
|
+
if len(valid_tasks) >= 4:
|
|
233
|
+
break
|
|
234
|
+
# Create a copy with modified ID
|
|
235
|
+
duplicated_task = task.copy()
|
|
236
|
+
duplicated_task["id"] = (
|
|
237
|
+
f"{task['id']}_dup{len(valid_tasks) - original_count + 1}"
|
|
238
|
+
)
|
|
239
|
+
valid_tasks.append(duplicated_task)
|
|
240
|
+
|
|
241
|
+
design.info(f"Duplicated tasks: {original_count} → {len(valid_tasks)}")
|
|
242
|
+
|
|
243
|
+
# Check if MCP configs should be converted to remote
|
|
244
|
+
sample_mcp_config = valid_tasks[0].get("mcp_config", {})
|
|
245
|
+
if isinstance(sample_mcp_config, str):
|
|
246
|
+
sample_mcp_config = json.loads(sample_mcp_config)
|
|
247
|
+
|
|
248
|
+
# Check config type by looking at all MCP server URLs
|
|
249
|
+
config_type = "unknown"
|
|
250
|
+
remote_image = None
|
|
251
|
+
|
|
252
|
+
# Check all server configs (could be named anything, not just "hud")
|
|
253
|
+
for server_config in sample_mcp_config.values():
|
|
254
|
+
if isinstance(server_config, dict) and "url" in server_config:
|
|
255
|
+
url = server_config.get("url", "")
|
|
256
|
+
if "mcp.hud.so" in url:
|
|
257
|
+
config_type = "remote"
|
|
258
|
+
# Extract image from Mcp-Image header if present
|
|
259
|
+
headers = server_config.get("headers", {})
|
|
260
|
+
found_image = headers.get("Mcp-Image", "")
|
|
261
|
+
if found_image:
|
|
262
|
+
remote_image = found_image
|
|
263
|
+
break
|
|
264
|
+
else:
|
|
265
|
+
# Any non-mcp.hud.so URL means local config
|
|
266
|
+
config_type = "local"
|
|
267
|
+
|
|
268
|
+
if config_type == "remote" and remote_image:
|
|
269
|
+
design.info(f"Tasks already use remote MCP configs with image: {remote_image}")
|
|
270
|
+
|
|
271
|
+
if config_type == "local":
|
|
272
|
+
convert_to_remote = design.select(
|
|
273
|
+
"Tasks use local MCP configs. Convert to remote configs for training?",
|
|
274
|
+
["Yes, convert to remote (requires public image)", "No, keep local configs"],
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
if convert_to_remote == "Yes, convert to remote (requires public image)":
|
|
278
|
+
# Get the image name from lock file
|
|
279
|
+
from hud.cli.rl.utils import get_image_from_lock
|
|
280
|
+
|
|
281
|
+
image = get_image_from_lock()
|
|
282
|
+
|
|
283
|
+
if not image:
|
|
284
|
+
design.error("No image found in hud.lock.yaml")
|
|
285
|
+
design.hint("Run 'hud build' first")
|
|
286
|
+
raise typer.Exit(1)
|
|
287
|
+
|
|
288
|
+
# Check if image contains registry prefix (indicates it's pushed)
|
|
289
|
+
if "/" not in image or image.startswith("local/"):
|
|
290
|
+
# Clean up image name for display (remove SHA if present)
|
|
291
|
+
display_image = image.split("@")[0] if "@" in image else image
|
|
292
|
+
design.warning(f"Image '{display_image}' appears to be local only")
|
|
293
|
+
push_image = design.select(
|
|
294
|
+
"Would you like to push the image to make it publicly available?",
|
|
295
|
+
["Yes, push image", "No, cancel"],
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
if push_image == "Yes, push image":
|
|
299
|
+
design.info("Running 'hud push' to publish image...")
|
|
300
|
+
# Import here to avoid circular imports
|
|
301
|
+
from hud.cli.push import push_command
|
|
302
|
+
|
|
303
|
+
# Run push command (it's synchronous)
|
|
304
|
+
push_command(directory=".", yes=True)
|
|
305
|
+
design.success("Image pushed successfully")
|
|
306
|
+
|
|
307
|
+
# Re-read the image name as it may have changed
|
|
308
|
+
image = get_image_from_lock()
|
|
309
|
+
else:
|
|
310
|
+
design.info("Keeping local MCP configs")
|
|
311
|
+
convert_to_remote = None
|
|
312
|
+
|
|
313
|
+
if convert_to_remote and image:
|
|
314
|
+
# Convert all task configs to remote
|
|
315
|
+
design.info(f"Converting MCP configs to use remote image: {image}")
|
|
316
|
+
|
|
317
|
+
for task in valid_tasks:
|
|
318
|
+
# Create remote MCP config
|
|
319
|
+
remote_config = {
|
|
320
|
+
"hud": {
|
|
321
|
+
"url": "https://mcp.hud.so/v3/mcp",
|
|
322
|
+
"headers": {
|
|
323
|
+
"Authorization": "Bearer $HUD_API_KEY",
|
|
324
|
+
"Mcp-Image": image,
|
|
325
|
+
},
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
task["mcp_config"] = remote_config
|
|
329
|
+
|
|
330
|
+
design.success("✓ Converted all tasks to use remote MCP configs")
|
|
331
|
+
|
|
332
|
+
# Convert to HuggingFace format
|
|
333
|
+
dataset_dict = {
|
|
334
|
+
"id": [],
|
|
335
|
+
"prompt": [],
|
|
336
|
+
"mcp_config": [],
|
|
337
|
+
"setup_tool": [],
|
|
338
|
+
"evaluate_tool": [],
|
|
339
|
+
"metadata": [],
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
for task in valid_tasks:
|
|
343
|
+
dataset_dict["id"].append(task["id"])
|
|
344
|
+
dataset_dict["prompt"].append(task["prompt"])
|
|
345
|
+
dataset_dict["mcp_config"].append(json.dumps(task["mcp_config"]))
|
|
346
|
+
dataset_dict["setup_tool"].append(json.dumps(task.get("setup_tool", {})))
|
|
347
|
+
dataset_dict["evaluate_tool"].append(json.dumps(task["evaluate_tool"]))
|
|
348
|
+
dataset_dict["metadata"].append(json.dumps(task.get("metadata", {})))
|
|
349
|
+
|
|
350
|
+
# Push to HuggingFace Hub if requested
|
|
351
|
+
if push:
|
|
352
|
+
try:
|
|
353
|
+
from datasets import Dataset
|
|
354
|
+
except ImportError as e:
|
|
355
|
+
design.error("datasets library not installed")
|
|
356
|
+
design.info("Install with: pip install datasets")
|
|
357
|
+
raise typer.Exit(1) from e
|
|
358
|
+
|
|
359
|
+
design.info(f"Creating HuggingFace dataset: {name}")
|
|
360
|
+
dataset = Dataset.from_dict(dataset_dict)
|
|
361
|
+
|
|
362
|
+
# Set up HF token
|
|
363
|
+
if token:
|
|
364
|
+
import os
|
|
365
|
+
|
|
366
|
+
os.environ["HF_TOKEN"] = token
|
|
367
|
+
|
|
368
|
+
design.info(f"Pushing to Hub (private={private})...")
|
|
369
|
+
try:
|
|
370
|
+
if name is None:
|
|
371
|
+
raise ValueError("Dataset name is required")
|
|
372
|
+
dataset.push_to_hub(name, private=private)
|
|
373
|
+
design.success(f"Dataset published: https://huggingface.co/datasets/{name}")
|
|
374
|
+
except Exception as e:
|
|
375
|
+
design.error(f"Failed to push to Hub: {e}")
|
|
376
|
+
design.hint("Make sure you're logged in: huggingface-cli login")
|
|
377
|
+
raise typer.Exit(1) from e
|
|
378
|
+
else:
|
|
379
|
+
# Save locally
|
|
380
|
+
if name is None:
|
|
381
|
+
raise ValueError("Dataset name is required")
|
|
382
|
+
output_file = Path(f"{name.replace('/', '_')}_dataset.json")
|
|
383
|
+
with open(output_file, "w") as f:
|
|
384
|
+
json.dump(dataset_dict, f, indent=2)
|
|
385
|
+
design.success(f"Dataset saved locally: {output_file}")
|
|
386
|
+
|
|
387
|
+
# Update hud.lock.yaml if requested
|
|
388
|
+
if update_lock:
|
|
389
|
+
update_lock_file(name, len(valid_tasks))
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def update_lock_file(dataset_name: str, task_count: int) -> None:
|
|
393
|
+
"""Update hud.lock.yaml with primary dataset reference."""
|
|
394
|
+
# Load existing lock file or create new
|
|
395
|
+
lock_data = read_lock_file()
|
|
396
|
+
|
|
397
|
+
# Update dataset info
|
|
398
|
+
lock_data["primary_dataset"] = {
|
|
399
|
+
"name": dataset_name,
|
|
400
|
+
"task_count": task_count,
|
|
401
|
+
"updated_at": datetime.now().isoformat(),
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
# Write back
|
|
405
|
+
if write_lock_file(lock_data):
|
|
406
|
+
design.success(f"Updated hud.lock.yaml with dataset: {dataset_name}")
|
hud/cli/init.py
CHANGED
|
@@ -139,7 +139,7 @@ if __name__ == "__main__":
|
|
|
139
139
|
mcp.run()
|
|
140
140
|
'''
|
|
141
141
|
|
|
142
|
-
TASKS_JSON_TEMPLATE =
|
|
142
|
+
TASKS_JSON_TEMPLATE = """[
|
|
143
143
|
{{
|
|
144
144
|
"prompt": "Increment the counter to reach 10",
|
|
145
145
|
"mcp_config": {{
|
|
@@ -159,12 +159,12 @@ TASKS_JSON_TEMPLATE = '''[
|
|
|
159
159
|
}}
|
|
160
160
|
}}
|
|
161
161
|
]
|
|
162
|
-
|
|
162
|
+
"""
|
|
163
163
|
|
|
164
164
|
TEST_TASK_TEMPLATE = '''#!/usr/bin/env python
|
|
165
165
|
"""Simple example of running tasks from tasks.json.
|
|
166
166
|
|
|
167
|
-
Make sure to run 'hud dev --build' in another terminal first
|
|
167
|
+
Make sure to run 'hud dev --build' in another terminal first, and install hud-python[agents]
|
|
168
168
|
"""
|
|
169
169
|
|
|
170
170
|
import asyncio
|
|
@@ -208,14 +208,16 @@ async def main():
|
|
|
208
208
|
|
|
209
209
|
if __name__ == "__main__":
|
|
210
210
|
asyncio.run(main())
|
|
211
|
-
'''
|
|
211
|
+
''' # noqa: E501
|
|
212
212
|
|
|
213
|
-
NOTEBOOK_TEMPLATE =
|
|
213
|
+
NOTEBOOK_TEMPLATE = """{{
|
|
214
214
|
"cells": [
|
|
215
215
|
{{
|
|
216
216
|
"cell_type": "markdown",
|
|
217
217
|
"metadata": {{}},
|
|
218
218
|
"source": [
|
|
219
|
+
"Make sure to `pip install hud-python[agents]` before running this notebook\\n",
|
|
220
|
+
"\\n",
|
|
219
221
|
"### Step 1: Create a Task\\n",
|
|
220
222
|
"\\n",
|
|
221
223
|
"A Task combines:\\n",
|
|
@@ -427,9 +429,18 @@ NOTEBOOK_TEMPLATE = '''{{
|
|
|
427
429
|
"nbformat": 4,
|
|
428
430
|
"nbformat_minor": 4
|
|
429
431
|
}}
|
|
430
|
-
|
|
432
|
+
""" # noqa: E501
|
|
431
433
|
|
|
432
|
-
|
|
434
|
+
ENV_FILE_TEMPLATE = """# HUD API Configuration
|
|
435
|
+
# Get your API key from https://app.hud.so/account
|
|
436
|
+
HUD_API_KEY=your_hud_api_key_here
|
|
437
|
+
|
|
438
|
+
# Anthropic API Configuration (optional)
|
|
439
|
+
# Required for using Claude agents - get from https://console.anthropic.com/
|
|
440
|
+
ANTHROPIC_API_KEY=your_anthropic_api_key_here
|
|
441
|
+
"""
|
|
442
|
+
|
|
443
|
+
README_TEMPLATE = """# {title}
|
|
433
444
|
|
|
434
445
|
A minimal HUD environment demonstrating the Task pattern with a simple counter.
|
|
435
446
|
|
|
@@ -437,21 +448,23 @@ A minimal HUD environment demonstrating the Task pattern with a simple counter.
|
|
|
437
448
|
|
|
438
449
|
### Interactive Development
|
|
439
450
|
```bash
|
|
440
|
-
# 1.
|
|
451
|
+
# 1. Configure your API keys (optional - only needed for evaluation)
|
|
452
|
+
# Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY
|
|
453
|
+
|
|
454
|
+
# 2. Start the environment (optional: with inspector)
|
|
441
455
|
hud dev --build --inspector
|
|
442
456
|
|
|
443
|
-
#
|
|
457
|
+
# 3. Choose your preferred way to test:
|
|
444
458
|
|
|
445
|
-
# Option A:
|
|
459
|
+
# Option A: Run the task with Claude (requires ANTHROPIC_API_KEY)
|
|
460
|
+
hud eval tasks.json --agent claude
|
|
446
461
|
|
|
447
|
-
# Option B:
|
|
448
|
-
|
|
449
|
-
|
|
462
|
+
# Option B: Interactive notebook test_env.ipynb (great for learning!)
|
|
463
|
+
# Requires installation:
|
|
464
|
+
pip install hud-python[agents]
|
|
450
465
|
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
# Run the task with Claude
|
|
454
|
-
hud eval tasks.json --agent claude
|
|
466
|
+
# Option C: Simple Python script (runs all tasks from tasks.json)
|
|
467
|
+
python test_task.py
|
|
455
468
|
```
|
|
456
469
|
|
|
457
470
|
## How HUD Environments Work
|
|
@@ -471,7 +484,7 @@ Once your environment is ready, you can share it with the community:
|
|
|
471
484
|
|
|
472
485
|
### 1. Push to Registry
|
|
473
486
|
```bash
|
|
474
|
-
# Build and push your environment (
|
|
487
|
+
# Build and push your environment (requires docker hub login and hud api key)
|
|
475
488
|
hud build
|
|
476
489
|
hud push
|
|
477
490
|
```
|
|
@@ -510,7 +523,7 @@ hud eval "your-org/your-dataset" --agent claude
|
|
|
510
523
|
**Note**: Only public HuggingFace datasets appear as leaderboards!
|
|
511
524
|
|
|
512
525
|
📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
|
|
513
|
-
|
|
526
|
+
""" # noqa: E501
|
|
514
527
|
|
|
515
528
|
|
|
516
529
|
def sanitize_name(name: str) -> str:
|
|
@@ -613,6 +626,12 @@ def create_environment(name: str | None, directory: str, force: bool) -> None:
|
|
|
613
626
|
notebook_path.write_text(notebook_content, encoding="utf-8")
|
|
614
627
|
files_created.append("test_env.ipynb")
|
|
615
628
|
|
|
629
|
+
# .env file
|
|
630
|
+
env_file_path = target_dir / ".env"
|
|
631
|
+
env_file_content = ENV_FILE_TEMPLATE.strip() + "\n"
|
|
632
|
+
env_file_path.write_text(env_file_content, encoding="utf-8")
|
|
633
|
+
files_created.append(".env")
|
|
634
|
+
|
|
616
635
|
# Success message
|
|
617
636
|
design.header(f"Created HUD Environment: {name}")
|
|
618
637
|
|