hud-python 0.4.35__py3-none-any.whl → 0.4.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/__init__.py +2 -0
- hud/agents/lite_llm.py +72 -0
- hud/agents/openai_chat_generic.py +21 -7
- hud/agents/tests/test_claude.py +32 -7
- hud/agents/tests/test_openai.py +29 -6
- hud/cli/__init__.py +228 -79
- hud/cli/build.py +26 -6
- hud/cli/dev.py +21 -40
- hud/cli/eval.py +96 -15
- hud/cli/flows/tasks.py +198 -65
- hud/cli/init.py +222 -629
- hud/cli/pull.py +6 -0
- hud/cli/push.py +11 -1
- hud/cli/rl/__init__.py +14 -4
- hud/cli/rl/celebrate.py +187 -0
- hud/cli/rl/config.py +15 -8
- hud/cli/rl/local_runner.py +44 -20
- hud/cli/rl/remote_runner.py +166 -87
- hud/cli/rl/viewer.py +141 -0
- hud/cli/rl/wait_utils.py +89 -0
- hud/cli/tests/test_build.py +3 -27
- hud/cli/tests/test_mcp_server.py +1 -12
- hud/cli/utils/config.py +85 -0
- hud/cli/utils/docker.py +21 -39
- hud/cli/utils/env_check.py +196 -0
- hud/cli/utils/environment.py +4 -3
- hud/cli/utils/interactive.py +2 -1
- hud/cli/utils/local_runner.py +204 -0
- hud/cli/utils/metadata.py +3 -1
- hud/cli/utils/package_runner.py +292 -0
- hud/cli/utils/remote_runner.py +4 -1
- hud/cli/utils/source_hash.py +108 -0
- hud/clients/base.py +1 -1
- hud/clients/fastmcp.py +1 -1
- hud/clients/mcp_use.py +30 -7
- hud/datasets/parallel.py +3 -1
- hud/datasets/runner.py +4 -1
- hud/otel/config.py +1 -1
- hud/otel/context.py +40 -6
- hud/rl/buffer.py +3 -0
- hud/rl/tests/test_learner.py +1 -1
- hud/rl/vllm_adapter.py +1 -1
- hud/server/server.py +234 -7
- hud/server/tests/test_add_tool.py +60 -0
- hud/server/tests/test_context.py +128 -0
- hud/server/tests/test_mcp_server_handlers.py +44 -0
- hud/server/tests/test_mcp_server_integration.py +405 -0
- hud/server/tests/test_mcp_server_more.py +247 -0
- hud/server/tests/test_run_wrapper.py +53 -0
- hud/server/tests/test_server_extra.py +166 -0
- hud/server/tests/test_sigterm_runner.py +78 -0
- hud/settings.py +38 -0
- hud/shared/hints.py +2 -2
- hud/telemetry/job.py +2 -2
- hud/types.py +9 -2
- hud/utils/tasks.py +32 -24
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/METADATA +43 -23
- {hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/RECORD +63 -46
- {hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/WHEEL +0 -0
- {hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/licenses/LICENSE +0 -0
hud/cli/init.py
CHANGED
|
@@ -2,562 +2,198 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import os
|
|
6
|
+
import tarfile
|
|
7
|
+
import tempfile
|
|
8
|
+
import time
|
|
5
9
|
from pathlib import Path
|
|
6
10
|
|
|
11
|
+
import httpx
|
|
12
|
+
import questionary
|
|
7
13
|
import typer
|
|
8
|
-
from rich.panel import Panel
|
|
9
|
-
from rich.syntax import Syntax
|
|
10
14
|
|
|
11
15
|
from hud.utils.hud_console import HUDConsole
|
|
12
16
|
|
|
13
|
-
#
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
[
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
[
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
# Calculate reward as progress towards target
|
|
129
|
-
reward = min(current_count / target, 1.0) if target > 0 else 0.0
|
|
130
|
-
done = current_count >= target
|
|
131
|
-
|
|
132
|
-
return EvaluationResult(
|
|
133
|
-
reward=reward,
|
|
134
|
-
done=done,
|
|
135
|
-
content=f"Counter at {{current_count}}/{{target}}"
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
if __name__ == "__main__":
|
|
139
|
-
mcp.run()
|
|
140
|
-
'''
|
|
141
|
-
|
|
142
|
-
TASKS_JSON_TEMPLATE = """[
|
|
143
|
-
{{
|
|
144
|
-
"prompt": "Increment the counter to reach 10",
|
|
145
|
-
"mcp_config": {{
|
|
146
|
-
"{name}": {{
|
|
147
|
-
"url": "http://localhost:8765/mcp"
|
|
148
|
-
}}
|
|
149
|
-
}},
|
|
150
|
-
"setup_tool": {{
|
|
151
|
-
"name": "setup",
|
|
152
|
-
"arguments": {{}}
|
|
153
|
-
}},
|
|
154
|
-
"evaluate_tool": {{
|
|
155
|
-
"name": "evaluate",
|
|
156
|
-
"arguments": {{
|
|
157
|
-
"target": 10
|
|
158
|
-
}}
|
|
159
|
-
}}
|
|
160
|
-
}}
|
|
161
|
-
]
|
|
162
|
-
"""
|
|
163
|
-
|
|
164
|
-
TEST_TASK_TEMPLATE = '''#!/usr/bin/env python
|
|
165
|
-
"""Simple example of running tasks from tasks.json.
|
|
166
|
-
|
|
167
|
-
Make sure to run 'hud dev --build' in another terminal first, and install hud-python[agents]
|
|
168
|
-
"""
|
|
169
|
-
|
|
170
|
-
import asyncio
|
|
171
|
-
import json
|
|
172
|
-
from hud.datasets import Task
|
|
173
|
-
from hud.clients import MCPClient
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
async def run_task(task_data: dict):
|
|
177
|
-
task = Task(**task_data)
|
|
178
|
-
client = MCPClient(mcp_config=task.mcp_config)
|
|
17
|
+
# Presets mapping to environment folders in public SDK repo
|
|
18
|
+
GITHUB_OWNER = "hud-evals"
|
|
19
|
+
GITHUB_REPO = "hud-python"
|
|
20
|
+
GITHUB_BRANCH = "main"
|
|
21
|
+
|
|
22
|
+
PRESET_MAP: dict[str, str | None] = {
|
|
23
|
+
"blank": "blank",
|
|
24
|
+
"deep-research": "remote_browser",
|
|
25
|
+
"browser": "browser",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
SKIP_DIR_NAMES = {"node_modules", "__pycache__", "dist", "build", ".next", ".git"}
|
|
29
|
+
|
|
30
|
+
# Files that need placeholder replacement
|
|
31
|
+
PLACEHOLDER_FILES = {
|
|
32
|
+
"pyproject.toml",
|
|
33
|
+
"tasks.json",
|
|
34
|
+
"src/controller/server.py",
|
|
35
|
+
"test_env.ipynb",
|
|
36
|
+
"README.md",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _replace_placeholders(target_dir: Path, env_name: str) -> list[str]:
|
|
41
|
+
"""Replace placeholders in template files with the actual environment name.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
target_dir: Directory containing the downloaded template files
|
|
45
|
+
env_name: The environment name to replace placeholders with
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
List of files that were modified
|
|
49
|
+
"""
|
|
50
|
+
modified_files = []
|
|
51
|
+
placeholder = "test_test"
|
|
52
|
+
|
|
53
|
+
# Normalize environment name for use in code/configs
|
|
54
|
+
# Replace spaces and special chars with underscores for Python identifiers
|
|
55
|
+
normalized_name = env_name.replace("-", "_").replace(" ", "_")
|
|
56
|
+
normalized_name = "".join(c if c.isalnum() or c == "_" else "_" for c in normalized_name)
|
|
57
|
+
|
|
58
|
+
for root, dirs, files in os.walk(target_dir):
|
|
59
|
+
# Skip directories we don't want to process
|
|
60
|
+
dirs[:] = [d for d in dirs if d not in SKIP_DIR_NAMES]
|
|
61
|
+
|
|
62
|
+
for file in files:
|
|
63
|
+
file_path = Path(root) / file
|
|
64
|
+
|
|
65
|
+
# Check if this file should have placeholders replaced
|
|
66
|
+
should_replace = file in PLACEHOLDER_FILES or any(
|
|
67
|
+
file_path.relative_to(target_dir).as_posix().endswith(f) for f in PLACEHOLDER_FILES
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
if should_replace:
|
|
71
|
+
try:
|
|
72
|
+
content = file_path.read_text(encoding="utf-8")
|
|
73
|
+
if placeholder in content:
|
|
74
|
+
new_content = content.replace(placeholder, normalized_name)
|
|
75
|
+
file_path.write_text(new_content, encoding="utf-8")
|
|
76
|
+
modified_files.append(str(file_path.relative_to(target_dir)))
|
|
77
|
+
except Exception: # noqa: S110
|
|
78
|
+
# Skip files that can't be read as text
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
return modified_files
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _prompt_for_preset() -> str:
|
|
85
|
+
"""Ask the user to choose a preset when not provided."""
|
|
86
|
+
try:
|
|
87
|
+
choices = [
|
|
88
|
+
{"name": "blank", "message": "blank │ minimal template"},
|
|
89
|
+
{"name": "deep-research", "message": "deep-research │ remote browser preset"},
|
|
90
|
+
{"name": "browser", "message": "browser │ local browser preset"},
|
|
91
|
+
]
|
|
92
|
+
display_choices = [c["message"] for c in choices]
|
|
93
|
+
selected = questionary.select(
|
|
94
|
+
"Choose a preset", choices=display_choices, default=display_choices[0]
|
|
95
|
+
).ask()
|
|
96
|
+
if not selected:
|
|
97
|
+
return "blank"
|
|
98
|
+
for c in choices:
|
|
99
|
+
if c["message"] == selected:
|
|
100
|
+
return c["name"]
|
|
101
|
+
return "blank"
|
|
102
|
+
except Exception:
|
|
103
|
+
return "blank"
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _download_tarball_subdir(
|
|
107
|
+
owner: str, repo: str, ref: str, subdir: str, dest_dir: Path, files_created: list[str]
|
|
108
|
+
) -> None:
|
|
109
|
+
"""Download a GitHub tarball and extract only a subdirectory."""
|
|
110
|
+
tarball_url = f"https://codeload.github.com/{owner}/{repo}/tar.gz/{ref}"
|
|
111
|
+
|
|
112
|
+
token = os.getenv("GITHUB_TOKEN")
|
|
113
|
+
headers = {"Authorization": f"token {token}"} if token else {}
|
|
114
|
+
with (
|
|
115
|
+
tempfile.NamedTemporaryFile(delete=False) as tmp_file,
|
|
116
|
+
httpx.Client(timeout=60) as client,
|
|
117
|
+
client.stream(
|
|
118
|
+
"GET",
|
|
119
|
+
tarball_url,
|
|
120
|
+
headers=headers,
|
|
121
|
+
) as resp,
|
|
122
|
+
):
|
|
123
|
+
if resp.status_code != 200:
|
|
124
|
+
raise RuntimeError(
|
|
125
|
+
f"Failed to download tarball (HTTP {resp.status_code}) from {tarball_url}"
|
|
126
|
+
)
|
|
127
|
+
for chunk in resp.iter_bytes():
|
|
128
|
+
if chunk:
|
|
129
|
+
tmp_file.write(chunk)
|
|
130
|
+
tmp_path = Path(tmp_file.name)
|
|
179
131
|
|
|
180
132
|
try:
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
133
|
+
with tarfile.open(tmp_path, mode="r:gz") as tar:
|
|
134
|
+
members = tar.getmembers()
|
|
135
|
+
if not members:
|
|
136
|
+
return
|
|
137
|
+
top = members[0].name.split("/", 1)[0]
|
|
138
|
+
target_prefix = f"{top}/environments/{subdir.strip('/')}"
|
|
139
|
+
|
|
140
|
+
for member in members:
|
|
141
|
+
name = member.name
|
|
142
|
+
if not (name == target_prefix or name.startswith(target_prefix + "/")):
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
rel_path = name[len(target_prefix) :].lstrip("/")
|
|
146
|
+
if not rel_path:
|
|
147
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
out_path = (dest_dir / rel_path).resolve()
|
|
151
|
+
dest_root = dest_dir.resolve()
|
|
152
|
+
if not str(out_path).startswith(str(dest_root)):
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
if member.isdir():
|
|
156
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
157
|
+
elif member.isreg():
|
|
158
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
159
|
+
extracted = tar.extractfile(member)
|
|
160
|
+
if extracted is None:
|
|
161
|
+
continue
|
|
162
|
+
with open(out_path, "wb") as f:
|
|
163
|
+
f.write(extracted.read())
|
|
164
|
+
# Use absolute dest_root for relative path computation to avoid Windows issues
|
|
165
|
+
files_created.append(str(out_path.relative_to(dest_root)))
|
|
201
166
|
finally:
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
async def main():
|
|
206
|
-
for task_data in json.load(open("tasks.json")):
|
|
207
|
-
await run_task(task_data)
|
|
208
|
-
|
|
209
|
-
if __name__ == "__main__":
|
|
210
|
-
asyncio.run(main())
|
|
211
|
-
''' # noqa: E501
|
|
212
|
-
|
|
213
|
-
NOTEBOOK_TEMPLATE = """{{
|
|
214
|
-
"cells": [
|
|
215
|
-
{{
|
|
216
|
-
"cell_type": "markdown",
|
|
217
|
-
"metadata": {{}},
|
|
218
|
-
"source": [
|
|
219
|
-
"Make sure to `pip install hud-python[agents]` before running this notebook\\n",
|
|
220
|
-
"\\n",
|
|
221
|
-
"### Step 1: Create a Task\\n",
|
|
222
|
-
"\\n",
|
|
223
|
-
"A Task combines:\\n",
|
|
224
|
-
"- **Prompt**: What we want an agent to accomplish\\n",
|
|
225
|
-
"- **MCP Config**: How to spawn the environment\\n",
|
|
226
|
-
"- **Setup Tool**: How to prepare the environment\\n",
|
|
227
|
-
"- **Evaluate Tool**: How to check if the task succeeded"
|
|
228
|
-
]
|
|
229
|
-
}},
|
|
230
|
-
{{
|
|
231
|
-
"cell_type": "code",
|
|
232
|
-
"execution_count": null,
|
|
233
|
-
"metadata": {{}},
|
|
234
|
-
"outputs": [],
|
|
235
|
-
"source": [
|
|
236
|
-
"from hud.datasets import Task\\n",
|
|
237
|
-
"from hud.types import MCPToolCall\\n",
|
|
238
|
-
"\\n",
|
|
239
|
-
"# Create a task that uses our {name} environment\\n",
|
|
240
|
-
"# See tasks.json for how to build a loadable task dataset\\n",
|
|
241
|
-
"task = Task(\\n",
|
|
242
|
-
" prompt=\\"Increment the counter to reach 10\\",\\n",
|
|
243
|
-
" mcp_config={{\\n",
|
|
244
|
-
" \\"{name}\\": {{\\n",
|
|
245
|
-
" \\"url\\": \\"http://localhost:8765/mcp\\"\\n",
|
|
246
|
-
" }},\\n",
|
|
247
|
-
" }},\\n",
|
|
248
|
-
" setup_tool=MCPToolCall(name=\\"setup\\", arguments={{}}),\\n",
|
|
249
|
-
" evaluate_tool=MCPToolCall(name=\\"evaluate\\", arguments={{\\"target\\": 10}}),\\n",
|
|
250
|
-
")"
|
|
251
|
-
]
|
|
252
|
-
}},
|
|
253
|
-
{{
|
|
254
|
-
"cell_type": "markdown",
|
|
255
|
-
"metadata": {{}},
|
|
256
|
-
"source": [
|
|
257
|
-
"### Step 2: Initialize MCP Client\\n",
|
|
258
|
-
"\\n",
|
|
259
|
-
"Run `hud dev --build` before this cell to intialize the server at `http://localhost:8765/mcp`"
|
|
260
|
-
]
|
|
261
|
-
}},
|
|
262
|
-
{{
|
|
263
|
-
"cell_type": "code",
|
|
264
|
-
"execution_count": null,
|
|
265
|
-
"metadata": {{}},
|
|
266
|
-
"outputs": [],
|
|
267
|
-
"source": [
|
|
268
|
-
"from hud.clients import MCPClient\\n",
|
|
269
|
-
"\\n",
|
|
270
|
-
"# Create the client\\n",
|
|
271
|
-
"client = MCPClient(mcp_config=task.mcp_config, auto_trace=False)\\n",
|
|
272
|
-
"\\n",
|
|
273
|
-
"# Initialize it (this connects to our dev server)\\n",
|
|
274
|
-
"await client.initialize()"
|
|
275
|
-
]
|
|
276
|
-
}},
|
|
277
|
-
{{
|
|
278
|
-
"cell_type": "markdown",
|
|
279
|
-
"metadata": {{}},
|
|
280
|
-
"source": [
|
|
281
|
-
"### Step 3: Run Setup\\n",
|
|
282
|
-
"\\n",
|
|
283
|
-
"Call the setup tool to prepare the environment according to the task."
|
|
284
|
-
]
|
|
285
|
-
}},
|
|
286
|
-
{{
|
|
287
|
-
"cell_type": "code",
|
|
288
|
-
"execution_count": null,
|
|
289
|
-
"metadata": {{}},
|
|
290
|
-
"outputs": [],
|
|
291
|
-
"source": [
|
|
292
|
-
"# Run the setup from our task\\n",
|
|
293
|
-
"setup_result = await client.call_tool(task.setup_tool) # type: ignore\\n",
|
|
294
|
-
"print(f\\"Setup result: {{setup_result}}\\")"
|
|
295
|
-
]
|
|
296
|
-
}},
|
|
297
|
-
{{
|
|
298
|
-
"cell_type": "markdown",
|
|
299
|
-
"metadata": {{}},
|
|
300
|
-
"source": [
|
|
301
|
-
"### Step 4: Perform Actions\\n",
|
|
302
|
-
"\\n",
|
|
303
|
-
"Now we'll manually perform actions to complete the task. In a real scenario, an AI agent would figure out what actions to take."
|
|
304
|
-
]
|
|
305
|
-
}},
|
|
306
|
-
{{
|
|
307
|
-
"cell_type": "code",
|
|
308
|
-
"execution_count": null,
|
|
309
|
-
"metadata": {{}},
|
|
310
|
-
"outputs": [],
|
|
311
|
-
"source": [
|
|
312
|
-
"# Increment the counter 10 times\\n",
|
|
313
|
-
"for i in range(10):\\n",
|
|
314
|
-
" result = await client.call_tool(name=\\"act\\", arguments={{}})\\n",
|
|
315
|
-
" print(f\\"Step {{i+1}}: {{result.content}}\\")"
|
|
316
|
-
]
|
|
317
|
-
}},
|
|
318
|
-
{{
|
|
319
|
-
"cell_type": "markdown",
|
|
320
|
-
"metadata": {{}},
|
|
321
|
-
"source": [
|
|
322
|
-
"## Step 5: Evaluate Success\\n",
|
|
323
|
-
"\\n",
|
|
324
|
-
"Check if we completed the task according to the evaluation criteria."
|
|
325
|
-
]
|
|
326
|
-
}},
|
|
327
|
-
{{
|
|
328
|
-
"cell_type": "code",
|
|
329
|
-
"execution_count": null,
|
|
330
|
-
"metadata": {{}},
|
|
331
|
-
"outputs": [],
|
|
332
|
-
"source": [
|
|
333
|
-
"# Run the evaluation from our task\\n",
|
|
334
|
-
"eval_result = await client.call_tool(task.evaluate_tool) # type: ignore\\n",
|
|
335
|
-
"\\n",
|
|
336
|
-
"# The result is a list with one TextContent item containing JSON\\n",
|
|
337
|
-
"print(eval_result)"
|
|
338
|
-
]
|
|
339
|
-
}},
|
|
340
|
-
{{
|
|
341
|
-
"cell_type": "markdown",
|
|
342
|
-
"metadata": {{}},
|
|
343
|
-
"source": [
|
|
344
|
-
"### Step 6: Cleanup\\n",
|
|
345
|
-
"\\n",
|
|
346
|
-
"Always shut down the client when done to stop the Docker container. Either stop hud dev in the terminal, or run this command:"
|
|
347
|
-
]
|
|
348
|
-
}},
|
|
349
|
-
{{
|
|
350
|
-
"cell_type": "code",
|
|
351
|
-
"execution_count": null,
|
|
352
|
-
"metadata": {{}},
|
|
353
|
-
"outputs": [],
|
|
354
|
-
"source": [
|
|
355
|
-
"await client.shutdown()"
|
|
356
|
-
]
|
|
357
|
-
}},
|
|
358
|
-
{{
|
|
359
|
-
"cell_type": "markdown",
|
|
360
|
-
"metadata": {{}},
|
|
361
|
-
"source": [
|
|
362
|
-
"### Bonus: Running with an AI Agent\\n",
|
|
363
|
-
"\\n",
|
|
364
|
-
"Instead of manually calling tools, you can have an AI agent solve the task automatically."
|
|
365
|
-
]
|
|
366
|
-
}},
|
|
367
|
-
{{
|
|
368
|
-
"cell_type": "code",
|
|
369
|
-
"execution_count": null,
|
|
370
|
-
"metadata": {{}},
|
|
371
|
-
"outputs": [],
|
|
372
|
-
"source": [
|
|
373
|
-
"# Uncomment to run with Claude (requires ANTHROPIC_API_KEY)\\n",
|
|
374
|
-
"from hud.agents import ClaudeAgent\\n",
|
|
375
|
-
"\\n",
|
|
376
|
-
"# Create an agent\\n",
|
|
377
|
-
"agent = ClaudeAgent(\\n",
|
|
378
|
-
" model=\\"claude-sonnet-4-20250514\\",\\n",
|
|
379
|
-
" allowed_tools=[\\"act\\"] # Only allow the act tool\\n",
|
|
380
|
-
")\\n",
|
|
381
|
-
"\\n",
|
|
382
|
-
"# Run the task\\n",
|
|
383
|
-
"result = await agent.run(task)\\n",
|
|
384
|
-
"print(f\\"Final reward: {{result.reward}}\\")"
|
|
385
|
-
]
|
|
386
|
-
}},
|
|
387
|
-
{{
|
|
388
|
-
"cell_type": "markdown",
|
|
389
|
-
"metadata": {{}},
|
|
390
|
-
"source": [
|
|
391
|
-
"### Next Steps\\n",
|
|
392
|
-
"\\n",
|
|
393
|
-
"1. **Create your own evaluators**: Add new evaluation functions to `server.py`\\n",
|
|
394
|
-
"2. **Build complex environments**: Replace the simple counter with your actual application\\n",
|
|
395
|
-
"3. **Test with agents**: Use different AI models to solve your tasks\\n",
|
|
396
|
-
"\\n",
|
|
397
|
-
"For more examples, check out:\\n",
|
|
398
|
-
"- `environments/text_2048/` - A complete 2048 game environment\\n",
|
|
399
|
-
"- `environments/browser/` - A full browser automation environment with GUI"
|
|
400
|
-
]
|
|
401
|
-
}},
|
|
402
|
-
{{
|
|
403
|
-
"cell_type": "code",
|
|
404
|
-
"execution_count": null,
|
|
405
|
-
"metadata": {{}},
|
|
406
|
-
"outputs": [],
|
|
407
|
-
"source": []
|
|
408
|
-
}}
|
|
409
|
-
],
|
|
410
|
-
"metadata": {{
|
|
411
|
-
"kernelspec": {{
|
|
412
|
-
"display_name": "Python 3",
|
|
413
|
-
"language": "python",
|
|
414
|
-
"name": "python3"
|
|
415
|
-
}},
|
|
416
|
-
"language_info": {{
|
|
417
|
-
"codemirror_mode": {{
|
|
418
|
-
"name": "ipython",
|
|
419
|
-
"version": 3
|
|
420
|
-
}},
|
|
421
|
-
"file_extension": ".py",
|
|
422
|
-
"mimetype": "text/x-python",
|
|
423
|
-
"name": "python",
|
|
424
|
-
"nbconvert_exporter": "python",
|
|
425
|
-
"pygments_lexer": "ipython3",
|
|
426
|
-
"version": "3.11.0"
|
|
427
|
-
}}
|
|
428
|
-
}},
|
|
429
|
-
"nbformat": 4,
|
|
430
|
-
"nbformat_minor": 4
|
|
431
|
-
}}
|
|
432
|
-
""" # noqa: E501
|
|
433
|
-
|
|
434
|
-
ENV_FILE_TEMPLATE = """# HUD API Configuration
|
|
435
|
-
# Get your API key from https://app.hud.so/account
|
|
436
|
-
HUD_API_KEY=""
|
|
437
|
-
|
|
438
|
-
# Anthropic API Configuration (optional)
|
|
439
|
-
# Required for using Claude agents - get from https://console.anthropic.com/
|
|
440
|
-
ANTHROPIC_API_KEY=""
|
|
441
|
-
"""
|
|
442
|
-
|
|
443
|
-
README_TEMPLATE = """# {title}
|
|
444
|
-
|
|
445
|
-
A minimal HUD environment demonstrating the Task pattern with a simple counter.
|
|
446
|
-
|
|
447
|
-
## Quick Start
|
|
448
|
-
|
|
449
|
-
### Interactive Development
|
|
450
|
-
```bash
|
|
451
|
-
# 1. Configure your API keys (optional - only needed for evaluation)
|
|
452
|
-
# Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY
|
|
453
|
-
|
|
454
|
-
# 2. Start the environment (optional: with inspector)
|
|
455
|
-
hud dev --build --inspector
|
|
456
|
-
|
|
457
|
-
# 3. Choose your preferred way to test:
|
|
458
|
-
|
|
459
|
-
# Option A: Run the task with Claude (requires ANTHROPIC_API_KEY)
|
|
460
|
-
hud eval tasks.json --agent claude
|
|
461
|
-
|
|
462
|
-
# Option B: Interactive notebook test_env.ipynb (great for learning!)
|
|
463
|
-
# Requires installation:
|
|
464
|
-
pip install hud-python[agents]
|
|
465
|
-
|
|
466
|
-
# Option C: Simple Python script (runs all tasks from tasks.json)
|
|
467
|
-
python test_task.py
|
|
468
|
-
```
|
|
469
|
-
|
|
470
|
-
## How HUD Environments Work
|
|
471
|
-
|
|
472
|
-
The environment is split into two components:
|
|
473
|
-
|
|
474
|
-
- **`env.py`** - Stateful logic that persists across reloads
|
|
475
|
-
- **`server.py`** - MCP server with tools (reloads on file changes)
|
|
476
|
-
|
|
477
|
-
This separation is crucial for `hud dev` - it allows you to modify the MCP tools and see changes immediately without losing the environment state. The environment runs as a separate process and communicates via socket, while the server can be restarted freely.
|
|
478
|
-
|
|
479
|
-
If you are ever seeing issues with the environment itself, running `hud dev --full-reload` will reload both the environment and the server.
|
|
480
|
-
|
|
481
|
-
## Publishing Your Environment
|
|
482
|
-
|
|
483
|
-
Once your environment is ready, you can share it with the community:
|
|
484
|
-
|
|
485
|
-
### 1. Push to Registry
|
|
486
|
-
```bash
|
|
487
|
-
# Build and push your environment (requires docker hub login and hud api key)
|
|
488
|
-
hud build
|
|
489
|
-
hud push
|
|
490
|
-
```
|
|
491
|
-
|
|
492
|
-
### 2. Create a Dataset
|
|
493
|
-
|
|
494
|
-
Create a dataset on HuggingFace with your tasks:
|
|
495
|
-
|
|
496
|
-
**Option A: Upload manually**
|
|
497
|
-
1. Upload your `tasks.json` to HuggingFace
|
|
498
|
-
2. Make sure it's **public** to appear on leaderboards
|
|
499
|
-
|
|
500
|
-
**Option B: Use the SDK**
|
|
501
|
-
```python
|
|
502
|
-
from hud.datasets import save_tasks
|
|
503
|
-
import json
|
|
504
|
-
|
|
505
|
-
# Load your tasks
|
|
506
|
-
with open("tasks.json") as f:
|
|
507
|
-
tasks = json.load(f)
|
|
508
|
-
|
|
509
|
-
# Push to HuggingFace
|
|
510
|
-
save_tasks(tasks, repo_id="your-org/your-dataset")
|
|
511
|
-
```
|
|
512
|
-
|
|
513
|
-
### 3. Run and Track Performance
|
|
514
|
-
|
|
515
|
-
```bash
|
|
516
|
-
# Run Claude on your benchmark
|
|
517
|
-
hud eval "your-org/your-dataset" --agent claude
|
|
518
|
-
|
|
519
|
-
# View results at:
|
|
520
|
-
# app.hud.so/leaderboards/your-org/your-dataset
|
|
521
|
-
```
|
|
522
|
-
|
|
523
|
-
**Note**: Only public HuggingFace datasets appear as leaderboards!
|
|
524
|
-
|
|
525
|
-
📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
|
|
526
|
-
""" # noqa: E501
|
|
167
|
+
from contextlib import suppress
|
|
527
168
|
|
|
169
|
+
with suppress(Exception):
|
|
170
|
+
os.remove(tmp_path)
|
|
528
171
|
|
|
529
|
-
def sanitize_name(name: str) -> str:
|
|
530
|
-
"""Convert a name to a valid Python package name."""
|
|
531
|
-
# Replace spaces and hyphens with underscores
|
|
532
|
-
name = name.replace(" ", "_").replace("-", "_")
|
|
533
|
-
# Remove any non-alphanumeric characters except underscores
|
|
534
|
-
name = "".join(c for c in name if c.isalnum() or c == "_")
|
|
535
|
-
# Ensure it doesn't start with a number
|
|
536
|
-
if name and name[0].isdigit():
|
|
537
|
-
name = f"env_{name}"
|
|
538
|
-
return name.lower()
|
|
539
172
|
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
173
|
+
def create_environment(
|
|
174
|
+
name: str | None, directory: str, force: bool, preset: str | None = None
|
|
175
|
+
) -> None:
|
|
176
|
+
"""Create a new HUD environment by downloading a preset from the repo."""
|
|
543
177
|
|
|
544
178
|
hud_console = HUDConsole()
|
|
545
179
|
|
|
546
|
-
# Determine environment name
|
|
180
|
+
# Determine environment name/target directory
|
|
547
181
|
if name is None:
|
|
548
|
-
# Use current directory name
|
|
549
182
|
current_dir = Path.cwd()
|
|
550
183
|
name = current_dir.name
|
|
551
184
|
target_dir = current_dir
|
|
552
185
|
hud_console.info(f"Using current directory name: {name}")
|
|
553
186
|
else:
|
|
554
|
-
# Create new directory
|
|
555
187
|
target_dir = Path(directory) / name
|
|
556
188
|
|
|
557
|
-
#
|
|
558
|
-
|
|
559
|
-
if
|
|
560
|
-
hud_console.warning(
|
|
189
|
+
# Choose preset
|
|
190
|
+
preset_normalized = (preset or "").strip().lower() if preset else _prompt_for_preset()
|
|
191
|
+
if preset_normalized not in PRESET_MAP:
|
|
192
|
+
hud_console.warning(
|
|
193
|
+
f"Unknown preset '{preset_normalized}', defaulting to 'blank' "
|
|
194
|
+
"(available: blank, deep-research, browser)"
|
|
195
|
+
)
|
|
196
|
+
preset_normalized = "blank"
|
|
561
197
|
|
|
562
198
|
# Check if directory exists
|
|
563
199
|
if target_dir.exists() and any(target_dir.iterdir()):
|
|
@@ -568,80 +204,59 @@ def create_environment(name: str | None, directory: str, force: bool) -> None:
|
|
|
568
204
|
else:
|
|
569
205
|
hud_console.warning(f"Overwriting existing files in {target_dir}")
|
|
570
206
|
|
|
571
|
-
#
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
pyproject_path = target_dir / "pyproject.toml"
|
|
585
|
-
pyproject_content = PYPROJECT_TEMPLATE.format(name=package_name).strip() + "\n"
|
|
586
|
-
pyproject_path.write_text(pyproject_content, encoding="utf-8")
|
|
587
|
-
files_created.append("pyproject.toml")
|
|
588
|
-
|
|
589
|
-
# README.md
|
|
590
|
-
readme_path = target_dir / "README.md"
|
|
591
|
-
readme_content = README_TEMPLATE.format(name=package_name, title=name).strip() + "\n"
|
|
592
|
-
readme_path.write_text(readme_content, encoding="utf-8")
|
|
593
|
-
files_created.append("README.md")
|
|
594
|
-
|
|
595
|
-
# Python files
|
|
596
|
-
# __init__.py
|
|
597
|
-
init_path = src_dir / "__init__.py"
|
|
598
|
-
init_path.write_text('"""Controller Package"""\n', encoding="utf-8")
|
|
599
|
-
files_created.append("src/controller/__init__.py")
|
|
600
|
-
|
|
601
|
-
# env.py
|
|
602
|
-
env_path = src_dir / "env.py"
|
|
603
|
-
env_path.write_text(ENV_TEMPLATE.strip() + "\n", encoding="utf-8")
|
|
604
|
-
files_created.append("src/controller/env.py")
|
|
605
|
-
|
|
606
|
-
# server.py (need to escape the double braces for .format())
|
|
607
|
-
server_path = src_dir / "server.py"
|
|
608
|
-
server_content = SERVER_TEMPLATE.format(name=package_name).strip() + "\n"
|
|
609
|
-
server_path.write_text(server_content, encoding="utf-8")
|
|
610
|
-
files_created.append("src/controller/server.py")
|
|
611
|
-
|
|
612
|
-
# tasks.json
|
|
613
|
-
tasks_path = target_dir / "tasks.json"
|
|
614
|
-
tasks_content = TASKS_JSON_TEMPLATE.format(name=package_name).strip() + "\n"
|
|
615
|
-
tasks_path.write_text(tasks_content, encoding="utf-8")
|
|
616
|
-
files_created.append("tasks.json")
|
|
617
|
-
|
|
618
|
-
# test_task.py
|
|
619
|
-
test_task_path = target_dir / "test_task.py"
|
|
620
|
-
test_task_path.write_text(TEST_TASK_TEMPLATE.strip() + "\n", encoding="utf-8")
|
|
621
|
-
files_created.append("test_task.py")
|
|
622
|
-
|
|
623
|
-
# notebook.ipynb
|
|
624
|
-
notebook_path = target_dir / "test_env.ipynb"
|
|
625
|
-
notebook_content = NOTEBOOK_TEMPLATE.format(name=package_name).strip() + "\n"
|
|
626
|
-
notebook_path.write_text(notebook_content, encoding="utf-8")
|
|
627
|
-
files_created.append("test_env.ipynb")
|
|
628
|
-
|
|
629
|
-
# .env file
|
|
630
|
-
env_file_path = target_dir / ".env"
|
|
631
|
-
env_file_content = ENV_FILE_TEMPLATE.strip() + "\n"
|
|
632
|
-
env_file_path.write_text(env_file_content, encoding="utf-8")
|
|
633
|
-
files_created.append(".env")
|
|
634
|
-
|
|
635
|
-
# Success message
|
|
636
|
-
hud_console.header(f"Created HUD Environment: {name}")
|
|
637
|
-
|
|
638
|
-
hud_console.section_title("Files created")
|
|
639
|
-
for file in files_created:
|
|
640
|
-
hud_console.status_item(file, "created")
|
|
207
|
+
# Download preset from GitHub
|
|
208
|
+
env_folder = PRESET_MAP[preset_normalized]
|
|
209
|
+
if env_folder is None:
|
|
210
|
+
hud_console.error("Internal error: preset mapping missing folder name")
|
|
211
|
+
raise typer.Exit(1)
|
|
212
|
+
|
|
213
|
+
hud_console.header(f"Initializing HUD Environment: {name} (preset: {preset_normalized})")
|
|
214
|
+
hud_console.section_title("Downloading template from public SDK")
|
|
215
|
+
source_url = (
|
|
216
|
+
f"https://github.com/{GITHUB_OWNER}/{GITHUB_REPO}/tree/"
|
|
217
|
+
f"{GITHUB_BRANCH}/environments/{env_folder}"
|
|
218
|
+
)
|
|
219
|
+
hud_console.info("Source: " + source_url)
|
|
641
220
|
|
|
642
|
-
|
|
221
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
222
|
+
|
|
223
|
+
started = time.time()
|
|
224
|
+
files_created_dl: list[str] = []
|
|
225
|
+
try:
|
|
226
|
+
_download_tarball_subdir(
|
|
227
|
+
owner=GITHUB_OWNER,
|
|
228
|
+
repo=GITHUB_REPO,
|
|
229
|
+
ref=GITHUB_BRANCH,
|
|
230
|
+
subdir=env_folder,
|
|
231
|
+
dest_dir=target_dir,
|
|
232
|
+
files_created=files_created_dl,
|
|
233
|
+
)
|
|
234
|
+
except Exception as e:
|
|
235
|
+
hud_console.error(f"Failed to download preset '{preset_normalized}': {e}")
|
|
236
|
+
raise typer.Exit(1) from None
|
|
643
237
|
|
|
644
|
-
|
|
238
|
+
duration_ms = int((time.time() - started) * 1000)
|
|
239
|
+
hud_console.success(
|
|
240
|
+
f"Downloaded {len(files_created_dl)} files in {duration_ms} ms into {target_dir}"
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Replace placeholders in template files
|
|
244
|
+
hud_console.section_title("Customizing template files")
|
|
245
|
+
modified_files = _replace_placeholders(target_dir, name)
|
|
246
|
+
if modified_files:
|
|
247
|
+
hud_console.success(f"Replaced placeholders in {len(modified_files)} files:")
|
|
248
|
+
for file in modified_files[:5]: # Show first 5 files
|
|
249
|
+
hud_console.status_item(file, "updated")
|
|
250
|
+
if len(modified_files) > 5:
|
|
251
|
+
hud_console.info(f"... and {len(modified_files) - 5} more files")
|
|
252
|
+
else:
|
|
253
|
+
hud_console.info("No placeholder replacements needed")
|
|
254
|
+
|
|
255
|
+
hud_console.section_title("Top-level files and folders")
|
|
256
|
+
for entry in sorted(os.listdir(target_dir)):
|
|
257
|
+
hud_console.status_item(entry, "added")
|
|
258
|
+
|
|
259
|
+
hud_console.section_title("Next steps")
|
|
645
260
|
if target_dir == Path.cwd():
|
|
646
261
|
hud_console.info("1. Start development server (with MCP inspector):")
|
|
647
262
|
hud_console.command_example("hud dev --inspector")
|
|
@@ -651,27 +266,5 @@ def create_environment(name: str | None, directory: str, force: bool) -> None:
|
|
|
651
266
|
hud_console.info("\n2. Start development server (with MCP inspector):")
|
|
652
267
|
hud_console.command_example("hud dev --inspector")
|
|
653
268
|
|
|
654
|
-
hud_console.info("\n3.
|
|
655
|
-
hud_console.info("
|
|
656
|
-
|
|
657
|
-
hud_console.info("\n4. Test your environment:")
|
|
658
|
-
hud_console.command_example("python test_task.py")
|
|
659
|
-
|
|
660
|
-
hud_console.info("\n5. Customize your environment:")
|
|
661
|
-
hud_console.info(" - Add tools to src/controller/server.py")
|
|
662
|
-
hud_console.info(" - Add state to src/controller/env.py")
|
|
663
|
-
hud_console.info(" - Modify tasks in tasks.json")
|
|
664
|
-
hud_console.info(" - Experiment in test_env.ipynb")
|
|
665
|
-
|
|
666
|
-
# Show a sample of the server code
|
|
667
|
-
hud_console.section_title("Your MCP server")
|
|
668
|
-
sample_code = '''@mcp.tool()
|
|
669
|
-
async def act() -> str:
|
|
670
|
-
"""Perform an action that changes the environment state."""
|
|
671
|
-
if env is None:
|
|
672
|
-
raise RuntimeError("Context not initialized")
|
|
673
|
-
count = env.act()
|
|
674
|
-
return f"Action #{count} performed. Current count: {count}"'''
|
|
675
|
-
|
|
676
|
-
syntax = Syntax(sample_code, "python", theme="monokai", line_numbers=False)
|
|
677
|
-
hud_console.console.print(Panel(syntax, border_style="dim"))
|
|
269
|
+
hud_console.info("\n3. Review the README in this preset for specific instructions.")
|
|
270
|
+
hud_console.info("\n4. Customize as needed.")
|