hud-python 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__main__.py +8 -0
- hud/agents/base.py +7 -8
- hud/agents/langchain.py +2 -2
- hud/agents/tests/test_openai.py +3 -1
- hud/cli/__init__.py +114 -52
- hud/cli/build.py +121 -71
- hud/cli/debug.py +2 -2
- hud/cli/{mcp_server.py → dev.py} +101 -38
- hud/cli/eval.py +175 -90
- hud/cli/init.py +442 -64
- hud/cli/list_func.py +72 -71
- hud/cli/pull.py +1 -2
- hud/cli/push.py +35 -23
- hud/cli/remove.py +35 -41
- hud/cli/tests/test_analyze.py +2 -1
- hud/cli/tests/test_analyze_metadata.py +42 -49
- hud/cli/tests/test_build.py +28 -52
- hud/cli/tests/test_cursor.py +1 -1
- hud/cli/tests/test_debug.py +1 -1
- hud/cli/tests/test_list_func.py +75 -64
- hud/cli/tests/test_main_module.py +30 -0
- hud/cli/tests/test_mcp_server.py +3 -3
- hud/cli/tests/test_pull.py +30 -61
- hud/cli/tests/test_push.py +70 -89
- hud/cli/tests/test_registry.py +36 -38
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/utils/__init__.py +1 -0
- hud/cli/{docker_utils.py → utils/docker.py} +36 -0
- hud/cli/{env_utils.py → utils/environment.py} +7 -7
- hud/cli/{interactive.py → utils/interactive.py} +91 -19
- hud/cli/{analyze_metadata.py → utils/metadata.py} +12 -8
- hud/cli/{registry.py → utils/registry.py} +28 -30
- hud/cli/{remote_runner.py → utils/remote_runner.py} +1 -1
- hud/cli/utils/runner.py +134 -0
- hud/cli/utils/server.py +250 -0
- hud/clients/base.py +1 -1
- hud/clients/fastmcp.py +5 -13
- hud/clients/mcp_use.py +6 -10
- hud/server/server.py +35 -5
- hud/shared/exceptions.py +11 -0
- hud/shared/tests/test_exceptions.py +22 -0
- hud/telemetry/tests/__init__.py +0 -0
- hud/telemetry/tests/test_replay.py +40 -0
- hud/telemetry/tests/test_trace.py +63 -0
- hud/tools/base.py +20 -3
- hud/tools/computer/hud.py +15 -6
- hud/tools/executors/tests/test_base_executor.py +27 -0
- hud/tools/response.py +12 -8
- hud/tools/tests/test_response.py +60 -0
- hud/tools/tests/test_tools_init.py +49 -0
- hud/utils/design.py +19 -8
- hud/utils/mcp.py +17 -5
- hud/utils/tests/test_mcp.py +112 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/METADATA +16 -13
- {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/RECORD +62 -52
- hud/cli/runner.py +0 -160
- /hud/cli/{cursor.py → utils/cursor.py} +0 -0
- /hud/cli/{utils.py → utils/logging.py} +0 -0
- {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/WHEEL +0 -0
- {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/licenses/LICENSE +0 -0
hud/cli/init.py
CHANGED
|
@@ -15,19 +15,13 @@ DOCKERFILE_TEMPLATE = """FROM python:3.11-slim
|
|
|
15
15
|
|
|
16
16
|
WORKDIR /app
|
|
17
17
|
|
|
18
|
-
# Install git for hud-python dependency
|
|
19
|
-
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
|
20
|
-
|
|
21
18
|
# Copy and install dependencies
|
|
22
19
|
COPY pyproject.toml ./
|
|
23
20
|
COPY src/ ./src/
|
|
24
21
|
RUN pip install --no-cache-dir -e .
|
|
25
22
|
|
|
26
|
-
# Set logging to stderr
|
|
27
|
-
ENV HUD_LOG_STREAM=stderr
|
|
28
|
-
|
|
29
23
|
# Start context server in background, then MCP server
|
|
30
|
-
CMD ["sh", "-c", "python -m hud_controller.
|
|
24
|
+
CMD ["sh", "-c", "python -m hud_controller.env & sleep 1 && exec python -m hud_controller.server"]
|
|
31
25
|
""" # noqa: E501
|
|
32
26
|
|
|
33
27
|
PYPROJECT_TEMPLATE = """[project]
|
|
@@ -53,109 +47,469 @@ allow-direct-references = true
|
|
|
53
47
|
packages = ["src/hud_controller"]
|
|
54
48
|
"""
|
|
55
49
|
|
|
56
|
-
|
|
50
|
+
ENV_TEMPLATE = '''"""Minimal environment that persists across hot-reloads."""
|
|
57
51
|
from hud.server.context import run_context_server
|
|
58
52
|
import asyncio
|
|
59
53
|
|
|
60
|
-
class
|
|
54
|
+
class Environment:
|
|
55
|
+
"""Simple counter environment."""
|
|
56
|
+
|
|
61
57
|
def __init__(self):
|
|
62
58
|
self.count = 0
|
|
63
59
|
|
|
64
60
|
def act(self):
|
|
61
|
+
"""Increment the counter."""
|
|
65
62
|
self.count += 1
|
|
66
63
|
return self.count
|
|
67
64
|
|
|
68
65
|
def get_count(self):
|
|
66
|
+
"""Get current counter."""
|
|
69
67
|
return self.count
|
|
68
|
+
|
|
69
|
+
def reset(self):
|
|
70
|
+
"""Reset counter to zero."""
|
|
71
|
+
self.count = 0
|
|
70
72
|
|
|
71
73
|
if __name__ == "__main__":
|
|
72
|
-
asyncio.run(run_context_server(
|
|
74
|
+
asyncio.run(run_context_server(Environment(), sock_path="/tmp/hud_ctx.sock"))
|
|
73
75
|
'''
|
|
74
76
|
|
|
75
77
|
SERVER_TEMPLATE = '''"""Minimal MCP server for HUD."""
|
|
78
|
+
import sys
|
|
79
|
+
import logging
|
|
76
80
|
from hud.server import MCPServer
|
|
77
81
|
from hud.server.context import attach_context
|
|
82
|
+
from hud.tools.types import EvaluationResult
|
|
83
|
+
|
|
84
|
+
# Configure logging to stderr
|
|
85
|
+
logging.basicConfig(
|
|
86
|
+
stream=sys.stderr,
|
|
87
|
+
level=logging.INFO,
|
|
88
|
+
format='[%(levelname)s] %(asctime)s | %(name)s | %(message)s'
|
|
89
|
+
)
|
|
78
90
|
|
|
79
91
|
mcp = MCPServer(name="{name}")
|
|
80
|
-
|
|
92
|
+
env = None
|
|
81
93
|
|
|
82
94
|
@mcp.initialize
|
|
83
|
-
async def init(
|
|
84
|
-
global
|
|
85
|
-
|
|
95
|
+
async def init(ctx):
|
|
96
|
+
global env
|
|
97
|
+
env = attach_context("/tmp/hud_ctx.sock")
|
|
98
|
+
logging.info("Connected to context server")
|
|
86
99
|
|
|
87
100
|
@mcp.shutdown
|
|
88
101
|
async def cleanup():
|
|
89
|
-
global
|
|
90
|
-
|
|
102
|
+
global env
|
|
103
|
+
env = None
|
|
91
104
|
|
|
92
105
|
@mcp.tool()
|
|
93
106
|
async def act() -> str:
|
|
94
|
-
"""Perform an action."""
|
|
95
|
-
|
|
107
|
+
"""Perform an action that changes the environment state."""
|
|
108
|
+
if env is None:
|
|
109
|
+
raise RuntimeError("Context not initialized")
|
|
110
|
+
count = env.act()
|
|
111
|
+
return f"Action #{{count}} performed. Current count: {{count}}"
|
|
96
112
|
|
|
97
113
|
@mcp.tool()
|
|
98
114
|
async def setup() -> str:
|
|
99
|
-
"""
|
|
100
|
-
|
|
115
|
+
"""Reset the environment to initial state."""
|
|
116
|
+
if env is None:
|
|
117
|
+
raise RuntimeError("Context not initialized")
|
|
118
|
+
env.reset()
|
|
119
|
+
return "Counter reset to 0"
|
|
101
120
|
|
|
102
121
|
@mcp.tool()
|
|
103
|
-
async def evaluate() ->
|
|
104
|
-
"""
|
|
105
|
-
|
|
122
|
+
async def evaluate(target: int = 10) -> EvaluationResult:
|
|
123
|
+
"""Check if the counter reached the target value."""
|
|
124
|
+
if env is None:
|
|
125
|
+
raise RuntimeError("Context not initialized")
|
|
126
|
+
current_count = env.get_count()
|
|
127
|
+
|
|
128
|
+
# Calculate reward as progress towards target
|
|
129
|
+
reward = min(current_count / target, 1.0) if target > 0 else 0.0
|
|
130
|
+
done = current_count >= target
|
|
131
|
+
|
|
132
|
+
return EvaluationResult(
|
|
133
|
+
reward=reward,
|
|
134
|
+
done=done,
|
|
135
|
+
content=f"Counter at {{current_count}}/{{target}}"
|
|
136
|
+
)
|
|
106
137
|
|
|
107
138
|
if __name__ == "__main__":
|
|
108
139
|
mcp.run()
|
|
109
140
|
'''
|
|
110
141
|
|
|
142
|
+
TASKS_JSON_TEMPLATE = '''[
|
|
143
|
+
{{
|
|
144
|
+
"prompt": "Increment the counter to reach 10",
|
|
145
|
+
"mcp_config": {{
|
|
146
|
+
"{name}": {{
|
|
147
|
+
"url": "http://localhost:8765/mcp"
|
|
148
|
+
}}
|
|
149
|
+
}},
|
|
150
|
+
"setup_tool": {{
|
|
151
|
+
"name": "setup",
|
|
152
|
+
"arguments": {{}}
|
|
153
|
+
}},
|
|
154
|
+
"evaluate_tool": {{
|
|
155
|
+
"name": "evaluate",
|
|
156
|
+
"arguments": {{
|
|
157
|
+
"target": 10
|
|
158
|
+
}}
|
|
159
|
+
}}
|
|
160
|
+
}}
|
|
161
|
+
]
|
|
162
|
+
'''
|
|
163
|
+
|
|
164
|
+
TEST_TASK_TEMPLATE = '''#!/usr/bin/env python
|
|
165
|
+
"""Simple example of running tasks from tasks.json.
|
|
166
|
+
|
|
167
|
+
Make sure to run 'hud dev --build' in another terminal first!
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
import asyncio
|
|
171
|
+
import json
|
|
172
|
+
from hud.datasets import Task
|
|
173
|
+
from hud.clients import MCPClient
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
async def run_task(task_data: dict):
|
|
177
|
+
task = Task(**task_data)
|
|
178
|
+
client = MCPClient(mcp_config=task.mcp_config)
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
print("Initializing client...")
|
|
182
|
+
await client.initialize()
|
|
183
|
+
|
|
184
|
+
result = await client.call_tool(task.setup_tool) # type: ignore
|
|
185
|
+
print(f"✅ Setup: {{result.content}}")
|
|
186
|
+
|
|
187
|
+
print("\\n🔄 Performing actions:")
|
|
188
|
+
for _ in range(10):
|
|
189
|
+
result = await client.call_tool(name="act", arguments={{}})
|
|
190
|
+
print(f" {{result.content}}")
|
|
191
|
+
|
|
192
|
+
result = await client.call_tool(task.evaluate_tool) # type: ignore
|
|
193
|
+
print(f"\\n📊 Evaluation: {{result.content}}")
|
|
194
|
+
|
|
195
|
+
return result.content
|
|
196
|
+
except Exception as e:
|
|
197
|
+
if "connection" in str(e).lower():
|
|
198
|
+
print("❌ Could not connect. Make sure 'hud dev --build' is running in another terminal.")
|
|
199
|
+
else:
|
|
200
|
+
raise e
|
|
201
|
+
finally:
|
|
202
|
+
await client.shutdown()
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
async def main():
|
|
206
|
+
for task_data in json.load(open("tasks.json")):
|
|
207
|
+
await run_task(task_data)
|
|
208
|
+
|
|
209
|
+
if __name__ == "__main__":
|
|
210
|
+
asyncio.run(main())
|
|
211
|
+
'''
|
|
212
|
+
|
|
213
|
+
NOTEBOOK_TEMPLATE = '''{{
|
|
214
|
+
"cells": [
|
|
215
|
+
{{
|
|
216
|
+
"cell_type": "markdown",
|
|
217
|
+
"metadata": {{}},
|
|
218
|
+
"source": [
|
|
219
|
+
"### Step 1: Create a Task\\n",
|
|
220
|
+
"\\n",
|
|
221
|
+
"A Task combines:\\n",
|
|
222
|
+
"- **Prompt**: What we want an agent to accomplish\\n",
|
|
223
|
+
"- **MCP Config**: How to spawn the environment\\n",
|
|
224
|
+
"- **Setup Tool**: How to prepare the environment\\n",
|
|
225
|
+
"- **Evaluate Tool**: How to check if the task succeeded"
|
|
226
|
+
]
|
|
227
|
+
}},
|
|
228
|
+
{{
|
|
229
|
+
"cell_type": "code",
|
|
230
|
+
"execution_count": null,
|
|
231
|
+
"metadata": {{}},
|
|
232
|
+
"outputs": [],
|
|
233
|
+
"source": [
|
|
234
|
+
"from hud.datasets import Task\\n",
|
|
235
|
+
"from hud.types import MCPToolCall\\n",
|
|
236
|
+
"\\n",
|
|
237
|
+
"# Create a task that uses our {name} environment\\n",
|
|
238
|
+
"# See tasks.json for how to build a loadable task dataset\\n",
|
|
239
|
+
"task = Task(\\n",
|
|
240
|
+
" prompt=\\"Increment the counter to reach 10\\",\\n",
|
|
241
|
+
" mcp_config={{\\n",
|
|
242
|
+
" \\"{name}\\": {{\\n",
|
|
243
|
+
" \\"url\\": \\"http://localhost:8765/mcp\\"\\n",
|
|
244
|
+
" }},\\n",
|
|
245
|
+
" }},\\n",
|
|
246
|
+
" setup_tool=MCPToolCall(name=\\"setup\\", arguments={{}}),\\n",
|
|
247
|
+
" evaluate_tool=MCPToolCall(name=\\"evaluate\\", arguments={{\\"target\\": 10}}),\\n",
|
|
248
|
+
")"
|
|
249
|
+
]
|
|
250
|
+
}},
|
|
251
|
+
{{
|
|
252
|
+
"cell_type": "markdown",
|
|
253
|
+
"metadata": {{}},
|
|
254
|
+
"source": [
|
|
255
|
+
"### Step 2: Initialize MCP Client\\n",
|
|
256
|
+
"\\n",
|
|
257
|
+
"Run `hud dev --build` before this cell to intialize the server at `http://localhost:8765/mcp`"
|
|
258
|
+
]
|
|
259
|
+
}},
|
|
260
|
+
{{
|
|
261
|
+
"cell_type": "code",
|
|
262
|
+
"execution_count": null,
|
|
263
|
+
"metadata": {{}},
|
|
264
|
+
"outputs": [],
|
|
265
|
+
"source": [
|
|
266
|
+
"from hud.clients import MCPClient\\n",
|
|
267
|
+
"\\n",
|
|
268
|
+
"# Create the client\\n",
|
|
269
|
+
"client = MCPClient(mcp_config=task.mcp_config, auto_trace=False)\\n",
|
|
270
|
+
"\\n",
|
|
271
|
+
"# Initialize it (this connects to our dev server)\\n",
|
|
272
|
+
"await client.initialize()"
|
|
273
|
+
]
|
|
274
|
+
}},
|
|
275
|
+
{{
|
|
276
|
+
"cell_type": "markdown",
|
|
277
|
+
"metadata": {{}},
|
|
278
|
+
"source": [
|
|
279
|
+
"### Step 3: Run Setup\\n",
|
|
280
|
+
"\\n",
|
|
281
|
+
"Call the setup tool to prepare the environment according to the task."
|
|
282
|
+
]
|
|
283
|
+
}},
|
|
284
|
+
{{
|
|
285
|
+
"cell_type": "code",
|
|
286
|
+
"execution_count": null,
|
|
287
|
+
"metadata": {{}},
|
|
288
|
+
"outputs": [],
|
|
289
|
+
"source": [
|
|
290
|
+
"# Run the setup from our task\\n",
|
|
291
|
+
"setup_result = await client.call_tool(task.setup_tool) # type: ignore\\n",
|
|
292
|
+
"print(f\\"Setup result: {{setup_result}}\\")"
|
|
293
|
+
]
|
|
294
|
+
}},
|
|
295
|
+
{{
|
|
296
|
+
"cell_type": "markdown",
|
|
297
|
+
"metadata": {{}},
|
|
298
|
+
"source": [
|
|
299
|
+
"### Step 4: Perform Actions\\n",
|
|
300
|
+
"\\n",
|
|
301
|
+
"Now we'll manually perform actions to complete the task. In a real scenario, an AI agent would figure out what actions to take."
|
|
302
|
+
]
|
|
303
|
+
}},
|
|
304
|
+
{{
|
|
305
|
+
"cell_type": "code",
|
|
306
|
+
"execution_count": null,
|
|
307
|
+
"metadata": {{}},
|
|
308
|
+
"outputs": [],
|
|
309
|
+
"source": [
|
|
310
|
+
"# Increment the counter 10 times\\n",
|
|
311
|
+
"for i in range(10):\\n",
|
|
312
|
+
" result = await client.call_tool(name=\\"act\\", arguments={{}})\\n",
|
|
313
|
+
" print(f\\"Step {{i+1}}: {{result.content}}\\")"
|
|
314
|
+
]
|
|
315
|
+
}},
|
|
316
|
+
{{
|
|
317
|
+
"cell_type": "markdown",
|
|
318
|
+
"metadata": {{}},
|
|
319
|
+
"source": [
|
|
320
|
+
"## Step 5: Evaluate Success\\n",
|
|
321
|
+
"\\n",
|
|
322
|
+
"Check if we completed the task according to the evaluation criteria."
|
|
323
|
+
]
|
|
324
|
+
}},
|
|
325
|
+
{{
|
|
326
|
+
"cell_type": "code",
|
|
327
|
+
"execution_count": null,
|
|
328
|
+
"metadata": {{}},
|
|
329
|
+
"outputs": [],
|
|
330
|
+
"source": [
|
|
331
|
+
"# Run the evaluation from our task\\n",
|
|
332
|
+
"eval_result = await client.call_tool(task.evaluate_tool) # type: ignore\\n",
|
|
333
|
+
"\\n",
|
|
334
|
+
"# The result is a list with one TextContent item containing JSON\\n",
|
|
335
|
+
"print(eval_result)"
|
|
336
|
+
]
|
|
337
|
+
}},
|
|
338
|
+
{{
|
|
339
|
+
"cell_type": "markdown",
|
|
340
|
+
"metadata": {{}},
|
|
341
|
+
"source": [
|
|
342
|
+
"### Step 6: Cleanup\\n",
|
|
343
|
+
"\\n",
|
|
344
|
+
"Always shut down the client when done to stop the Docker container. Either stop hud dev in the terminal, or run this command:"
|
|
345
|
+
]
|
|
346
|
+
}},
|
|
347
|
+
{{
|
|
348
|
+
"cell_type": "code",
|
|
349
|
+
"execution_count": null,
|
|
350
|
+
"metadata": {{}},
|
|
351
|
+
"outputs": [],
|
|
352
|
+
"source": [
|
|
353
|
+
"await client.shutdown()"
|
|
354
|
+
]
|
|
355
|
+
}},
|
|
356
|
+
{{
|
|
357
|
+
"cell_type": "markdown",
|
|
358
|
+
"metadata": {{}},
|
|
359
|
+
"source": [
|
|
360
|
+
"### Bonus: Running with an AI Agent\\n",
|
|
361
|
+
"\\n",
|
|
362
|
+
"Instead of manually calling tools, you can have an AI agent solve the task automatically."
|
|
363
|
+
]
|
|
364
|
+
}},
|
|
365
|
+
{{
|
|
366
|
+
"cell_type": "code",
|
|
367
|
+
"execution_count": null,
|
|
368
|
+
"metadata": {{}},
|
|
369
|
+
"outputs": [],
|
|
370
|
+
"source": [
|
|
371
|
+
"# Uncomment to run with Claude (requires ANTHROPIC_API_KEY)\\n",
|
|
372
|
+
"from hud.agents import ClaudeAgent\\n",
|
|
373
|
+
"\\n",
|
|
374
|
+
"# Create an agent\\n",
|
|
375
|
+
"agent = ClaudeAgent(\\n",
|
|
376
|
+
" model=\\"claude-sonnet-4-20250514\\",\\n",
|
|
377
|
+
" allowed_tools=[\\"act\\"] # Only allow the act tool\\n",
|
|
378
|
+
")\\n",
|
|
379
|
+
"\\n",
|
|
380
|
+
"# Run the task\\n",
|
|
381
|
+
"result = await agent.run(task)\\n",
|
|
382
|
+
"print(f\\"Final reward: {{result.reward}}\\")"
|
|
383
|
+
]
|
|
384
|
+
}},
|
|
385
|
+
{{
|
|
386
|
+
"cell_type": "markdown",
|
|
387
|
+
"metadata": {{}},
|
|
388
|
+
"source": [
|
|
389
|
+
"### Next Steps\\n",
|
|
390
|
+
"\\n",
|
|
391
|
+
"1. **Create your own evaluators**: Add new evaluation functions to `server.py`\\n",
|
|
392
|
+
"2. **Build complex environments**: Replace the simple counter with your actual application\\n",
|
|
393
|
+
"3. **Test with agents**: Use different AI models to solve your tasks\\n",
|
|
394
|
+
"\\n",
|
|
395
|
+
"For more examples, check out:\\n",
|
|
396
|
+
"- `environments/text_2048/` - A complete 2048 game environment\\n",
|
|
397
|
+
"- `environments/browser/` - A full browser automation environment with GUI"
|
|
398
|
+
]
|
|
399
|
+
}},
|
|
400
|
+
{{
|
|
401
|
+
"cell_type": "code",
|
|
402
|
+
"execution_count": null,
|
|
403
|
+
"metadata": {{}},
|
|
404
|
+
"outputs": [],
|
|
405
|
+
"source": []
|
|
406
|
+
}}
|
|
407
|
+
],
|
|
408
|
+
"metadata": {{
|
|
409
|
+
"kernelspec": {{
|
|
410
|
+
"display_name": "Python 3",
|
|
411
|
+
"language": "python",
|
|
412
|
+
"name": "python3"
|
|
413
|
+
}},
|
|
414
|
+
"language_info": {{
|
|
415
|
+
"codemirror_mode": {{
|
|
416
|
+
"name": "ipython",
|
|
417
|
+
"version": 3
|
|
418
|
+
}},
|
|
419
|
+
"file_extension": ".py",
|
|
420
|
+
"mimetype": "text/x-python",
|
|
421
|
+
"name": "python",
|
|
422
|
+
"nbconvert_exporter": "python",
|
|
423
|
+
"pygments_lexer": "ipython3",
|
|
424
|
+
"version": "3.11.0"
|
|
425
|
+
}}
|
|
426
|
+
}},
|
|
427
|
+
"nbformat": 4,
|
|
428
|
+
"nbformat_minor": 4
|
|
429
|
+
}}
|
|
430
|
+
'''
|
|
431
|
+
|
|
111
432
|
README_TEMPLATE = '''# {title}
|
|
112
433
|
|
|
113
|
-
A minimal HUD environment
|
|
434
|
+
A minimal HUD environment demonstrating the Task pattern with a simple counter.
|
|
114
435
|
|
|
115
436
|
## Quick Start
|
|
116
437
|
|
|
438
|
+
### Interactive Development
|
|
117
439
|
```bash
|
|
118
|
-
#
|
|
119
|
-
hud dev
|
|
440
|
+
# 1. Start the environment (optional: with inspector)
|
|
441
|
+
hud dev --build --inspector
|
|
442
|
+
|
|
443
|
+
# 2. Choose your preferred way to test:
|
|
120
444
|
|
|
121
|
-
#
|
|
122
|
-
|
|
123
|
-
|
|
445
|
+
# Option A: Interactive notebook test_env.ipynb (great for learning!)
|
|
446
|
+
|
|
447
|
+
# Option B: Simple Python script (runs all tasks from tasks.json)
|
|
448
|
+
python test_task.py
|
|
449
|
+
```
|
|
450
|
+
|
|
451
|
+
### Run with an Agent
|
|
452
|
+
```bash
|
|
453
|
+
# Run the task with Claude
|
|
454
|
+
hud eval tasks.json --agent claude
|
|
124
455
|
```
|
|
125
456
|
|
|
126
|
-
##
|
|
457
|
+
## How HUD Environments Work
|
|
127
458
|
|
|
128
|
-
|
|
129
|
-
- `src/hud_controller/context.py` - Persistent state across hot-reloads
|
|
130
|
-
- `Dockerfile` - Container configuration
|
|
131
|
-
- `pyproject.toml` - Python dependencies
|
|
459
|
+
The environment is split into two components:
|
|
132
460
|
|
|
133
|
-
|
|
461
|
+
- **`env.py`** - Stateful logic that persists across reloads
|
|
462
|
+
- **`server.py`** - MCP server with tools (reloads on file changes)
|
|
134
463
|
|
|
135
|
-
|
|
464
|
+
This separation is crucial for `hud dev` - it allows you to modify the MCP tools and see changes immediately without losing the environment state. The environment runs as a separate process and communicates via socket, while the server can be restarted freely.
|
|
136
465
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
466
|
+
If you are ever seeing issues with the environment itself, running `hud dev --full-reload` will reload both the environment and the server.
|
|
467
|
+
|
|
468
|
+
## Publishing Your Environment
|
|
469
|
+
|
|
470
|
+
Once your environment is ready, you can share it with the community:
|
|
471
|
+
|
|
472
|
+
### 1. Push to Registry
|
|
473
|
+
```bash
|
|
474
|
+
# Build and push your environment (this requires docker hub login and hud api key)
|
|
475
|
+
hud build
|
|
476
|
+
hud push
|
|
142
477
|
```
|
|
143
478
|
|
|
144
|
-
|
|
479
|
+
### 2. Create a Dataset
|
|
480
|
+
|
|
481
|
+
Create a dataset on HuggingFace with your tasks:
|
|
145
482
|
|
|
146
|
-
|
|
483
|
+
**Option A: Upload manually**
|
|
484
|
+
1. Upload your `tasks.json` to HuggingFace
|
|
485
|
+
2. Make sure it's **public** to appear on leaderboards
|
|
147
486
|
|
|
487
|
+
**Option B: Use the SDK**
|
|
148
488
|
```python
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
489
|
+
from hud.datasets import save_tasks
|
|
490
|
+
import json
|
|
491
|
+
|
|
492
|
+
# Load your tasks
|
|
493
|
+
with open("tasks.json") as f:
|
|
494
|
+
tasks = json.load(f)
|
|
495
|
+
|
|
496
|
+
# Push to HuggingFace
|
|
497
|
+
save_tasks(tasks, repo_id="your-org/your-dataset")
|
|
498
|
+
```
|
|
499
|
+
|
|
500
|
+
### 3. Run and Track Performance
|
|
501
|
+
|
|
502
|
+
```bash
|
|
503
|
+
# Run Claude on your benchmark
|
|
504
|
+
hud eval "your-org/your-dataset" --agent claude
|
|
505
|
+
|
|
506
|
+
# View results at:
|
|
507
|
+
# app.hud.so/leaderboards/your-org/your-dataset
|
|
153
508
|
```
|
|
154
509
|
|
|
155
|
-
|
|
510
|
+
**Note**: Only public HuggingFace datasets appear as leaderboards!
|
|
156
511
|
|
|
157
|
-
|
|
158
|
-
- [MCP Specification](https://modelcontextprotocol.io)
|
|
512
|
+
📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
|
|
159
513
|
'''
|
|
160
514
|
|
|
161
515
|
|
|
@@ -173,7 +527,6 @@ def sanitize_name(name: str) -> str:
|
|
|
173
527
|
|
|
174
528
|
def create_environment(name: str | None, directory: str, force: bool) -> None:
|
|
175
529
|
"""Create a new HUD environment from templates."""
|
|
176
|
-
from hud.utils.design import HUDDesign
|
|
177
530
|
|
|
178
531
|
design = HUDDesign()
|
|
179
532
|
|
|
@@ -211,38 +564,55 @@ def create_environment(name: str | None, directory: str, force: bool) -> None:
|
|
|
211
564
|
|
|
212
565
|
# Dockerfile
|
|
213
566
|
dockerfile_path = target_dir / "Dockerfile"
|
|
214
|
-
dockerfile_path.write_text(DOCKERFILE_TEMPLATE.strip() + "\n")
|
|
567
|
+
dockerfile_path.write_text(DOCKERFILE_TEMPLATE.strip() + "\n", encoding="utf-8")
|
|
215
568
|
files_created.append("Dockerfile")
|
|
216
569
|
|
|
217
570
|
# pyproject.toml
|
|
218
571
|
pyproject_path = target_dir / "pyproject.toml"
|
|
219
572
|
pyproject_content = PYPROJECT_TEMPLATE.format(name=package_name).strip() + "\n"
|
|
220
|
-
pyproject_path.write_text(pyproject_content)
|
|
573
|
+
pyproject_path.write_text(pyproject_content, encoding="utf-8")
|
|
221
574
|
files_created.append("pyproject.toml")
|
|
222
575
|
|
|
223
576
|
# README.md
|
|
224
577
|
readme_path = target_dir / "README.md"
|
|
225
578
|
readme_content = README_TEMPLATE.format(name=package_name, title=name).strip() + "\n"
|
|
226
|
-
readme_path.write_text(readme_content)
|
|
579
|
+
readme_path.write_text(readme_content, encoding="utf-8")
|
|
227
580
|
files_created.append("README.md")
|
|
228
581
|
|
|
229
582
|
# Python files
|
|
230
583
|
# __init__.py
|
|
231
584
|
init_path = src_dir / "__init__.py"
|
|
232
|
-
init_path.write_text('"""HUD Controller Package"""\n')
|
|
585
|
+
init_path.write_text('"""HUD Controller Package"""\n', encoding="utf-8")
|
|
233
586
|
files_created.append("src/hud_controller/__init__.py")
|
|
234
587
|
|
|
235
|
-
#
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
files_created.append("src/hud_controller/
|
|
588
|
+
# env.py
|
|
589
|
+
env_path = src_dir / "env.py"
|
|
590
|
+
env_path.write_text(ENV_TEMPLATE.strip() + "\n", encoding="utf-8")
|
|
591
|
+
files_created.append("src/hud_controller/env.py")
|
|
239
592
|
|
|
240
593
|
# server.py (need to escape the double braces for .format())
|
|
241
594
|
server_path = src_dir / "server.py"
|
|
242
595
|
server_content = SERVER_TEMPLATE.format(name=package_name).strip() + "\n"
|
|
243
|
-
server_path.write_text(server_content)
|
|
596
|
+
server_path.write_text(server_content, encoding="utf-8")
|
|
244
597
|
files_created.append("src/hud_controller/server.py")
|
|
245
598
|
|
|
599
|
+
# tasks.json
|
|
600
|
+
tasks_path = target_dir / "tasks.json"
|
|
601
|
+
tasks_content = TASKS_JSON_TEMPLATE.format(name=package_name).strip() + "\n"
|
|
602
|
+
tasks_path.write_text(tasks_content, encoding="utf-8")
|
|
603
|
+
files_created.append("tasks.json")
|
|
604
|
+
|
|
605
|
+
# test_task.py
|
|
606
|
+
test_task_path = target_dir / "test_task.py"
|
|
607
|
+
test_task_path.write_text(TEST_TASK_TEMPLATE.strip() + "\n", encoding="utf-8")
|
|
608
|
+
files_created.append("test_task.py")
|
|
609
|
+
|
|
610
|
+
# notebook.ipynb
|
|
611
|
+
notebook_path = target_dir / "test_env.ipynb"
|
|
612
|
+
notebook_content = NOTEBOOK_TEMPLATE.format(name=package_name).strip() + "\n"
|
|
613
|
+
notebook_path.write_text(notebook_content, encoding="utf-8")
|
|
614
|
+
files_created.append("test_env.ipynb")
|
|
615
|
+
|
|
246
616
|
# Success message
|
|
247
617
|
design.header(f"Created HUD Environment: {name}")
|
|
248
618
|
|
|
@@ -265,16 +635,24 @@ def create_environment(name: str | None, directory: str, force: bool) -> None:
|
|
|
265
635
|
design.info("\n3. Connect from Cursor or test via the MCP inspector:")
|
|
266
636
|
design.info(" Follow the instructions shown by hud dev --inspector")
|
|
267
637
|
|
|
268
|
-
design.info("\n4.
|
|
638
|
+
design.info("\n4. Test your environment:")
|
|
639
|
+
design.command_example("python test_task.py")
|
|
640
|
+
|
|
641
|
+
design.info("\n5. Customize your environment:")
|
|
269
642
|
design.info(" - Add tools to src/hud_controller/server.py")
|
|
270
|
-
design.info(" - Add state to src/hud_controller/
|
|
643
|
+
design.info(" - Add state to src/hud_controller/env.py")
|
|
644
|
+
design.info(" - Modify tasks in tasks.json")
|
|
645
|
+
design.info(" - Experiment in run_eval.ipynb")
|
|
271
646
|
|
|
272
647
|
# Show a sample of the server code
|
|
273
648
|
design.section_title("Your MCP server")
|
|
274
649
|
sample_code = '''@mcp.tool()
|
|
275
650
|
async def act() -> str:
|
|
276
|
-
"""Perform an action."""
|
|
277
|
-
|
|
651
|
+
"""Perform an action that changes the environment state."""
|
|
652
|
+
if env is None:
|
|
653
|
+
raise RuntimeError("Context not initialized")
|
|
654
|
+
count = env.act()
|
|
655
|
+
return f"Action #{count} performed. Current count: {count}"'''
|
|
278
656
|
|
|
279
657
|
syntax = Syntax(sample_code, "python", theme="monokai", line_numbers=False)
|
|
280
658
|
design.console.print(Panel(syntax, border_style="dim"))
|