hud-python 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (63) hide show
  1. hud/__main__.py +8 -0
  2. hud/agents/base.py +7 -8
  3. hud/agents/langchain.py +2 -2
  4. hud/agents/tests/test_openai.py +3 -1
  5. hud/cli/__init__.py +114 -52
  6. hud/cli/build.py +121 -71
  7. hud/cli/debug.py +2 -2
  8. hud/cli/{mcp_server.py → dev.py} +101 -38
  9. hud/cli/eval.py +175 -90
  10. hud/cli/init.py +442 -64
  11. hud/cli/list_func.py +72 -71
  12. hud/cli/pull.py +1 -2
  13. hud/cli/push.py +35 -23
  14. hud/cli/remove.py +35 -41
  15. hud/cli/tests/test_analyze.py +2 -1
  16. hud/cli/tests/test_analyze_metadata.py +42 -49
  17. hud/cli/tests/test_build.py +28 -52
  18. hud/cli/tests/test_cursor.py +1 -1
  19. hud/cli/tests/test_debug.py +1 -1
  20. hud/cli/tests/test_list_func.py +75 -64
  21. hud/cli/tests/test_main_module.py +30 -0
  22. hud/cli/tests/test_mcp_server.py +3 -3
  23. hud/cli/tests/test_pull.py +30 -61
  24. hud/cli/tests/test_push.py +70 -89
  25. hud/cli/tests/test_registry.py +36 -38
  26. hud/cli/tests/test_utils.py +1 -1
  27. hud/cli/utils/__init__.py +1 -0
  28. hud/cli/{docker_utils.py → utils/docker.py} +36 -0
  29. hud/cli/{env_utils.py → utils/environment.py} +7 -7
  30. hud/cli/{interactive.py → utils/interactive.py} +91 -19
  31. hud/cli/{analyze_metadata.py → utils/metadata.py} +12 -8
  32. hud/cli/{registry.py → utils/registry.py} +28 -30
  33. hud/cli/{remote_runner.py → utils/remote_runner.py} +1 -1
  34. hud/cli/utils/runner.py +134 -0
  35. hud/cli/utils/server.py +250 -0
  36. hud/clients/base.py +1 -1
  37. hud/clients/fastmcp.py +5 -13
  38. hud/clients/mcp_use.py +6 -10
  39. hud/server/server.py +35 -5
  40. hud/shared/exceptions.py +11 -0
  41. hud/shared/tests/test_exceptions.py +22 -0
  42. hud/telemetry/tests/__init__.py +0 -0
  43. hud/telemetry/tests/test_replay.py +40 -0
  44. hud/telemetry/tests/test_trace.py +63 -0
  45. hud/tools/base.py +20 -3
  46. hud/tools/computer/hud.py +15 -6
  47. hud/tools/executors/tests/test_base_executor.py +27 -0
  48. hud/tools/response.py +12 -8
  49. hud/tools/tests/test_response.py +60 -0
  50. hud/tools/tests/test_tools_init.py +49 -0
  51. hud/utils/design.py +19 -8
  52. hud/utils/mcp.py +17 -5
  53. hud/utils/tests/test_mcp.py +112 -0
  54. hud/utils/tests/test_version.py +1 -1
  55. hud/version.py +1 -1
  56. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/METADATA +16 -13
  57. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/RECORD +62 -52
  58. hud/cli/runner.py +0 -160
  59. /hud/cli/{cursor.py → utils/cursor.py} +0 -0
  60. /hud/cli/{utils.py → utils/logging.py} +0 -0
  61. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/WHEEL +0 -0
  62. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/entry_points.txt +0 -0
  63. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/licenses/LICENSE +0 -0
hud/cli/init.py CHANGED
@@ -15,19 +15,13 @@ DOCKERFILE_TEMPLATE = """FROM python:3.11-slim
15
15
 
16
16
  WORKDIR /app
17
17
 
18
- # Install git for hud-python dependency
19
- RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
20
-
21
18
  # Copy and install dependencies
22
19
  COPY pyproject.toml ./
23
20
  COPY src/ ./src/
24
21
  RUN pip install --no-cache-dir -e .
25
22
 
26
- # Set logging to stderr
27
- ENV HUD_LOG_STREAM=stderr
28
-
29
23
  # Start context server in background, then MCP server
30
- CMD ["sh", "-c", "python -m hud_controller.context & sleep 1 && exec python -m hud_controller.server"]
24
+ CMD ["sh", "-c", "python -m hud_controller.env & sleep 1 && exec python -m hud_controller.server"]
31
25
  """ # noqa: E501
32
26
 
33
27
  PYPROJECT_TEMPLATE = """[project]
@@ -53,109 +47,469 @@ allow-direct-references = true
53
47
  packages = ["src/hud_controller"]
54
48
  """
55
49
 
56
- CONTEXT_TEMPLATE = '''"""Minimal context that persists across hot-reloads."""
50
+ ENV_TEMPLATE = '''"""Minimal environment that persists across hot-reloads."""
57
51
  from hud.server.context import run_context_server
58
52
  import asyncio
59
53
 
60
- class Context:
54
+ class Environment:
55
+ """Simple counter environment."""
56
+
61
57
  def __init__(self):
62
58
  self.count = 0
63
59
 
64
60
  def act(self):
61
+ """Increment the counter."""
65
62
  self.count += 1
66
63
  return self.count
67
64
 
68
65
  def get_count(self):
66
+ """Get current counter."""
69
67
  return self.count
68
+
69
+ def reset(self):
70
+ """Reset counter to zero."""
71
+ self.count = 0
70
72
 
71
73
  if __name__ == "__main__":
72
- asyncio.run(run_context_server(Context()))
74
+ asyncio.run(run_context_server(Environment(), sock_path="/tmp/hud_ctx.sock"))
73
75
  '''
74
76
 
75
77
  SERVER_TEMPLATE = '''"""Minimal MCP server for HUD."""
78
+ import sys
79
+ import logging
76
80
  from hud.server import MCPServer
77
81
  from hud.server.context import attach_context
82
+ from hud.tools.types import EvaluationResult
83
+
84
+ # Configure logging to stderr
85
+ logging.basicConfig(
86
+ stream=sys.stderr,
87
+ level=logging.INFO,
88
+ format='[%(levelname)s] %(asctime)s | %(name)s | %(message)s'
89
+ )
78
90
 
79
91
  mcp = MCPServer(name="{name}")
80
- ctx = None
92
+ env = None
81
93
 
82
94
  @mcp.initialize
83
- async def init(init_ctx):
84
- global ctx
85
- ctx = attach_context("/tmp/hud_ctx.sock")
95
+ async def init(ctx):
96
+ global env
97
+ env = attach_context("/tmp/hud_ctx.sock")
98
+ logging.info("Connected to context server")
86
99
 
87
100
  @mcp.shutdown
88
101
  async def cleanup():
89
- global ctx
90
- ctx = None
102
+ global env
103
+ env = None
91
104
 
92
105
  @mcp.tool()
93
106
  async def act() -> str:
94
- """Perform an action."""
95
- return f"Action #{{ctx.act()}}"
107
+ """Perform an action that changes the environment state."""
108
+ if env is None:
109
+ raise RuntimeError("Context not initialized")
110
+ count = env.act()
111
+ return f"Action #{{count}} performed. Current count: {{count}}"
96
112
 
97
113
  @mcp.tool()
98
114
  async def setup() -> str:
99
- """Required for HUD environments."""
100
- return "Ready"
115
+ """Reset the environment to initial state."""
116
+ if env is None:
117
+ raise RuntimeError("Context not initialized")
118
+ env.reset()
119
+ return "Counter reset to 0"
101
120
 
102
121
  @mcp.tool()
103
- async def evaluate() -> dict:
104
- """Required for HUD environments."""
105
- return {{"count": ctx.get_count()}}
122
+ async def evaluate(target: int = 10) -> EvaluationResult:
123
+ """Check if the counter reached the target value."""
124
+ if env is None:
125
+ raise RuntimeError("Context not initialized")
126
+ current_count = env.get_count()
127
+
128
+ # Calculate reward as progress towards target
129
+ reward = min(current_count / target, 1.0) if target > 0 else 0.0
130
+ done = current_count >= target
131
+
132
+ return EvaluationResult(
133
+ reward=reward,
134
+ done=done,
135
+ content=f"Counter at {{current_count}}/{{target}}"
136
+ )
106
137
 
107
138
  if __name__ == "__main__":
108
139
  mcp.run()
109
140
  '''
110
141
 
142
+ TASKS_JSON_TEMPLATE = '''[
143
+ {{
144
+ "prompt": "Increment the counter to reach 10",
145
+ "mcp_config": {{
146
+ "{name}": {{
147
+ "url": "http://localhost:8765/mcp"
148
+ }}
149
+ }},
150
+ "setup_tool": {{
151
+ "name": "setup",
152
+ "arguments": {{}}
153
+ }},
154
+ "evaluate_tool": {{
155
+ "name": "evaluate",
156
+ "arguments": {{
157
+ "target": 10
158
+ }}
159
+ }}
160
+ }}
161
+ ]
162
+ '''
163
+
164
+ TEST_TASK_TEMPLATE = '''#!/usr/bin/env python
165
+ """Simple example of running tasks from tasks.json.
166
+
167
+ Make sure to run 'hud dev --build' in another terminal first!
168
+ """
169
+
170
+ import asyncio
171
+ import json
172
+ from hud.datasets import Task
173
+ from hud.clients import MCPClient
174
+
175
+
176
+ async def run_task(task_data: dict):
177
+ task = Task(**task_data)
178
+ client = MCPClient(mcp_config=task.mcp_config)
179
+
180
+ try:
181
+ print("Initializing client...")
182
+ await client.initialize()
183
+
184
+ result = await client.call_tool(task.setup_tool) # type: ignore
185
+ print(f"✅ Setup: {{result.content}}")
186
+
187
+ print("\\n🔄 Performing actions:")
188
+ for _ in range(10):
189
+ result = await client.call_tool(name="act", arguments={{}})
190
+ print(f" {{result.content}}")
191
+
192
+ result = await client.call_tool(task.evaluate_tool) # type: ignore
193
+ print(f"\\n📊 Evaluation: {{result.content}}")
194
+
195
+ return result.content
196
+ except Exception as e:
197
+ if "connection" in str(e).lower():
198
+ print("❌ Could not connect. Make sure 'hud dev --build' is running in another terminal.")
199
+ else:
200
+ raise e
201
+ finally:
202
+ await client.shutdown()
203
+
204
+
205
+ async def main():
206
+ for task_data in json.load(open("tasks.json")):
207
+ await run_task(task_data)
208
+
209
+ if __name__ == "__main__":
210
+ asyncio.run(main())
211
+ '''
212
+
213
+ NOTEBOOK_TEMPLATE = '''{{
214
+ "cells": [
215
+ {{
216
+ "cell_type": "markdown",
217
+ "metadata": {{}},
218
+ "source": [
219
+ "### Step 1: Create a Task\\n",
220
+ "\\n",
221
+ "A Task combines:\\n",
222
+ "- **Prompt**: What we want an agent to accomplish\\n",
223
+ "- **MCP Config**: How to spawn the environment\\n",
224
+ "- **Setup Tool**: How to prepare the environment\\n",
225
+ "- **Evaluate Tool**: How to check if the task succeeded"
226
+ ]
227
+ }},
228
+ {{
229
+ "cell_type": "code",
230
+ "execution_count": null,
231
+ "metadata": {{}},
232
+ "outputs": [],
233
+ "source": [
234
+ "from hud.datasets import Task\\n",
235
+ "from hud.types import MCPToolCall\\n",
236
+ "\\n",
237
+ "# Create a task that uses our {name} environment\\n",
238
+ "# See tasks.json for how to build a loadable task dataset\\n",
239
+ "task = Task(\\n",
240
+ " prompt=\\"Increment the counter to reach 10\\",\\n",
241
+ " mcp_config={{\\n",
242
+ " \\"{name}\\": {{\\n",
243
+ " \\"url\\": \\"http://localhost:8765/mcp\\"\\n",
244
+ " }},\\n",
245
+ " }},\\n",
246
+ " setup_tool=MCPToolCall(name=\\"setup\\", arguments={{}}),\\n",
247
+ " evaluate_tool=MCPToolCall(name=\\"evaluate\\", arguments={{\\"target\\": 10}}),\\n",
248
+ ")"
249
+ ]
250
+ }},
251
+ {{
252
+ "cell_type": "markdown",
253
+ "metadata": {{}},
254
+ "source": [
255
+ "### Step 2: Initialize MCP Client\\n",
256
+ "\\n",
257
+ "Run `hud dev --build` before this cell to intialize the server at `http://localhost:8765/mcp`"
258
+ ]
259
+ }},
260
+ {{
261
+ "cell_type": "code",
262
+ "execution_count": null,
263
+ "metadata": {{}},
264
+ "outputs": [],
265
+ "source": [
266
+ "from hud.clients import MCPClient\\n",
267
+ "\\n",
268
+ "# Create the client\\n",
269
+ "client = MCPClient(mcp_config=task.mcp_config, auto_trace=False)\\n",
270
+ "\\n",
271
+ "# Initialize it (this connects to our dev server)\\n",
272
+ "await client.initialize()"
273
+ ]
274
+ }},
275
+ {{
276
+ "cell_type": "markdown",
277
+ "metadata": {{}},
278
+ "source": [
279
+ "### Step 3: Run Setup\\n",
280
+ "\\n",
281
+ "Call the setup tool to prepare the environment according to the task."
282
+ ]
283
+ }},
284
+ {{
285
+ "cell_type": "code",
286
+ "execution_count": null,
287
+ "metadata": {{}},
288
+ "outputs": [],
289
+ "source": [
290
+ "# Run the setup from our task\\n",
291
+ "setup_result = await client.call_tool(task.setup_tool) # type: ignore\\n",
292
+ "print(f\\"Setup result: {{setup_result}}\\")"
293
+ ]
294
+ }},
295
+ {{
296
+ "cell_type": "markdown",
297
+ "metadata": {{}},
298
+ "source": [
299
+ "### Step 4: Perform Actions\\n",
300
+ "\\n",
301
+ "Now we'll manually perform actions to complete the task. In a real scenario, an AI agent would figure out what actions to take."
302
+ ]
303
+ }},
304
+ {{
305
+ "cell_type": "code",
306
+ "execution_count": null,
307
+ "metadata": {{}},
308
+ "outputs": [],
309
+ "source": [
310
+ "# Increment the counter 10 times\\n",
311
+ "for i in range(10):\\n",
312
+ " result = await client.call_tool(name=\\"act\\", arguments={{}})\\n",
313
+ " print(f\\"Step {{i+1}}: {{result.content}}\\")"
314
+ ]
315
+ }},
316
+ {{
317
+ "cell_type": "markdown",
318
+ "metadata": {{}},
319
+ "source": [
320
+ "## Step 5: Evaluate Success\\n",
321
+ "\\n",
322
+ "Check if we completed the task according to the evaluation criteria."
323
+ ]
324
+ }},
325
+ {{
326
+ "cell_type": "code",
327
+ "execution_count": null,
328
+ "metadata": {{}},
329
+ "outputs": [],
330
+ "source": [
331
+ "# Run the evaluation from our task\\n",
332
+ "eval_result = await client.call_tool(task.evaluate_tool) # type: ignore\\n",
333
+ "\\n",
334
+ "# The result is a list with one TextContent item containing JSON\\n",
335
+ "print(eval_result)"
336
+ ]
337
+ }},
338
+ {{
339
+ "cell_type": "markdown",
340
+ "metadata": {{}},
341
+ "source": [
342
+ "### Step 6: Cleanup\\n",
343
+ "\\n",
344
+ "Always shut down the client when done to stop the Docker container. Either stop hud dev in the terminal, or run this command:"
345
+ ]
346
+ }},
347
+ {{
348
+ "cell_type": "code",
349
+ "execution_count": null,
350
+ "metadata": {{}},
351
+ "outputs": [],
352
+ "source": [
353
+ "await client.shutdown()"
354
+ ]
355
+ }},
356
+ {{
357
+ "cell_type": "markdown",
358
+ "metadata": {{}},
359
+ "source": [
360
+ "### Bonus: Running with an AI Agent\\n",
361
+ "\\n",
362
+ "Instead of manually calling tools, you can have an AI agent solve the task automatically."
363
+ ]
364
+ }},
365
+ {{
366
+ "cell_type": "code",
367
+ "execution_count": null,
368
+ "metadata": {{}},
369
+ "outputs": [],
370
+ "source": [
371
+ "# Uncomment to run with Claude (requires ANTHROPIC_API_KEY)\\n",
372
+ "from hud.agents import ClaudeAgent\\n",
373
+ "\\n",
374
+ "# Create an agent\\n",
375
+ "agent = ClaudeAgent(\\n",
376
+ " model=\\"claude-sonnet-4-20250514\\",\\n",
377
+ " allowed_tools=[\\"act\\"] # Only allow the act tool\\n",
378
+ ")\\n",
379
+ "\\n",
380
+ "# Run the task\\n",
381
+ "result = await agent.run(task)\\n",
382
+ "print(f\\"Final reward: {{result.reward}}\\")"
383
+ ]
384
+ }},
385
+ {{
386
+ "cell_type": "markdown",
387
+ "metadata": {{}},
388
+ "source": [
389
+ "### Next Steps\\n",
390
+ "\\n",
391
+ "1. **Create your own evaluators**: Add new evaluation functions to `server.py`\\n",
392
+ "2. **Build complex environments**: Replace the simple counter with your actual application\\n",
393
+ "3. **Test with agents**: Use different AI models to solve your tasks\\n",
394
+ "\\n",
395
+ "For more examples, check out:\\n",
396
+ "- `environments/text_2048/` - A complete 2048 game environment\\n",
397
+ "- `environments/browser/` - A full browser automation environment with GUI"
398
+ ]
399
+ }},
400
+ {{
401
+ "cell_type": "code",
402
+ "execution_count": null,
403
+ "metadata": {{}},
404
+ "outputs": [],
405
+ "source": []
406
+ }}
407
+ ],
408
+ "metadata": {{
409
+ "kernelspec": {{
410
+ "display_name": "Python 3",
411
+ "language": "python",
412
+ "name": "python3"
413
+ }},
414
+ "language_info": {{
415
+ "codemirror_mode": {{
416
+ "name": "ipython",
417
+ "version": 3
418
+ }},
419
+ "file_extension": ".py",
420
+ "mimetype": "text/x-python",
421
+ "name": "python",
422
+ "nbconvert_exporter": "python",
423
+ "pygments_lexer": "ipython3",
424
+ "version": "3.11.0"
425
+ }}
426
+ }},
427
+ "nbformat": 4,
428
+ "nbformat_minor": 4
429
+ }}
430
+ '''
431
+
111
432
  README_TEMPLATE = '''# {title}
112
433
 
113
- A minimal HUD environment created with `hud init`.
434
+ A minimal HUD environment demonstrating the Task pattern with a simple counter.
114
435
 
115
436
  ## Quick Start
116
437
 
438
+ ### Interactive Development
117
439
  ```bash
118
- # Build and run locally
119
- hud dev
440
+ # 1. Start the environment (optional: with inspector)
441
+ hud dev --build --inspector
442
+
443
+ # 2. Choose your preferred way to test:
120
444
 
121
- # Or build first
122
- docker build -t {name}:dev .
123
- hud dev --image {name}:dev
445
+ # Option A: Interactive notebook test_env.ipynb (great for learning!)
446
+
447
+ # Option B: Simple Python script (runs all tasks from tasks.json)
448
+ python test_task.py
449
+ ```
450
+
451
+ ### Run with an Agent
452
+ ```bash
453
+ # Run the task with Claude
454
+ hud eval tasks.json --agent claude
124
455
  ```
125
456
 
126
- ## Structure
457
+ ## How HUD Environments Work
127
458
 
128
- - `src/hud_controller/server.py` - MCP server with tools
129
- - `src/hud_controller/context.py` - Persistent state across hot-reloads
130
- - `Dockerfile` - Container configuration
131
- - `pyproject.toml` - Python dependencies
459
+ The environment is split into two components:
132
460
 
133
- ## Adding Tools
461
+ - **`env.py`** - Stateful logic that persists across reloads
462
+ - **`server.py`** - MCP server with tools (reloads on file changes)
134
463
 
135
- Add new tools to `server.py`:
464
+ This separation is crucial for `hud dev` - it allows you to modify the MCP tools and see changes immediately without losing the environment state. The environment runs as a separate process and communicates via socket, while the server can be restarted freely.
136
465
 
137
- ```python
138
- @mcp.tool()
139
- async def my_tool(param: str) -> str:
140
- """Tool description."""
141
- return f"Result: {{param}}"
466
+ If you are ever seeing issues with the environment itself, running `hud dev --full-reload` will reload both the environment and the server.
467
+
468
+ ## Publishing Your Environment
469
+
470
+ Once your environment is ready, you can share it with the community:
471
+
472
+ ### 1. Push to Registry
473
+ ```bash
474
+ # Build and push your environment (this requires docker hub login and hud api key)
475
+ hud build
476
+ hud push
142
477
  ```
143
478
 
144
- ## Adding State
479
+ ### 2. Create a Dataset
480
+
481
+ Create a dataset on HuggingFace with your tasks:
145
482
 
146
- Extend the `Context` class in `context.py`:
483
+ **Option A: Upload manually**
484
+ 1. Upload your `tasks.json` to HuggingFace
485
+ 2. Make sure it's **public** to appear on leaderboards
147
486
 
487
+ **Option B: Use the SDK**
148
488
  ```python
149
- class Context:
150
- def __init__(self):
151
- self.count = 0
152
- self.data = {{}} # Add your state
489
+ from hud.datasets import save_tasks
490
+ import json
491
+
492
+ # Load your tasks
493
+ with open("tasks.json") as f:
494
+ tasks = json.load(f)
495
+
496
+ # Push to HuggingFace
497
+ save_tasks(tasks, repo_id="your-org/your-dataset")
498
+ ```
499
+
500
+ ### 3. Run and Track Performance
501
+
502
+ ```bash
503
+ # Run Claude on your benchmark
504
+ hud eval "your-org/your-dataset" --agent claude
505
+
506
+ # View results at:
507
+ # app.hud.so/leaderboards/your-org/your-dataset
153
508
  ```
154
509
 
155
- ## Learn More
510
+ **Note**: Only public HuggingFace datasets appear as leaderboards!
156
511
 
157
- - [HUD Documentation](https://docs.hud.so)
158
- - [MCP Specification](https://modelcontextprotocol.io)
512
+ 📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
159
513
  '''
160
514
 
161
515
 
@@ -173,7 +527,6 @@ def sanitize_name(name: str) -> str:
173
527
 
174
528
  def create_environment(name: str | None, directory: str, force: bool) -> None:
175
529
  """Create a new HUD environment from templates."""
176
- from hud.utils.design import HUDDesign
177
530
 
178
531
  design = HUDDesign()
179
532
 
@@ -211,38 +564,55 @@ def create_environment(name: str | None, directory: str, force: bool) -> None:
211
564
 
212
565
  # Dockerfile
213
566
  dockerfile_path = target_dir / "Dockerfile"
214
- dockerfile_path.write_text(DOCKERFILE_TEMPLATE.strip() + "\n")
567
+ dockerfile_path.write_text(DOCKERFILE_TEMPLATE.strip() + "\n", encoding="utf-8")
215
568
  files_created.append("Dockerfile")
216
569
 
217
570
  # pyproject.toml
218
571
  pyproject_path = target_dir / "pyproject.toml"
219
572
  pyproject_content = PYPROJECT_TEMPLATE.format(name=package_name).strip() + "\n"
220
- pyproject_path.write_text(pyproject_content)
573
+ pyproject_path.write_text(pyproject_content, encoding="utf-8")
221
574
  files_created.append("pyproject.toml")
222
575
 
223
576
  # README.md
224
577
  readme_path = target_dir / "README.md"
225
578
  readme_content = README_TEMPLATE.format(name=package_name, title=name).strip() + "\n"
226
- readme_path.write_text(readme_content)
579
+ readme_path.write_text(readme_content, encoding="utf-8")
227
580
  files_created.append("README.md")
228
581
 
229
582
  # Python files
230
583
  # __init__.py
231
584
  init_path = src_dir / "__init__.py"
232
- init_path.write_text('"""HUD Controller Package"""\n')
585
+ init_path.write_text('"""HUD Controller Package"""\n', encoding="utf-8")
233
586
  files_created.append("src/hud_controller/__init__.py")
234
587
 
235
- # context.py
236
- context_path = src_dir / "context.py"
237
- context_path.write_text(CONTEXT_TEMPLATE.strip() + "\n")
238
- files_created.append("src/hud_controller/context.py")
588
+ # env.py
589
+ env_path = src_dir / "env.py"
590
+ env_path.write_text(ENV_TEMPLATE.strip() + "\n", encoding="utf-8")
591
+ files_created.append("src/hud_controller/env.py")
239
592
 
240
593
  # server.py (need to escape the double braces for .format())
241
594
  server_path = src_dir / "server.py"
242
595
  server_content = SERVER_TEMPLATE.format(name=package_name).strip() + "\n"
243
- server_path.write_text(server_content)
596
+ server_path.write_text(server_content, encoding="utf-8")
244
597
  files_created.append("src/hud_controller/server.py")
245
598
 
599
+ # tasks.json
600
+ tasks_path = target_dir / "tasks.json"
601
+ tasks_content = TASKS_JSON_TEMPLATE.format(name=package_name).strip() + "\n"
602
+ tasks_path.write_text(tasks_content, encoding="utf-8")
603
+ files_created.append("tasks.json")
604
+
605
+ # test_task.py
606
+ test_task_path = target_dir / "test_task.py"
607
+ test_task_path.write_text(TEST_TASK_TEMPLATE.strip() + "\n", encoding="utf-8")
608
+ files_created.append("test_task.py")
609
+
610
+ # notebook.ipynb
611
+ notebook_path = target_dir / "test_env.ipynb"
612
+ notebook_content = NOTEBOOK_TEMPLATE.format(name=package_name).strip() + "\n"
613
+ notebook_path.write_text(notebook_content, encoding="utf-8")
614
+ files_created.append("test_env.ipynb")
615
+
246
616
  # Success message
247
617
  design.header(f"Created HUD Environment: {name}")
248
618
 
@@ -265,16 +635,24 @@ def create_environment(name: str | None, directory: str, force: bool) -> None:
265
635
  design.info("\n3. Connect from Cursor or test via the MCP inspector:")
266
636
  design.info(" Follow the instructions shown by hud dev --inspector")
267
637
 
268
- design.info("\n4. Customize your environment:")
638
+ design.info("\n4. Test your environment:")
639
+ design.command_example("python test_task.py")
640
+
641
+ design.info("\n5. Customize your environment:")
269
642
  design.info(" - Add tools to src/hud_controller/server.py")
270
- design.info(" - Add state to src/hud_controller/context.py")
643
+ design.info(" - Add state to src/hud_controller/env.py")
644
+ design.info(" - Modify tasks in tasks.json")
645
+ design.info(" - Experiment in run_eval.ipynb")
271
646
 
272
647
  # Show a sample of the server code
273
648
  design.section_title("Your MCP server")
274
649
  sample_code = '''@mcp.tool()
275
650
  async def act() -> str:
276
- """Perform an action."""
277
- return f"Action #{ctx.act()}"'''
651
+ """Perform an action that changes the environment state."""
652
+ if env is None:
653
+ raise RuntimeError("Context not initialized")
654
+ count = env.act()
655
+ return f"Action #{count} performed. Current count: {count}"'''
278
656
 
279
657
  syntax = Syntax(sample_code, "python", theme="monokai", line_numbers=False)
280
658
  design.console.print(Panel(syntax, border_style="dim"))