hud-python 0.4.8__tar.gz → 0.4.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (159) hide show
  1. {hud_python-0.4.8 → hud_python-0.4.9}/.gitignore +3 -1
  2. {hud_python-0.4.8 → hud_python-0.4.9}/PKG-INFO +12 -1
  3. {hud_python-0.4.8 → hud_python-0.4.9}/environments/browser/README.md +58 -6
  4. {hud_python-0.4.8 → hud_python-0.4.9}/environments/browser/pyproject.toml +9 -14
  5. {hud_python-0.4.8 → hud_python-0.4.9}/environments/browser/src/hud_controller/README.md +1 -1
  6. {hud_python-0.4.8 → hud_python-0.4.9}/environments/remote_browser/README.md +2 -2
  7. {hud_python-0.4.8 → hud_python-0.4.9}/environments/text_2048/README.md +2 -2
  8. {hud_python-0.4.8 → hud_python-0.4.9}/hud/agents/base.py +50 -1
  9. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/__init__.py +120 -1
  10. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/analyze_metadata.py +29 -41
  11. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/build.py +7 -0
  12. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/debug.py +8 -1
  13. hud_python-0.4.9/hud/cli/eval.py +226 -0
  14. hud_python-0.4.9/hud/cli/list_func.py +212 -0
  15. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/pull.py +4 -13
  16. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/push.py +84 -41
  17. hud_python-0.4.9/hud/cli/registry.py +155 -0
  18. hud_python-0.4.9/hud/cli/remove.py +200 -0
  19. hud_python-0.4.9/hud/cli/tests/test_analyze_metadata.py +277 -0
  20. hud_python-0.4.9/hud/cli/tests/test_build.py +450 -0
  21. hud_python-0.4.9/hud/cli/tests/test_list_func.py +288 -0
  22. hud_python-0.4.9/hud/cli/tests/test_pull.py +400 -0
  23. hud_python-0.4.9/hud/cli/tests/test_push.py +379 -0
  24. hud_python-0.4.9/hud/cli/tests/test_registry.py +264 -0
  25. {hud_python-0.4.8 → hud_python-0.4.9}/hud/clients/base.py +13 -1
  26. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/__init__.py +2 -0
  27. hud_python-0.4.9/hud/tools/response.py +54 -0
  28. {hud_python-0.4.8 → hud_python-0.4.9}/hud/utils/design.py +10 -0
  29. {hud_python-0.4.8 → hud_python-0.4.9}/hud/utils/mcp.py +14 -2
  30. {hud_python-0.4.8 → hud_python-0.4.9}/hud/utils/tests/test_version.py +1 -1
  31. {hud_python-0.4.8 → hud_python-0.4.9}/hud/version.py +1 -1
  32. {hud_python-0.4.8 → hud_python-0.4.9}/pyproject.toml +4 -1
  33. {hud_python-0.4.8 → hud_python-0.4.9}/rl/README.md +10 -18
  34. {hud_python-0.4.8 → hud_python-0.4.9}/LICENSE +0 -0
  35. {hud_python-0.4.8 → hud_python-0.4.9}/README.md +0 -0
  36. {hud_python-0.4.8 → hud_python-0.4.9}/environments/README.md +0 -0
  37. {hud_python-0.4.8 → hud_python-0.4.9}/environments/browser/apps/2048/README.md +0 -0
  38. {hud_python-0.4.8 → hud_python-0.4.9}/environments/browser/apps/2048/backend/pyproject.toml +0 -0
  39. {hud_python-0.4.8 → hud_python-0.4.9}/environments/browser/apps/README.md +0 -0
  40. {hud_python-0.4.8 → hud_python-0.4.9}/environments/browser/apps/todo/README.md +0 -0
  41. {hud_python-0.4.8 → hud_python-0.4.9}/environments/browser/apps/todo/backend/pyproject.toml +0 -0
  42. {hud_python-0.4.8 → hud_python-0.4.9}/environments/remote_browser/pyproject.toml +0 -0
  43. {hud_python-0.4.8 → hud_python-0.4.9}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  44. {hud_python-0.4.8 → hud_python-0.4.9}/environments/text_2048/pyproject.toml +0 -0
  45. {hud_python-0.4.8 → hud_python-0.4.9}/examples/README.md +0 -0
  46. {hud_python-0.4.8 → hud_python-0.4.9}/hud/__init__.py +0 -0
  47. {hud_python-0.4.8 → hud_python-0.4.9}/hud/agents/__init__.py +0 -0
  48. {hud_python-0.4.8 → hud_python-0.4.9}/hud/agents/claude.py +0 -0
  49. {hud_python-0.4.8 → hud_python-0.4.9}/hud/agents/langchain.py +0 -0
  50. {hud_python-0.4.8 → hud_python-0.4.9}/hud/agents/misc/__init__.py +0 -0
  51. {hud_python-0.4.8 → hud_python-0.4.9}/hud/agents/misc/response_agent.py +0 -0
  52. {hud_python-0.4.8 → hud_python-0.4.9}/hud/agents/openai.py +0 -0
  53. {hud_python-0.4.8 → hud_python-0.4.9}/hud/agents/openai_chat_generic.py +0 -0
  54. {hud_python-0.4.8 → hud_python-0.4.9}/hud/agents/tests/__init__.py +0 -0
  55. {hud_python-0.4.8 → hud_python-0.4.9}/hud/agents/tests/test_base.py +0 -0
  56. {hud_python-0.4.8 → hud_python-0.4.9}/hud/agents/tests/test_claude.py +0 -0
  57. {hud_python-0.4.8 → hud_python-0.4.9}/hud/agents/tests/test_client.py +0 -0
  58. {hud_python-0.4.8 → hud_python-0.4.9}/hud/agents/tests/test_openai.py +0 -0
  59. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/__main__.py +0 -0
  60. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/analyze.py +0 -0
  61. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/clone.py +0 -0
  62. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/cursor.py +0 -0
  63. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/docker_utils.py +0 -0
  64. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/init.py +0 -0
  65. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/interactive.py +0 -0
  66. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/mcp_server.py +0 -0
  67. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/remote_runner.py +0 -0
  68. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/runner.py +0 -0
  69. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/tests/__init__.py +0 -0
  70. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/tests/test_analyze.py +0 -0
  71. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/tests/test_cli_init.py +0 -0
  72. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/tests/test_cli_main.py +0 -0
  73. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/tests/test_clone.py +0 -0
  74. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/tests/test_cursor.py +0 -0
  75. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/tests/test_debug.py +0 -0
  76. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/tests/test_mcp_server.py +0 -0
  77. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/tests/test_utils.py +0 -0
  78. {hud_python-0.4.8 → hud_python-0.4.9}/hud/cli/utils.py +0 -0
  79. {hud_python-0.4.8 → hud_python-0.4.9}/hud/clients/README.md +0 -0
  80. {hud_python-0.4.8 → hud_python-0.4.9}/hud/clients/__init__.py +0 -0
  81. {hud_python-0.4.8 → hud_python-0.4.9}/hud/clients/fastmcp.py +0 -0
  82. {hud_python-0.4.8 → hud_python-0.4.9}/hud/clients/mcp_use.py +0 -0
  83. {hud_python-0.4.8 → hud_python-0.4.9}/hud/clients/tests/__init__.py +0 -0
  84. {hud_python-0.4.8 → hud_python-0.4.9}/hud/clients/tests/test_client_integration.py +0 -0
  85. {hud_python-0.4.8 → hud_python-0.4.9}/hud/clients/tests/test_fastmcp.py +0 -0
  86. {hud_python-0.4.8 → hud_python-0.4.9}/hud/clients/tests/test_protocol.py +0 -0
  87. {hud_python-0.4.8 → hud_python-0.4.9}/hud/clients/utils/__init__.py +0 -0
  88. {hud_python-0.4.8 → hud_python-0.4.9}/hud/clients/utils/retry_transport.py +0 -0
  89. {hud_python-0.4.8 → hud_python-0.4.9}/hud/datasets.py +0 -0
  90. {hud_python-0.4.8 → hud_python-0.4.9}/hud/misc/__init__.py +0 -0
  91. {hud_python-0.4.8 → hud_python-0.4.9}/hud/misc/claude_plays_pokemon.py +0 -0
  92. {hud_python-0.4.8 → hud_python-0.4.9}/hud/otel/__init__.py +0 -0
  93. {hud_python-0.4.8 → hud_python-0.4.9}/hud/otel/collector.py +0 -0
  94. {hud_python-0.4.8 → hud_python-0.4.9}/hud/otel/config.py +0 -0
  95. {hud_python-0.4.8 → hud_python-0.4.9}/hud/otel/context.py +0 -0
  96. {hud_python-0.4.8 → hud_python-0.4.9}/hud/otel/exporters.py +0 -0
  97. {hud_python-0.4.8 → hud_python-0.4.9}/hud/otel/instrumentation.py +0 -0
  98. {hud_python-0.4.8 → hud_python-0.4.9}/hud/otel/processors.py +0 -0
  99. {hud_python-0.4.8 → hud_python-0.4.9}/hud/otel/tests/__init__.py +0 -0
  100. {hud_python-0.4.8 → hud_python-0.4.9}/hud/otel/tests/test_processors.py +0 -0
  101. {hud_python-0.4.8 → hud_python-0.4.9}/hud/py.typed +0 -0
  102. {hud_python-0.4.8 → hud_python-0.4.9}/hud/server/__init__.py +0 -0
  103. {hud_python-0.4.8 → hud_python-0.4.9}/hud/server/context.py +0 -0
  104. {hud_python-0.4.8 → hud_python-0.4.9}/hud/server/helper/__init__.py +0 -0
  105. {hud_python-0.4.8 → hud_python-0.4.9}/hud/server/low_level.py +0 -0
  106. {hud_python-0.4.8 → hud_python-0.4.9}/hud/server/server.py +0 -0
  107. {hud_python-0.4.8 → hud_python-0.4.9}/hud/server/tests/__init__.py +0 -0
  108. {hud_python-0.4.8 → hud_python-0.4.9}/hud/settings.py +0 -0
  109. {hud_python-0.4.8 → hud_python-0.4.9}/hud/shared/__init__.py +0 -0
  110. {hud_python-0.4.8 → hud_python-0.4.9}/hud/shared/exceptions.py +0 -0
  111. {hud_python-0.4.8 → hud_python-0.4.9}/hud/shared/requests.py +0 -0
  112. {hud_python-0.4.8 → hud_python-0.4.9}/hud/shared/tests/__init__.py +0 -0
  113. {hud_python-0.4.8 → hud_python-0.4.9}/hud/shared/tests/test_exceptions.py +0 -0
  114. {hud_python-0.4.8 → hud_python-0.4.9}/hud/shared/tests/test_requests.py +0 -0
  115. {hud_python-0.4.8 → hud_python-0.4.9}/hud/telemetry/__init__.py +0 -0
  116. {hud_python-0.4.8 → hud_python-0.4.9}/hud/telemetry/instrument.py +0 -0
  117. {hud_python-0.4.8 → hud_python-0.4.9}/hud/telemetry/job.py +0 -0
  118. {hud_python-0.4.8 → hud_python-0.4.9}/hud/telemetry/replay.py +0 -0
  119. {hud_python-0.4.8 → hud_python-0.4.9}/hud/telemetry/trace.py +0 -0
  120. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/base.py +0 -0
  121. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/bash.py +0 -0
  122. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/computer/__init__.py +0 -0
  123. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/computer/anthropic.py +0 -0
  124. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/computer/hud.py +0 -0
  125. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/computer/openai.py +0 -0
  126. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/computer/settings.py +0 -0
  127. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/edit.py +0 -0
  128. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/executors/__init__.py +0 -0
  129. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/executors/base.py +0 -0
  130. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/executors/pyautogui.py +0 -0
  131. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/executors/tests/__init__.py +0 -0
  132. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/executors/tests/test_base_executor.py +0 -0
  133. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  134. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/executors/xdo.py +0 -0
  135. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/playwright.py +0 -0
  136. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/tests/__init__.py +0 -0
  137. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/tests/test_base.py +0 -0
  138. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/tests/test_bash.py +0 -0
  139. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/tests/test_bash_extended.py +0 -0
  140. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/tests/test_computer.py +0 -0
  141. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/tests/test_computer_actions.py +0 -0
  142. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/tests/test_edit.py +0 -0
  143. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/tests/test_init.py +0 -0
  144. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/tests/test_playwright_tool.py +0 -0
  145. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/tests/test_tools.py +0 -0
  146. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/tests/test_utils.py +0 -0
  147. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/types.py +0 -0
  148. {hud_python-0.4.8 → hud_python-0.4.9}/hud/tools/utils.py +0 -0
  149. {hud_python-0.4.8 → hud_python-0.4.9}/hud/types.py +0 -0
  150. {hud_python-0.4.8 → hud_python-0.4.9}/hud/utils/__init__.py +0 -0
  151. {hud_python-0.4.8 → hud_python-0.4.9}/hud/utils/async_utils.py +0 -0
  152. {hud_python-0.4.8 → hud_python-0.4.9}/hud/utils/progress.py +0 -0
  153. {hud_python-0.4.8 → hud_python-0.4.9}/hud/utils/telemetry.py +0 -0
  154. {hud_python-0.4.8 → hud_python-0.4.9}/hud/utils/tests/__init__.py +0 -0
  155. {hud_python-0.4.8 → hud_python-0.4.9}/hud/utils/tests/test_async_utils.py +0 -0
  156. {hud_python-0.4.8 → hud_python-0.4.9}/hud/utils/tests/test_init.py +0 -0
  157. {hud_python-0.4.8 → hud_python-0.4.9}/hud/utils/tests/test_progress.py +0 -0
  158. {hud_python-0.4.8 → hud_python-0.4.9}/hud/utils/tests/test_telemetry.py +0 -0
  159. {hud_python-0.4.8 → hud_python-0.4.9}/rl/pyproject.toml +0 -0
@@ -42,4 +42,6 @@ CLAUDE.md
42
42
 
43
43
  # RL
44
44
  wandb/
45
- outputs/
45
+ outputs/
46
+
47
+ test/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.8
3
+ Version: 0.4.9
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -38,6 +38,7 @@ Requires-Python: <3.14,>=3.11
38
38
  Requires-Dist: fastmcp>=2.11.2
39
39
  Requires-Dist: httpx<1,>=0.23.0
40
40
  Requires-Dist: hud-mcp-python-sdk>=0.1.0
41
+ Requires-Dist: mcp>=1.13.1
41
42
  Requires-Dist: opentelemetry-api>=1.34.1
42
43
  Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
43
44
  Requires-Dist: opentelemetry-instrumentation-mcp>=0.44.1
@@ -61,6 +62,16 @@ Requires-Dist: langchain-anthropic; extra == 'agent'
61
62
  Requires-Dist: langchain-openai; extra == 'agent'
62
63
  Requires-Dist: numpy>=1.24.0; extra == 'agent'
63
64
  Requires-Dist: openai; extra == 'agent'
65
+ Provides-Extra: agents
66
+ Requires-Dist: anthropic; extra == 'agents'
67
+ Requires-Dist: datasets>=2.14.0; extra == 'agents'
68
+ Requires-Dist: dotenv>=0.9.9; extra == 'agents'
69
+ Requires-Dist: hud-mcp-use-python-sdk>=0.1.0; extra == 'agents'
70
+ Requires-Dist: langchain; extra == 'agents'
71
+ Requires-Dist: langchain-anthropic; extra == 'agents'
72
+ Requires-Dist: langchain-openai; extra == 'agents'
73
+ Requires-Dist: numpy>=1.24.0; extra == 'agents'
74
+ Requires-Dist: openai; extra == 'agents'
64
75
  Provides-Extra: dev
65
76
  Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
66
77
  Requires-Dist: anthropic; extra == 'dev'
@@ -2,6 +2,8 @@
2
2
 
3
3
  A browser automation environment for the HUD platform demonstrating best practices for building MCP (Model Context Protocol) environments with evaluation systems.
4
4
 
5
+ **Key Feature**: This environment is **hot-reloadable** - it maintains state (running services, browser sessions, launched apps) across server restarts during development.
6
+
5
7
  ## Quick Start
6
8
 
7
9
  ### Build & Deploy
@@ -14,6 +16,34 @@ docker build -t hud-browser .
14
16
  docker run --rm -i -p 8080:8080 hud-browser
15
17
  ```
16
18
 
19
+ ### Hot-Reloadable Architecture
20
+
21
+ This environment uses a persistent context server architecture that maintains state across MCP server restarts:
22
+
23
+ - **Context Server**: Runs as a separate process holding ServiceManager and state
24
+ - **MCP Server**: Connects via Unix socket, can restart without losing services
25
+ - **State Preservation**: X11, VNC, running apps, and service states persist
26
+ - **Development Friendly**: Edit code and restart MCP server instantly
27
+
28
+ #### Docker Architecture
29
+
30
+ The environment uses a single CMD that follows the proven text_2048 pattern:
31
+
32
+ ```dockerfile
33
+ CMD ["sh", "-c", "\
34
+ # Start services in background \
35
+ python -m hud_controller.context_server & \
36
+ x11vnc ... & \
37
+ # Run MCP server in foreground \
38
+ exec hud-controller mcp \
39
+ "]
40
+ ```
41
+
42
+ This pattern ensures:
43
+ - Background services (`&`) start once and persist
44
+ - Only the `exec` command gets wrapped by watchfiles
45
+ - Services survive hot-reloads during development
46
+
17
47
  ## Deployment to Registry
18
48
 
19
49
  ### 1. Publish to Docker Registry
@@ -169,10 +199,11 @@ Set these in your environment/Docker configuration:
169
199
 
170
200
  ```
171
201
  Docker Container
172
- ├── start.sh # Service startup orchestration
173
202
  ├── MCP Server (FastMCP) # Protocol implementation
174
203
  │ ├── Tools # setup, evaluate, computer, etc.
175
- │ └── Resources # Dynamic registry discovery
204
+ │ └── Resources # Dynamic registry discovery
205
+ ├── Context Server # Persistent state management
206
+ │ └── PersistentContext # Maintains services & browser state
176
207
  ├── Services
177
208
  │ ├── X11 (Xvfb) # Virtual display
178
209
  │ ├── VNC + Websockify # Remote access
@@ -188,8 +219,7 @@ Docker Container
188
219
 
189
220
  ```
190
221
  browser/
191
- ├── Dockerfile # Multi-stage build with optimization
192
- ├── start.sh # Service startup script
222
+ ├── Dockerfile # Multi-stage build with integrated startup
193
223
  ├── apps/ # Launchable web applications
194
224
  │ ├── todo/ # Example app with evaluation APIs
195
225
  │ └── 2048/ # 2048 game app
@@ -197,6 +227,8 @@ browser/
197
227
  │ ├── server.py # FastMCP server + resource definitions
198
228
  │ ├── services.py # Service management
199
229
  │ ├── context.py # Environment context
230
+ │ ├── context_server.py # Persistent context server
231
+ │ ├── persistent_context.py # State persistence wrapper
200
232
  │ ├── evaluators/ # Evaluation system
201
233
  │ ├── setup/ # Setup system
202
234
  │ └── problems/ # Problem definitions
@@ -205,7 +237,7 @@ browser/
205
237
 
206
238
  ## Development Workflow
207
239
 
208
- ### Hot-Reload Development with `hud mcp`
240
+ ### Hot-Reload Development with `hud dev`
209
241
 
210
242
  For rapid iteration without Docker rebuilds:
211
243
 
@@ -214,7 +246,7 @@ For rapid iteration without Docker rebuilds:
214
246
  cd environments/browser
215
247
 
216
248
  # Start hot-reload development proxy
217
- hud mcp . --build
249
+ hud dev . --build
218
250
 
219
251
  # This will:
220
252
  # - Build/use hud-browser:dev image
@@ -225,6 +257,21 @@ hud mcp . --build
225
257
 
226
258
  Add the URL from output to Cursor settings or click the deeplink. Now you can edit code in `src/` and changes apply instantly!
227
259
 
260
+ #### How Hot-Reloading Works
261
+
262
+ This environment uses a persistent context server pattern:
263
+
264
+ 1. **Context Server**: A separate Python process maintains state (services, browser, apps)
265
+ 2. **Socket Communication**: MCP server connects via Unix socket `/tmp/hud_browser_ctx.sock`
266
+ 3. **State Preservation**: X11, VNC, browser sessions, and launched apps persist across reloads
267
+ 4. **Automatic Recovery**: On reload, the server reconnects to existing services
268
+
269
+ This means you can:
270
+ - Edit code and have changes apply immediately
271
+ - Keep browser sessions and apps running
272
+ - Maintain VNC connections
273
+ - Preserve test state between iterations
274
+
228
275
  ### Traditional Development Steps
229
276
 
230
277
  1. **Start with apps** - Build your web applications independently
@@ -392,4 +439,9 @@ When creating new MCP environments:
392
439
  6. **Update service dependencies** in `services.py` as needed
393
440
  7. **Extend Dockerfile** with your environment's requirements
394
441
 
442
+ For hot-reloadability:
443
+ - Keep complex objects out of the persistent context
444
+ - Only store simple, picklable state
445
+ - Recreate tools and clients on each server start
446
+
395
447
  See `src/hud_controller/README.md` for detailed implementation guidance.
@@ -3,25 +3,20 @@ name = "hud-controller"
3
3
  version = "0.1.0"
4
4
  description = "HUD Controller for browser environments with MCP tools"
5
5
  requires-python = ">=3.11,<3.14"
6
- dependencies = [
7
- "hud-python @ git+https://github.com/hud-evals/hud-python.git@l/text-2048",
8
- "playwright",
9
- "pyautogui",
10
- "httpx",
11
- "typer",
12
- ]
13
-
14
- [project.scripts]
15
- hud-controller = "hud_controller.__main__:main"
6
+ dependencies = [ "hud-python", "playwright", "pyautogui", "httpx", "typer",]
16
7
 
17
8
  [build-system]
18
- requires = ["hatchling"]
9
+ requires = [ "hatchling",]
19
10
  build-backend = "hatchling.build"
20
11
 
21
- [tool.hatch.build.targets.wheel]
22
- packages = ["src/hud_controller"]
12
+ [project.scripts]
13
+ hud-controller = "hud_controller.__main__:main"
14
+
15
+ [tool.hud]
16
+ image = "hud-browser:dev"
23
17
 
24
18
  [tool.hatch.metadata]
25
19
  allow-direct-references = true
26
20
 
27
-
21
+ [tool.hatch.build.targets.wheel]
22
+ packages = [ "src/hud_controller",]
@@ -55,7 +55,7 @@ class EvaluatorRegistry:
55
55
  def create_evaluator(cls, spec, context): pass
56
56
  ```
57
57
 
58
- ### BrowserEnvironmentContext
58
+ ### BrowserContext
59
59
 
60
60
  Unified interface for environment interactions:
61
61
  - `call_app_api(app, endpoint, method, data)` - Call app backend API
@@ -34,7 +34,7 @@ docker run --rm -i \
34
34
 
35
35
  Development mode allows you to edit code locally and see changes immediately without rebuilding.
36
36
 
37
- #### Option 1: Using `hud mcp` (Recommended)
37
+ #### Option 1: Using `hud dev` (Recommended)
38
38
 
39
39
  The easiest way to develop with hot-reload:
40
40
 
@@ -44,7 +44,7 @@ export BROWSER_PROVIDER=anchorbrowser
44
44
  export ANCHOR_API_KEY=your-api-key
45
45
 
46
46
  # Start development proxy
47
- hud mcp . --build
47
+ hud dev . --build
48
48
 
49
49
  # This will:
50
50
  # - Build/use hud-remote-browser:dev image
@@ -57,13 +57,13 @@ The agent will play 2048 and try to reach a target tile using the available tool
57
57
 
58
58
  ## Development Mode
59
59
 
60
- ### Option 1: Using `hud mcp` (Recommended)
60
+ ### Option 1: Using `hud dev` (Recommended)
61
61
 
62
62
  The easiest way to develop with hot-reload:
63
63
 
64
64
  ```bash
65
65
  # Start development proxy
66
- hud mcp . --build
66
+ hud dev . --build
67
67
 
68
68
  # This will:
69
69
  # - Build/use hud-text-2048:dev image
@@ -85,6 +85,7 @@ class MCPAgent(ABC):
85
85
  self._tool_map: dict[str, types.Tool] = {} # Simplified: just name to tool
86
86
  self.screenshot_history: list[str] = []
87
87
  self._auto_trace = auto_trace
88
+ self._auto_trace_cm: Any | None = None # Store auto-created trace context manager
88
89
  self.initialization_complete = False
89
90
 
90
91
  # Response agent to automatically interact with the model
@@ -303,6 +304,9 @@ class MCPAgent(ABC):
303
304
  except Exception as e:
304
305
  logger.warning("ResponseAgent failed: %s", e)
305
306
  if decision == "STOP":
307
+ # Try to submit response through lifecycle tool
308
+ await self._maybe_submit_response(response, messages)
309
+
306
310
  logger.info("Stopping execution")
307
311
  final_response = response
308
312
  break
@@ -483,6 +487,40 @@ class MCPAgent(ABC):
483
487
  self._available_tools.append(tool)
484
488
  # Simplified mapping - just tool name to tool
485
489
  self._tool_map[tool.name] = tool
490
+
491
+ # Auto-detect response tool as a lifecycle tool
492
+ if tool.name == "response" and "response" not in self.lifecycle_tools:
493
+ logger.debug("Auto-detected 'response' tool as a lifecycle tool")
494
+ self.lifecycle_tools.append("response")
495
+
496
+ async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
497
+ """Submit response through lifecycle tool if available.
498
+
499
+ Args:
500
+ response: The agent's response
501
+ messages: The current message history (will be modified in-place)
502
+ """
503
+ # Check if we have a response lifecycle tool
504
+ if "response" in self.lifecycle_tools and "response" in self._tool_map:
505
+ logger.debug("Calling response lifecycle tool")
506
+ try:
507
+ # Call the response tool with the agent's response
508
+ response_tool_call = MCPToolCall(
509
+ name="response",
510
+ arguments={"response": response.content, "messages": messages}
511
+ )
512
+ response_results = await self.call_tools(response_tool_call)
513
+
514
+ # Format and add the response tool results to messages
515
+ response_messages = await self.format_tool_results(
516
+ [response_tool_call], response_results
517
+ )
518
+ messages.extend(response_messages)
519
+
520
+ # Mark the task as done
521
+ logger.info("Response lifecycle tool executed, marking task as done")
522
+ except Exception as e:
523
+ logger.error("Response lifecycle tool failed: %s", e)
486
524
 
487
525
  async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
488
526
  """Inject metadata into the metadata of the initialize request."""
@@ -491,7 +529,7 @@ class MCPAgent(ABC):
491
529
  mcp_config,
492
530
  MCPConfigPatch(meta=self.metadata),
493
531
  )
494
- setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
532
+ self._auto_trace_cm = setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
495
533
 
496
534
  def get_available_tools(self) -> list[types.Tool]:
497
535
  """Get list of available MCP tools for LLM use (excludes lifecycle tools)."""
@@ -532,6 +570,17 @@ class MCPAgent(ABC):
532
570
 
533
571
  async def _cleanup(self) -> None:
534
572
  """Cleanup resources."""
573
+ # Clean up auto-created trace if any
574
+ if self._auto_trace_cm:
575
+ try:
576
+ self._auto_trace_cm.__exit__(None, None, None)
577
+ logger.info("Closed auto-created trace")
578
+ except Exception as e:
579
+ logger.warning("Failed to close auto-created trace: %s", e)
580
+ finally:
581
+ self._auto_trace_cm = None
582
+
583
+ # Clean up auto-created client
535
584
  if self._auto_created_client and self.mcp_client:
536
585
  try:
537
586
  await self.mcp_client.shutdown()
@@ -23,10 +23,13 @@ from .clone import clone_repository, get_clone_message, print_error, print_tutor
23
23
  from .cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
24
24
  from .debug import debug_mcp_stdio
25
25
  from .init import create_environment
26
+ from . import list_func as list_module
26
27
  from .mcp_server import run_mcp_dev_server
27
28
  from .pull import pull_command
28
29
  from .push import push_command
30
+ from .remove import remove_command
29
31
  from .utils import CaptureLogger
32
+ from .eval import eval_command
30
33
 
31
34
  # Create the main Typer app
32
35
  app = typer.Typer(
@@ -442,7 +445,8 @@ def run(
442
445
 
443
446
  # Get URL from options or environment
444
447
  if not url:
445
- url = os.getenv("HUD_MCP_URL", "https://mcp.hud.so/v3/mcp")
448
+ from hud.settings import settings
449
+ url = settings.hud_mcp_url
446
450
 
447
451
  run_remote_server(image, docker_args, transport, port, url, api_key, run_id, verbose)
448
452
 
@@ -561,6 +565,63 @@ def pull(
561
565
  pull_command(target, lock_file, yes, verify_only, verbose)
562
566
 
563
567
 
568
+ @app.command(name="list")
569
+ def list_environments(
570
+ filter_name: str | None = typer.Option(
571
+ None, "--filter", "-f", help="Filter environments by name (case-insensitive)"
572
+ ),
573
+ json_output: bool = typer.Option(
574
+ False, "--json", help="Output as JSON"
575
+ ),
576
+ show_all: bool = typer.Option(
577
+ False, "--all", "-a", help="Show all columns including digest"
578
+ ),
579
+ verbose: bool = typer.Option(
580
+ False, "--verbose", "-v", help="Show detailed output"
581
+ ),
582
+ ) -> None:
583
+ """📋 List all HUD environments in local registry.
584
+
585
+ Shows environments pulled with 'hud pull' stored in ~/.hud/envs/
586
+
587
+ Examples:
588
+ hud list # List all environments
589
+ hud list --filter text # Filter by name
590
+ hud list --json # Output as JSON
591
+ hud list --all # Show digest column
592
+ hud list --verbose # Show full descriptions
593
+ """
594
+ list_module.list_command(filter_name, json_output, show_all, verbose)
595
+
596
+
597
+ @app.command()
598
+ def remove(
599
+ target: str | None = typer.Argument(
600
+ None,
601
+ help="Environment to remove (digest, name, or 'all' for all environments)"
602
+ ),
603
+ yes: bool = typer.Option(
604
+ False, "--yes", "-y", help="Skip confirmation prompt"
605
+ ),
606
+ verbose: bool = typer.Option(
607
+ False, "--verbose", "-v", help="Show detailed output"
608
+ ),
609
+ ) -> None:
610
+ """🗑️ Remove HUD environments from local registry.
611
+
612
+ Removes environment metadata from ~/.hud/envs/
613
+ Note: This does not remove the Docker images.
614
+
615
+ Examples:
616
+ hud remove abc123 # Remove by digest
617
+ hud remove text_2048 # Remove by name
618
+ hud remove hudpython/test_init # Remove by full name
619
+ hud remove all # Remove all environments
620
+ hud remove all --yes # Remove all without confirmation
621
+ """
622
+ remove_command(target, yes, verbose)
623
+
624
+
564
625
  @app.command()
565
626
  def init(
566
627
  name: str = typer.Argument(None, help="Environment name (default: current directory name)"),
@@ -592,6 +653,64 @@ def quickstart() -> None:
592
653
  clone("https://github.com/hud-evals/quickstart.git")
593
654
 
594
655
 
656
+ @app.command()
657
+ def eval(
658
+ source: str = typer.Argument(
659
+ ...,
660
+ help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file",
661
+ ),
662
+ full: bool = typer.Option(
663
+ False,
664
+ "--full",
665
+ help="Run the entire dataset (omit for single-task debug mode)",
666
+ ),
667
+ agent: str = typer.Option(
668
+ "claude",
669
+ "--agent",
670
+ help="Agent backend to use (claude or openai)",
671
+ ),
672
+ model: str | None = typer.Option(
673
+ None,
674
+ "--model",
675
+ help="Model name for the chosen agent",
676
+ ),
677
+ allowed_tools: str | None = typer.Option(
678
+ None,
679
+ "--allowed-tools",
680
+ help="Comma-separated list of allowed tools",
681
+ ),
682
+ max_concurrent: int = typer.Option(
683
+ 30,
684
+ "--max-concurrent",
685
+ help="Concurrency level for full-dataset mode",
686
+ ),
687
+ max_steps: int = typer.Option(
688
+ 30,
689
+ "--max-steps",
690
+ help="Maximum steps per task (default: 10 for single, 50 for full)",
691
+ ),
692
+ ) -> None:
693
+ """🚀 Run evaluation on datasets or individual tasks with agents."""
694
+ # Validate agent choice
695
+ valid_agents = ["claude", "openai"]
696
+ if agent not in valid_agents:
697
+ from hud.utils.design import HUDDesign
698
+ design = HUDDesign()
699
+ design.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
700
+ raise typer.Exit(1)
701
+
702
+ # Import and run the command
703
+ eval_command(
704
+ source=source,
705
+ full=full,
706
+ agent=agent, # type: ignore
707
+ model=model,
708
+ allowed_tools=allowed_tools,
709
+ max_concurrent=max_concurrent,
710
+ max_steps=max_steps,
711
+ )
712
+
713
+
595
714
  def main() -> None:
596
715
  """Main entry point for the CLI."""
597
716
  # Show header for main help
@@ -12,6 +12,8 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
12
12
  from hud.settings import settings
13
13
  from hud.utils.design import HUDDesign
14
14
 
15
+ from .registry import get_registry_dir, list_registry_entries, extract_digest_from_image, load_from_registry
16
+
15
17
  console = Console()
16
18
  design = HUDDesign()
17
19
 
@@ -50,38 +52,31 @@ def fetch_lock_from_registry(reference: str) -> dict | None:
50
52
 
51
53
  def check_local_cache(reference: str) -> dict | None:
52
54
  """Check local cache for lock file."""
53
- # Extract digest if present
54
- if "@sha256:" in reference:
55
- digest = reference.split("@sha256:")[-1][:12]
56
- elif "/" in reference:
57
- # Try to find by name pattern
58
- cache_dir = Path.home() / ".hud" / "envs"
59
- if cache_dir.exists():
60
- # Look for any cached version of this image
61
- for env_dir in cache_dir.iterdir():
62
- if env_dir.is_dir():
63
- lock_file = env_dir / "hud.lock.yaml"
64
- if lock_file.exists():
65
- with open(lock_file) as f:
66
- lock_data = yaml.safe_load(f)
67
- # Check if this matches our reference
68
- if lock_data and "image" in lock_data:
69
- image = lock_data["image"]
70
- # Match by name (ignoring tag/digest)
71
- ref_base = reference.split("@")[0].split(":")[0]
72
- img_base = image.split("@")[0].split(":")[0]
73
- if ref_base in img_base or img_base in ref_base:
74
- return lock_data
75
- return None
76
- else:
77
- digest = "latest"
78
-
79
- # Check specific digest directory
80
- lock_file = Path.home() / ".hud" / "envs" / digest / "hud.lock.yaml"
81
- if lock_file.exists():
82
- with open(lock_file) as f:
83
- return yaml.safe_load(f)
84
-
55
+ # First try exact digest match
56
+ digest = extract_digest_from_image(reference)
57
+ lock_data = load_from_registry(digest)
58
+ if lock_data:
59
+ return lock_data
60
+
61
+ # If not found and reference has a name, search by name pattern
62
+ if "/" in reference:
63
+ # Look for any cached version of this image
64
+ ref_base = reference.split("@")[0].split(":")[0]
65
+
66
+ for digest, lock_file in list_registry_entries():
67
+ try:
68
+ with open(lock_file) as f:
69
+ lock_data = yaml.safe_load(f)
70
+ # Check if this matches our reference
71
+ if lock_data and "image" in lock_data:
72
+ image = lock_data["image"]
73
+ # Match by name (ignoring tag/digest)
74
+ img_base = image.split("@")[0].split(":")[0]
75
+ if ref_base in img_base or img_base in ref_base:
76
+ return lock_data
77
+ except Exception:
78
+ continue
79
+
85
80
  return None
86
81
 
87
82
 
@@ -147,15 +142,8 @@ async def analyze_from_metadata(reference: str, output_format: str, verbose: boo
147
142
  source = "registry"
148
143
 
149
144
  # Save to local cache for next time
150
- if "@sha256:" in lock_data.get("image", ""):
151
- digest = lock_data["image"].split("@sha256:")[-1][:12]
152
- else:
153
- digest = "latest"
154
-
155
- cache_dir = Path.home() / ".hud" / "envs" / digest
156
- cache_dir.mkdir(parents=True, exist_ok=True)
157
- with open(cache_dir / "hud.lock.yaml", "w") as f: # noqa: ASYNC230
158
- yaml.dump(lock_data, f, default_flow_style=False, sort_keys=False)
145
+ from .registry import save_to_registry
146
+ save_to_registry(lock_data, lock_data.get("image", ""), verbose=False)
159
147
  else:
160
148
  progress.update(task, description="[red]✗ Not found[/red]")
161
149
 
@@ -17,6 +17,8 @@ from hud.clients import MCPClient
17
17
  from hud.utils.design import HUDDesign
18
18
  from hud.version import __version__ as hud_version
19
19
 
20
+ from .registry import save_to_registry
21
+
20
22
 
21
23
  def parse_version(version_str: str) -> tuple[int, int, int]:
22
24
  """Parse version string like '1.0.0' or '1.0' into tuple of integers."""
@@ -459,6 +461,11 @@ def build_environment(
459
461
  # Remove temp image after we're done
460
462
  subprocess.run(["docker", "rmi", temp_tag], capture_output=True) # noqa: S603, S607
461
463
 
464
+ # Add to local registry
465
+ if image_id:
466
+ # Save to local registry using the helper
467
+ save_to_registry(lock_content, lock_content.get("image", tag), verbose)
468
+
462
469
  # Print summary
463
470
  design.section_title("Build Complete")
464
471
 
@@ -167,7 +167,14 @@ async def debug_mcp_stdio(command: list[str], logger: CaptureLogger, max_phase:
167
167
  break
168
168
  except Exception as e:
169
169
  logger.error(f"Failed to parse MCP response: {e}")
170
- continue
170
+ logger.error(f"Raw output that caused the error: {repr(line)}")
171
+ logger.hint("This usually means non-JSON output is being sent to STDOUT")
172
+ logger.hint("Common causes:")
173
+ logger.hint(" - Print statements in your server code")
174
+ logger.hint(" - Library warnings (use warnings.filterwarnings)")
175
+ logger.hint(" - Import-time output from dependencies")
176
+ phases_completed = 1 # Mark as failed
177
+ break # Stop trying to parse
171
178
 
172
179
  if response and "result" in response:
173
180
  logger.success("MCP server initialized successfully")