hud-python 0.4.8__tar.gz → 0.4.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (160) hide show
  1. {hud_python-0.4.8 → hud_python-0.4.10}/.gitignore +3 -1
  2. {hud_python-0.4.8 → hud_python-0.4.10}/PKG-INFO +12 -1
  3. {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/README.md +58 -6
  4. {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/pyproject.toml +9 -14
  5. {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/src/hud_controller/README.md +1 -1
  6. {hud_python-0.4.8 → hud_python-0.4.10}/environments/remote_browser/README.md +2 -2
  7. {hud_python-0.4.8 → hud_python-0.4.10}/environments/text_2048/README.md +2 -2
  8. {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/base.py +50 -1
  9. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/__init__.py +187 -11
  10. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/analyze_metadata.py +33 -42
  11. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/build.py +7 -0
  12. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/debug.py +8 -1
  13. hud_python-0.4.10/hud/cli/env_utils.py +133 -0
  14. hud_python-0.4.10/hud/cli/eval.py +302 -0
  15. hud_python-0.4.10/hud/cli/list_func.py +213 -0
  16. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/mcp_server.py +3 -79
  17. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/pull.py +20 -15
  18. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/push.py +84 -41
  19. hud_python-0.4.10/hud/cli/registry.py +155 -0
  20. hud_python-0.4.10/hud/cli/remove.py +200 -0
  21. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/runner.py +1 -1
  22. hud_python-0.4.10/hud/cli/tests/test_analyze_metadata.py +277 -0
  23. hud_python-0.4.10/hud/cli/tests/test_build.py +450 -0
  24. hud_python-0.4.10/hud/cli/tests/test_list_func.py +288 -0
  25. hud_python-0.4.10/hud/cli/tests/test_pull.py +400 -0
  26. hud_python-0.4.10/hud/cli/tests/test_push.py +379 -0
  27. hud_python-0.4.10/hud/cli/tests/test_registry.py +264 -0
  28. {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/base.py +13 -1
  29. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/__init__.py +2 -0
  30. hud_python-0.4.10/hud/tools/response.py +54 -0
  31. {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/design.py +10 -0
  32. {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/mcp.py +14 -2
  33. {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/tests/test_version.py +1 -1
  34. {hud_python-0.4.8 → hud_python-0.4.10}/hud/version.py +1 -1
  35. {hud_python-0.4.8 → hud_python-0.4.10}/pyproject.toml +4 -1
  36. {hud_python-0.4.8 → hud_python-0.4.10}/rl/README.md +10 -18
  37. {hud_python-0.4.8 → hud_python-0.4.10}/LICENSE +0 -0
  38. {hud_python-0.4.8 → hud_python-0.4.10}/README.md +0 -0
  39. {hud_python-0.4.8 → hud_python-0.4.10}/environments/README.md +0 -0
  40. {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/apps/2048/README.md +0 -0
  41. {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/apps/2048/backend/pyproject.toml +0 -0
  42. {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/apps/README.md +0 -0
  43. {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/apps/todo/README.md +0 -0
  44. {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/apps/todo/backend/pyproject.toml +0 -0
  45. {hud_python-0.4.8 → hud_python-0.4.10}/environments/remote_browser/pyproject.toml +0 -0
  46. {hud_python-0.4.8 → hud_python-0.4.10}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  47. {hud_python-0.4.8 → hud_python-0.4.10}/environments/text_2048/pyproject.toml +0 -0
  48. {hud_python-0.4.8 → hud_python-0.4.10}/examples/README.md +0 -0
  49. {hud_python-0.4.8 → hud_python-0.4.10}/hud/__init__.py +0 -0
  50. {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/__init__.py +0 -0
  51. {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/claude.py +0 -0
  52. {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/langchain.py +0 -0
  53. {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/misc/__init__.py +0 -0
  54. {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/misc/response_agent.py +0 -0
  55. {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/openai.py +0 -0
  56. {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/openai_chat_generic.py +0 -0
  57. {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/tests/__init__.py +0 -0
  58. {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/tests/test_base.py +0 -0
  59. {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/tests/test_claude.py +0 -0
  60. {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/tests/test_client.py +0 -0
  61. {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/tests/test_openai.py +0 -0
  62. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/__main__.py +0 -0
  63. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/analyze.py +0 -0
  64. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/clone.py +0 -0
  65. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/cursor.py +0 -0
  66. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/docker_utils.py +0 -0
  67. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/init.py +0 -0
  68. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/interactive.py +0 -0
  69. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/remote_runner.py +0 -0
  70. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/__init__.py +0 -0
  71. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_analyze.py +0 -0
  72. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_cli_init.py +0 -0
  73. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_cli_main.py +0 -0
  74. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_clone.py +0 -0
  75. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_cursor.py +0 -0
  76. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_debug.py +0 -0
  77. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_mcp_server.py +0 -0
  78. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_utils.py +0 -0
  79. {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/utils.py +0 -0
  80. {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/README.md +0 -0
  81. {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/__init__.py +0 -0
  82. {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/fastmcp.py +0 -0
  83. {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/mcp_use.py +0 -0
  84. {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/tests/__init__.py +0 -0
  85. {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/tests/test_client_integration.py +0 -0
  86. {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/tests/test_fastmcp.py +0 -0
  87. {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/tests/test_protocol.py +0 -0
  88. {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/utils/__init__.py +0 -0
  89. {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/utils/retry_transport.py +0 -0
  90. {hud_python-0.4.8 → hud_python-0.4.10}/hud/datasets.py +0 -0
  91. {hud_python-0.4.8 → hud_python-0.4.10}/hud/misc/__init__.py +0 -0
  92. {hud_python-0.4.8 → hud_python-0.4.10}/hud/misc/claude_plays_pokemon.py +0 -0
  93. {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/__init__.py +0 -0
  94. {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/collector.py +0 -0
  95. {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/config.py +0 -0
  96. {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/context.py +0 -0
  97. {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/exporters.py +0 -0
  98. {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/instrumentation.py +0 -0
  99. {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/processors.py +0 -0
  100. {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/tests/__init__.py +0 -0
  101. {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/tests/test_processors.py +0 -0
  102. {hud_python-0.4.8 → hud_python-0.4.10}/hud/py.typed +0 -0
  103. {hud_python-0.4.8 → hud_python-0.4.10}/hud/server/__init__.py +0 -0
  104. {hud_python-0.4.8 → hud_python-0.4.10}/hud/server/context.py +0 -0
  105. {hud_python-0.4.8 → hud_python-0.4.10}/hud/server/helper/__init__.py +0 -0
  106. {hud_python-0.4.8 → hud_python-0.4.10}/hud/server/low_level.py +0 -0
  107. {hud_python-0.4.8 → hud_python-0.4.10}/hud/server/server.py +0 -0
  108. {hud_python-0.4.8 → hud_python-0.4.10}/hud/server/tests/__init__.py +0 -0
  109. {hud_python-0.4.8 → hud_python-0.4.10}/hud/settings.py +0 -0
  110. {hud_python-0.4.8 → hud_python-0.4.10}/hud/shared/__init__.py +0 -0
  111. {hud_python-0.4.8 → hud_python-0.4.10}/hud/shared/exceptions.py +0 -0
  112. {hud_python-0.4.8 → hud_python-0.4.10}/hud/shared/requests.py +0 -0
  113. {hud_python-0.4.8 → hud_python-0.4.10}/hud/shared/tests/__init__.py +0 -0
  114. {hud_python-0.4.8 → hud_python-0.4.10}/hud/shared/tests/test_exceptions.py +0 -0
  115. {hud_python-0.4.8 → hud_python-0.4.10}/hud/shared/tests/test_requests.py +0 -0
  116. {hud_python-0.4.8 → hud_python-0.4.10}/hud/telemetry/__init__.py +0 -0
  117. {hud_python-0.4.8 → hud_python-0.4.10}/hud/telemetry/instrument.py +0 -0
  118. {hud_python-0.4.8 → hud_python-0.4.10}/hud/telemetry/job.py +0 -0
  119. {hud_python-0.4.8 → hud_python-0.4.10}/hud/telemetry/replay.py +0 -0
  120. {hud_python-0.4.8 → hud_python-0.4.10}/hud/telemetry/trace.py +0 -0
  121. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/base.py +0 -0
  122. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/bash.py +0 -0
  123. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/computer/__init__.py +0 -0
  124. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/computer/anthropic.py +0 -0
  125. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/computer/hud.py +0 -0
  126. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/computer/openai.py +0 -0
  127. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/computer/settings.py +0 -0
  128. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/edit.py +0 -0
  129. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/executors/__init__.py +0 -0
  130. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/executors/base.py +0 -0
  131. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/executors/pyautogui.py +0 -0
  132. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/executors/tests/__init__.py +0 -0
  133. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/executors/tests/test_base_executor.py +0 -0
  134. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  135. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/executors/xdo.py +0 -0
  136. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/playwright.py +0 -0
  137. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/__init__.py +0 -0
  138. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_base.py +0 -0
  139. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_bash.py +0 -0
  140. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_bash_extended.py +0 -0
  141. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_computer.py +0 -0
  142. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_computer_actions.py +0 -0
  143. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_edit.py +0 -0
  144. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_init.py +0 -0
  145. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_playwright_tool.py +0 -0
  146. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_tools.py +0 -0
  147. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_utils.py +0 -0
  148. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/types.py +0 -0
  149. {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/utils.py +0 -0
  150. {hud_python-0.4.8 → hud_python-0.4.10}/hud/types.py +0 -0
  151. {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/__init__.py +0 -0
  152. {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/async_utils.py +0 -0
  153. {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/progress.py +0 -0
  154. {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/telemetry.py +0 -0
  155. {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/tests/__init__.py +0 -0
  156. {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/tests/test_async_utils.py +0 -0
  157. {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/tests/test_init.py +0 -0
  158. {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/tests/test_progress.py +0 -0
  159. {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/tests/test_telemetry.py +0 -0
  160. {hud_python-0.4.8 → hud_python-0.4.10}/rl/pyproject.toml +0 -0
@@ -42,4 +42,6 @@ CLAUDE.md
42
42
 
43
43
  # RL
44
44
  wandb/
45
- outputs/
45
+ outputs/
46
+
47
+ test/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.8
3
+ Version: 0.4.10
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -38,6 +38,7 @@ Requires-Python: <3.14,>=3.11
38
38
  Requires-Dist: fastmcp>=2.11.2
39
39
  Requires-Dist: httpx<1,>=0.23.0
40
40
  Requires-Dist: hud-mcp-python-sdk>=0.1.0
41
+ Requires-Dist: mcp>=1.13.1
41
42
  Requires-Dist: opentelemetry-api>=1.34.1
42
43
  Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
43
44
  Requires-Dist: opentelemetry-instrumentation-mcp>=0.44.1
@@ -61,6 +62,16 @@ Requires-Dist: langchain-anthropic; extra == 'agent'
61
62
  Requires-Dist: langchain-openai; extra == 'agent'
62
63
  Requires-Dist: numpy>=1.24.0; extra == 'agent'
63
64
  Requires-Dist: openai; extra == 'agent'
65
+ Provides-Extra: agents
66
+ Requires-Dist: anthropic; extra == 'agents'
67
+ Requires-Dist: datasets>=2.14.0; extra == 'agents'
68
+ Requires-Dist: dotenv>=0.9.9; extra == 'agents'
69
+ Requires-Dist: hud-mcp-use-python-sdk>=0.1.0; extra == 'agents'
70
+ Requires-Dist: langchain; extra == 'agents'
71
+ Requires-Dist: langchain-anthropic; extra == 'agents'
72
+ Requires-Dist: langchain-openai; extra == 'agents'
73
+ Requires-Dist: numpy>=1.24.0; extra == 'agents'
74
+ Requires-Dist: openai; extra == 'agents'
64
75
  Provides-Extra: dev
65
76
  Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
66
77
  Requires-Dist: anthropic; extra == 'dev'
@@ -2,6 +2,8 @@
2
2
 
3
3
  A browser automation environment for the HUD platform demonstrating best practices for building MCP (Model Context Protocol) environments with evaluation systems.
4
4
 
5
+ **Key Feature**: This environment is **hot-reloadable** - it maintains state (running services, browser sessions, launched apps) across server restarts during development.
6
+
5
7
  ## Quick Start
6
8
 
7
9
  ### Build & Deploy
@@ -14,6 +16,34 @@ docker build -t hud-browser .
14
16
  docker run --rm -i -p 8080:8080 hud-browser
15
17
  ```
16
18
 
19
+ ### Hot-Reloadable Architecture
20
+
21
+ This environment uses a persistent context server architecture that maintains state across MCP server restarts:
22
+
23
+ - **Context Server**: Runs as a separate process holding ServiceManager and state
24
+ - **MCP Server**: Connects via Unix socket, can restart without losing services
25
+ - **State Preservation**: X11, VNC, running apps, and service states persist
26
+ - **Development Friendly**: Edit code and restart MCP server instantly
27
+
28
+ #### Docker Architecture
29
+
30
+ The environment uses a single CMD that follows the proven text_2048 pattern:
31
+
32
+ ```dockerfile
33
+ CMD ["sh", "-c", "\
34
+ # Start services in background \
35
+ python -m hud_controller.context_server & \
36
+ x11vnc ... & \
37
+ # Run MCP server in foreground \
38
+ exec hud-controller mcp \
39
+ "]
40
+ ```
41
+
42
+ This pattern ensures:
43
+ - Background services (`&`) start once and persist
44
+ - Only the `exec` command gets wrapped by watchfiles
45
+ - Services survive hot-reloads during development
46
+
17
47
  ## Deployment to Registry
18
48
 
19
49
  ### 1. Publish to Docker Registry
@@ -169,10 +199,11 @@ Set these in your environment/Docker configuration:
169
199
 
170
200
  ```
171
201
  Docker Container
172
- ├── start.sh # Service startup orchestration
173
202
  ├── MCP Server (FastMCP) # Protocol implementation
174
203
  │ ├── Tools # setup, evaluate, computer, etc.
175
- │ └── Resources # Dynamic registry discovery
204
+ │ └── Resources # Dynamic registry discovery
205
+ ├── Context Server # Persistent state management
206
+ │ └── PersistentContext # Maintains services & browser state
176
207
  ├── Services
177
208
  │ ├── X11 (Xvfb) # Virtual display
178
209
  │ ├── VNC + Websockify # Remote access
@@ -188,8 +219,7 @@ Docker Container
188
219
 
189
220
  ```
190
221
  browser/
191
- ├── Dockerfile # Multi-stage build with optimization
192
- ├── start.sh # Service startup script
222
+ ├── Dockerfile # Multi-stage build with integrated startup
193
223
  ├── apps/ # Launchable web applications
194
224
  │ ├── todo/ # Example app with evaluation APIs
195
225
  │ └── 2048/ # 2048 game app
@@ -197,6 +227,8 @@ browser/
197
227
  │ ├── server.py # FastMCP server + resource definitions
198
228
  │ ├── services.py # Service management
199
229
  │ ├── context.py # Environment context
230
+ │ ├── context_server.py # Persistent context server
231
+ │ ├── persistent_context.py # State persistence wrapper
200
232
  │ ├── evaluators/ # Evaluation system
201
233
  │ ├── setup/ # Setup system
202
234
  │ └── problems/ # Problem definitions
@@ -205,7 +237,7 @@ browser/
205
237
 
206
238
  ## Development Workflow
207
239
 
208
- ### Hot-Reload Development with `hud mcp`
240
+ ### Hot-Reload Development with `hud dev`
209
241
 
210
242
  For rapid iteration without Docker rebuilds:
211
243
 
@@ -214,7 +246,7 @@ For rapid iteration without Docker rebuilds:
214
246
  cd environments/browser
215
247
 
216
248
  # Start hot-reload development proxy
217
- hud mcp . --build
249
+ hud dev . --build
218
250
 
219
251
  # This will:
220
252
  # - Build/use hud-browser:dev image
@@ -225,6 +257,21 @@ hud mcp . --build
225
257
 
226
258
  Add the URL from output to Cursor settings or click the deeplink. Now you can edit code in `src/` and changes apply instantly!
227
259
 
260
+ #### How Hot-Reloading Works
261
+
262
+ This environment uses a persistent context server pattern:
263
+
264
+ 1. **Context Server**: A separate Python process maintains state (services, browser, apps)
265
+ 2. **Socket Communication**: MCP server connects via Unix socket `/tmp/hud_browser_ctx.sock`
266
+ 3. **State Preservation**: X11, VNC, browser sessions, and launched apps persist across reloads
267
+ 4. **Automatic Recovery**: On reload, the server reconnects to existing services
268
+
269
+ This means you can:
270
+ - Edit code and have changes apply immediately
271
+ - Keep browser sessions and apps running
272
+ - Maintain VNC connections
273
+ - Preserve test state between iterations
274
+
228
275
  ### Traditional Development Steps
229
276
 
230
277
  1. **Start with apps** - Build your web applications independently
@@ -392,4 +439,9 @@ When creating new MCP environments:
392
439
  6. **Update service dependencies** in `services.py` as needed
393
440
  7. **Extend Dockerfile** with your environment's requirements
394
441
 
442
+ For hot-reloadability:
443
+ - Keep complex objects out of the persistent context
444
+ - Only store simple, picklable state
445
+ - Recreate tools and clients on each server start
446
+
395
447
  See `src/hud_controller/README.md` for detailed implementation guidance.
@@ -3,25 +3,20 @@ name = "hud-controller"
3
3
  version = "0.1.0"
4
4
  description = "HUD Controller for browser environments with MCP tools"
5
5
  requires-python = ">=3.11,<3.14"
6
- dependencies = [
7
- "hud-python @ git+https://github.com/hud-evals/hud-python.git@l/text-2048",
8
- "playwright",
9
- "pyautogui",
10
- "httpx",
11
- "typer",
12
- ]
13
-
14
- [project.scripts]
15
- hud-controller = "hud_controller.__main__:main"
6
+ dependencies = [ "hud-python", "playwright", "pyautogui", "httpx", "typer",]
16
7
 
17
8
  [build-system]
18
- requires = ["hatchling"]
9
+ requires = [ "hatchling",]
19
10
  build-backend = "hatchling.build"
20
11
 
21
- [tool.hatch.build.targets.wheel]
22
- packages = ["src/hud_controller"]
12
+ [project.scripts]
13
+ hud-controller = "hud_controller.__main__:main"
14
+
15
+ [tool.hud]
16
+ image = "hud-browser:dev"
23
17
 
24
18
  [tool.hatch.metadata]
25
19
  allow-direct-references = true
26
20
 
27
-
21
+ [tool.hatch.build.targets.wheel]
22
+ packages = [ "src/hud_controller",]
@@ -55,7 +55,7 @@ class EvaluatorRegistry:
55
55
  def create_evaluator(cls, spec, context): pass
56
56
  ```
57
57
 
58
- ### BrowserEnvironmentContext
58
+ ### BrowserContext
59
59
 
60
60
  Unified interface for environment interactions:
61
61
  - `call_app_api(app, endpoint, method, data)` - Call app backend API
@@ -34,7 +34,7 @@ docker run --rm -i \
34
34
 
35
35
  Development mode allows you to edit code locally and see changes immediately without rebuilding.
36
36
 
37
- #### Option 1: Using `hud mcp` (Recommended)
37
+ #### Option 1: Using `hud dev` (Recommended)
38
38
 
39
39
  The easiest way to develop with hot-reload:
40
40
 
@@ -44,7 +44,7 @@ export BROWSER_PROVIDER=anchorbrowser
44
44
  export ANCHOR_API_KEY=your-api-key
45
45
 
46
46
  # Start development proxy
47
- hud mcp . --build
47
+ hud dev . --build
48
48
 
49
49
  # This will:
50
50
  # - Build/use hud-remote-browser:dev image
@@ -57,13 +57,13 @@ The agent will play 2048 and try to reach a target tile using the available tool
57
57
 
58
58
  ## Development Mode
59
59
 
60
- ### Option 1: Using `hud mcp` (Recommended)
60
+ ### Option 1: Using `hud dev` (Recommended)
61
61
 
62
62
  The easiest way to develop with hot-reload:
63
63
 
64
64
  ```bash
65
65
  # Start development proxy
66
- hud mcp . --build
66
+ hud dev . --build
67
67
 
68
68
  # This will:
69
69
  # - Build/use hud-text-2048:dev image
@@ -85,6 +85,7 @@ class MCPAgent(ABC):
85
85
  self._tool_map: dict[str, types.Tool] = {} # Simplified: just name to tool
86
86
  self.screenshot_history: list[str] = []
87
87
  self._auto_trace = auto_trace
88
+ self._auto_trace_cm: Any | None = None # Store auto-created trace context manager
88
89
  self.initialization_complete = False
89
90
 
90
91
  # Response agent to automatically interact with the model
@@ -303,6 +304,9 @@ class MCPAgent(ABC):
303
304
  except Exception as e:
304
305
  logger.warning("ResponseAgent failed: %s", e)
305
306
  if decision == "STOP":
307
+ # Try to submit response through lifecycle tool
308
+ await self._maybe_submit_response(response, messages)
309
+
306
310
  logger.info("Stopping execution")
307
311
  final_response = response
308
312
  break
@@ -483,6 +487,40 @@ class MCPAgent(ABC):
483
487
  self._available_tools.append(tool)
484
488
  # Simplified mapping - just tool name to tool
485
489
  self._tool_map[tool.name] = tool
490
+
491
+ # Auto-detect response tool as a lifecycle tool
492
+ if tool.name == "response" and "response" not in self.lifecycle_tools:
493
+ logger.debug("Auto-detected 'response' tool as a lifecycle tool")
494
+ self.lifecycle_tools.append("response")
495
+
496
+ async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
497
+ """Submit response through lifecycle tool if available.
498
+
499
+ Args:
500
+ response: The agent's response
501
+ messages: The current message history (will be modified in-place)
502
+ """
503
+ # Check if we have a response lifecycle tool
504
+ if "response" in self.lifecycle_tools and "response" in self._tool_map:
505
+ logger.debug("Calling response lifecycle tool")
506
+ try:
507
+ # Call the response tool with the agent's response
508
+ response_tool_call = MCPToolCall(
509
+ name="response",
510
+ arguments={"response": response.content, "messages": messages}
511
+ )
512
+ response_results = await self.call_tools(response_tool_call)
513
+
514
+ # Format and add the response tool results to messages
515
+ response_messages = await self.format_tool_results(
516
+ [response_tool_call], response_results
517
+ )
518
+ messages.extend(response_messages)
519
+
520
+ # Mark the task as done
521
+ logger.info("Response lifecycle tool executed, marking task as done")
522
+ except Exception as e:
523
+ logger.error("Response lifecycle tool failed: %s", e)
486
524
 
487
525
  async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
488
526
  """Inject metadata into the metadata of the initialize request."""
@@ -491,7 +529,7 @@ class MCPAgent(ABC):
491
529
  mcp_config,
492
530
  MCPConfigPatch(meta=self.metadata),
493
531
  )
494
- setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
532
+ self._auto_trace_cm = setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
495
533
 
496
534
  def get_available_tools(self) -> list[types.Tool]:
497
535
  """Get list of available MCP tools for LLM use (excludes lifecycle tools)."""
@@ -532,6 +570,17 @@ class MCPAgent(ABC):
532
570
 
533
571
  async def _cleanup(self) -> None:
534
572
  """Cleanup resources."""
573
+ # Clean up auto-created trace if any
574
+ if self._auto_trace_cm:
575
+ try:
576
+ self._auto_trace_cm.__exit__(None, None, None)
577
+ logger.info("Closed auto-created trace")
578
+ except Exception as e:
579
+ logger.warning("Failed to close auto-created trace: %s", e)
580
+ finally:
581
+ self._auto_trace_cm = None
582
+
583
+ # Clean up auto-created client
535
584
  if self._auto_created_client and self.mcp_client:
536
585
  try:
537
586
  await self.mcp_client.shutdown()
@@ -23,9 +23,11 @@ from .clone import clone_repository, get_clone_message, print_error, print_tutor
23
23
  from .cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
24
24
  from .debug import debug_mcp_stdio
25
25
  from .init import create_environment
26
+ from . import list_func as list_module
26
27
  from .mcp_server import run_mcp_dev_server
27
28
  from .pull import pull_command
28
29
  from .push import push_command
30
+ from .remove import remove_command
29
31
  from .utils import CaptureLogger
30
32
 
31
33
  # Create the main Typer app
@@ -129,7 +131,7 @@ def analyze(
129
131
  def debug(
130
132
  params: list[str] = typer.Argument( # type: ignore[arg-type] # noqa: B008
131
133
  None,
132
- help="Docker image followed by optional Docker run arguments (e.g., 'hud-image:latest -e KEY=value')", # noqa: E501
134
+ help="Docker image, environment directory, or config file followed by optional Docker arguments", # noqa: E501
133
135
  ),
134
136
  config: Path = typer.Option( # noqa: B008
135
137
  None,
@@ -145,6 +147,12 @@ def debug(
145
147
  "--cursor",
146
148
  help="Debug a server from Cursor config",
147
149
  ),
150
+ build: bool = typer.Option(
151
+ False,
152
+ "--build",
153
+ "-b",
154
+ help="Build image before debugging (for directory mode)",
155
+ ),
148
156
  max_phase: int = typer.Option(
149
157
  5,
150
158
  "--max-phase",
@@ -157,15 +165,24 @@ def debug(
157
165
  """🐛 Debug MCP environment - test initialization, tools, and readiness.
158
166
 
159
167
  Examples:
160
- hud debug hud-text-2048:latest
161
- hud debug my-mcp-server:v1 -e API_KEY=xxx -p 8080:8080
168
+ hud debug . # Debug current directory
169
+ hud debug environments/browser # Debug specific directory
170
+ hud debug . --build # Build then debug
171
+ hud debug hud-text-2048:latest # Debug Docker image
172
+ hud debug my-mcp-server:v1 -e API_KEY=xxx
162
173
  hud debug --config mcp-config.json
163
174
  hud debug --cursor text-2048-dev
164
- hud debug hud-browser:dev --max-phase 3
175
+ hud debug . --max-phase 3 # Stop after phase 3
165
176
  """
166
-
177
+ # Import here to avoid circular imports
178
+ from .env_utils import get_image_name, is_environment_directory, build_environment, image_exists
179
+ from hud.utils.design import HUDDesign
180
+
181
+ design = HUDDesign()
182
+
167
183
  # Determine the command to run
168
184
  command = None
185
+ docker_args = []
169
186
 
170
187
  if config:
171
188
  # Load config from JSON file
@@ -183,13 +200,44 @@ def debug(
183
200
  console.print(f"[red]❌ {error or 'Failed to parse cursor config'}[/red]")
184
201
  raise typer.Exit(1)
185
202
  elif params:
186
- image, *docker_args = params
187
- # Build Docker command
188
- command = ["docker", "run", "--rm", "-i", *docker_args, image]
203
+ first_param = params[0]
204
+ docker_args = params[1:] if len(params) > 1 else []
205
+
206
+ # Check if it's a directory
207
+ if Path(first_param).exists() and is_environment_directory(first_param):
208
+ # Directory mode - like hud dev
209
+ directory = first_param
210
+
211
+ # Get or generate image name
212
+ image_name, source = get_image_name(directory)
213
+
214
+ if source == "auto":
215
+ design.info(f"Auto-generated image name: {image_name}")
216
+
217
+ # Build if requested or if image doesn't exist
218
+ if build or not image_exists(image_name):
219
+ if not build and not image_exists(image_name):
220
+ if typer.confirm(f"Image {image_name} not found. Build it now?"):
221
+ build = True
222
+ else:
223
+ raise typer.Exit(1)
224
+
225
+ if build:
226
+ if not build_environment(directory, image_name):
227
+ raise typer.Exit(1)
228
+
229
+ # Build Docker command
230
+ command = ["docker", "run", "--rm", "-i", *docker_args, image_name]
231
+ else:
232
+ # Assume it's an image name
233
+ image = first_param
234
+ command = ["docker", "run", "--rm", "-i", *docker_args, image]
189
235
  else:
190
- console.print("[red]Error: Must specify either a Docker image, --config, or --cursor[/red]")
236
+ console.print("[red]Error: Must specify a directory, Docker image, --config, or --cursor[/red]")
191
237
  console.print("\nExamples:")
192
- console.print(" hud debug hud-text-2048:latest")
238
+ console.print(" hud debug . # Debug current directory")
239
+ console.print(" hud debug environments/browser # Debug specific directory")
240
+ console.print(" hud debug hud-text-2048:latest # Debug Docker image")
193
241
  console.print(" hud debug --config mcp-config.json")
194
242
  console.print(" hud debug --cursor my-server")
195
243
  raise typer.Exit(1)
@@ -442,7 +490,8 @@ def run(
442
490
 
443
491
  # Get URL from options or environment
444
492
  if not url:
445
- url = os.getenv("HUD_MCP_URL", "https://mcp.hud.so/v3/mcp")
493
+ from hud.settings import settings
494
+ url = settings.hud_mcp_url
446
495
 
447
496
  run_remote_server(image, docker_args, transport, port, url, api_key, run_id, verbose)
448
497
 
@@ -561,6 +610,63 @@ def pull(
561
610
  pull_command(target, lock_file, yes, verify_only, verbose)
562
611
 
563
612
 
613
+ @app.command(name="list")
614
+ def list_environments(
615
+ filter_name: str | None = typer.Option(
616
+ None, "--filter", "-f", help="Filter environments by name (case-insensitive)"
617
+ ),
618
+ json_output: bool = typer.Option(
619
+ False, "--json", help="Output as JSON"
620
+ ),
621
+ show_all: bool = typer.Option(
622
+ False, "--all", "-a", help="Show all columns including digest"
623
+ ),
624
+ verbose: bool = typer.Option(
625
+ False, "--verbose", "-v", help="Show detailed output"
626
+ ),
627
+ ) -> None:
628
+ """📋 List all HUD environments in local registry.
629
+
630
+ Shows environments pulled with 'hud pull' stored in ~/.hud/envs/
631
+
632
+ Examples:
633
+ hud list # List all environments
634
+ hud list --filter text # Filter by name
635
+ hud list --json # Output as JSON
636
+ hud list --all # Show digest column
637
+ hud list --verbose # Show full descriptions
638
+ """
639
+ list_module.list_command(filter_name, json_output, show_all, verbose)
640
+
641
+
642
+ @app.command()
643
+ def remove(
644
+ target: str | None = typer.Argument(
645
+ None,
646
+ help="Environment to remove (digest, name, or 'all' for all environments)"
647
+ ),
648
+ yes: bool = typer.Option(
649
+ False, "--yes", "-y", help="Skip confirmation prompt"
650
+ ),
651
+ verbose: bool = typer.Option(
652
+ False, "--verbose", "-v", help="Show detailed output"
653
+ ),
654
+ ) -> None:
655
+ """🗑️ Remove HUD environments from local registry.
656
+
657
+ Removes environment metadata from ~/.hud/envs/
658
+ Note: This does not remove the Docker images.
659
+
660
+ Examples:
661
+ hud remove abc123 # Remove by digest
662
+ hud remove text_2048 # Remove by name
663
+ hud remove hudpython/test_init # Remove by full name
664
+ hud remove all # Remove all environments
665
+ hud remove all --yes # Remove all without confirmation
666
+ """
667
+ remove_command(target, yes, verbose)
668
+
669
+
564
670
  @app.command()
565
671
  def init(
566
672
  name: str = typer.Argument(None, help="Environment name (default: current directory name)"),
@@ -592,6 +698,76 @@ def quickstart() -> None:
592
698
  clone("https://github.com/hud-evals/quickstart.git")
593
699
 
594
700
 
701
+ @app.command()
702
+ def eval(
703
+ source: str = typer.Argument(
704
+ ...,
705
+ help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file",
706
+ ),
707
+ full: bool = typer.Option(
708
+ False,
709
+ "--full",
710
+ help="Run the entire dataset (omit for single-task debug mode)",
711
+ ),
712
+ agent: str = typer.Option(
713
+ "claude",
714
+ "--agent",
715
+ help="Agent backend to use (claude or openai)",
716
+ ),
717
+ model: str | None = typer.Option(
718
+ None,
719
+ "--model",
720
+ help="Model name for the chosen agent",
721
+ ),
722
+ allowed_tools: str | None = typer.Option(
723
+ None,
724
+ "--allowed-tools",
725
+ help="Comma-separated list of allowed tools",
726
+ ),
727
+ max_concurrent: int = typer.Option(
728
+ 30,
729
+ "--max-concurrent",
730
+ help="Concurrency level for full-dataset mode",
731
+ ),
732
+ max_steps: int = typer.Option(
733
+ 30,
734
+ "--max-steps",
735
+ help="Maximum steps per task (default: 10 for single, 50 for full)",
736
+ ),
737
+ ) -> None:
738
+ """🚀 Run evaluation on datasets or individual tasks with agents."""
739
+ # Validate agent choice
740
+ valid_agents = ["claude", "openai"]
741
+ if agent not in valid_agents:
742
+ from hud.utils.design import HUDDesign
743
+ design = HUDDesign()
744
+ design.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
745
+ raise typer.Exit(1)
746
+
747
+ # Import eval_command lazily to avoid importing agent dependencies
748
+ try:
749
+ from .eval import eval_command
750
+ except ImportError as e:
751
+ from hud.utils.design import HUDDesign
752
+ design = HUDDesign()
753
+ design.error(
754
+ "Evaluation dependencies are not installed. "
755
+ "Please install with: pip install 'hud-python[agent]'"
756
+ )
757
+ raise typer.Exit(1) from e
758
+
759
+ # Run the command
760
+ eval_command(
761
+ source=source,
762
+ full=full,
763
+ agent=agent, # type: ignore
764
+ model=model,
765
+ allowed_tools=allowed_tools,
766
+ max_concurrent=max_concurrent,
767
+ max_steps=max_steps,
768
+ )
769
+
770
+
595
771
  def main() -> None:
596
772
  """Main entry point for the CLI."""
597
773
  # Show header for main help