hud-python 0.4.8__tar.gz → 0.4.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.8 → hud_python-0.4.10}/.gitignore +3 -1
- {hud_python-0.4.8 → hud_python-0.4.10}/PKG-INFO +12 -1
- {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/README.md +58 -6
- {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/pyproject.toml +9 -14
- {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/src/hud_controller/README.md +1 -1
- {hud_python-0.4.8 → hud_python-0.4.10}/environments/remote_browser/README.md +2 -2
- {hud_python-0.4.8 → hud_python-0.4.10}/environments/text_2048/README.md +2 -2
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/base.py +50 -1
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/__init__.py +187 -11
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/analyze_metadata.py +33 -42
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/build.py +7 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/debug.py +8 -1
- hud_python-0.4.10/hud/cli/env_utils.py +133 -0
- hud_python-0.4.10/hud/cli/eval.py +302 -0
- hud_python-0.4.10/hud/cli/list_func.py +213 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/mcp_server.py +3 -79
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/pull.py +20 -15
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/push.py +84 -41
- hud_python-0.4.10/hud/cli/registry.py +155 -0
- hud_python-0.4.10/hud/cli/remove.py +200 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/runner.py +1 -1
- hud_python-0.4.10/hud/cli/tests/test_analyze_metadata.py +277 -0
- hud_python-0.4.10/hud/cli/tests/test_build.py +450 -0
- hud_python-0.4.10/hud/cli/tests/test_list_func.py +288 -0
- hud_python-0.4.10/hud/cli/tests/test_pull.py +400 -0
- hud_python-0.4.10/hud/cli/tests/test_push.py +379 -0
- hud_python-0.4.10/hud/cli/tests/test_registry.py +264 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/base.py +13 -1
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/__init__.py +2 -0
- hud_python-0.4.10/hud/tools/response.py +54 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/design.py +10 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/mcp.py +14 -2
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/version.py +1 -1
- {hud_python-0.4.8 → hud_python-0.4.10}/pyproject.toml +4 -1
- {hud_python-0.4.8 → hud_python-0.4.10}/rl/README.md +10 -18
- {hud_python-0.4.8 → hud_python-0.4.10}/LICENSE +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/README.md +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/environments/README.md +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/apps/2048/README.md +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/apps/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/apps/README.md +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/apps/todo/README.md +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/environments/browser/apps/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/examples/README.md +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/claude.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/openai.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/openai_chat_generic.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/tests/test_base.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/clone.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/cursor.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/docker_utils.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/init.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/interactive.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/remote_runner.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_mcp_server.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/cli/utils.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/README.md +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/fastmcp.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/mcp_use.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/datasets.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/collector.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/config.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/context.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/exporters.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/instrumentation.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/processors.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/tests/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/py.typed +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/server/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/server/context.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/server/low_level.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/server/server.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/settings.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/shared/exceptions.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/shared/requests.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/shared/tests/test_exceptions.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/telemetry/job.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/telemetry/trace.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/base.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/bash.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/edit.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/executors/base.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/types.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/tools/utils.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/types.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/progress.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.8 → hud_python-0.4.10}/rl/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.10
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -38,6 +38,7 @@ Requires-Python: <3.14,>=3.11
|
|
|
38
38
|
Requires-Dist: fastmcp>=2.11.2
|
|
39
39
|
Requires-Dist: httpx<1,>=0.23.0
|
|
40
40
|
Requires-Dist: hud-mcp-python-sdk>=0.1.0
|
|
41
|
+
Requires-Dist: mcp>=1.13.1
|
|
41
42
|
Requires-Dist: opentelemetry-api>=1.34.1
|
|
42
43
|
Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
|
|
43
44
|
Requires-Dist: opentelemetry-instrumentation-mcp>=0.44.1
|
|
@@ -61,6 +62,16 @@ Requires-Dist: langchain-anthropic; extra == 'agent'
|
|
|
61
62
|
Requires-Dist: langchain-openai; extra == 'agent'
|
|
62
63
|
Requires-Dist: numpy>=1.24.0; extra == 'agent'
|
|
63
64
|
Requires-Dist: openai; extra == 'agent'
|
|
65
|
+
Provides-Extra: agents
|
|
66
|
+
Requires-Dist: anthropic; extra == 'agents'
|
|
67
|
+
Requires-Dist: datasets>=2.14.0; extra == 'agents'
|
|
68
|
+
Requires-Dist: dotenv>=0.9.9; extra == 'agents'
|
|
69
|
+
Requires-Dist: hud-mcp-use-python-sdk>=0.1.0; extra == 'agents'
|
|
70
|
+
Requires-Dist: langchain; extra == 'agents'
|
|
71
|
+
Requires-Dist: langchain-anthropic; extra == 'agents'
|
|
72
|
+
Requires-Dist: langchain-openai; extra == 'agents'
|
|
73
|
+
Requires-Dist: numpy>=1.24.0; extra == 'agents'
|
|
74
|
+
Requires-Dist: openai; extra == 'agents'
|
|
64
75
|
Provides-Extra: dev
|
|
65
76
|
Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
|
|
66
77
|
Requires-Dist: anthropic; extra == 'dev'
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
A browser automation environment for the HUD platform demonstrating best practices for building MCP (Model Context Protocol) environments with evaluation systems.
|
|
4
4
|
|
|
5
|
+
**Key Feature**: This environment is **hot-reloadable** - it maintains state (running services, browser sessions, launched apps) across server restarts during development.
|
|
6
|
+
|
|
5
7
|
## Quick Start
|
|
6
8
|
|
|
7
9
|
### Build & Deploy
|
|
@@ -14,6 +16,34 @@ docker build -t hud-browser .
|
|
|
14
16
|
docker run --rm -i -p 8080:8080 hud-browser
|
|
15
17
|
```
|
|
16
18
|
|
|
19
|
+
### Hot-Reloadable Architecture
|
|
20
|
+
|
|
21
|
+
This environment uses a persistent context server architecture that maintains state across MCP server restarts:
|
|
22
|
+
|
|
23
|
+
- **Context Server**: Runs as a separate process holding ServiceManager and state
|
|
24
|
+
- **MCP Server**: Connects via Unix socket, can restart without losing services
|
|
25
|
+
- **State Preservation**: X11, VNC, running apps, and service states persist
|
|
26
|
+
- **Development Friendly**: Edit code and restart MCP server instantly
|
|
27
|
+
|
|
28
|
+
#### Docker Architecture
|
|
29
|
+
|
|
30
|
+
The environment uses a single CMD that follows the proven text_2048 pattern:
|
|
31
|
+
|
|
32
|
+
```dockerfile
|
|
33
|
+
CMD ["sh", "-c", "\
|
|
34
|
+
# Start services in background \
|
|
35
|
+
python -m hud_controller.context_server & \
|
|
36
|
+
x11vnc ... & \
|
|
37
|
+
# Run MCP server in foreground \
|
|
38
|
+
exec hud-controller mcp \
|
|
39
|
+
"]
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
This pattern ensures:
|
|
43
|
+
- Background services (`&`) start once and persist
|
|
44
|
+
- Only the `exec` command gets wrapped by watchfiles
|
|
45
|
+
- Services survive hot-reloads during development
|
|
46
|
+
|
|
17
47
|
## Deployment to Registry
|
|
18
48
|
|
|
19
49
|
### 1. Publish to Docker Registry
|
|
@@ -169,10 +199,11 @@ Set these in your environment/Docker configuration:
|
|
|
169
199
|
|
|
170
200
|
```
|
|
171
201
|
Docker Container
|
|
172
|
-
├── start.sh # Service startup orchestration
|
|
173
202
|
├── MCP Server (FastMCP) # Protocol implementation
|
|
174
203
|
│ ├── Tools # setup, evaluate, computer, etc.
|
|
175
|
-
│ └── Resources
|
|
204
|
+
│ └── Resources # Dynamic registry discovery
|
|
205
|
+
├── Context Server # Persistent state management
|
|
206
|
+
│ └── PersistentContext # Maintains services & browser state
|
|
176
207
|
├── Services
|
|
177
208
|
│ ├── X11 (Xvfb) # Virtual display
|
|
178
209
|
│ ├── VNC + Websockify # Remote access
|
|
@@ -188,8 +219,7 @@ Docker Container
|
|
|
188
219
|
|
|
189
220
|
```
|
|
190
221
|
browser/
|
|
191
|
-
├── Dockerfile # Multi-stage build with
|
|
192
|
-
├── start.sh # Service startup script
|
|
222
|
+
├── Dockerfile # Multi-stage build with integrated startup
|
|
193
223
|
├── apps/ # Launchable web applications
|
|
194
224
|
│ ├── todo/ # Example app with evaluation APIs
|
|
195
225
|
│ └── 2048/ # 2048 game app
|
|
@@ -197,6 +227,8 @@ browser/
|
|
|
197
227
|
│ ├── server.py # FastMCP server + resource definitions
|
|
198
228
|
│ ├── services.py # Service management
|
|
199
229
|
│ ├── context.py # Environment context
|
|
230
|
+
│ ├── context_server.py # Persistent context server
|
|
231
|
+
│ ├── persistent_context.py # State persistence wrapper
|
|
200
232
|
│ ├── evaluators/ # Evaluation system
|
|
201
233
|
│ ├── setup/ # Setup system
|
|
202
234
|
│ └── problems/ # Problem definitions
|
|
@@ -205,7 +237,7 @@ browser/
|
|
|
205
237
|
|
|
206
238
|
## Development Workflow
|
|
207
239
|
|
|
208
|
-
### Hot-Reload Development with `hud
|
|
240
|
+
### Hot-Reload Development with `hud dev`
|
|
209
241
|
|
|
210
242
|
For rapid iteration without Docker rebuilds:
|
|
211
243
|
|
|
@@ -214,7 +246,7 @@ For rapid iteration without Docker rebuilds:
|
|
|
214
246
|
cd environments/browser
|
|
215
247
|
|
|
216
248
|
# Start hot-reload development proxy
|
|
217
|
-
hud
|
|
249
|
+
hud dev . --build
|
|
218
250
|
|
|
219
251
|
# This will:
|
|
220
252
|
# - Build/use hud-browser:dev image
|
|
@@ -225,6 +257,21 @@ hud mcp . --build
|
|
|
225
257
|
|
|
226
258
|
Add the URL from output to Cursor settings or click the deeplink. Now you can edit code in `src/` and changes apply instantly!
|
|
227
259
|
|
|
260
|
+
#### How Hot-Reloading Works
|
|
261
|
+
|
|
262
|
+
This environment uses a persistent context server pattern:
|
|
263
|
+
|
|
264
|
+
1. **Context Server**: A separate Python process maintains state (services, browser, apps)
|
|
265
|
+
2. **Socket Communication**: MCP server connects via Unix socket `/tmp/hud_browser_ctx.sock`
|
|
266
|
+
3. **State Preservation**: X11, VNC, browser sessions, and launched apps persist across reloads
|
|
267
|
+
4. **Automatic Recovery**: On reload, the server reconnects to existing services
|
|
268
|
+
|
|
269
|
+
This means you can:
|
|
270
|
+
- Edit code and have changes apply immediately
|
|
271
|
+
- Keep browser sessions and apps running
|
|
272
|
+
- Maintain VNC connections
|
|
273
|
+
- Preserve test state between iterations
|
|
274
|
+
|
|
228
275
|
### Traditional Development Steps
|
|
229
276
|
|
|
230
277
|
1. **Start with apps** - Build your web applications independently
|
|
@@ -392,4 +439,9 @@ When creating new MCP environments:
|
|
|
392
439
|
6. **Update service dependencies** in `services.py` as needed
|
|
393
440
|
7. **Extend Dockerfile** with your environment's requirements
|
|
394
441
|
|
|
442
|
+
For hot-reloadability:
|
|
443
|
+
- Keep complex objects out of the persistent context
|
|
444
|
+
- Only store simple, picklable state
|
|
445
|
+
- Recreate tools and clients on each server start
|
|
446
|
+
|
|
395
447
|
See `src/hud_controller/README.md` for detailed implementation guidance.
|
|
@@ -3,25 +3,20 @@ name = "hud-controller"
|
|
|
3
3
|
version = "0.1.0"
|
|
4
4
|
description = "HUD Controller for browser environments with MCP tools"
|
|
5
5
|
requires-python = ">=3.11,<3.14"
|
|
6
|
-
dependencies = [
|
|
7
|
-
"hud-python @ git+https://github.com/hud-evals/hud-python.git@l/text-2048",
|
|
8
|
-
"playwright",
|
|
9
|
-
"pyautogui",
|
|
10
|
-
"httpx",
|
|
11
|
-
"typer",
|
|
12
|
-
]
|
|
13
|
-
|
|
14
|
-
[project.scripts]
|
|
15
|
-
hud-controller = "hud_controller.__main__:main"
|
|
6
|
+
dependencies = [ "hud-python", "playwright", "pyautogui", "httpx", "typer",]
|
|
16
7
|
|
|
17
8
|
[build-system]
|
|
18
|
-
requires = ["hatchling"]
|
|
9
|
+
requires = [ "hatchling",]
|
|
19
10
|
build-backend = "hatchling.build"
|
|
20
11
|
|
|
21
|
-
[
|
|
22
|
-
|
|
12
|
+
[project.scripts]
|
|
13
|
+
hud-controller = "hud_controller.__main__:main"
|
|
14
|
+
|
|
15
|
+
[tool.hud]
|
|
16
|
+
image = "hud-browser:dev"
|
|
23
17
|
|
|
24
18
|
[tool.hatch.metadata]
|
|
25
19
|
allow-direct-references = true
|
|
26
20
|
|
|
27
|
-
|
|
21
|
+
[tool.hatch.build.targets.wheel]
|
|
22
|
+
packages = [ "src/hud_controller",]
|
|
@@ -55,7 +55,7 @@ class EvaluatorRegistry:
|
|
|
55
55
|
def create_evaluator(cls, spec, context): pass
|
|
56
56
|
```
|
|
57
57
|
|
|
58
|
-
###
|
|
58
|
+
### BrowserContext
|
|
59
59
|
|
|
60
60
|
Unified interface for environment interactions:
|
|
61
61
|
- `call_app_api(app, endpoint, method, data)` - Call app backend API
|
|
@@ -34,7 +34,7 @@ docker run --rm -i \
|
|
|
34
34
|
|
|
35
35
|
Development mode allows you to edit code locally and see changes immediately without rebuilding.
|
|
36
36
|
|
|
37
|
-
#### Option 1: Using `hud
|
|
37
|
+
#### Option 1: Using `hud dev` (Recommended)
|
|
38
38
|
|
|
39
39
|
The easiest way to develop with hot-reload:
|
|
40
40
|
|
|
@@ -44,7 +44,7 @@ export BROWSER_PROVIDER=anchorbrowser
|
|
|
44
44
|
export ANCHOR_API_KEY=your-api-key
|
|
45
45
|
|
|
46
46
|
# Start development proxy
|
|
47
|
-
hud
|
|
47
|
+
hud dev . --build
|
|
48
48
|
|
|
49
49
|
# This will:
|
|
50
50
|
# - Build/use hud-remote-browser:dev image
|
|
@@ -57,13 +57,13 @@ The agent will play 2048 and try to reach a target tile using the available tool
|
|
|
57
57
|
|
|
58
58
|
## Development Mode
|
|
59
59
|
|
|
60
|
-
### Option 1: Using `hud
|
|
60
|
+
### Option 1: Using `hud dev` (Recommended)
|
|
61
61
|
|
|
62
62
|
The easiest way to develop with hot-reload:
|
|
63
63
|
|
|
64
64
|
```bash
|
|
65
65
|
# Start development proxy
|
|
66
|
-
hud
|
|
66
|
+
hud dev . --build
|
|
67
67
|
|
|
68
68
|
# This will:
|
|
69
69
|
# - Build/use hud-text-2048:dev image
|
|
@@ -85,6 +85,7 @@ class MCPAgent(ABC):
|
|
|
85
85
|
self._tool_map: dict[str, types.Tool] = {} # Simplified: just name to tool
|
|
86
86
|
self.screenshot_history: list[str] = []
|
|
87
87
|
self._auto_trace = auto_trace
|
|
88
|
+
self._auto_trace_cm: Any | None = None # Store auto-created trace context manager
|
|
88
89
|
self.initialization_complete = False
|
|
89
90
|
|
|
90
91
|
# Response agent to automatically interact with the model
|
|
@@ -303,6 +304,9 @@ class MCPAgent(ABC):
|
|
|
303
304
|
except Exception as e:
|
|
304
305
|
logger.warning("ResponseAgent failed: %s", e)
|
|
305
306
|
if decision == "STOP":
|
|
307
|
+
# Try to submit response through lifecycle tool
|
|
308
|
+
await self._maybe_submit_response(response, messages)
|
|
309
|
+
|
|
306
310
|
logger.info("Stopping execution")
|
|
307
311
|
final_response = response
|
|
308
312
|
break
|
|
@@ -483,6 +487,40 @@ class MCPAgent(ABC):
|
|
|
483
487
|
self._available_tools.append(tool)
|
|
484
488
|
# Simplified mapping - just tool name to tool
|
|
485
489
|
self._tool_map[tool.name] = tool
|
|
490
|
+
|
|
491
|
+
# Auto-detect response tool as a lifecycle tool
|
|
492
|
+
if tool.name == "response" and "response" not in self.lifecycle_tools:
|
|
493
|
+
logger.debug("Auto-detected 'response' tool as a lifecycle tool")
|
|
494
|
+
self.lifecycle_tools.append("response")
|
|
495
|
+
|
|
496
|
+
async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
|
|
497
|
+
"""Submit response through lifecycle tool if available.
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
response: The agent's response
|
|
501
|
+
messages: The current message history (will be modified in-place)
|
|
502
|
+
"""
|
|
503
|
+
# Check if we have a response lifecycle tool
|
|
504
|
+
if "response" in self.lifecycle_tools and "response" in self._tool_map:
|
|
505
|
+
logger.debug("Calling response lifecycle tool")
|
|
506
|
+
try:
|
|
507
|
+
# Call the response tool with the agent's response
|
|
508
|
+
response_tool_call = MCPToolCall(
|
|
509
|
+
name="response",
|
|
510
|
+
arguments={"response": response.content, "messages": messages}
|
|
511
|
+
)
|
|
512
|
+
response_results = await self.call_tools(response_tool_call)
|
|
513
|
+
|
|
514
|
+
# Format and add the response tool results to messages
|
|
515
|
+
response_messages = await self.format_tool_results(
|
|
516
|
+
[response_tool_call], response_results
|
|
517
|
+
)
|
|
518
|
+
messages.extend(response_messages)
|
|
519
|
+
|
|
520
|
+
# Mark the task as done
|
|
521
|
+
logger.info("Response lifecycle tool executed, marking task as done")
|
|
522
|
+
except Exception as e:
|
|
523
|
+
logger.error("Response lifecycle tool failed: %s", e)
|
|
486
524
|
|
|
487
525
|
async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
|
|
488
526
|
"""Inject metadata into the metadata of the initialize request."""
|
|
@@ -491,7 +529,7 @@ class MCPAgent(ABC):
|
|
|
491
529
|
mcp_config,
|
|
492
530
|
MCPConfigPatch(meta=self.metadata),
|
|
493
531
|
)
|
|
494
|
-
setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
|
|
532
|
+
self._auto_trace_cm = setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
|
|
495
533
|
|
|
496
534
|
def get_available_tools(self) -> list[types.Tool]:
|
|
497
535
|
"""Get list of available MCP tools for LLM use (excludes lifecycle tools)."""
|
|
@@ -532,6 +570,17 @@ class MCPAgent(ABC):
|
|
|
532
570
|
|
|
533
571
|
async def _cleanup(self) -> None:
|
|
534
572
|
"""Cleanup resources."""
|
|
573
|
+
# Clean up auto-created trace if any
|
|
574
|
+
if self._auto_trace_cm:
|
|
575
|
+
try:
|
|
576
|
+
self._auto_trace_cm.__exit__(None, None, None)
|
|
577
|
+
logger.info("Closed auto-created trace")
|
|
578
|
+
except Exception as e:
|
|
579
|
+
logger.warning("Failed to close auto-created trace: %s", e)
|
|
580
|
+
finally:
|
|
581
|
+
self._auto_trace_cm = None
|
|
582
|
+
|
|
583
|
+
# Clean up auto-created client
|
|
535
584
|
if self._auto_created_client and self.mcp_client:
|
|
536
585
|
try:
|
|
537
586
|
await self.mcp_client.shutdown()
|
|
@@ -23,9 +23,11 @@ from .clone import clone_repository, get_clone_message, print_error, print_tutor
|
|
|
23
23
|
from .cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
|
|
24
24
|
from .debug import debug_mcp_stdio
|
|
25
25
|
from .init import create_environment
|
|
26
|
+
from . import list_func as list_module
|
|
26
27
|
from .mcp_server import run_mcp_dev_server
|
|
27
28
|
from .pull import pull_command
|
|
28
29
|
from .push import push_command
|
|
30
|
+
from .remove import remove_command
|
|
29
31
|
from .utils import CaptureLogger
|
|
30
32
|
|
|
31
33
|
# Create the main Typer app
|
|
@@ -129,7 +131,7 @@ def analyze(
|
|
|
129
131
|
def debug(
|
|
130
132
|
params: list[str] = typer.Argument( # type: ignore[arg-type] # noqa: B008
|
|
131
133
|
None,
|
|
132
|
-
help="Docker image followed by optional Docker
|
|
134
|
+
help="Docker image, environment directory, or config file followed by optional Docker arguments", # noqa: E501
|
|
133
135
|
),
|
|
134
136
|
config: Path = typer.Option( # noqa: B008
|
|
135
137
|
None,
|
|
@@ -145,6 +147,12 @@ def debug(
|
|
|
145
147
|
"--cursor",
|
|
146
148
|
help="Debug a server from Cursor config",
|
|
147
149
|
),
|
|
150
|
+
build: bool = typer.Option(
|
|
151
|
+
False,
|
|
152
|
+
"--build",
|
|
153
|
+
"-b",
|
|
154
|
+
help="Build image before debugging (for directory mode)",
|
|
155
|
+
),
|
|
148
156
|
max_phase: int = typer.Option(
|
|
149
157
|
5,
|
|
150
158
|
"--max-phase",
|
|
@@ -157,15 +165,24 @@ def debug(
|
|
|
157
165
|
"""🐛 Debug MCP environment - test initialization, tools, and readiness.
|
|
158
166
|
|
|
159
167
|
Examples:
|
|
160
|
-
hud debug
|
|
161
|
-
hud debug
|
|
168
|
+
hud debug . # Debug current directory
|
|
169
|
+
hud debug environments/browser # Debug specific directory
|
|
170
|
+
hud debug . --build # Build then debug
|
|
171
|
+
hud debug hud-text-2048:latest # Debug Docker image
|
|
172
|
+
hud debug my-mcp-server:v1 -e API_KEY=xxx
|
|
162
173
|
hud debug --config mcp-config.json
|
|
163
174
|
hud debug --cursor text-2048-dev
|
|
164
|
-
hud debug
|
|
175
|
+
hud debug . --max-phase 3 # Stop after phase 3
|
|
165
176
|
"""
|
|
166
|
-
|
|
177
|
+
# Import here to avoid circular imports
|
|
178
|
+
from .env_utils import get_image_name, is_environment_directory, build_environment, image_exists
|
|
179
|
+
from hud.utils.design import HUDDesign
|
|
180
|
+
|
|
181
|
+
design = HUDDesign()
|
|
182
|
+
|
|
167
183
|
# Determine the command to run
|
|
168
184
|
command = None
|
|
185
|
+
docker_args = []
|
|
169
186
|
|
|
170
187
|
if config:
|
|
171
188
|
# Load config from JSON file
|
|
@@ -183,13 +200,44 @@ def debug(
|
|
|
183
200
|
console.print(f"[red]❌ {error or 'Failed to parse cursor config'}[/red]")
|
|
184
201
|
raise typer.Exit(1)
|
|
185
202
|
elif params:
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
203
|
+
first_param = params[0]
|
|
204
|
+
docker_args = params[1:] if len(params) > 1 else []
|
|
205
|
+
|
|
206
|
+
# Check if it's a directory
|
|
207
|
+
if Path(first_param).exists() and is_environment_directory(first_param):
|
|
208
|
+
# Directory mode - like hud dev
|
|
209
|
+
directory = first_param
|
|
210
|
+
|
|
211
|
+
# Get or generate image name
|
|
212
|
+
image_name, source = get_image_name(directory)
|
|
213
|
+
|
|
214
|
+
if source == "auto":
|
|
215
|
+
design.info(f"Auto-generated image name: {image_name}")
|
|
216
|
+
|
|
217
|
+
# Build if requested or if image doesn't exist
|
|
218
|
+
if build or not image_exists(image_name):
|
|
219
|
+
if not build and not image_exists(image_name):
|
|
220
|
+
if typer.confirm(f"Image {image_name} not found. Build it now?"):
|
|
221
|
+
build = True
|
|
222
|
+
else:
|
|
223
|
+
raise typer.Exit(1)
|
|
224
|
+
|
|
225
|
+
if build:
|
|
226
|
+
if not build_environment(directory, image_name):
|
|
227
|
+
raise typer.Exit(1)
|
|
228
|
+
|
|
229
|
+
# Build Docker command
|
|
230
|
+
command = ["docker", "run", "--rm", "-i", *docker_args, image_name]
|
|
231
|
+
else:
|
|
232
|
+
# Assume it's an image name
|
|
233
|
+
image = first_param
|
|
234
|
+
command = ["docker", "run", "--rm", "-i", *docker_args, image]
|
|
189
235
|
else:
|
|
190
|
-
console.print("[red]Error: Must specify
|
|
236
|
+
console.print("[red]Error: Must specify a directory, Docker image, --config, or --cursor[/red]")
|
|
191
237
|
console.print("\nExamples:")
|
|
192
|
-
console.print(" hud debug
|
|
238
|
+
console.print(" hud debug . # Debug current directory")
|
|
239
|
+
console.print(" hud debug environments/browser # Debug specific directory")
|
|
240
|
+
console.print(" hud debug hud-text-2048:latest # Debug Docker image")
|
|
193
241
|
console.print(" hud debug --config mcp-config.json")
|
|
194
242
|
console.print(" hud debug --cursor my-server")
|
|
195
243
|
raise typer.Exit(1)
|
|
@@ -442,7 +490,8 @@ def run(
|
|
|
442
490
|
|
|
443
491
|
# Get URL from options or environment
|
|
444
492
|
if not url:
|
|
445
|
-
|
|
493
|
+
from hud.settings import settings
|
|
494
|
+
url = settings.hud_mcp_url
|
|
446
495
|
|
|
447
496
|
run_remote_server(image, docker_args, transport, port, url, api_key, run_id, verbose)
|
|
448
497
|
|
|
@@ -561,6 +610,63 @@ def pull(
|
|
|
561
610
|
pull_command(target, lock_file, yes, verify_only, verbose)
|
|
562
611
|
|
|
563
612
|
|
|
613
|
+
@app.command(name="list")
|
|
614
|
+
def list_environments(
|
|
615
|
+
filter_name: str | None = typer.Option(
|
|
616
|
+
None, "--filter", "-f", help="Filter environments by name (case-insensitive)"
|
|
617
|
+
),
|
|
618
|
+
json_output: bool = typer.Option(
|
|
619
|
+
False, "--json", help="Output as JSON"
|
|
620
|
+
),
|
|
621
|
+
show_all: bool = typer.Option(
|
|
622
|
+
False, "--all", "-a", help="Show all columns including digest"
|
|
623
|
+
),
|
|
624
|
+
verbose: bool = typer.Option(
|
|
625
|
+
False, "--verbose", "-v", help="Show detailed output"
|
|
626
|
+
),
|
|
627
|
+
) -> None:
|
|
628
|
+
"""📋 List all HUD environments in local registry.
|
|
629
|
+
|
|
630
|
+
Shows environments pulled with 'hud pull' stored in ~/.hud/envs/
|
|
631
|
+
|
|
632
|
+
Examples:
|
|
633
|
+
hud list # List all environments
|
|
634
|
+
hud list --filter text # Filter by name
|
|
635
|
+
hud list --json # Output as JSON
|
|
636
|
+
hud list --all # Show digest column
|
|
637
|
+
hud list --verbose # Show full descriptions
|
|
638
|
+
"""
|
|
639
|
+
list_module.list_command(filter_name, json_output, show_all, verbose)
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
@app.command()
|
|
643
|
+
def remove(
|
|
644
|
+
target: str | None = typer.Argument(
|
|
645
|
+
None,
|
|
646
|
+
help="Environment to remove (digest, name, or 'all' for all environments)"
|
|
647
|
+
),
|
|
648
|
+
yes: bool = typer.Option(
|
|
649
|
+
False, "--yes", "-y", help="Skip confirmation prompt"
|
|
650
|
+
),
|
|
651
|
+
verbose: bool = typer.Option(
|
|
652
|
+
False, "--verbose", "-v", help="Show detailed output"
|
|
653
|
+
),
|
|
654
|
+
) -> None:
|
|
655
|
+
"""🗑️ Remove HUD environments from local registry.
|
|
656
|
+
|
|
657
|
+
Removes environment metadata from ~/.hud/envs/
|
|
658
|
+
Note: This does not remove the Docker images.
|
|
659
|
+
|
|
660
|
+
Examples:
|
|
661
|
+
hud remove abc123 # Remove by digest
|
|
662
|
+
hud remove text_2048 # Remove by name
|
|
663
|
+
hud remove hudpython/test_init # Remove by full name
|
|
664
|
+
hud remove all # Remove all environments
|
|
665
|
+
hud remove all --yes # Remove all without confirmation
|
|
666
|
+
"""
|
|
667
|
+
remove_command(target, yes, verbose)
|
|
668
|
+
|
|
669
|
+
|
|
564
670
|
@app.command()
|
|
565
671
|
def init(
|
|
566
672
|
name: str = typer.Argument(None, help="Environment name (default: current directory name)"),
|
|
@@ -592,6 +698,76 @@ def quickstart() -> None:
|
|
|
592
698
|
clone("https://github.com/hud-evals/quickstart.git")
|
|
593
699
|
|
|
594
700
|
|
|
701
|
+
@app.command()
|
|
702
|
+
def eval(
|
|
703
|
+
source: str = typer.Argument(
|
|
704
|
+
...,
|
|
705
|
+
help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file",
|
|
706
|
+
),
|
|
707
|
+
full: bool = typer.Option(
|
|
708
|
+
False,
|
|
709
|
+
"--full",
|
|
710
|
+
help="Run the entire dataset (omit for single-task debug mode)",
|
|
711
|
+
),
|
|
712
|
+
agent: str = typer.Option(
|
|
713
|
+
"claude",
|
|
714
|
+
"--agent",
|
|
715
|
+
help="Agent backend to use (claude or openai)",
|
|
716
|
+
),
|
|
717
|
+
model: str | None = typer.Option(
|
|
718
|
+
None,
|
|
719
|
+
"--model",
|
|
720
|
+
help="Model name for the chosen agent",
|
|
721
|
+
),
|
|
722
|
+
allowed_tools: str | None = typer.Option(
|
|
723
|
+
None,
|
|
724
|
+
"--allowed-tools",
|
|
725
|
+
help="Comma-separated list of allowed tools",
|
|
726
|
+
),
|
|
727
|
+
max_concurrent: int = typer.Option(
|
|
728
|
+
30,
|
|
729
|
+
"--max-concurrent",
|
|
730
|
+
help="Concurrency level for full-dataset mode",
|
|
731
|
+
),
|
|
732
|
+
max_steps: int = typer.Option(
|
|
733
|
+
30,
|
|
734
|
+
"--max-steps",
|
|
735
|
+
help="Maximum steps per task (default: 10 for single, 50 for full)",
|
|
736
|
+
),
|
|
737
|
+
) -> None:
|
|
738
|
+
"""🚀 Run evaluation on datasets or individual tasks with agents."""
|
|
739
|
+
# Validate agent choice
|
|
740
|
+
valid_agents = ["claude", "openai"]
|
|
741
|
+
if agent not in valid_agents:
|
|
742
|
+
from hud.utils.design import HUDDesign
|
|
743
|
+
design = HUDDesign()
|
|
744
|
+
design.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
|
|
745
|
+
raise typer.Exit(1)
|
|
746
|
+
|
|
747
|
+
# Import eval_command lazily to avoid importing agent dependencies
|
|
748
|
+
try:
|
|
749
|
+
from .eval import eval_command
|
|
750
|
+
except ImportError as e:
|
|
751
|
+
from hud.utils.design import HUDDesign
|
|
752
|
+
design = HUDDesign()
|
|
753
|
+
design.error(
|
|
754
|
+
"Evaluation dependencies are not installed. "
|
|
755
|
+
"Please install with: pip install 'hud-python[agent]'"
|
|
756
|
+
)
|
|
757
|
+
raise typer.Exit(1) from e
|
|
758
|
+
|
|
759
|
+
# Run the command
|
|
760
|
+
eval_command(
|
|
761
|
+
source=source,
|
|
762
|
+
full=full,
|
|
763
|
+
agent=agent, # type: ignore
|
|
764
|
+
model=model,
|
|
765
|
+
allowed_tools=allowed_tools,
|
|
766
|
+
max_concurrent=max_concurrent,
|
|
767
|
+
max_steps=max_steps,
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
|
|
595
771
|
def main() -> None:
|
|
596
772
|
"""Main entry point for the CLI."""
|
|
597
773
|
# Show header for main help
|