hud-python 0.4.12__tar.gz → 0.4.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.12 → hud_python-0.4.14}/PKG-INFO +6 -7
- {hud_python-0.4.12 → hud_python-0.4.14}/environments/README.md +12 -12
- hud_python-0.4.14/environments/browser/README.md +213 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/environments/remote_browser/pyproject.toml +1 -1
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/__init__.py +8 -1
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/dev.py +41 -13
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/eval.py +36 -31
- hud_python-0.4.14/hud/cli/init.py +658 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/list_func.py +1 -1
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/clients/fastmcp.py +2 -12
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/clients/mcp_use.py +1 -7
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/otel/instrumentation.py +5 -1
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/server/server.py +1 -1
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/version.py +1 -1
- {hud_python-0.4.12 → hud_python-0.4.14}/pyproject.toml +4 -5
- hud_python-0.4.12/environments/browser/README.md +0 -447
- hud_python-0.4.12/environments/browser/src/hud_controller/README.md +0 -117
- hud_python-0.4.12/hud/cli/init.py +0 -279
- {hud_python-0.4.12 → hud_python-0.4.14}/.gitignore +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/LICENSE +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/README.md +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/environments/browser/apps/2048/README.md +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/environments/browser/apps/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/environments/browser/apps/README.md +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/environments/browser/apps/todo/README.md +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/environments/browser/apps/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/environments/browser/pyproject.toml +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/examples/README.md +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/__main__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/agents/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/agents/base.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/agents/claude.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/agents/openai.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/agents/openai_chat_generic.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/agents/tests/test_base.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/build.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/clone.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/debug.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/pull.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/push.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/remove.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_build.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_mcp_server.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/utils/docker.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/utils/interactive.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/utils/runner.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/cli/utils/server.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/clients/README.md +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/clients/base.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/datasets.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/otel/collector.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/otel/config.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/otel/context.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/otel/exporters.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/otel/processors.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/otel/tests/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/py.typed +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/server/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/server/context.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/server/low_level.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/settings.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/shared/exceptions.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/shared/requests.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/shared/tests/test_exceptions.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/telemetry/job.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/telemetry/tests/test_trace.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/telemetry/trace.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/base.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/bash.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/edit.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/executors/base.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/response.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/types.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/tools/utils.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/types.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/utils/design.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/utils/mcp.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/utils/progress.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/utils/tests/test_mcp.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/rl/README.md +0 -0
- {hud_python-0.4.12 → hud_python-0.4.14}/rl/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.14
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -35,10 +35,9 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
35
35
|
Classifier: Programming Language :: Python :: 3.12
|
|
36
36
|
Classifier: Programming Language :: Python :: 3.13
|
|
37
37
|
Requires-Python: <3.14,>=3.11
|
|
38
|
-
Requires-Dist: fastmcp>=2.11.2
|
|
39
38
|
Requires-Dist: httpx<1,>=0.23.0
|
|
40
|
-
Requires-Dist: hud-
|
|
41
|
-
Requires-Dist: mcp>=
|
|
39
|
+
Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
|
|
40
|
+
Requires-Dist: hud-mcp-python-sdk>=3.13.2
|
|
42
41
|
Requires-Dist: opentelemetry-api>=1.34.1
|
|
43
42
|
Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
|
|
44
43
|
Requires-Dist: opentelemetry-instrumentation-mcp>=0.44.1
|
|
@@ -56,6 +55,7 @@ Provides-Extra: agent
|
|
|
56
55
|
Requires-Dist: anthropic; extra == 'agent'
|
|
57
56
|
Requires-Dist: datasets>=2.14.0; extra == 'agent'
|
|
58
57
|
Requires-Dist: dotenv>=0.9.9; extra == 'agent'
|
|
58
|
+
Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agent'
|
|
59
59
|
Requires-Dist: ipykernel; extra == 'agent'
|
|
60
60
|
Requires-Dist: ipython<9; extra == 'agent'
|
|
61
61
|
Requires-Dist: jupyter-client; extra == 'agent'
|
|
@@ -63,13 +63,13 @@ Requires-Dist: jupyter-core; extra == 'agent'
|
|
|
63
63
|
Requires-Dist: langchain; extra == 'agent'
|
|
64
64
|
Requires-Dist: langchain-anthropic; extra == 'agent'
|
|
65
65
|
Requires-Dist: langchain-openai; extra == 'agent'
|
|
66
|
-
Requires-Dist: mcp-use; extra == 'agent'
|
|
67
66
|
Requires-Dist: numpy>=1.24.0; extra == 'agent'
|
|
68
67
|
Requires-Dist: openai; extra == 'agent'
|
|
69
68
|
Provides-Extra: agents
|
|
70
69
|
Requires-Dist: anthropic; extra == 'agents'
|
|
71
70
|
Requires-Dist: datasets>=2.14.0; extra == 'agents'
|
|
72
71
|
Requires-Dist: dotenv>=0.9.9; extra == 'agents'
|
|
72
|
+
Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agents'
|
|
73
73
|
Requires-Dist: ipykernel; extra == 'agents'
|
|
74
74
|
Requires-Dist: ipython<9; extra == 'agents'
|
|
75
75
|
Requires-Dist: jupyter-client; extra == 'agents'
|
|
@@ -77,7 +77,6 @@ Requires-Dist: jupyter-core; extra == 'agents'
|
|
|
77
77
|
Requires-Dist: langchain; extra == 'agents'
|
|
78
78
|
Requires-Dist: langchain-anthropic; extra == 'agents'
|
|
79
79
|
Requires-Dist: langchain-openai; extra == 'agents'
|
|
80
|
-
Requires-Dist: mcp-use; extra == 'agents'
|
|
81
80
|
Requires-Dist: numpy>=1.24.0; extra == 'agents'
|
|
82
81
|
Requires-Dist: openai; extra == 'agents'
|
|
83
82
|
Provides-Extra: dev
|
|
@@ -85,6 +84,7 @@ Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
|
|
|
85
84
|
Requires-Dist: anthropic; extra == 'dev'
|
|
86
85
|
Requires-Dist: datasets>=2.14.0; extra == 'dev'
|
|
87
86
|
Requires-Dist: dotenv>=0.9.9; extra == 'dev'
|
|
87
|
+
Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'dev'
|
|
88
88
|
Requires-Dist: inspect-ai>=0.3.80; extra == 'dev'
|
|
89
89
|
Requires-Dist: ipykernel; extra == 'dev'
|
|
90
90
|
Requires-Dist: ipython<9; extra == 'dev'
|
|
@@ -93,7 +93,6 @@ Requires-Dist: jupyter-core; extra == 'dev'
|
|
|
93
93
|
Requires-Dist: langchain; extra == 'dev'
|
|
94
94
|
Requires-Dist: langchain-anthropic; extra == 'dev'
|
|
95
95
|
Requires-Dist: langchain-openai; extra == 'dev'
|
|
96
|
-
Requires-Dist: mcp-use; extra == 'dev'
|
|
97
96
|
Requires-Dist: numpy>=1.24.0; extra == 'dev'
|
|
98
97
|
Requires-Dist: openai; extra == 'dev'
|
|
99
98
|
Requires-Dist: pillow>=11.1.0; extra == 'dev'
|
|
@@ -351,7 +351,7 @@ from . import basic, advanced # This registers all @setup.tool() decorated func
|
|
|
351
351
|
|
|
352
352
|
# In setup/basic.py
|
|
353
353
|
from . import setup
|
|
354
|
-
from
|
|
354
|
+
from mcp.types import TextContent
|
|
355
355
|
|
|
356
356
|
@setup.tool()
|
|
357
357
|
async def reset(**kwargs):
|
|
@@ -361,14 +361,14 @@ async def reset(**kwargs):
|
|
|
361
361
|
**kwargs: Additional parameters
|
|
362
362
|
|
|
363
363
|
Returns:
|
|
364
|
-
|
|
364
|
+
TextContent
|
|
365
365
|
"""
|
|
366
366
|
# Access environment from the hub
|
|
367
367
|
env = setup.env
|
|
368
368
|
await env.reset_state()
|
|
369
|
-
return
|
|
370
|
-
|
|
371
|
-
|
|
369
|
+
return TextContent(
|
|
370
|
+
text="Environment reset to initial state",
|
|
371
|
+
type="text"
|
|
372
372
|
)
|
|
373
373
|
|
|
374
374
|
@setup.tool()
|
|
@@ -379,14 +379,14 @@ async def seed_data(num_items: int = 5):
|
|
|
379
379
|
num_items: Number of items to create
|
|
380
380
|
|
|
381
381
|
Returns:
|
|
382
|
-
|
|
382
|
+
TextContent
|
|
383
383
|
"""
|
|
384
384
|
# Access environment from the hub
|
|
385
385
|
env = setup.env
|
|
386
386
|
items = await env.create_items(num_items)
|
|
387
|
-
return
|
|
388
|
-
|
|
389
|
-
|
|
387
|
+
return TextContent(
|
|
388
|
+
text=f"Created {len(items)} items",
|
|
389
|
+
type="text"
|
|
390
390
|
)
|
|
391
391
|
|
|
392
392
|
# In evaluate/__init__.py
|
|
@@ -827,13 +827,13 @@ Before making changes:
|
|
|
827
827
|
```python
|
|
828
828
|
# In setup/my_new_setup.py
|
|
829
829
|
from . import setup
|
|
830
|
-
from hud.tools import BaseSetup,
|
|
830
|
+
from hud.tools import BaseSetup, TextContent
|
|
831
831
|
|
|
832
832
|
@setup("my_new_setup", description="Clear description of what this does")
|
|
833
833
|
class MyNewSetup(BaseSetup):
|
|
834
|
-
async def __call__(self, context, param1: str, param2: int = 10) ->
|
|
834
|
+
async def __call__(self, context, param1: str, param2: int = 10) -> TextContent:
|
|
835
835
|
# Implementation
|
|
836
|
-
return
|
|
836
|
+
return TextContent(...)
|
|
837
837
|
```
|
|
838
838
|
|
|
839
839
|
**Adding New Evaluators**
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
# Browser Environment
|
|
2
|
+
|
|
3
|
+
A browser automation environment for HUD that provides GUI access and web app interaction capabilities. This environment supports hot-reloading during development while maintaining persistent state.
|
|
4
|
+
|
|
5
|
+
## Architecture Overview
|
|
6
|
+
|
|
7
|
+
The browser environment uses a two-process architecture:
|
|
8
|
+
|
|
9
|
+
1. **Context Server** (`context.py`): Long-running process that maintains persistent state
|
|
10
|
+
2. **MCP Server** (`server.py`): Hot-reloadable process that handles tool requests
|
|
11
|
+
|
|
12
|
+
### Key Components
|
|
13
|
+
|
|
14
|
+
- **BrowserContext**: Stores persistent state (running apps, ports, playwright instance)
|
|
15
|
+
- **ServiceManager**: Manages X11, VNC, and app processes
|
|
16
|
+
- **BaseHub Tools**: Setup and evaluate tools organized by app (2048, todo)
|
|
17
|
+
- **Multiprocessing Proxy**: Enables state sharing between processes
|
|
18
|
+
|
|
19
|
+
## Context Management and Common Pitfalls
|
|
20
|
+
|
|
21
|
+
### Understanding the Proxy System
|
|
22
|
+
|
|
23
|
+
The browser environment uses Python's `multiprocessing.Manager` to share state between the context server and MCP server. This introduces important constraints:
|
|
24
|
+
|
|
25
|
+
#### ❌ Common Pitfall: Unpicklable Objects
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
# BAD: This will fail with "cannot pickle 'coroutine' object"
|
|
29
|
+
@setup.tool("my_tool")
|
|
30
|
+
async def my_tool():
|
|
31
|
+
env = setup.env
|
|
32
|
+
result = await env.call_app_api("app", "/api/endpoint") # Returns coroutine
|
|
33
|
+
# The coroutine can't be serialized through the proxy!
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
#### ✅ Solution: Direct HTTP Calls
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
# GOOD: Make HTTP calls directly
|
|
40
|
+
@setup.tool("my_tool")
|
|
41
|
+
async def my_tool():
|
|
42
|
+
import httpx
|
|
43
|
+
|
|
44
|
+
# Get the backend port from persistent context
|
|
45
|
+
persistent_ctx = setup.env
|
|
46
|
+
backend_port = persistent_ctx.get_app_backend_port("app")
|
|
47
|
+
|
|
48
|
+
# Make API call directly
|
|
49
|
+
url = f"http://localhost:{backend_port}/api/endpoint"
|
|
50
|
+
async with httpx.AsyncClient() as client:
|
|
51
|
+
response = await client.get(url)
|
|
52
|
+
response.raise_for_status()
|
|
53
|
+
result = response.json()
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### State Synchronization Issues
|
|
57
|
+
|
|
58
|
+
#### ❌ Common Pitfall: Direct List/Dict Manipulation
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
# BAD: Regular Python lists don't sync through proxy
|
|
62
|
+
class ServiceManager:
|
|
63
|
+
def __init__(self):
|
|
64
|
+
self._launched_apps = [] # Won't sync!
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
#### ✅ Solution: Store State in Persistent Context
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
# GOOD: Use the persistent context for shared state
|
|
71
|
+
class BrowserContext:
|
|
72
|
+
def __init__(self):
|
|
73
|
+
self._running_apps: List[str] = []
|
|
74
|
+
self._app_ports: Dict[str, Dict[str, int]] = {}
|
|
75
|
+
|
|
76
|
+
def add_running_app(self, app_name: str) -> None:
|
|
77
|
+
"""Add app to running list."""
|
|
78
|
+
if app_name not in self._running_apps:
|
|
79
|
+
self._running_apps.append(app_name)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Accessing Shared Resources
|
|
83
|
+
|
|
84
|
+
#### ❌ Common Pitfall: Direct Attribute Access
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
# BAD: Direct attribute access on proxy objects
|
|
88
|
+
playwright_tool = env.playwright # May not work with proxy
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
#### ✅ Solution: Use Getter Methods
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
# GOOD: Use proxy-friendly getter methods
|
|
95
|
+
playwright_tool = persistent_ctx.get_playwright_tool()
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Best Practices
|
|
99
|
+
|
|
100
|
+
### 1. Tool Implementation Pattern
|
|
101
|
+
|
|
102
|
+
All setup and evaluate tools should follow this pattern:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
@setup.tool("tool_name")
|
|
106
|
+
async def tool_name(param1: type, param2: type):
|
|
107
|
+
"""Tool description."""
|
|
108
|
+
try:
|
|
109
|
+
# Get persistent context
|
|
110
|
+
persistent_ctx = setup.env # or evaluate.env
|
|
111
|
+
|
|
112
|
+
# Get app ports
|
|
113
|
+
backend_port = persistent_ctx.get_app_backend_port("app_name")
|
|
114
|
+
|
|
115
|
+
# Make HTTP request
|
|
116
|
+
url = f"http://localhost:{backend_port}/api/endpoint"
|
|
117
|
+
async with httpx.AsyncClient() as client:
|
|
118
|
+
response = await client.method(url, json=data)
|
|
119
|
+
response.raise_for_status()
|
|
120
|
+
result = response.json()
|
|
121
|
+
|
|
122
|
+
# Return result
|
|
123
|
+
return TextContent(
|
|
124
|
+
text=f"Success message",
|
|
125
|
+
type="text"
|
|
126
|
+
)
|
|
127
|
+
except Exception as e:
|
|
128
|
+
logger.error(f"tool_name failed: {e}")
|
|
129
|
+
return TextContent(
|
|
130
|
+
text=f"Failed: {str(e)}",
|
|
131
|
+
type="text"
|
|
132
|
+
)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### 2. App Launch Pattern
|
|
136
|
+
|
|
137
|
+
When launching apps, ensure ports are stored in the persistent context:
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
# In launch_app tool
|
|
141
|
+
app_info = await service_manager.launch_app(app_name)
|
|
142
|
+
|
|
143
|
+
# Store ports in persistent context for later access
|
|
144
|
+
try:
|
|
145
|
+
backend_port = service_manager.get_app_port(app_name)
|
|
146
|
+
frontend_port = service_manager.get_app_frontend_port(app_name)
|
|
147
|
+
persistent_ctx.set_app_ports(app_name, frontend_port, backend_port)
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.error(f"Failed to store ports: {e}")
|
|
150
|
+
|
|
151
|
+
# Track app in persistent context
|
|
152
|
+
persistent_ctx.add_running_app(app_name)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### 3. Import Organization
|
|
156
|
+
|
|
157
|
+
Keep imports at module level:
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
# At top of file
|
|
161
|
+
import logging
|
|
162
|
+
import httpx
|
|
163
|
+
from mcp.types import TextContent
|
|
164
|
+
from . import setup
|
|
165
|
+
|
|
166
|
+
# Not inside functions
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## Troubleshooting
|
|
170
|
+
|
|
171
|
+
### "Cannot pickle 'coroutine' object"
|
|
172
|
+
|
|
173
|
+
**Cause**: Trying to return an async function result through the proxy.
|
|
174
|
+
|
|
175
|
+
**Fix**: Don't use async methods on proxied objects. Make direct HTTP calls instead.
|
|
176
|
+
|
|
177
|
+
### "App not launched" errors
|
|
178
|
+
|
|
179
|
+
**Cause**: State synchronization issue between ServiceManager and persistent context.
|
|
180
|
+
|
|
181
|
+
**Fix**: Ensure `launch_app` stores app info in the persistent context, and setup/evaluate tools check the persistent context's app list.
|
|
182
|
+
|
|
183
|
+
### "Object has no attribute" on proxy objects
|
|
184
|
+
|
|
185
|
+
**Cause**: Direct attribute access on multiprocessing proxy objects.
|
|
186
|
+
|
|
187
|
+
**Fix**: Use getter/setter methods instead of direct attribute access.
|
|
188
|
+
|
|
189
|
+
## Development Workflow
|
|
190
|
+
|
|
191
|
+
1. **Start the environment**: `hud dev`
|
|
192
|
+
2. **Make changes**: Edit tools in `src/hud_controller/`
|
|
193
|
+
3. **Test immediately**: The MCP server hot-reloads automatically
|
|
194
|
+
4. **Check logs**: Look for serialization or proxy errors
|
|
195
|
+
|
|
196
|
+
## Adding New Apps
|
|
197
|
+
|
|
198
|
+
1. Create app directory in `apps/`
|
|
199
|
+
2. Add setup tools in `src/hud_controller/setup/app_name.py`
|
|
200
|
+
3. Add evaluate tools in `src/hud_controller/evaluate/app_name.py`
|
|
201
|
+
4. Follow the HTTP pattern - no `call_app_api` usage
|
|
202
|
+
5. Store app ports in persistent context when launching
|
|
203
|
+
|
|
204
|
+
## Key Files
|
|
205
|
+
|
|
206
|
+
- `context.py`: Persistent state management
|
|
207
|
+
- `server.py`: MCP server and tool definitions
|
|
208
|
+
- `services.py`: Process management for X11, VNC, apps
|
|
209
|
+
- `setup/`: Setup tools organized by app
|
|
210
|
+
- `evaluate/`: Evaluation tools organized by app
|
|
211
|
+
|
|
212
|
+
Remember: When in doubt, make direct HTTP calls and store state in the persistent context!
|
|
213
|
+
|
|
@@ -3,7 +3,7 @@ name = "hud-remote-browser"
|
|
|
3
3
|
version = "0.1.0"
|
|
4
4
|
description = "HUD Remote Browser Controller with MCP tools for cloud browser providers"
|
|
5
5
|
requires-python = ">=3.11,<3.13"
|
|
6
|
-
dependencies = [ "hud-python
|
|
6
|
+
dependencies = [ "hud-python>=0.4.12", "pyautogui", "playwright", "httpx", "typer", "google-api-python-client", "google-auth",]
|
|
7
7
|
|
|
8
8
|
[build-system]
|
|
9
9
|
requires = [ "hatchling",]
|
|
@@ -348,6 +348,11 @@ def dev(
|
|
|
348
348
|
),
|
|
349
349
|
port: int = typer.Option(8765, "--port", "-p", help="HTTP server port (ignored for stdio)"),
|
|
350
350
|
no_reload: bool = typer.Option(False, "--no-reload", help="Disable hot-reload"),
|
|
351
|
+
full_reload: bool = typer.Option(
|
|
352
|
+
False,
|
|
353
|
+
"--full-reload",
|
|
354
|
+
help="Restart entire container on file changes (instead of just server process)",
|
|
355
|
+
),
|
|
351
356
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show server logs"),
|
|
352
357
|
inspector: bool = typer.Option(
|
|
353
358
|
False, "--inspector", help="Launch MCP Inspector (HTTP mode only)"
|
|
@@ -375,12 +380,13 @@ def dev(
|
|
|
375
380
|
hud dev . --inspector # Launch MCP Inspector (HTTP mode only)
|
|
376
381
|
hud dev . --interactive # Launch interactive testing mode (HTTP mode only)
|
|
377
382
|
hud dev . --no-logs # Disable Docker log streaming
|
|
383
|
+
hud dev . --full-reload # Restart entire container on file changes (instead of just server)
|
|
378
384
|
|
|
379
385
|
# With Docker arguments (after all options):
|
|
380
386
|
hud dev . -e BROWSER_PROVIDER=anchorbrowser -e ANCHOR_API_KEY=xxx
|
|
381
387
|
hud dev . -e API_KEY=secret -v /tmp/data:/data --network host
|
|
382
388
|
hud dev . --build -e DEBUG=true --memory 2g
|
|
383
|
-
"""
|
|
389
|
+
""" # noqa: E501
|
|
384
390
|
# Parse directory and Docker arguments
|
|
385
391
|
if params:
|
|
386
392
|
directory = params[0]
|
|
@@ -397,6 +403,7 @@ def dev(
|
|
|
397
403
|
transport,
|
|
398
404
|
port,
|
|
399
405
|
no_reload,
|
|
406
|
+
full_reload,
|
|
400
407
|
verbose,
|
|
401
408
|
inspector,
|
|
402
409
|
no_logs,
|
|
@@ -35,6 +35,7 @@ def create_proxy_server(
|
|
|
35
35
|
directory: str | Path,
|
|
36
36
|
image_name: str,
|
|
37
37
|
no_reload: bool = False,
|
|
38
|
+
full_reload: bool = False,
|
|
38
39
|
verbose: bool = False,
|
|
39
40
|
docker_args: list[str] | None = None,
|
|
40
41
|
interactive: bool = False,
|
|
@@ -48,8 +49,12 @@ def create_proxy_server(
|
|
|
48
49
|
design.warning(f"Could not extract CMD from {image_name}, using default")
|
|
49
50
|
original_cmd = ["python", "-m", "hud_controller.server"]
|
|
50
51
|
|
|
51
|
-
# Generate container name from image
|
|
52
|
-
|
|
52
|
+
# Generate unique container name from image to avoid conflicts between multiple instances
|
|
53
|
+
import os
|
|
54
|
+
|
|
55
|
+
pid = str(os.getpid())[-6:] # Last 6 digits of process ID for uniqueness
|
|
56
|
+
base_name = image_name.replace(":", "-").replace("/", "-")
|
|
57
|
+
container_name = f"{base_name}-{pid}"
|
|
53
58
|
|
|
54
59
|
# Build the docker run command
|
|
55
60
|
docker_cmd = [
|
|
@@ -73,14 +78,20 @@ def create_proxy_server(
|
|
|
73
78
|
if interactive:
|
|
74
79
|
no_reload = True
|
|
75
80
|
|
|
76
|
-
|
|
77
|
-
|
|
81
|
+
# Validate reload options
|
|
82
|
+
if no_reload and full_reload:
|
|
83
|
+
design.warning("Cannot use --full-reload with --no-reload, ignoring --full-reload")
|
|
84
|
+
full_reload = False
|
|
85
|
+
|
|
86
|
+
if not no_reload and not full_reload:
|
|
87
|
+
# Standard hot-reload: inject supervisor for server restart within container
|
|
78
88
|
modified_cmd = inject_supervisor(original_cmd)
|
|
79
89
|
docker_cmd.extend(["--entrypoint", modified_cmd[0]])
|
|
80
90
|
docker_cmd.append(image_name)
|
|
81
91
|
docker_cmd.extend(modified_cmd[1:])
|
|
82
92
|
else:
|
|
83
|
-
# No reload
|
|
93
|
+
# No reload or full reload: use original CMD without supervisor
|
|
94
|
+
# Note: Full reload logic (container restart) would be implemented here in the future
|
|
84
95
|
docker_cmd.append(image_name)
|
|
85
96
|
|
|
86
97
|
# Create configuration following MCPConfig schema
|
|
@@ -96,9 +107,14 @@ def create_proxy_server(
|
|
|
96
107
|
|
|
97
108
|
# Debug output - only if verbose
|
|
98
109
|
if verbose:
|
|
99
|
-
if not no_reload:
|
|
110
|
+
if not no_reload and not full_reload:
|
|
111
|
+
design.info("Mode: Hot-reload (server restart within container)")
|
|
100
112
|
design.info("Watching: /app/src for changes")
|
|
113
|
+
elif full_reload:
|
|
114
|
+
design.info("Mode: Full reload (container restart on file changes)")
|
|
115
|
+
design.info("Note: Full container restart not yet implemented, using no-reload mode")
|
|
101
116
|
else:
|
|
117
|
+
design.info("Mode: No reload")
|
|
102
118
|
design.info("Container will run without hot-reload")
|
|
103
119
|
design.command_example(f"docker logs -f {container_name}", "View container logs")
|
|
104
120
|
|
|
@@ -127,6 +143,7 @@ async def start_mcp_proxy(
|
|
|
127
143
|
transport: str,
|
|
128
144
|
port: int,
|
|
129
145
|
no_reload: bool = False,
|
|
146
|
+
full_reload: bool = False,
|
|
130
147
|
verbose: bool = False,
|
|
131
148
|
inspector: bool = False,
|
|
132
149
|
no_logs: bool = False,
|
|
@@ -212,8 +229,12 @@ async def start_mcp_proxy(
|
|
|
212
229
|
design.error(f"Source directory not found: {src_path}")
|
|
213
230
|
raise click.Abort
|
|
214
231
|
|
|
215
|
-
# Extract container name from the proxy configuration
|
|
216
|
-
|
|
232
|
+
# Extract container name from the proxy configuration (must match create_proxy_server naming)
|
|
233
|
+
import os
|
|
234
|
+
|
|
235
|
+
pid = str(os.getpid())[-6:] # Last 6 digits of process ID for uniqueness
|
|
236
|
+
base_name = image_name.replace(":", "-").replace("/", "-")
|
|
237
|
+
container_name = f"{base_name}-{pid}"
|
|
217
238
|
|
|
218
239
|
# Remove any existing container with the same name (silently)
|
|
219
240
|
# Note: The proxy creates containers on-demand when clients connect
|
|
@@ -347,6 +368,7 @@ async def start_mcp_proxy(
|
|
|
347
368
|
# Always show waiting message
|
|
348
369
|
log_design.info("") # Empty line for spacing
|
|
349
370
|
log_design.progress_message("⏳ Waiting for first client connection to start container...")
|
|
371
|
+
log_design.info(f"📋 Looking for container: {container_name}") # noqa: G004
|
|
350
372
|
|
|
351
373
|
# Keep trying to stream logs - container is created on demand
|
|
352
374
|
has_shown_started = False
|
|
@@ -397,7 +419,8 @@ async def start_mcp_proxy(
|
|
|
397
419
|
|
|
398
420
|
# Show all logs with gold formatting like hud debug
|
|
399
421
|
# Format all logs in gold/dim style like hud debug's stderr
|
|
400
|
-
|
|
422
|
+
# Use stdout console to avoid stderr redirection when not verbose
|
|
423
|
+
log_design._stdout_console.print(
|
|
401
424
|
f"[rgb(192,150,12)]■[/rgb(192,150,12)] {decoded_line}", highlight=False
|
|
402
425
|
)
|
|
403
426
|
|
|
@@ -408,16 +431,19 @@ async def start_mcp_proxy(
|
|
|
408
431
|
await asyncio.sleep(1)
|
|
409
432
|
continue # Loop back to check if container exists
|
|
410
433
|
|
|
411
|
-
except Exception:
|
|
412
|
-
# Some unexpected error
|
|
434
|
+
except Exception as e:
|
|
435
|
+
# Some unexpected error - show it so we can debug
|
|
436
|
+
log_design.warning(f"Failed to stream Docker logs: {e}") # noqa: G004
|
|
413
437
|
if verbose:
|
|
414
|
-
|
|
438
|
+
import traceback
|
|
439
|
+
|
|
440
|
+
log_design.warning(f"Traceback: {traceback.format_exc()}") # noqa: G004
|
|
415
441
|
await asyncio.sleep(1)
|
|
416
442
|
|
|
417
443
|
# CRITICAL: Create proxy AFTER all logging setup to prevent it from resetting logging config
|
|
418
444
|
# This is important because FastMCP might initialize loggers during creation
|
|
419
445
|
proxy = create_proxy_server(
|
|
420
|
-
directory, image_name, no_reload, verbose, docker_args or [], interactive
|
|
446
|
+
directory, image_name, no_reload, full_reload, verbose, docker_args or [], interactive
|
|
421
447
|
)
|
|
422
448
|
|
|
423
449
|
# One more attempt to suppress the FastMCP server log
|
|
@@ -548,6 +574,7 @@ def run_mcp_dev_server(
|
|
|
548
574
|
transport: str = "http",
|
|
549
575
|
port: int = 8765,
|
|
550
576
|
no_reload: bool = False,
|
|
577
|
+
full_reload: bool = False,
|
|
551
578
|
verbose: bool = False,
|
|
552
579
|
inspector: bool = False,
|
|
553
580
|
no_logs: bool = False,
|
|
@@ -706,6 +733,7 @@ def run_mcp_dev_server(
|
|
|
706
733
|
transport,
|
|
707
734
|
port,
|
|
708
735
|
no_reload,
|
|
736
|
+
full_reload,
|
|
709
737
|
verbose,
|
|
710
738
|
inspector,
|
|
711
739
|
no_logs,
|
|
@@ -26,15 +26,6 @@ def build_agent(
|
|
|
26
26
|
"""Create and return the requested agent type."""
|
|
27
27
|
|
|
28
28
|
# Import agents lazily to avoid dependency issues
|
|
29
|
-
try:
|
|
30
|
-
from hud.agents.misc.response_agent import ResponseAgent
|
|
31
|
-
except ImportError as e:
|
|
32
|
-
design.error(
|
|
33
|
-
"Agent dependencies are not installed. "
|
|
34
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
35
|
-
)
|
|
36
|
-
raise typer.Exit(1) from e
|
|
37
|
-
|
|
38
29
|
if agent_type == "openai":
|
|
39
30
|
try:
|
|
40
31
|
from hud.agents import OperatorAgent
|
|
@@ -45,12 +36,12 @@ def build_agent(
|
|
|
45
36
|
)
|
|
46
37
|
raise typer.Exit(1) from e
|
|
47
38
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
39
|
+
if allowed_tools:
|
|
40
|
+
return OperatorAgent(
|
|
41
|
+
allowed_tools=allowed_tools,
|
|
42
|
+
)
|
|
43
|
+
else:
|
|
44
|
+
return OperatorAgent()
|
|
54
45
|
|
|
55
46
|
# Fallback Claude agent (Anthropic)
|
|
56
47
|
try:
|
|
@@ -63,13 +54,16 @@ def build_agent(
|
|
|
63
54
|
raise typer.Exit(1) from e
|
|
64
55
|
|
|
65
56
|
model = model or "claude-sonnet-4-20250514"
|
|
66
|
-
allowed_tools = allowed_tools or ["anthropic_computer"]
|
|
67
57
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
58
|
+
if allowed_tools:
|
|
59
|
+
return ClaudeAgent(
|
|
60
|
+
model=model,
|
|
61
|
+
allowed_tools=allowed_tools,
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
return ClaudeAgent(
|
|
65
|
+
model=model,
|
|
66
|
+
)
|
|
73
67
|
|
|
74
68
|
|
|
75
69
|
async def run_single_task(
|
|
@@ -100,8 +94,8 @@ async def run_single_task(
|
|
|
100
94
|
with open(path) as f: # noqa: ASYNC230
|
|
101
95
|
json_data = json.load(f)
|
|
102
96
|
|
|
103
|
-
# Check if JSON contains
|
|
104
|
-
if isinstance(json_data, list):
|
|
97
|
+
# Check if JSON contains multiple tasks (list with more than 1 task)
|
|
98
|
+
if isinstance(json_data, list) and len(json_data) > 1:
|
|
105
99
|
design.info(f"Found {len(json_data)} tasks in JSON file, running as dataset…")
|
|
106
100
|
|
|
107
101
|
# Build agent class and config for run_dataset
|
|
@@ -118,8 +112,10 @@ async def run_single_task(
|
|
|
118
112
|
raise typer.Exit(1) from e
|
|
119
113
|
|
|
120
114
|
agent_config: dict[str, Any] = {
|
|
121
|
-
"allowed_tools": allowed_tools or ["openai_computer"],
|
|
122
115
|
}
|
|
116
|
+
if allowed_tools:
|
|
117
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
118
|
+
|
|
123
119
|
else:
|
|
124
120
|
try:
|
|
125
121
|
from hud.agents import ClaudeAgent
|
|
@@ -134,8 +130,9 @@ async def run_single_task(
|
|
|
134
130
|
|
|
135
131
|
agent_config = {
|
|
136
132
|
"model": model or "claude-sonnet-4-20250514",
|
|
137
|
-
"allowed_tools": allowed_tools or ["anthropic_computer"],
|
|
138
133
|
}
|
|
134
|
+
if allowed_tools:
|
|
135
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
139
136
|
|
|
140
137
|
# Run as dataset with single-task concurrency to maintain debug behavior
|
|
141
138
|
results = await run_dataset(
|
|
@@ -146,7 +143,6 @@ async def run_single_task(
|
|
|
146
143
|
max_concurrent=1, # Run sequentially for debug mode
|
|
147
144
|
metadata={"source": str(path)},
|
|
148
145
|
max_steps=max_steps,
|
|
149
|
-
auto_respond=True,
|
|
150
146
|
)
|
|
151
147
|
|
|
152
148
|
# Display summary
|
|
@@ -154,8 +150,15 @@ async def run_single_task(
|
|
|
154
150
|
design.success(f"Completed {len(results)} tasks: {successful} successful")
|
|
155
151
|
return
|
|
156
152
|
|
|
157
|
-
# Single task JSON
|
|
158
|
-
|
|
153
|
+
# Single task JSON (either direct object or list with 1 task)
|
|
154
|
+
if isinstance(json_data, list) and len(json_data) == 1:
|
|
155
|
+
design.info("Found 1 task in JSON file, running as single task…")
|
|
156
|
+
task = Task(**json_data[0])
|
|
157
|
+
elif isinstance(json_data, dict):
|
|
158
|
+
task = Task(**json_data)
|
|
159
|
+
else:
|
|
160
|
+
design.error("JSON file must contain a list of tasks when using --full flag")
|
|
161
|
+
raise typer.Exit(1)
|
|
159
162
|
else:
|
|
160
163
|
# Load from HuggingFace dataset
|
|
161
164
|
try:
|
|
@@ -238,8 +241,10 @@ async def run_full_dataset(
|
|
|
238
241
|
raise typer.Exit(1) from e
|
|
239
242
|
|
|
240
243
|
agent_config: dict[str, Any] = {
|
|
241
|
-
"allowed_tools": allowed_tools or ["openai_computer"],
|
|
242
244
|
}
|
|
245
|
+
if allowed_tools:
|
|
246
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
247
|
+
|
|
243
248
|
else:
|
|
244
249
|
try:
|
|
245
250
|
from hud.agents import ClaudeAgent
|
|
@@ -254,8 +259,9 @@ async def run_full_dataset(
|
|
|
254
259
|
|
|
255
260
|
agent_config = {
|
|
256
261
|
"model": model or "claude-sonnet-4-20250514",
|
|
257
|
-
"allowed_tools": allowed_tools or ["anthropic_computer"],
|
|
258
262
|
}
|
|
263
|
+
if allowed_tools:
|
|
264
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
259
265
|
|
|
260
266
|
design.info("🚀 Running evaluation…")
|
|
261
267
|
return await run_dataset(
|
|
@@ -266,7 +272,6 @@ async def run_full_dataset(
|
|
|
266
272
|
max_concurrent=max_concurrent,
|
|
267
273
|
metadata={"dataset": source},
|
|
268
274
|
max_steps=max_steps,
|
|
269
|
-
auto_respond=True,
|
|
270
275
|
)
|
|
271
276
|
|
|
272
277
|
|