hud-python 0.4.20__tar.gz → 0.4.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.20 → hud_python-0.4.21}/PKG-INFO +2 -4
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/__init__.py +7 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/agents/base.py +40 -10
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/agents/claude.py +13 -8
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/agents/tests/test_client.py +6 -27
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/__init__.py +50 -20
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/build.py +3 -44
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/eval.py +25 -6
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/init.py +4 -4
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/push.py +3 -1
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_push.py +6 -6
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/clients/__init__.py +3 -2
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/clients/base.py +20 -9
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/clients/mcp_use.py +44 -22
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/datasets/task.py +6 -2
- hud_python-0.4.21/hud/native/__init__.py +6 -0
- hud_python-0.4.21/hud/native/comparator.py +546 -0
- hud_python-0.4.21/hud/native/tests/__init__.py +1 -0
- hud_python-0.4.21/hud/native/tests/test_comparator.py +539 -0
- hud_python-0.4.21/hud/native/tests/test_native_init.py +79 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/otel/instrumentation.py +0 -2
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/server/server.py +9 -2
- hud_python-0.4.21/hud/shared/exceptions.py +364 -0
- hud_python-0.4.21/hud/shared/hints.py +177 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/shared/requests.py +15 -3
- hud_python-0.4.21/hud/shared/tests/test_exceptions.py +420 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/__init__.py +2 -0
- hud_python-0.4.21/hud/tools/submit.py +66 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/types.py +33 -5
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/utils/design.py +57 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/utils/mcp.py +6 -0
- hud_python-0.4.21/hud/utils/pretty_errors.py +68 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/version.py +1 -1
- {hud_python-0.4.20 → hud_python-0.4.21}/pyproject.toml +2 -3
- hud_python-0.4.20/hud/shared/exceptions.py +0 -191
- hud_python-0.4.20/hud/shared/tests/test_exceptions.py +0 -179
- {hud_python-0.4.20 → hud_python-0.4.21}/.gitignore +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/LICENSE +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/README.md +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/environments/README.md +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/environments/browser/README.md +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/environments/browser/apps/2048/README.md +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/environments/browser/apps/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/environments/browser/apps/README.md +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/environments/browser/apps/todo/README.md +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/environments/browser/apps/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/environments/browser/pyproject.toml +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/examples/README.md +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/__main__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/agents/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/agents/openai.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/agents/openai_chat_generic.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/agents/tests/test_base.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/clone.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/debug.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/dev.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/hf.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/list_func.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/pull.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/remove.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/rl/README.md +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/rl/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/rl/init.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/rl/pod.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/rl/ssh.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/rl/train.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/rl/utils.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_build.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_mcp_server.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/utils/docker.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/utils/interactive.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/utils/runner.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/cli/utils/server.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/clients/README.md +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/clients/fastmcp.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/datasets/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/datasets/execution/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/datasets/execution/parallel.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/datasets/execution/runner.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/datasets/utils.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/otel/collector.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/otel/config.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/otel/context.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/otel/exporters.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/otel/processors.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/otel/tests/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/py.typed +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/server/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/server/context.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/server/low_level.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/settings.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/telemetry/job.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/telemetry/tests/test_trace.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/telemetry/trace.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/base.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/bash.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/edit.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/executors/base.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/response.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/types.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/tools/utils.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/utils/progress.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/utils/tests/test_mcp.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/rl/README.md +0 -0
- {hud_python-0.4.20 → hud_python-0.4.21}/rl/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.21
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -38,6 +38,7 @@ Requires-Python: <3.14,>=3.11
|
|
|
38
38
|
Requires-Dist: httpx<1,>=0.23.0
|
|
39
39
|
Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
|
|
40
40
|
Requires-Dist: hud-mcp-python-sdk>=3.13.2
|
|
41
|
+
Requires-Dist: hud-mcp-use-python-sdk>=2.3.16
|
|
41
42
|
Requires-Dist: opentelemetry-api>=1.34.1
|
|
42
43
|
Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
|
|
43
44
|
Requires-Dist: opentelemetry-instrumentation-mcp>=0.44.1
|
|
@@ -56,7 +57,6 @@ Provides-Extra: agent
|
|
|
56
57
|
Requires-Dist: anthropic; extra == 'agent'
|
|
57
58
|
Requires-Dist: datasets>=2.14.0; extra == 'agent'
|
|
58
59
|
Requires-Dist: dotenv>=0.9.9; extra == 'agent'
|
|
59
|
-
Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agent'
|
|
60
60
|
Requires-Dist: ipykernel; extra == 'agent'
|
|
61
61
|
Requires-Dist: ipython<9; extra == 'agent'
|
|
62
62
|
Requires-Dist: jupyter-client; extra == 'agent'
|
|
@@ -70,7 +70,6 @@ Provides-Extra: agents
|
|
|
70
70
|
Requires-Dist: anthropic; extra == 'agents'
|
|
71
71
|
Requires-Dist: datasets>=2.14.0; extra == 'agents'
|
|
72
72
|
Requires-Dist: dotenv>=0.9.9; extra == 'agents'
|
|
73
|
-
Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agents'
|
|
74
73
|
Requires-Dist: ipykernel; extra == 'agents'
|
|
75
74
|
Requires-Dist: ipython<9; extra == 'agents'
|
|
76
75
|
Requires-Dist: jupyter-client; extra == 'agents'
|
|
@@ -85,7 +84,6 @@ Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
|
|
|
85
84
|
Requires-Dist: anthropic; extra == 'dev'
|
|
86
85
|
Requires-Dist: datasets>=2.14.0; extra == 'dev'
|
|
87
86
|
Requires-Dist: dotenv>=0.9.9; extra == 'dev'
|
|
88
|
-
Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'dev'
|
|
89
87
|
Requires-Dist: inspect-ai>=0.3.80; extra == 'dev'
|
|
90
88
|
Requires-Dist: ipykernel; extra == 'dev'
|
|
91
89
|
Requires-Dist: ipython<9; extra == 'dev'
|
|
@@ -111,10 +111,12 @@ class MCPAgent(ABC):
|
|
|
111
111
|
# Initialize these here so methods can be called before initialize()
|
|
112
112
|
self._available_tools: list[types.Tool] = []
|
|
113
113
|
self._tool_map: dict[str, types.Tool] = {} # Simplified: just name to tool
|
|
114
|
-
self.
|
|
114
|
+
self.response_tool_name = None
|
|
115
|
+
self.initialization_complete = False
|
|
116
|
+
|
|
117
|
+
# Trace
|
|
115
118
|
self._auto_trace = auto_trace
|
|
116
119
|
self._auto_trace_cm: Any | None = None # Store auto-created trace context manager
|
|
117
|
-
self.initialization_complete = False
|
|
118
120
|
|
|
119
121
|
# Response agent to automatically interact with the model
|
|
120
122
|
self.response_agent = response_agent
|
|
@@ -530,6 +532,9 @@ class MCPAgent(ABC):
|
|
|
530
532
|
self._available_tools = []
|
|
531
533
|
self._tool_map = {}
|
|
532
534
|
|
|
535
|
+
# Track response tools by server
|
|
536
|
+
response_tools_by_server: dict[str, str] = {} # server_name -> tool_name
|
|
537
|
+
|
|
533
538
|
for tool in all_tools:
|
|
534
539
|
# Check if tool should be included
|
|
535
540
|
if self.allowed_tools and tool.name not in self.allowed_tools:
|
|
@@ -541,10 +546,36 @@ class MCPAgent(ABC):
|
|
|
541
546
|
# Simplified mapping - just tool name to tool
|
|
542
547
|
self._tool_map[tool.name] = tool
|
|
543
548
|
|
|
544
|
-
#
|
|
545
|
-
if tool.name
|
|
546
|
-
|
|
547
|
-
|
|
549
|
+
# Track response tools
|
|
550
|
+
if "response" in tool.name or tool.name == "response":
|
|
551
|
+
# Extract server name from tool name (e.g., "grader_response" -> "grader")
|
|
552
|
+
if "_" in tool.name:
|
|
553
|
+
server_name = tool.name.split("_", 1)[0]
|
|
554
|
+
response_tools_by_server[server_name] = tool.name
|
|
555
|
+
else:
|
|
556
|
+
response_tools_by_server["_default"] = tool.name
|
|
557
|
+
|
|
558
|
+
# Find the response tool to use (prioritize last server in config)
|
|
559
|
+
if response_tools_by_server and hasattr(self.mcp_client, "mcp_config"):
|
|
560
|
+
# Get server names in order from mcp_config
|
|
561
|
+
server_names = list(self.mcp_client.mcp_config.keys())
|
|
562
|
+
|
|
563
|
+
# Try to find response tool from last server first
|
|
564
|
+
response_tool_name = None
|
|
565
|
+
for server_name in reversed(server_names):
|
|
566
|
+
if server_name in response_tools_by_server:
|
|
567
|
+
response_tool_name = response_tools_by_server[server_name]
|
|
568
|
+
break
|
|
569
|
+
|
|
570
|
+
# Fallback to any response tool
|
|
571
|
+
if not response_tool_name and response_tools_by_server:
|
|
572
|
+
response_tool_name = next(iter(response_tools_by_server.values()))
|
|
573
|
+
|
|
574
|
+
# Add to lifecycle tools if found
|
|
575
|
+
if response_tool_name and response_tool_name not in self.lifecycle_tools:
|
|
576
|
+
self.design.debug(f"Auto-detected '{response_tool_name}' tool as a lifecycle tool")
|
|
577
|
+
self.response_tool_name = response_tool_name
|
|
578
|
+
self.lifecycle_tools.append(response_tool_name)
|
|
548
579
|
|
|
549
580
|
# Check if all required tools are available
|
|
550
581
|
if self.required_tools:
|
|
@@ -565,13 +596,12 @@ class MCPAgent(ABC):
|
|
|
565
596
|
response: The agent's response
|
|
566
597
|
messages: The current message history (will be modified in-place)
|
|
567
598
|
"""
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
self.design.debug("Calling response lifecycle tool")
|
|
599
|
+
if self.response_tool_name:
|
|
600
|
+
self.design.debug(f"Calling response lifecycle tool: {self.response_tool_name}")
|
|
571
601
|
try:
|
|
572
602
|
# Call the response tool with the agent's response
|
|
573
603
|
response_tool_call = MCPToolCall(
|
|
574
|
-
name=
|
|
604
|
+
name=self.response_tool_name, arguments={"response": response.content}
|
|
575
605
|
)
|
|
576
606
|
response_results = await self.call_tools(response_tool_call)
|
|
577
607
|
|
|
@@ -306,19 +306,20 @@ class ClaudeAgent(MCPAgent):
|
|
|
306
306
|
"""Convert MCP tools to Claude tool format."""
|
|
307
307
|
claude_tools = []
|
|
308
308
|
self._claude_to_mcp_tool_map = {} # Reset mapping
|
|
309
|
-
|
|
309
|
+
|
|
310
310
|
# Find computer tool by priority
|
|
311
311
|
computer_tool_priority = ["anthropic_computer", "computer_anthropic", "computer"]
|
|
312
312
|
selected_computer_tool = None
|
|
313
|
-
|
|
313
|
+
|
|
314
314
|
for priority_name in computer_tool_priority:
|
|
315
315
|
for tool in self._available_tools:
|
|
316
|
-
|
|
316
|
+
# Check both exact match and suffix match (for prefixed tools)
|
|
317
|
+
if tool.name == priority_name or tool.name.endswith(f"_{priority_name}"):
|
|
317
318
|
selected_computer_tool = tool
|
|
318
319
|
break
|
|
319
320
|
if selected_computer_tool:
|
|
320
321
|
break
|
|
321
|
-
|
|
322
|
+
|
|
322
323
|
# Add the selected computer tool if found
|
|
323
324
|
if selected_computer_tool:
|
|
324
325
|
claude_tool = {
|
|
@@ -330,14 +331,18 @@ class ClaudeAgent(MCPAgent):
|
|
|
330
331
|
# Map Claude's "computer" back to the actual MCP tool name
|
|
331
332
|
self._claude_to_mcp_tool_map["computer"] = selected_computer_tool.name
|
|
332
333
|
claude_tools.append(claude_tool)
|
|
333
|
-
logger.debug(
|
|
334
|
-
|
|
334
|
+
logger.debug("Using %s as computer tool for Claude", selected_computer_tool.name)
|
|
335
|
+
|
|
335
336
|
# Add other non-computer tools
|
|
336
337
|
for tool in self._available_tools:
|
|
337
338
|
# Skip computer tools (already handled) and lifecycle tools
|
|
338
|
-
|
|
339
|
+
is_computer_tool = any(
|
|
340
|
+
tool.name == priority_name or tool.name.endswith(f"_{priority_name}")
|
|
341
|
+
for priority_name in computer_tool_priority
|
|
342
|
+
)
|
|
343
|
+
if is_computer_tool or tool.name in self.lifecycle_tools:
|
|
339
344
|
continue
|
|
340
|
-
|
|
345
|
+
|
|
341
346
|
claude_tool = {
|
|
342
347
|
"name": tool.name,
|
|
343
348
|
"description": tool.description or f"Execute {tool.name}",
|
|
@@ -33,29 +33,6 @@ class TestMCPClient:
|
|
|
33
33
|
with patch("mcp_use.client.MCPClient.from_dict", return_value=mock_instance):
|
|
34
34
|
yield mock_instance
|
|
35
35
|
|
|
36
|
-
@pytest.mark.asyncio
|
|
37
|
-
async def test_init_with_config(self, mock_telemetry):
|
|
38
|
-
"""Test client initialization with config dictionary."""
|
|
39
|
-
mcp_config = {
|
|
40
|
-
"test_server": {
|
|
41
|
-
"command": "python",
|
|
42
|
-
"args": ["-m", "test_server"],
|
|
43
|
-
"env": {"TEST": "true"},
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
with patch("mcp_use.client.MCPClient.from_dict") as mock_from_dict:
|
|
48
|
-
mock_instance = MagicMock()
|
|
49
|
-
mock_instance.create_all_sessions = AsyncMock(return_value={})
|
|
50
|
-
mock_from_dict.return_value = mock_instance
|
|
51
|
-
client = MCPClient(mcp_config=mcp_config, verbose=True)
|
|
52
|
-
# Initialize to trigger connection
|
|
53
|
-
await client.initialize()
|
|
54
|
-
|
|
55
|
-
assert client.verbose is True
|
|
56
|
-
# Verify MCPUseClient.from_dict was called with proper config
|
|
57
|
-
mock_from_dict.assert_called_once_with({"mcpServers": mcp_config})
|
|
58
|
-
|
|
59
36
|
@pytest.mark.asyncio
|
|
60
37
|
async def test_connect_single_server(self, mock_telemetry, mock_mcp_use_client):
|
|
61
38
|
"""Test connecting to a single server."""
|
|
@@ -146,10 +123,10 @@ class TestMCPClient:
|
|
|
146
123
|
# Verify sessions were created
|
|
147
124
|
mock_mcp_use_client.create_all_sessions.assert_called_once()
|
|
148
125
|
|
|
149
|
-
# Check tools from both servers
|
|
126
|
+
# Check tools from both servers - should be prefixed with server names
|
|
150
127
|
tools = await client.list_tools()
|
|
151
128
|
names = {t.name for t in tools}
|
|
152
|
-
assert names == {"
|
|
129
|
+
assert names == {"server1_tool1", "server2_tool2"}
|
|
153
130
|
|
|
154
131
|
@pytest.mark.asyncio
|
|
155
132
|
async def test_call_tool(self, mock_telemetry, mock_mcp_use_client):
|
|
@@ -220,8 +197,10 @@ class TestMCPClient:
|
|
|
220
197
|
|
|
221
198
|
await client.initialize()
|
|
222
199
|
|
|
223
|
-
|
|
224
|
-
|
|
200
|
+
# Calling a non-existent tool should return an error result
|
|
201
|
+
result = await client.call_tool(name="nonexistent", arguments={})
|
|
202
|
+
assert result.isError is True
|
|
203
|
+
assert "Tool 'nonexistent' not found" in result.content[0].text
|
|
225
204
|
|
|
226
205
|
@pytest.mark.asyncio
|
|
227
206
|
async def test_get_telemetry_data(self, mock_telemetry, mock_mcp_use_client):
|
|
@@ -43,6 +43,12 @@ app = typer.Typer(
|
|
|
43
43
|
|
|
44
44
|
console = Console()
|
|
45
45
|
|
|
46
|
+
# Standard support hint appended to error outputs
|
|
47
|
+
SUPPORT_HINT = (
|
|
48
|
+
"If this looks like an issue with the sdk, please make a github issue at "
|
|
49
|
+
"https://github.com/hud-evals/hud-python/issues"
|
|
50
|
+
)
|
|
51
|
+
|
|
46
52
|
|
|
47
53
|
# Capture IMAGE and any following Docker args as a single variadic argument list.
|
|
48
54
|
@app.command(context_settings={"allow_extra_args": True, "ignore_unknown_options": True})
|
|
@@ -818,6 +824,11 @@ def eval(
|
|
|
818
824
|
"--max-concurrent-per-worker",
|
|
819
825
|
help="Maximum concurrent tasks per worker in parallel mode",
|
|
820
826
|
),
|
|
827
|
+
verbose: bool = typer.Option(
|
|
828
|
+
False,
|
|
829
|
+
"--verbose",
|
|
830
|
+
help="Enable verbose output from the agent",
|
|
831
|
+
),
|
|
821
832
|
) -> None:
|
|
822
833
|
"""🚀 Run evaluation on datasets or individual tasks with agents."""
|
|
823
834
|
from hud.utils.design import HUDDesign
|
|
@@ -912,6 +923,7 @@ def eval(
|
|
|
912
923
|
parallel=parallel,
|
|
913
924
|
max_workers=max_workers,
|
|
914
925
|
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
926
|
+
verbose=verbose,
|
|
915
927
|
)
|
|
916
928
|
|
|
917
929
|
|
|
@@ -950,27 +962,45 @@ def hf(
|
|
|
950
962
|
|
|
951
963
|
def main() -> None:
|
|
952
964
|
"""Main entry point for the CLI."""
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
965
|
+
try:
|
|
966
|
+
# Show header for main help
|
|
967
|
+
if len(sys.argv) == 1 or (len(sys.argv) == 2 and sys.argv[1] in ["--help", "-h"]):
|
|
968
|
+
console.print(
|
|
969
|
+
Panel.fit(
|
|
970
|
+
"[bold cyan]🚀 HUD CLI[/bold cyan]\nMCP Environment Analysis & Debugging",
|
|
971
|
+
border_style="cyan",
|
|
972
|
+
)
|
|
959
973
|
)
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
+
console.print("\n[yellow]Quick Start:[/yellow]")
|
|
975
|
+
console.print(
|
|
976
|
+
" 1. Create a new environment: [cyan]hud init my-env && cd my-env[/cyan]"
|
|
977
|
+
)
|
|
978
|
+
console.print(" 2. Develop with hot-reload: [cyan]hud dev --interactive[/cyan]")
|
|
979
|
+
console.print(" 3. Build for production: [cyan]hud build[/cyan]")
|
|
980
|
+
console.print(" 4. Share your environment: [cyan]hud push[/cyan]")
|
|
981
|
+
console.print(" 5. Get shared environments: [cyan]hud pull <org/name:tag>[/cyan]")
|
|
982
|
+
console.print(" 6. Run and test: [cyan]hud run <image>[/cyan]")
|
|
983
|
+
console.print("\n[yellow]RL Training:[/yellow]")
|
|
984
|
+
console.print(" 1. Generate config: [cyan]hud rl init my-env:latest[/cyan]")
|
|
985
|
+
console.print(
|
|
986
|
+
" 2. Create dataset: [cyan]hud hf tasks.json --name my-org/my-tasks[/cyan]"
|
|
987
|
+
)
|
|
988
|
+
console.print(" 3. Start training: [cyan]hud rl --model Qwen/Qwen2.5-3B[/cyan]\n")
|
|
989
|
+
|
|
990
|
+
app()
|
|
991
|
+
except typer.Exit as e:
|
|
992
|
+
# Append SDK support hint for non-zero exits
|
|
993
|
+
try:
|
|
994
|
+
exit_code = getattr(e, "exit_code", 0)
|
|
995
|
+
except Exception:
|
|
996
|
+
exit_code = 1
|
|
997
|
+
if exit_code != 0:
|
|
998
|
+
from hud.utils.design import design
|
|
999
|
+
|
|
1000
|
+
design.info(SUPPORT_HINT)
|
|
1001
|
+
raise
|
|
1002
|
+
except Exception:
|
|
1003
|
+
raise
|
|
974
1004
|
|
|
975
1005
|
|
|
976
1006
|
if __name__ == "__main__":
|
|
@@ -204,30 +204,10 @@ async def analyze_mcp_environment(
|
|
|
204
204
|
"success": True,
|
|
205
205
|
}
|
|
206
206
|
except Exception as e:
|
|
207
|
-
import
|
|
207
|
+
from hud.shared.exceptions import HudException
|
|
208
208
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
design.error(f"Failed to analyze environment: {error_msg}")
|
|
212
|
-
design.error(f"Traceback:\n{traceback.format_exc()}")
|
|
213
|
-
|
|
214
|
-
# Common issues
|
|
215
|
-
if "Connection reset" in error_msg or "EOF" in error_msg:
|
|
216
|
-
design.warning(
|
|
217
|
-
"The MCP server may have crashed on startup. Check your server.py for errors."
|
|
218
|
-
)
|
|
219
|
-
elif "timeout" in error_msg:
|
|
220
|
-
design.warning(
|
|
221
|
-
"The MCP server took too long to initialize. It might need more startup time."
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
return {
|
|
225
|
-
"initializeMs": 0,
|
|
226
|
-
"toolCount": 0,
|
|
227
|
-
"tools": [],
|
|
228
|
-
"success": False,
|
|
229
|
-
"error": error_msg,
|
|
230
|
-
}
|
|
209
|
+
# Convert to HudException for better error messages and hints
|
|
210
|
+
raise HudException from e
|
|
231
211
|
finally:
|
|
232
212
|
# Only shutdown if we successfully initialized
|
|
233
213
|
if initialized:
|
|
@@ -340,27 +320,6 @@ def build_environment(
|
|
|
340
320
|
finally:
|
|
341
321
|
loop.close()
|
|
342
322
|
|
|
343
|
-
if not analysis["success"]:
|
|
344
|
-
design.error("Failed to analyze MCP environment")
|
|
345
|
-
if "error" in analysis:
|
|
346
|
-
design.error(f"Error: {analysis['error']}")
|
|
347
|
-
|
|
348
|
-
# Provide helpful debugging tips
|
|
349
|
-
design.section_title("Debugging Tips")
|
|
350
|
-
design.info("1. Debug your environment build:")
|
|
351
|
-
design.command_example("hud debug . --build")
|
|
352
|
-
design.dim_info(" This will", "test MCP server connection and show detailed logs")
|
|
353
|
-
design.info("")
|
|
354
|
-
design.info("2. Check for common issues:")
|
|
355
|
-
design.info(" - Server crashes on startup")
|
|
356
|
-
design.info(" - Missing dependencies")
|
|
357
|
-
design.info(" - Syntax errors in server.py")
|
|
358
|
-
design.info("")
|
|
359
|
-
design.info("3. Run with verbose mode:")
|
|
360
|
-
design.command_example("hud build . --verbose")
|
|
361
|
-
|
|
362
|
-
raise typer.Exit(1)
|
|
363
|
-
|
|
364
323
|
design.success(f"Analyzed environment: {analysis['toolCount']} tools found")
|
|
365
324
|
|
|
366
325
|
# Extract environment variables from Dockerfile
|
|
@@ -22,6 +22,7 @@ def build_agent(
|
|
|
22
22
|
*,
|
|
23
23
|
model: str | None = None,
|
|
24
24
|
allowed_tools: list[str] | None = None,
|
|
25
|
+
verbose: bool = False,
|
|
25
26
|
) -> Any:
|
|
26
27
|
"""Create and return the requested agent type."""
|
|
27
28
|
|
|
@@ -39,9 +40,10 @@ def build_agent(
|
|
|
39
40
|
if allowed_tools:
|
|
40
41
|
return OperatorAgent(
|
|
41
42
|
allowed_tools=allowed_tools,
|
|
43
|
+
verbose=verbose,
|
|
42
44
|
)
|
|
43
45
|
else:
|
|
44
|
-
return OperatorAgent()
|
|
46
|
+
return OperatorAgent(verbose=verbose)
|
|
45
47
|
|
|
46
48
|
# Fallback Claude agent (Anthropic)
|
|
47
49
|
try:
|
|
@@ -59,10 +61,12 @@ def build_agent(
|
|
|
59
61
|
return ClaudeAgent(
|
|
60
62
|
model=model,
|
|
61
63
|
allowed_tools=allowed_tools,
|
|
64
|
+
verbose=verbose,
|
|
62
65
|
)
|
|
63
66
|
else:
|
|
64
67
|
return ClaudeAgent(
|
|
65
68
|
model=model,
|
|
69
|
+
verbose=verbose,
|
|
66
70
|
)
|
|
67
71
|
|
|
68
72
|
|
|
@@ -73,6 +77,7 @@ async def run_single_task(
|
|
|
73
77
|
model: str | None = None,
|
|
74
78
|
allowed_tools: list[str] | None = None,
|
|
75
79
|
max_steps: int = 10,
|
|
80
|
+
verbose: bool = False,
|
|
76
81
|
) -> None:
|
|
77
82
|
"""Load one task and execute it, or detect if JSON contains a list and run as dataset."""
|
|
78
83
|
|
|
@@ -82,7 +87,7 @@ async def run_single_task(
|
|
|
82
87
|
except ImportError as e:
|
|
83
88
|
design.error(
|
|
84
89
|
"Dataset dependencies are not installed. "
|
|
85
|
-
"Please install with: pip install 'hud-python
|
|
90
|
+
"Please install with: pip install 'hud-python\u27E6agent\u27E7'"
|
|
86
91
|
)
|
|
87
92
|
raise typer.Exit(1) from e
|
|
88
93
|
|
|
@@ -106,11 +111,11 @@ async def run_single_task(
|
|
|
106
111
|
except ImportError as e:
|
|
107
112
|
design.error(
|
|
108
113
|
"OpenAI agent dependencies are not installed. "
|
|
109
|
-
"Please install with: pip install 'hud-python
|
|
114
|
+
"Please install with: pip install 'hud-python\u27E6agent\u27E7'"
|
|
110
115
|
)
|
|
111
116
|
raise typer.Exit(1) from e
|
|
112
117
|
|
|
113
|
-
agent_config: dict[str, Any] = {}
|
|
118
|
+
agent_config: dict[str, Any] = {"verbose": verbose}
|
|
114
119
|
if allowed_tools:
|
|
115
120
|
agent_config["allowed_tools"] = allowed_tools
|
|
116
121
|
|
|
@@ -128,6 +133,7 @@ async def run_single_task(
|
|
|
128
133
|
|
|
129
134
|
agent_config = {
|
|
130
135
|
"model": model or "claude-sonnet-4-20250514",
|
|
136
|
+
"verbose": verbose,
|
|
131
137
|
}
|
|
132
138
|
if allowed_tools:
|
|
133
139
|
agent_config["allowed_tools"] = allowed_tools
|
|
@@ -182,6 +188,7 @@ async def run_single_task(
|
|
|
182
188
|
agent_type,
|
|
183
189
|
model=model,
|
|
184
190
|
allowed_tools=allowed_tools,
|
|
191
|
+
verbose=verbose,
|
|
185
192
|
)
|
|
186
193
|
design.info(task.prompt)
|
|
187
194
|
result = await agent.run(task, max_steps=max_steps)
|
|
@@ -199,6 +206,7 @@ async def run_full_dataset(
|
|
|
199
206
|
parallel: bool = False,
|
|
200
207
|
max_workers: int | None = None,
|
|
201
208
|
max_concurrent_per_worker: int = 25,
|
|
209
|
+
verbose: bool = False,
|
|
202
210
|
) -> list[Any]:
|
|
203
211
|
"""Run evaluation across the entire dataset.
|
|
204
212
|
|
|
@@ -211,7 +219,7 @@ async def run_full_dataset(
|
|
|
211
219
|
except ImportError as e:
|
|
212
220
|
design.error(
|
|
213
221
|
"Dataset dependencies are not installed. "
|
|
214
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
222
|
+
"Please install with: pip install 'hud-python[[agent]]'"
|
|
215
223
|
)
|
|
216
224
|
raise typer.Exit(1) from e
|
|
217
225
|
|
|
@@ -245,7 +253,7 @@ async def run_full_dataset(
|
|
|
245
253
|
)
|
|
246
254
|
raise typer.Exit(1) from e
|
|
247
255
|
|
|
248
|
-
agent_config: dict[str, Any] = {}
|
|
256
|
+
agent_config: dict[str, Any] = {"verbose": verbose}
|
|
249
257
|
if allowed_tools:
|
|
250
258
|
agent_config["allowed_tools"] = allowed_tools
|
|
251
259
|
|
|
@@ -263,6 +271,7 @@ async def run_full_dataset(
|
|
|
263
271
|
|
|
264
272
|
agent_config = {
|
|
265
273
|
"model": model or "claude-sonnet-4-20250514",
|
|
274
|
+
"verbose": verbose,
|
|
266
275
|
}
|
|
267
276
|
if allowed_tools:
|
|
268
277
|
agent_config["allowed_tools"] = allowed_tools
|
|
@@ -360,6 +369,11 @@ def eval_command(
|
|
|
360
369
|
"--max-concurrent-per-worker",
|
|
361
370
|
help="Maximum concurrent tasks per worker in parallel mode",
|
|
362
371
|
),
|
|
372
|
+
verbose: bool = typer.Option(
|
|
373
|
+
False,
|
|
374
|
+
"--verbose",
|
|
375
|
+
help="Enable verbose output from the agent",
|
|
376
|
+
),
|
|
363
377
|
) -> None:
|
|
364
378
|
"""🚀 Run evaluation on datasets or individual tasks with agents.
|
|
365
379
|
|
|
@@ -387,6 +401,9 @@ def eval_command(
|
|
|
387
401
|
|
|
388
402
|
# Run with OpenAI Operator agent
|
|
389
403
|
hud eval hud-evals/OSWorld-Gold-Beta --agent openai
|
|
404
|
+
|
|
405
|
+
# Run with verbose output for debugging
|
|
406
|
+
hud eval task.json --verbose
|
|
390
407
|
"""
|
|
391
408
|
from hud.settings import settings
|
|
392
409
|
|
|
@@ -428,6 +445,7 @@ def eval_command(
|
|
|
428
445
|
parallel=parallel,
|
|
429
446
|
max_workers=max_workers,
|
|
430
447
|
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
448
|
+
verbose=verbose,
|
|
431
449
|
)
|
|
432
450
|
)
|
|
433
451
|
else:
|
|
@@ -438,5 +456,6 @@ def eval_command(
|
|
|
438
456
|
model=model,
|
|
439
457
|
allowed_tools=allowed_tools_list,
|
|
440
458
|
max_steps=max_steps,
|
|
459
|
+
verbose=verbose,
|
|
441
460
|
)
|
|
442
461
|
)
|
|
@@ -182,15 +182,15 @@ async def run_task(task_data: dict):
|
|
|
182
182
|
await client.initialize()
|
|
183
183
|
|
|
184
184
|
result = await client.call_tool(task.setup_tool) # type: ignore
|
|
185
|
-
print(f"✅ Setup: {
|
|
185
|
+
print(f"✅ Setup: {result.content}")
|
|
186
186
|
|
|
187
187
|
print("\\n🔄 Performing actions:")
|
|
188
188
|
for _ in range(10):
|
|
189
|
-
result = await client.call_tool(name="act", arguments={
|
|
190
|
-
print(f" {
|
|
189
|
+
result = await client.call_tool(name="act", arguments={})
|
|
190
|
+
print(f" {result.content}")
|
|
191
191
|
|
|
192
192
|
result = await client.call_tool(task.evaluate_tool) # type: ignore
|
|
193
|
-
print(f"\\n📊 Evaluation: {
|
|
193
|
+
print(f"\\n📊 Evaluation: {result.content}")
|
|
194
194
|
|
|
195
195
|
return result.content
|
|
196
196
|
except Exception as e:
|
|
@@ -11,7 +11,6 @@ import requests
|
|
|
11
11
|
import typer
|
|
12
12
|
import yaml
|
|
13
13
|
|
|
14
|
-
from hud.settings import settings
|
|
15
14
|
from hud.utils.design import HUDDesign
|
|
16
15
|
|
|
17
16
|
|
|
@@ -127,6 +126,9 @@ def push_environment(
|
|
|
127
126
|
design = HUDDesign()
|
|
128
127
|
design.header("HUD Environment Push")
|
|
129
128
|
|
|
129
|
+
# Import settings lazily after any environment setup
|
|
130
|
+
from hud.settings import settings
|
|
131
|
+
|
|
130
132
|
# Find hud.lock.yaml in specified directory
|
|
131
133
|
env_dir = Path(directory)
|
|
132
134
|
lock_path = env_dir / "hud.lock.yaml"
|
|
@@ -123,7 +123,7 @@ class TestPushEnvironment:
|
|
|
123
123
|
mock_design.error.assert_called()
|
|
124
124
|
|
|
125
125
|
@mock.patch("hud.cli.push.HUDDesign")
|
|
126
|
-
@mock.patch("hud.
|
|
126
|
+
@mock.patch("hud.settings.settings")
|
|
127
127
|
def test_push_no_api_key(self, mock_settings, mock_design_class, tmp_path):
|
|
128
128
|
"""Test pushing without API key."""
|
|
129
129
|
mock_design = mock.Mock()
|
|
@@ -143,7 +143,7 @@ class TestPushEnvironment:
|
|
|
143
143
|
@mock.patch("subprocess.Popen")
|
|
144
144
|
@mock.patch("subprocess.run")
|
|
145
145
|
@mock.patch("hud.cli.push.get_docker_username")
|
|
146
|
-
@mock.patch("hud.
|
|
146
|
+
@mock.patch("hud.settings.settings")
|
|
147
147
|
@mock.patch("hud.cli.push.HUDDesign")
|
|
148
148
|
def test_push_auto_detect_username(
|
|
149
149
|
self,
|
|
@@ -205,7 +205,7 @@ class TestPushEnvironment:
|
|
|
205
205
|
assert "testuser/image%3A0.1.0" in call_args[0][0]
|
|
206
206
|
|
|
207
207
|
@mock.patch("subprocess.run")
|
|
208
|
-
@mock.patch("hud.
|
|
208
|
+
@mock.patch("hud.settings.settings")
|
|
209
209
|
@mock.patch("hud.cli.push.HUDDesign")
|
|
210
210
|
def test_push_explicit_image(self, mock_design_class, mock_settings, mock_run, tmp_path):
|
|
211
211
|
"""Test pushing with explicit image name."""
|
|
@@ -226,7 +226,7 @@ class TestPushEnvironment:
|
|
|
226
226
|
|
|
227
227
|
@mock.patch("subprocess.Popen")
|
|
228
228
|
@mock.patch("subprocess.run")
|
|
229
|
-
@mock.patch("hud.
|
|
229
|
+
@mock.patch("hud.settings.settings")
|
|
230
230
|
@mock.patch("hud.cli.push.HUDDesign")
|
|
231
231
|
def test_push_with_tag(self, mock_design_class, mock_settings, mock_run, mock_popen, tmp_path):
|
|
232
232
|
"""Test pushing with explicit tag."""
|
|
@@ -282,7 +282,7 @@ class TestPushEnvironment:
|
|
|
282
282
|
mock_process.returncode = 1
|
|
283
283
|
mock_popen.return_value = mock_process
|
|
284
284
|
|
|
285
|
-
with mock.patch("hud.
|
|
285
|
+
with mock.patch("hud.settings.settings") as mock_settings:
|
|
286
286
|
mock_settings.api_key = "test-key"
|
|
287
287
|
with (
|
|
288
288
|
mock.patch("subprocess.run"),
|
|
@@ -292,7 +292,7 @@ class TestPushEnvironment:
|
|
|
292
292
|
|
|
293
293
|
@mock.patch("hud.cli.push.get_docker_image_labels")
|
|
294
294
|
@mock.patch("subprocess.run")
|
|
295
|
-
@mock.patch("hud.
|
|
295
|
+
@mock.patch("hud.settings.settings")
|
|
296
296
|
@mock.patch("hud.cli.push.HUDDesign")
|
|
297
297
|
def test_push_with_labels(
|
|
298
298
|
self, mock_design_class, mock_settings, mock_run, mock_get_labels, tmp_path
|
|
@@ -4,9 +4,10 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from .base import AgentMCPClient, BaseHUDClient
|
|
6
6
|
from .fastmcp import FastMCPHUDClient
|
|
7
|
+
from .mcp_use import MCPUseHUDClient
|
|
7
8
|
|
|
8
|
-
# Default to
|
|
9
|
-
MCPClient =
|
|
9
|
+
# Default to MCP-use for new features
|
|
10
|
+
MCPClient = MCPUseHUDClient
|
|
10
11
|
|
|
11
12
|
__all__ = [
|
|
12
13
|
"AgentMCPClient",
|