hud-python 0.4.21__tar.gz → 0.4.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.21 → hud_python-0.4.23}/PKG-INFO +3 -1
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/agents/base.py +37 -37
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/agents/claude.py +11 -6
- hud_python-0.4.23/hud/agents/grounded_openai.py +282 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/agents/misc/response_agent.py +3 -2
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/agents/openai.py +2 -2
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/agents/openai_chat_generic.py +3 -1
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/agents/tests/test_client.py +6 -1
- hud_python-0.4.23/hud/agents/tests/test_grounded_openai_agent.py +155 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/__init__.py +34 -24
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/analyze.py +27 -26
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/build.py +50 -46
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/debug.py +7 -7
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/dev.py +107 -99
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/eval.py +33 -31
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/hf.py +53 -53
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/init.py +28 -28
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/list_func.py +22 -22
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/pull.py +36 -36
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/push.py +76 -74
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/remove.py +42 -40
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/rl/__init__.py +2 -2
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/rl/init.py +41 -41
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/rl/pod.py +97 -91
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/rl/ssh.py +42 -40
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/rl/train.py +75 -73
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/rl/utils.py +10 -10
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_analyze.py +1 -1
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_analyze_metadata.py +2 -2
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_pull.py +45 -45
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_push.py +31 -29
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_registry.py +15 -15
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/utils/environment.py +11 -11
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/utils/interactive.py +18 -18
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/utils/logging.py +12 -12
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/utils/metadata.py +12 -12
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/utils/registry.py +5 -5
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/utils/runner.py +23 -23
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/utils/server.py +16 -16
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/settings.py +6 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/shared/hints.py +7 -7
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/executors/tests/test_base_executor.py +1 -1
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/executors/xdo.py +1 -1
- hud_python-0.4.23/hud/tools/grounding/__init__.py +13 -0
- hud_python-0.4.23/hud/tools/grounding/config.py +54 -0
- hud_python-0.4.23/hud/tools/grounding/grounded_tool.py +314 -0
- hud_python-0.4.23/hud/tools/grounding/grounder.py +302 -0
- hud_python-0.4.23/hud/tools/grounding/tests/__init__.py +1 -0
- hud_python-0.4.23/hud/tools/grounding/tests/test_grounded_tool.py +196 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/tests/test_playwright_tool.py +1 -1
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/tests/test_tools_init.py +1 -1
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/tests/test_utils.py +2 -2
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/types.py +4 -4
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/utils/__init__.py +3 -3
- hud_python-0.4.23/hud/utils/agent_factories.py +86 -0
- hud_python-0.4.21/hud/utils/design.py → hud_python-0.4.23/hud/utils/hud_console.py +39 -33
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/utils/pretty_errors.py +6 -6
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/version.py +1 -1
- {hud_python-0.4.21 → hud_python-0.4.23}/pyproject.toml +2 -1
- {hud_python-0.4.21 → hud_python-0.4.23}/.gitignore +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/LICENSE +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/README.md +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/environments/README.md +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/environments/browser/README.md +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/environments/browser/apps/2048/README.md +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/environments/browser/apps/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/environments/browser/apps/README.md +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/environments/browser/apps/todo/README.md +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/environments/browser/apps/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/environments/browser/pyproject.toml +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/examples/README.md +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/__main__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/agents/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/agents/tests/test_base.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/clone.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/rl/README.md +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_build.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_mcp_server.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/utils/docker.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/clients/README.md +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/clients/base.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/clients/fastmcp.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/clients/mcp_use.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/datasets/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/datasets/execution/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/datasets/execution/parallel.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/datasets/execution/runner.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/datasets/task.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/datasets/utils.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/native/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/native/comparator.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/otel/collector.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/otel/config.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/otel/context.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/otel/exporters.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/otel/instrumentation.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/otel/processors.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/otel/tests/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/py.typed +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/server/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/server/context.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/server/low_level.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/server/server.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/shared/exceptions.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/shared/requests.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/shared/tests/test_exceptions.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/telemetry/job.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/telemetry/tests/test_trace.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/telemetry/trace.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/base.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/bash.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/edit.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/executors/base.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/response.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/submit.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/types.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/tools/utils.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/utils/mcp.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/utils/progress.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/utils/tests/test_mcp.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/rl/README.md +0 -0
- {hud_python-0.4.21 → hud_python-0.4.23}/rl/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.23
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -66,6 +66,7 @@ Requires-Dist: langchain-anthropic; extra == 'agent'
|
|
|
66
66
|
Requires-Dist: langchain-openai; extra == 'agent'
|
|
67
67
|
Requires-Dist: numpy>=1.24.0; extra == 'agent'
|
|
68
68
|
Requires-Dist: openai; extra == 'agent'
|
|
69
|
+
Requires-Dist: pillow>=11.1.0; extra == 'agent'
|
|
69
70
|
Provides-Extra: agents
|
|
70
71
|
Requires-Dist: anthropic; extra == 'agents'
|
|
71
72
|
Requires-Dist: datasets>=2.14.0; extra == 'agents'
|
|
@@ -79,6 +80,7 @@ Requires-Dist: langchain-anthropic; extra == 'agents'
|
|
|
79
80
|
Requires-Dist: langchain-openai; extra == 'agents'
|
|
80
81
|
Requires-Dist: numpy>=1.24.0; extra == 'agents'
|
|
81
82
|
Requires-Dist: openai; extra == 'agents'
|
|
83
|
+
Requires-Dist: pillow>=11.1.0; extra == 'agents'
|
|
82
84
|
Provides-Extra: dev
|
|
83
85
|
Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
|
|
84
86
|
Requires-Dist: anthropic; extra == 'dev'
|
|
@@ -11,7 +11,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
|
|
11
11
|
import mcp.types as types
|
|
12
12
|
|
|
13
13
|
from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
|
|
14
|
-
from hud.utils.
|
|
14
|
+
from hud.utils.hud_console import HUDConsole
|
|
15
15
|
from hud.utils.mcp import MCPConfigPatch, patch_mcp_config, setup_hud_telemetry
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
@@ -37,7 +37,7 @@ class MCPAgent(ABC):
|
|
|
37
37
|
and automatic marking of lifecycle tools (setup/evaluate) from a `Task`.
|
|
38
38
|
- Messaging: system prompt handling, optional inclusion of setup output on
|
|
39
39
|
the first turn, and control over initial screenshots.
|
|
40
|
-
- Telemetry & UX: standardized logging/printing via `
|
|
40
|
+
- Telemetry & UX: standardized logging/printing via `HUDConsole` and optional
|
|
41
41
|
automatic tracing (`auto_trace`).
|
|
42
42
|
|
|
43
43
|
Subclasses implement provider-specific formatting and response fetching
|
|
@@ -92,11 +92,11 @@ class MCPAgent(ABC):
|
|
|
92
92
|
self._auto_created_client = False # Track if we created the client
|
|
93
93
|
|
|
94
94
|
self.model_name = model_name
|
|
95
|
-
self.
|
|
95
|
+
self.console = HUDConsole(logger=logger)
|
|
96
96
|
|
|
97
97
|
# Set verbose mode if requested
|
|
98
98
|
if verbose:
|
|
99
|
-
self.
|
|
99
|
+
self.console.set_verbose(True)
|
|
100
100
|
|
|
101
101
|
# Filtering
|
|
102
102
|
self.allowed_tools = allowed_tools
|
|
@@ -131,7 +131,7 @@ class MCPAgent(ABC):
|
|
|
131
131
|
|
|
132
132
|
self.mcp_client = MCPClient(mcp_config=task.mcp_config)
|
|
133
133
|
self._auto_created_client = True
|
|
134
|
-
self.
|
|
134
|
+
self.console.info_log("Auto-created MCPClient from task.mcp_config")
|
|
135
135
|
|
|
136
136
|
# Ensure we have a client
|
|
137
137
|
if self.mcp_client is None:
|
|
@@ -168,7 +168,7 @@ class MCPAgent(ABC):
|
|
|
168
168
|
await self._filter_tools()
|
|
169
169
|
|
|
170
170
|
num_tools = len(self._available_tools)
|
|
171
|
-
self.
|
|
171
|
+
self.console.success_log(
|
|
172
172
|
f"Agent initialized with {num_tools} available tools (after filtering)"
|
|
173
173
|
)
|
|
174
174
|
|
|
@@ -243,7 +243,7 @@ class MCPAgent(ABC):
|
|
|
243
243
|
|
|
244
244
|
# Execute the setup tool and append the initial observation to the context
|
|
245
245
|
if task.setup_tool is not None:
|
|
246
|
-
self.
|
|
246
|
+
self.console.progress_log(f"Setting up tool phase: {task.setup_tool}")
|
|
247
247
|
results = await self.call_tools(task.setup_tool)
|
|
248
248
|
if any(result.isError for result in results):
|
|
249
249
|
raise RuntimeError(f"{results}")
|
|
@@ -257,7 +257,7 @@ class MCPAgent(ABC):
|
|
|
257
257
|
prompt_result = await self._run_context(start_context, max_steps=max_steps)
|
|
258
258
|
|
|
259
259
|
except Exception as e:
|
|
260
|
-
self.
|
|
260
|
+
self.console.error_log(f"Task execution failed: {e}")
|
|
261
261
|
# Create an error result but don't return yet - we still want to evaluate
|
|
262
262
|
prompt_result = Trace(reward=0.0, done=True, content=str(e), isError=True)
|
|
263
263
|
prompt_result.populate_from_context()
|
|
@@ -265,7 +265,7 @@ class MCPAgent(ABC):
|
|
|
265
265
|
# Always evaluate if we have a prompt result and evaluate tool
|
|
266
266
|
if prompt_result is not None and task.evaluate_tool is not None:
|
|
267
267
|
try:
|
|
268
|
-
self.
|
|
268
|
+
self.console.progress_log(f"Evaluating tool phase: {task.evaluate_tool}")
|
|
269
269
|
results = await self.call_tools(task.evaluate_tool)
|
|
270
270
|
|
|
271
271
|
if any(result.isError for result in results):
|
|
@@ -288,7 +288,7 @@ class MCPAgent(ABC):
|
|
|
288
288
|
prompt_result.content = eval_content
|
|
289
289
|
|
|
290
290
|
except Exception as e:
|
|
291
|
-
self.
|
|
291
|
+
self.console.error_log(f"Evaluation phase failed: {e}")
|
|
292
292
|
# Continue with the prompt result even if evaluation failed
|
|
293
293
|
|
|
294
294
|
return (
|
|
@@ -319,21 +319,21 @@ class MCPAgent(ABC):
|
|
|
319
319
|
|
|
320
320
|
# Add initial context
|
|
321
321
|
messages.extend(await self.format_message(context))
|
|
322
|
-
self.
|
|
322
|
+
self.console.debug(f"Messages: {messages}")
|
|
323
323
|
|
|
324
324
|
step_count = 0
|
|
325
325
|
while max_steps == -1 or step_count < max_steps:
|
|
326
326
|
step_count += 1
|
|
327
327
|
if max_steps == -1:
|
|
328
|
-
self.
|
|
328
|
+
self.console.debug(f"Step {step_count} (unlimited)")
|
|
329
329
|
else:
|
|
330
|
-
self.
|
|
330
|
+
self.console.debug(f"Step {step_count}/{max_steps}")
|
|
331
331
|
|
|
332
332
|
try:
|
|
333
333
|
# 1. Get model response
|
|
334
334
|
response = await self.get_response(messages)
|
|
335
335
|
|
|
336
|
-
self.
|
|
336
|
+
self.console.debug(f"Agent:\n{response}")
|
|
337
337
|
|
|
338
338
|
# Check if we should stop
|
|
339
339
|
if response.done or not response.tool_calls:
|
|
@@ -345,16 +345,16 @@ class MCPAgent(ABC):
|
|
|
345
345
|
response.content
|
|
346
346
|
)
|
|
347
347
|
except Exception as e:
|
|
348
|
-
self.
|
|
348
|
+
self.console.warning_log(f"ResponseAgent failed: {e}")
|
|
349
349
|
if decision == "STOP":
|
|
350
350
|
# Try to submit response through lifecycle tool
|
|
351
351
|
await self._maybe_submit_response(response, messages)
|
|
352
352
|
|
|
353
|
-
self.
|
|
353
|
+
self.console.debug("Stopping execution")
|
|
354
354
|
final_response = response
|
|
355
355
|
break
|
|
356
356
|
else:
|
|
357
|
-
self.
|
|
357
|
+
self.console.debug("Continuing execution")
|
|
358
358
|
messages.extend(await self.format_message(decision))
|
|
359
359
|
continue
|
|
360
360
|
|
|
@@ -376,21 +376,21 @@ class MCPAgent(ABC):
|
|
|
376
376
|
for call, result in zip(tool_calls, tool_results, strict=False):
|
|
377
377
|
step_info += f"\n{call}\n{result}"
|
|
378
378
|
|
|
379
|
-
self.
|
|
379
|
+
self.console.info_log(step_info)
|
|
380
380
|
|
|
381
381
|
except Exception as e:
|
|
382
|
-
self.
|
|
382
|
+
self.console.error_log(f"Step failed: {e}")
|
|
383
383
|
error = str(e)
|
|
384
384
|
break
|
|
385
385
|
|
|
386
386
|
except KeyboardInterrupt:
|
|
387
|
-
self.
|
|
387
|
+
self.console.warning_log("Agent execution interrupted by user")
|
|
388
388
|
error = "Interrupted by user"
|
|
389
389
|
except asyncio.CancelledError:
|
|
390
|
-
self.
|
|
390
|
+
self.console.warning_log("Agent execution cancelled")
|
|
391
391
|
error = "Cancelled"
|
|
392
392
|
except Exception as e:
|
|
393
|
-
self.
|
|
393
|
+
self.console.error_log(f"Unexpected error: {e}")
|
|
394
394
|
error = str(e)
|
|
395
395
|
|
|
396
396
|
# Build result
|
|
@@ -431,17 +431,17 @@ class MCPAgent(ABC):
|
|
|
431
431
|
results: list[MCPToolResult] = []
|
|
432
432
|
for tc in tool_call:
|
|
433
433
|
try:
|
|
434
|
-
self.
|
|
434
|
+
self.console.debug(f"Calling tool: {tc}")
|
|
435
435
|
results.append(await self.mcp_client.call_tool(tc))
|
|
436
436
|
except TimeoutError as e:
|
|
437
|
-
self.
|
|
437
|
+
self.console.error_log(f"Tool execution timed out: {e}")
|
|
438
438
|
try:
|
|
439
439
|
await self.mcp_client.shutdown()
|
|
440
440
|
except Exception as close_err:
|
|
441
|
-
self.
|
|
441
|
+
self.console.debug(f"Failed to close MCP client cleanly: {close_err}")
|
|
442
442
|
raise
|
|
443
443
|
except Exception as e:
|
|
444
|
-
self.
|
|
444
|
+
self.console.error_log(f"Tool execution failed: {e}")
|
|
445
445
|
results.append(_format_error_result(str(e)))
|
|
446
446
|
return results
|
|
447
447
|
|
|
@@ -573,7 +573,7 @@ class MCPAgent(ABC):
|
|
|
573
573
|
|
|
574
574
|
# Add to lifecycle tools if found
|
|
575
575
|
if response_tool_name and response_tool_name not in self.lifecycle_tools:
|
|
576
|
-
self.
|
|
576
|
+
self.console.debug(f"Auto-detected '{response_tool_name}' tool as a lifecycle tool")
|
|
577
577
|
self.response_tool_name = response_tool_name
|
|
578
578
|
self.lifecycle_tools.append(response_tool_name)
|
|
579
579
|
|
|
@@ -597,7 +597,7 @@ class MCPAgent(ABC):
|
|
|
597
597
|
messages: The current message history (will be modified in-place)
|
|
598
598
|
"""
|
|
599
599
|
if self.response_tool_name:
|
|
600
|
-
self.
|
|
600
|
+
self.console.debug(f"Calling response lifecycle tool: {self.response_tool_name}")
|
|
601
601
|
try:
|
|
602
602
|
# Call the response tool with the agent's response
|
|
603
603
|
response_tool_call = MCPToolCall(
|
|
@@ -612,9 +612,9 @@ class MCPAgent(ABC):
|
|
|
612
612
|
messages.extend(response_messages)
|
|
613
613
|
|
|
614
614
|
# Mark the task as done
|
|
615
|
-
self.
|
|
615
|
+
self.console.debug("Response lifecycle tool executed, marking task as done")
|
|
616
616
|
except Exception as e:
|
|
617
|
-
self.
|
|
617
|
+
self.console.error_log(f"Response lifecycle tool failed: {e}")
|
|
618
618
|
|
|
619
619
|
async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
|
|
620
620
|
"""Inject metadata into the metadata of the initialize request."""
|
|
@@ -668,9 +668,9 @@ class MCPAgent(ABC):
|
|
|
668
668
|
if self._auto_trace_cm:
|
|
669
669
|
try:
|
|
670
670
|
self._auto_trace_cm.__exit__(None, None, None)
|
|
671
|
-
self.
|
|
671
|
+
self.console.debug("Closed auto-created trace")
|
|
672
672
|
except Exception as e:
|
|
673
|
-
self.
|
|
673
|
+
self.console.warning_log(f"Failed to close auto-created trace: {e}")
|
|
674
674
|
finally:
|
|
675
675
|
self._auto_trace_cm = None
|
|
676
676
|
|
|
@@ -678,9 +678,9 @@ class MCPAgent(ABC):
|
|
|
678
678
|
if self._auto_created_client and self.mcp_client:
|
|
679
679
|
try:
|
|
680
680
|
await self.mcp_client.shutdown()
|
|
681
|
-
self.
|
|
681
|
+
self.console.debug("Closed auto-created MCPClient")
|
|
682
682
|
except Exception as e:
|
|
683
|
-
self.
|
|
683
|
+
self.console.warning_log(f"Failed to close auto-created client: {e}")
|
|
684
684
|
finally:
|
|
685
685
|
self.mcp_client = None
|
|
686
686
|
self._auto_created_client = False
|
|
@@ -713,13 +713,13 @@ class MCPAgent(ABC):
|
|
|
713
713
|
if self._is_connection_error(e):
|
|
714
714
|
msg = self._get_connection_error_message(e)
|
|
715
715
|
# Always show connection errors, not just when logging is enabled
|
|
716
|
-
self.
|
|
717
|
-
self.
|
|
716
|
+
self.console.error(f"❌ {msg}")
|
|
717
|
+
self.console.info("💡 Make sure the MCP server is started before running the agent.")
|
|
718
718
|
|
|
719
719
|
# For localhost, provide specific instructions
|
|
720
720
|
error_str = str(e).lower()
|
|
721
721
|
if "localhost" in error_str or "127.0.0.1" in error_str:
|
|
722
|
-
self.
|
|
722
|
+
self.console.info(" Run 'hud dev' in another terminal to start the MCP server")
|
|
723
723
|
|
|
724
724
|
raise RuntimeError(msg) from e
|
|
725
725
|
raise
|
|
@@ -364,16 +364,21 @@ class ClaudeAgent(MCPAgent):
|
|
|
364
364
|
messages_cached = copy.deepcopy(messages)
|
|
365
365
|
|
|
366
366
|
# Mark last user message with cache control
|
|
367
|
-
if
|
|
367
|
+
if (
|
|
368
|
+
messages_cached
|
|
369
|
+
and isinstance(messages_cached[-1], dict)
|
|
370
|
+
and messages_cached[-1].get("role") == "user"
|
|
371
|
+
):
|
|
368
372
|
last_content = messages_cached[-1]["content"]
|
|
369
373
|
# Content is formatted to be list of ContentBlock in format_blocks and format_message
|
|
370
374
|
if isinstance(last_content, list):
|
|
371
375
|
for block in last_content:
|
|
372
|
-
# Only add cache control to block types that support it
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
376
|
+
# Only add cache control to dict-like block types that support it
|
|
377
|
+
if isinstance(block, dict):
|
|
378
|
+
block_type = block.get("type")
|
|
379
|
+
if block_type in ["text", "image", "tool_use", "tool_result"]:
|
|
380
|
+
cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
|
|
381
|
+
block["cache_control"] = cache_control # type: ignore[reportGeneralTypeIssues]
|
|
377
382
|
|
|
378
383
|
return messages_cached
|
|
379
384
|
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
"""Grounded OpenAI agent that separates visual grounding from reasoning."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any, ClassVar
|
|
7
|
+
|
|
8
|
+
from hud import instrument
|
|
9
|
+
from hud.tools.grounding import GroundedComputerTool, Grounder, GrounderConfig
|
|
10
|
+
from hud.types import AgentResponse, MCPToolCall, MCPToolResult
|
|
11
|
+
|
|
12
|
+
from .openai_chat_generic import GenericOpenAIChatAgent
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
|
|
16
|
+
"""OpenAI agent that uses a separate grounding model for element detection.
|
|
17
|
+
|
|
18
|
+
This agent:
|
|
19
|
+
- Exposes only a synthetic "computer" tool to the planning model
|
|
20
|
+
- Intercepts tool calls to ground element descriptions to coordinates
|
|
21
|
+
- Converts grounded results to real computer tool calls
|
|
22
|
+
- Maintains screenshot state for grounding operations
|
|
23
|
+
|
|
24
|
+
The architecture separates concerns:
|
|
25
|
+
- Planning model (GPT-4o etc) focuses on high-level reasoning
|
|
26
|
+
- Grounding model (Qwen2-VL etc) handles visual element detection
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
metadata: ClassVar[dict[str, Any]] = {}
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
*,
|
|
34
|
+
grounder_config: GrounderConfig,
|
|
35
|
+
model_name: str = "gpt-4o-mini",
|
|
36
|
+
allowed_tools: list[str] | None = None,
|
|
37
|
+
append_setup_output: bool = False,
|
|
38
|
+
system_prompt: str | None = None,
|
|
39
|
+
**kwargs: Any,
|
|
40
|
+
) -> None:
|
|
41
|
+
"""Initialize the grounded OpenAI agent.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
grounder_config: Configuration for the grounding model
|
|
45
|
+
openai_client: OpenAI client for the planning model
|
|
46
|
+
model: Name of the OpenAI model to use for planning (e.g., "gpt-4o", "gpt-4o-mini")
|
|
47
|
+
real_computer_tool_name: Name of the actual computer tool to execute
|
|
48
|
+
**kwargs: Additional arguments passed to GenericOpenAIChatAgent
|
|
49
|
+
"""
|
|
50
|
+
# Set defaults for grounded agent
|
|
51
|
+
if allowed_tools is None:
|
|
52
|
+
allowed_tools = ["computer"]
|
|
53
|
+
|
|
54
|
+
if system_prompt is None:
|
|
55
|
+
system_prompt = (
|
|
56
|
+
"You are a helpful AI assistant that can control the computer "
|
|
57
|
+
"through visual interaction.\n\n"
|
|
58
|
+
"IMPORTANT: Always explain your reasoning and observations before taking actions:\n"
|
|
59
|
+
"1. First, describe what you see on the screen\n"
|
|
60
|
+
"2. Explain what you plan to do and why\n"
|
|
61
|
+
"3. Then use the computer tool with natural language descriptions\n\n"
|
|
62
|
+
"For example:\n"
|
|
63
|
+
"- 'I can see a login form with username and password fields. "
|
|
64
|
+
"I need to click on the username field first.'\n"
|
|
65
|
+
"- 'There's a blue submit button at the bottom. "
|
|
66
|
+
"I'll click on it to submit the form.'\n"
|
|
67
|
+
"- 'I notice a red close button in the top right corner. "
|
|
68
|
+
"I'll click it to close this dialog.'\n\n"
|
|
69
|
+
"Use descriptive element descriptions like:\n"
|
|
70
|
+
"- Colors: 'red button', 'blue link', 'green checkmark'\n"
|
|
71
|
+
"- Position: 'top right corner', 'bottom of the page', 'left sidebar'\n"
|
|
72
|
+
"- Text content: 'Submit button', 'Login link', 'Cancel option'\n"
|
|
73
|
+
"- Element type: 'text field', 'dropdown menu', 'checkbox'"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
super().__init__(
|
|
77
|
+
model_name=model_name,
|
|
78
|
+
allowed_tools=allowed_tools,
|
|
79
|
+
append_setup_output=append_setup_output,
|
|
80
|
+
system_prompt=system_prompt,
|
|
81
|
+
**kwargs,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
self.grounder = Grounder(grounder_config)
|
|
85
|
+
self.grounded_tool = None
|
|
86
|
+
|
|
87
|
+
async def initialize(self, task: Any = None) -> None:
|
|
88
|
+
"""Initialize the agent and create the grounded tool with mcp_client."""
|
|
89
|
+
# Call parent initialization first
|
|
90
|
+
await super().initialize(task)
|
|
91
|
+
|
|
92
|
+
if self.mcp_client is None:
|
|
93
|
+
raise ValueError("mcp_client must be initialized before creating grounded tool")
|
|
94
|
+
self.grounded_tool = GroundedComputerTool(
|
|
95
|
+
grounder=self.grounder, mcp_client=self.mcp_client, computer_tool_name="computer"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def get_tool_schemas(self) -> list[Any]:
|
|
99
|
+
"""Override to expose only the synthetic grounded tool.
|
|
100
|
+
|
|
101
|
+
The planning model only sees the synthetic "computer" tool,
|
|
102
|
+
which is provided by the grounded tool itself.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
List containing only the grounded computer tool schema
|
|
106
|
+
"""
|
|
107
|
+
if self.grounded_tool is None:
|
|
108
|
+
return []
|
|
109
|
+
return [self.grounded_tool.get_openai_tool_schema()]
|
|
110
|
+
|
|
111
|
+
@instrument(
|
|
112
|
+
span_type="agent",
|
|
113
|
+
record_args=False,
|
|
114
|
+
record_result=True,
|
|
115
|
+
)
|
|
116
|
+
async def get_response(self, messages: Any) -> AgentResponse:
|
|
117
|
+
"""Get response from the planning model and handle grounded tool calls.
|
|
118
|
+
|
|
119
|
+
This method:
|
|
120
|
+
1. Calls the planning model with the grounded tool schema
|
|
121
|
+
2. Executes any tool calls directly through the grounded tool
|
|
122
|
+
3. Returns the response
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
messages: Conversation messages
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
AgentResponse with either content or tool calls for MCP execution
|
|
129
|
+
"""
|
|
130
|
+
tool_schemas = self.get_tool_schemas()
|
|
131
|
+
|
|
132
|
+
# Take initial screenshot and add to messages if this is the first turn
|
|
133
|
+
has_image = any(
|
|
134
|
+
isinstance(m.get("content"), list)
|
|
135
|
+
and any(
|
|
136
|
+
block.get("type") == "image_url"
|
|
137
|
+
for block in m["content"]
|
|
138
|
+
if isinstance(block, dict)
|
|
139
|
+
)
|
|
140
|
+
for m in messages
|
|
141
|
+
if isinstance(m.get("content"), list)
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
if not has_image:
|
|
145
|
+
if self.mcp_client is None:
|
|
146
|
+
raise ValueError("mcp_client is not initialized")
|
|
147
|
+
screenshot_result = await self.mcp_client.call_tool(
|
|
148
|
+
MCPToolCall(name="computer", arguments={"action": "screenshot"})
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
for block in screenshot_result.content:
|
|
152
|
+
# Check for ImageContent type from MCP
|
|
153
|
+
if hasattr(block, "data") and hasattr(block, "mimeType"):
|
|
154
|
+
mime_type = getattr(block, "mimeType", "image/png")
|
|
155
|
+
data = getattr(block, "data", "")
|
|
156
|
+
messages.append(
|
|
157
|
+
{
|
|
158
|
+
"role": "user",
|
|
159
|
+
"content": [
|
|
160
|
+
{
|
|
161
|
+
"type": "image_url",
|
|
162
|
+
"image_url": {"url": f"data:{mime_type};base64,{data}"},
|
|
163
|
+
}
|
|
164
|
+
],
|
|
165
|
+
}
|
|
166
|
+
)
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
protected_keys = {"model", "messages", "tools", "parallel_tool_calls"}
|
|
170
|
+
extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
|
|
171
|
+
|
|
172
|
+
response = await self.oai.chat.completions.create(
|
|
173
|
+
model=self.model_name,
|
|
174
|
+
messages=messages,
|
|
175
|
+
tools=tool_schemas,
|
|
176
|
+
parallel_tool_calls=False,
|
|
177
|
+
**extra,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
choice = response.choices[0]
|
|
181
|
+
msg = choice.message
|
|
182
|
+
|
|
183
|
+
assistant_msg: dict[str, Any] = {"role": "assistant"}
|
|
184
|
+
if msg.content:
|
|
185
|
+
assistant_msg["content"] = msg.content
|
|
186
|
+
if msg.tool_calls:
|
|
187
|
+
assistant_msg["tool_calls"] = msg.tool_calls
|
|
188
|
+
|
|
189
|
+
messages.append(assistant_msg)
|
|
190
|
+
|
|
191
|
+
self.conversation_history = messages.copy()
|
|
192
|
+
|
|
193
|
+
if not msg.tool_calls:
|
|
194
|
+
return AgentResponse(
|
|
195
|
+
content=msg.content or "",
|
|
196
|
+
tool_calls=[],
|
|
197
|
+
done=choice.finish_reason in ("stop", "length"),
|
|
198
|
+
raw=response,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
tc = msg.tool_calls[0]
|
|
202
|
+
|
|
203
|
+
if tc.function.name != "computer":
|
|
204
|
+
return AgentResponse(
|
|
205
|
+
content=f"Error: Model called unexpected tool '{tc.function.name}'",
|
|
206
|
+
tool_calls=[],
|
|
207
|
+
done=True,
|
|
208
|
+
raw=response,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Parse the arguments
|
|
212
|
+
try:
|
|
213
|
+
args = json.loads(tc.function.arguments or "{}")
|
|
214
|
+
except json.JSONDecodeError:
|
|
215
|
+
return AgentResponse(
|
|
216
|
+
content="Error: Invalid tool arguments", tool_calls=[], done=True, raw=response
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
tool_call = MCPToolCall(name="computer", arguments=args, id=tc.id)
|
|
220
|
+
|
|
221
|
+
return AgentResponse(
|
|
222
|
+
content=msg.content or "", tool_calls=[tool_call], done=False, raw=response
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
async def call_tools(
|
|
226
|
+
self, tool_call: MCPToolCall | list[MCPToolCall] | None = None
|
|
227
|
+
) -> list[MCPToolResult]:
|
|
228
|
+
"""Override call_tools to intercept computer tool calls.
|
|
229
|
+
|
|
230
|
+
Execute them through grounded tool.
|
|
231
|
+
"""
|
|
232
|
+
if tool_call is None:
|
|
233
|
+
return []
|
|
234
|
+
|
|
235
|
+
if isinstance(tool_call, MCPToolCall):
|
|
236
|
+
tool_call = [tool_call]
|
|
237
|
+
|
|
238
|
+
results: list[MCPToolResult] = []
|
|
239
|
+
for tc in tool_call:
|
|
240
|
+
if tc.name == "computer":
|
|
241
|
+
# Execute through grounded tool instead of MCP
|
|
242
|
+
try:
|
|
243
|
+
# Extract latest screenshot from conversation history
|
|
244
|
+
screenshot_b64 = None
|
|
245
|
+
for m in reversed(self.conversation_history):
|
|
246
|
+
if m.get("role") == "user" and isinstance(m.get("content"), list):
|
|
247
|
+
for block in m["content"]:
|
|
248
|
+
if (
|
|
249
|
+
isinstance(block, dict)
|
|
250
|
+
and block.get("type") == "image_url"
|
|
251
|
+
and isinstance(block.get("image_url"), dict)
|
|
252
|
+
):
|
|
253
|
+
url = block["image_url"].get("url", "")
|
|
254
|
+
if url.startswith("data:"):
|
|
255
|
+
screenshot_b64 = (
|
|
256
|
+
url.split(",", 1)[1] if "," in url else None
|
|
257
|
+
)
|
|
258
|
+
break
|
|
259
|
+
if screenshot_b64:
|
|
260
|
+
break
|
|
261
|
+
|
|
262
|
+
# Pass screenshot to grounded tool
|
|
263
|
+
args_with_screenshot = dict(tc.arguments) if tc.arguments else {}
|
|
264
|
+
if screenshot_b64:
|
|
265
|
+
args_with_screenshot["screenshot_b64"] = screenshot_b64
|
|
266
|
+
|
|
267
|
+
if self.grounded_tool is None:
|
|
268
|
+
raise ValueError("Grounded tool is not initialized")
|
|
269
|
+
content_blocks = await self.grounded_tool(**args_with_screenshot)
|
|
270
|
+
results.append(MCPToolResult(content=content_blocks, isError=False))
|
|
271
|
+
except Exception as e:
|
|
272
|
+
# Create error result
|
|
273
|
+
from mcp.types import TextContent
|
|
274
|
+
|
|
275
|
+
error_content = TextContent(text=str(e), type="text")
|
|
276
|
+
results.append(MCPToolResult(content=[error_content], isError=True))
|
|
277
|
+
else:
|
|
278
|
+
# For non-computer tools, use parent implementation
|
|
279
|
+
parent_results = await super().call_tools(tc)
|
|
280
|
+
results.extend(parent_results)
|
|
281
|
+
|
|
282
|
+
return results
|
|
@@ -16,7 +16,7 @@ class ResponseAgent:
|
|
|
16
16
|
based on the agent's final response message.
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
|
-
def __init__(self, api_key: str | None = None) -> None:
|
|
19
|
+
def __init__(self, api_key: str | None = None, model: str = "gpt-4o") -> None:
|
|
20
20
|
self.api_key = api_key or settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
|
|
21
21
|
if not self.api_key:
|
|
22
22
|
raise ValueError(
|
|
@@ -24,6 +24,7 @@ class ResponseAgent:
|
|
|
24
24
|
)
|
|
25
25
|
|
|
26
26
|
self.client = AsyncOpenAI(api_key=self.api_key)
|
|
27
|
+
self.model = model
|
|
27
28
|
|
|
28
29
|
self.system_prompt = """
|
|
29
30
|
You are an assistant that helps determine the appropriate response to an agent's message.
|
|
@@ -54,7 +55,7 @@ class ResponseAgent:
|
|
|
54
55
|
"""
|
|
55
56
|
try:
|
|
56
57
|
response = await self.client.chat.completions.create(
|
|
57
|
-
model=
|
|
58
|
+
model=self.model,
|
|
58
59
|
messages=[
|
|
59
60
|
{"role": "system", "content": self.system_prompt},
|
|
60
61
|
{
|
|
@@ -204,7 +204,7 @@ class OperatorAgent(MCPAgent):
|
|
|
204
204
|
break
|
|
205
205
|
|
|
206
206
|
if not latest_screenshot:
|
|
207
|
-
self.
|
|
207
|
+
self.console.warning_log("No screenshot provided for response to action")
|
|
208
208
|
return AgentResponse(
|
|
209
209
|
content="No screenshot available for next action",
|
|
210
210
|
tool_calls=[],
|
|
@@ -327,7 +327,7 @@ class OperatorAgent(MCPAgent):
|
|
|
327
327
|
for content in result.content:
|
|
328
328
|
if isinstance(content, types.TextContent):
|
|
329
329
|
# Don't add error text as input_text, just track it
|
|
330
|
-
self.
|
|
330
|
+
self.console.error_log(f"Tool error: {content.text}")
|
|
331
331
|
elif isinstance(content, types.ImageContent):
|
|
332
332
|
# Even error results might have images
|
|
333
333
|
latest_screenshot = content.data
|
|
@@ -17,7 +17,7 @@ from __future__ import annotations
|
|
|
17
17
|
|
|
18
18
|
import json
|
|
19
19
|
import logging
|
|
20
|
-
from typing import TYPE_CHECKING, Any, cast
|
|
20
|
+
from typing import TYPE_CHECKING, Any, ClassVar, cast
|
|
21
21
|
|
|
22
22
|
import mcp.types as types
|
|
23
23
|
|
|
@@ -36,6 +36,8 @@ logger = logging.getLogger(__name__)
|
|
|
36
36
|
class GenericOpenAIChatAgent(MCPAgent):
|
|
37
37
|
"""MCP-enabled agent that speaks the OpenAI *chat.completions* protocol."""
|
|
38
38
|
|
|
39
|
+
metadata: ClassVar[dict[str, Any]] = {}
|
|
40
|
+
|
|
39
41
|
def __init__(
|
|
40
42
|
self,
|
|
41
43
|
*,
|
|
@@ -200,7 +200,12 @@ class TestMCPClient:
|
|
|
200
200
|
# Calling a non-existent tool should return an error result
|
|
201
201
|
result = await client.call_tool(name="nonexistent", arguments={})
|
|
202
202
|
assert result.isError is True
|
|
203
|
-
|
|
203
|
+
# Check that the error message is in the text content
|
|
204
|
+
text_content = ""
|
|
205
|
+
for content in result.content:
|
|
206
|
+
if isinstance(content, types.TextContent):
|
|
207
|
+
text_content += content.text
|
|
208
|
+
assert "Tool 'nonexistent' not found" in text_content
|
|
204
209
|
|
|
205
210
|
@pytest.mark.asyncio
|
|
206
211
|
async def test_get_telemetry_data(self, mock_telemetry, mock_mcp_use_client):
|