hud-python 0.4.63__tar.gz → 0.4.64__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hud_python-0.4.63 → hud_python-0.4.64}/PKG-INFO +3 -3
- {hud_python-0.4.63 → hud_python-0.4.64}/README.md +2 -2
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/base.py +6 -4
- hud_python-0.4.64/hud/agents/claude.py +365 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/langchain.py +4 -1
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/openai_chat_generic.py +4 -1
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_base.py +0 -1
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/eval.py +3 -3
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_eval.py +93 -25
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/version.py +1 -1
- {hud_python-0.4.63 → hud_python-0.4.64}/pyproject.toml +1 -1
- hud_python-0.4.63/hud/agents/claude.py +0 -419
- {hud_python-0.4.63 → hud_python-0.4.64}/.gitignore +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/LICENSE +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/blank/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/blank/environment/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/blank/environment/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/blank/server/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/blank/server/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/browser-base/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/environment/2048/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/environment/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/environment/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/environment/todo/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/server/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/deepresearch/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/deepresearch/environment/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/deepresearch/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/deepresearch/server/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/jupyter/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/jupyter/server/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/online_mind2web/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/online_mind2web/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/online_mind2web/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/rubrics/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/rubrics/environment/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/rubrics/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/rubrics/server/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/examples/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/__main__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/gemini.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/grounded_openai.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/lite_llm.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/misc/integration_test_agent.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/openai.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_base_runtime.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_gemini.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/utils.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/build.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/clone.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/debug.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/dev.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/eval_config.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/flows/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/flows/dev.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/flows/tasks.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/get.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/init.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/list_func.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/pull.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/push.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/remove.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/celebrate.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/config.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/display.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/gpu.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/gpu_utils.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/local_runner.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/presets.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/remote_runner.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/rl_api.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/viewer.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/vllm.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/wait_utils.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_analyze_module.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_build.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_build_failure.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_build_module.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_cli_root.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_convert.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_mcp_server.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_push_happy.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_push_wrapper.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/config.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/docker.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/env_check.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/interactive.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/local_runner.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/package_runner.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/runner.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/server.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/source_hash.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tasks.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_config.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_docker.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_docker_hints.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_env_check.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_environment.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_interactive_module.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_local_runner.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_logging_utils.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_metadata.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_package_runner.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_registry_utils.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_remote_runner.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_runner_modules.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_source_hash.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_tasks.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/version_check.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/base.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/fastmcp.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/mcp_use.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/tests/test_mcp_use_retry.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/utils/mcp_use_retry.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/utils/retry.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/datasets/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/datasets/parallel.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/datasets/runner.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/datasets/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/datasets/tests/test_runner.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/datasets/tests/test_utils.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/datasets/utils.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/native/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/native/comparator.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/collector.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/config.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/context.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/exporters.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/instrumentation.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/processors.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/tests/test_instrumentation.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/py.typed +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/README.md +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/actor.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/buffer.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/chat_template.jinja +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/config.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/distributed.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/learner.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/tests/test_learner.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/train.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/types.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/utils/start_vllm_server.sh +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/utils.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/vllm_adapter.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/samples/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/samples/browser.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/context.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/low_level.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/router.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/server.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_add_tool.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_context.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_mcp_server_handlers.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_mcp_server_integration.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_mcp_server_more.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_run_wrapper.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_server_extra.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_sigterm_runner.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/settings.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/exceptions.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/hints.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/requests.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/tests/test_exceptions.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/tests/test_hints.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/async_context.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/job.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/tests/test_async_context.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/tests/test_instrument.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/tests/test_job.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/tests/test_trace.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/trace.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/utils.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/base.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/bash.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/computer/gemini.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/computer/qwen.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/edit.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/executors/base.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/grounding/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/grounding/config.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/grounding/grounded_tool.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/grounding/grounder.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/grounding/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/jupyter.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/response.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/submit.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_jupyter_tool.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_submit.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_types.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/types.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/utils.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/types.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/agent_factories.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/group_eval.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/hud_console.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/mcp.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/pretty_errors.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/progress.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/task_tracking.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tasks.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_agent_factories.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_mcp.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_pretty_errors.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_tasks.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_tool_shorthand.py +0 -0
- {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tool_shorthand.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.64
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -227,7 +227,7 @@ async def main() -> None:
|
|
|
227
227
|
client = MCPClient(mcp_config=task.mcp_config)
|
|
228
228
|
agent = ClaudeAgent(
|
|
229
229
|
mcp_client=client,
|
|
230
|
-
model="claude-sonnet-4-
|
|
230
|
+
model="claude-sonnet-4-5", # requires ANTHROPIC_API_KEY
|
|
231
231
|
)
|
|
232
232
|
|
|
233
233
|
result = await agent.run(task)
|
|
@@ -292,7 +292,7 @@ results = await run_dataset(
|
|
|
292
292
|
name="My SheetBench-50 Evaluation",
|
|
293
293
|
dataset="hud-evals/SheetBench-50", # <-- HuggingFace dataset
|
|
294
294
|
agent_class=ClaudeAgent, # <-- Your custom agent can replace this (see https://docs.hud.ai/evaluate-agents/create-agents)
|
|
295
|
-
agent_config={"model": "claude-sonnet-4-
|
|
295
|
+
agent_config={"model": "claude-sonnet-4-5"},
|
|
296
296
|
max_concurrent=50,
|
|
297
297
|
max_steps=30,
|
|
298
298
|
)
|
|
@@ -86,7 +86,7 @@ async def main() -> None:
|
|
|
86
86
|
client = MCPClient(mcp_config=task.mcp_config)
|
|
87
87
|
agent = ClaudeAgent(
|
|
88
88
|
mcp_client=client,
|
|
89
|
-
model="claude-sonnet-4-
|
|
89
|
+
model="claude-sonnet-4-5", # requires ANTHROPIC_API_KEY
|
|
90
90
|
)
|
|
91
91
|
|
|
92
92
|
result = await agent.run(task)
|
|
@@ -151,7 +151,7 @@ results = await run_dataset(
|
|
|
151
151
|
name="My SheetBench-50 Evaluation",
|
|
152
152
|
dataset="hud-evals/SheetBench-50", # <-- HuggingFace dataset
|
|
153
153
|
agent_class=ClaudeAgent, # <-- Your custom agent can replace this (see https://docs.hud.ai/evaluate-agents/create-agents)
|
|
154
|
-
agent_config={"model": "claude-sonnet-4-
|
|
154
|
+
agent_config={"model": "claude-sonnet-4-5"},
|
|
155
155
|
max_concurrent=50,
|
|
156
156
|
max_steps=30,
|
|
157
157
|
)
|
|
@@ -25,8 +25,6 @@ if TYPE_CHECKING:
|
|
|
25
25
|
|
|
26
26
|
logger = logging.getLogger(__name__)
|
|
27
27
|
|
|
28
|
-
GLOBAL_SYSTEM_PROMPT = "You are an assistant that can use tools to help the user. You will be given a task and you will need to use the tools to complete the task." # noqa: E501
|
|
29
|
-
|
|
30
28
|
|
|
31
29
|
class MCPAgent(ABC):
|
|
32
30
|
"""
|
|
@@ -58,7 +56,7 @@ class MCPAgent(ABC):
|
|
|
58
56
|
disallowed_tools: list[str] | None = None,
|
|
59
57
|
response_tool_name: str | None = None,
|
|
60
58
|
# Messages
|
|
61
|
-
system_prompt: str =
|
|
59
|
+
system_prompt: str | None = None,
|
|
62
60
|
append_setup_output: bool = True,
|
|
63
61
|
initial_screenshot: bool = True,
|
|
64
62
|
# Misc
|
|
@@ -155,7 +153,10 @@ class MCPAgent(ABC):
|
|
|
155
153
|
# If task is provided, apply agent_config and add lifecycle tools
|
|
156
154
|
if isinstance(task, Task) and task.agent_config:
|
|
157
155
|
if task.agent_config.get("system_prompt"):
|
|
158
|
-
self.system_prompt
|
|
156
|
+
if self.system_prompt is None:
|
|
157
|
+
self.system_prompt = task.agent_config["system_prompt"]
|
|
158
|
+
else:
|
|
159
|
+
self.system_prompt += "\n\n" + task.agent_config["system_prompt"]
|
|
159
160
|
if "append_setup_output" in task.agent_config:
|
|
160
161
|
self.append_setup_output = task.agent_config["append_setup_output"]
|
|
161
162
|
if "initial_screenshot" in task.agent_config:
|
|
@@ -242,6 +243,7 @@ class MCPAgent(ABC):
|
|
|
242
243
|
return await self._run_context(context, max_steps=max_steps)
|
|
243
244
|
|
|
244
245
|
except Exception as e:
|
|
246
|
+
logger.exception("Error while running agent:")
|
|
245
247
|
# Always return a Trace object for any exception
|
|
246
248
|
if self._is_connection_error(e):
|
|
247
249
|
# Return error trace for connection failures
|
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
"""Claude MCP Agent implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import copy
|
|
6
|
+
import logging
|
|
7
|
+
import re
|
|
8
|
+
from inspect import cleandoc
|
|
9
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
|
|
10
|
+
|
|
11
|
+
from anthropic import Anthropic, AsyncAnthropic, Omit
|
|
12
|
+
from anthropic.types import (
|
|
13
|
+
CacheControlEphemeralParam,
|
|
14
|
+
)
|
|
15
|
+
from anthropic.types.beta import (
|
|
16
|
+
BetaBase64ImageSourceParam,
|
|
17
|
+
BetaContentBlockParam,
|
|
18
|
+
BetaImageBlockParam,
|
|
19
|
+
BetaMessageParam,
|
|
20
|
+
BetaTextBlockParam,
|
|
21
|
+
BetaToolBash20250124Param,
|
|
22
|
+
BetaToolComputerUse20250124Param,
|
|
23
|
+
BetaToolParam,
|
|
24
|
+
BetaToolResultBlockParam,
|
|
25
|
+
BetaToolTextEditor20250728Param,
|
|
26
|
+
BetaToolUnionParam,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
import hud
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from hud.datasets import Task
|
|
33
|
+
|
|
34
|
+
import mcp.types as types
|
|
35
|
+
|
|
36
|
+
from hud.settings import settings
|
|
37
|
+
from hud.tools.computer.settings import computer_settings
|
|
38
|
+
from hud.types import AgentResponse, MCPToolCall, MCPToolResult
|
|
39
|
+
from hud.utils.hud_console import HUDConsole
|
|
40
|
+
|
|
41
|
+
from .base import MCPAgent
|
|
42
|
+
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ClaudeAgent(MCPAgent):
|
|
47
|
+
"""
|
|
48
|
+
Claude agent that uses MCP servers for tool execution.
|
|
49
|
+
|
|
50
|
+
This agent uses Claude's native tool calling capabilities but executes
|
|
51
|
+
tools through MCP servers instead of direct implementation.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
metadata: ClassVar[dict[str, Any]] = {
|
|
55
|
+
"display_width": computer_settings.ANTHROPIC_COMPUTER_WIDTH,
|
|
56
|
+
"display_height": computer_settings.ANTHROPIC_COMPUTER_HEIGHT,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
model_client: AsyncAnthropic | None = None,
|
|
62
|
+
model: str = "claude-sonnet-4-5",
|
|
63
|
+
max_tokens: int = 16384,
|
|
64
|
+
use_computer_beta: bool = True,
|
|
65
|
+
validate_api_key: bool = True,
|
|
66
|
+
computer_tool_regex: str = r"(^|_)(anthropic_computer|computer_anthropic|computer)$",
|
|
67
|
+
**kwargs: Any,
|
|
68
|
+
) -> None:
|
|
69
|
+
"""
|
|
70
|
+
Initialize Claude MCP agent.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
model_client: AsyncAnthropic client (created if not provided)
|
|
74
|
+
model: Claude model to use
|
|
75
|
+
max_tokens: Maximum tokens for response
|
|
76
|
+
use_computer_beta: Whether to use computer-use beta features
|
|
77
|
+
computer_tool_regex: we use this regex to identify the computer tool
|
|
78
|
+
**kwargs: Additional arguments passed to BaseMCPAgent (including mcp_client)
|
|
79
|
+
"""
|
|
80
|
+
super().__init__(**kwargs)
|
|
81
|
+
|
|
82
|
+
# Initialize client if not provided
|
|
83
|
+
if model_client is None:
|
|
84
|
+
api_key = settings.anthropic_api_key
|
|
85
|
+
if not api_key:
|
|
86
|
+
raise ValueError("Anthropic API key not found. Set ANTHROPIC_API_KEY.")
|
|
87
|
+
model_client = AsyncAnthropic(api_key=api_key)
|
|
88
|
+
|
|
89
|
+
# validate api key if requested
|
|
90
|
+
if validate_api_key:
|
|
91
|
+
try:
|
|
92
|
+
Anthropic(api_key=model_client.api_key).models.list()
|
|
93
|
+
except Exception as e:
|
|
94
|
+
raise ValueError(f"Anthropic API key is invalid: {e}") from e
|
|
95
|
+
|
|
96
|
+
self.anthropic_client = model_client
|
|
97
|
+
self.model = model
|
|
98
|
+
self.max_tokens = max_tokens
|
|
99
|
+
self.use_computer_beta = use_computer_beta
|
|
100
|
+
self.hud_console = HUDConsole(logger=logger)
|
|
101
|
+
|
|
102
|
+
self.model_name = "Claude"
|
|
103
|
+
self.checkpoint_name = self.model
|
|
104
|
+
|
|
105
|
+
self.computer_tool_regex = computer_tool_regex
|
|
106
|
+
|
|
107
|
+
# these will be initialized in _convert_tools_for_claude
|
|
108
|
+
self.has_computer_tool = False
|
|
109
|
+
self.tool_mapping: dict[str, str] = {}
|
|
110
|
+
self.claude_tools: list[BetaToolUnionParam] = []
|
|
111
|
+
|
|
112
|
+
async def initialize(self, task: str | Task | None = None) -> None:
|
|
113
|
+
"""Initialize the agent and build tool mappings."""
|
|
114
|
+
await super().initialize(task)
|
|
115
|
+
# Build tool mappings after tools are discovered
|
|
116
|
+
self._convert_tools_for_claude()
|
|
117
|
+
|
|
118
|
+
async def get_system_messages(self) -> list[Any]:
|
|
119
|
+
"""No system messages for Claude because applied in get_response"""
|
|
120
|
+
return []
|
|
121
|
+
|
|
122
|
+
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
|
|
123
|
+
"""Format messages for Claude."""
|
|
124
|
+
# Convert MCP content types to Anthropic content types
|
|
125
|
+
anthropic_blocks: list[BetaContentBlockParam] = []
|
|
126
|
+
|
|
127
|
+
for block in blocks:
|
|
128
|
+
if isinstance(block, types.TextContent):
|
|
129
|
+
# Only include fields that Anthropic expects
|
|
130
|
+
anthropic_blocks.append(
|
|
131
|
+
BetaTextBlockParam(
|
|
132
|
+
type="text",
|
|
133
|
+
text=block.text,
|
|
134
|
+
)
|
|
135
|
+
)
|
|
136
|
+
elif isinstance(block, types.ImageContent):
|
|
137
|
+
# Convert MCP ImageContent to Anthropic format
|
|
138
|
+
anthropic_blocks.append(
|
|
139
|
+
BetaImageBlockParam(
|
|
140
|
+
type="image",
|
|
141
|
+
source=BetaBase64ImageSourceParam(
|
|
142
|
+
type="base64",
|
|
143
|
+
media_type=cast(
|
|
144
|
+
"Literal['image/jpeg', 'image/png', 'image/gif', 'image/webp']",
|
|
145
|
+
block.mimeType,
|
|
146
|
+
),
|
|
147
|
+
data=block.data,
|
|
148
|
+
),
|
|
149
|
+
)
|
|
150
|
+
)
|
|
151
|
+
else:
|
|
152
|
+
raise ValueError(f"Unknown content block type: {type(block)}")
|
|
153
|
+
|
|
154
|
+
return [BetaMessageParam(role="user", content=anthropic_blocks)]
|
|
155
|
+
|
|
156
|
+
@hud.instrument(
|
|
157
|
+
span_type="agent",
|
|
158
|
+
record_args=False, # Messages can be large
|
|
159
|
+
record_result=True,
|
|
160
|
+
)
|
|
161
|
+
async def get_response(self, messages: list[BetaMessageParam]) -> AgentResponse:
|
|
162
|
+
"""Get response from Claude including any tool calls."""
|
|
163
|
+
|
|
164
|
+
messages_cached = self._add_prompt_caching(messages)
|
|
165
|
+
|
|
166
|
+
response = await self.anthropic_client.beta.messages.create(
|
|
167
|
+
model=self.model,
|
|
168
|
+
system=self.system_prompt if self.system_prompt is not None else Omit(),
|
|
169
|
+
max_tokens=self.max_tokens,
|
|
170
|
+
messages=messages_cached,
|
|
171
|
+
tools=self.claude_tools,
|
|
172
|
+
tool_choice={"type": "auto", "disable_parallel_tool_use": True},
|
|
173
|
+
betas=["computer-use-2025-01-24"] if self.has_computer_tool else [],
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
messages.append(
|
|
177
|
+
BetaMessageParam(
|
|
178
|
+
role="assistant",
|
|
179
|
+
content=response.content,
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Process response
|
|
184
|
+
result = AgentResponse(content="", tool_calls=[], done=True)
|
|
185
|
+
|
|
186
|
+
# Extract text content and reasoning
|
|
187
|
+
text_content = ""
|
|
188
|
+
thinking_content = ""
|
|
189
|
+
|
|
190
|
+
for block in response.content:
|
|
191
|
+
if block.type == "tool_use":
|
|
192
|
+
tool_call = MCPToolCall(
|
|
193
|
+
id=block.id,
|
|
194
|
+
# look up name in tool_mapping if available, otherwise use block name
|
|
195
|
+
name=self.tool_mapping.get(block.name, block.name),
|
|
196
|
+
arguments=block.input,
|
|
197
|
+
)
|
|
198
|
+
result.tool_calls.append(tool_call)
|
|
199
|
+
result.done = False
|
|
200
|
+
elif block.type == "text":
|
|
201
|
+
text_content += block.text
|
|
202
|
+
elif hasattr(block, "type") and block.type == "thinking":
|
|
203
|
+
thinking_content += f"Thinking: {block.thinking}\n"
|
|
204
|
+
|
|
205
|
+
result.content = thinking_content + text_content
|
|
206
|
+
|
|
207
|
+
return result
|
|
208
|
+
|
|
209
|
+
async def format_tool_results(
|
|
210
|
+
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
211
|
+
) -> list[BetaMessageParam]:
|
|
212
|
+
"""Format tool results into Claude messages."""
|
|
213
|
+
# Process each tool result
|
|
214
|
+
user_content = []
|
|
215
|
+
|
|
216
|
+
for tool_call, result in zip(tool_calls, tool_results, strict=True):
|
|
217
|
+
# Extract Claude-specific metadata from extra fields
|
|
218
|
+
tool_use_id = tool_call.id
|
|
219
|
+
if not tool_use_id:
|
|
220
|
+
self.hud_console.warning(f"No tool_use_id found for {tool_call.name}")
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
# Convert MCP tool results to Claude format
|
|
224
|
+
claude_blocks = []
|
|
225
|
+
|
|
226
|
+
if result.isError:
|
|
227
|
+
# Extract error message from content
|
|
228
|
+
error_msg = "Tool execution failed"
|
|
229
|
+
for content in result.content:
|
|
230
|
+
if isinstance(content, types.TextContent):
|
|
231
|
+
error_msg = content.text
|
|
232
|
+
break
|
|
233
|
+
claude_blocks.append(text_to_content_block(f"Error: {error_msg}"))
|
|
234
|
+
else:
|
|
235
|
+
# Process success content
|
|
236
|
+
for content in result.content:
|
|
237
|
+
if isinstance(content, types.TextContent):
|
|
238
|
+
claude_blocks.append(text_to_content_block(content.text))
|
|
239
|
+
elif isinstance(content, types.ImageContent):
|
|
240
|
+
claude_blocks.append(base64_to_content_block(content.data))
|
|
241
|
+
|
|
242
|
+
# Add tool result
|
|
243
|
+
user_content.append(tool_use_content_block(tool_use_id, claude_blocks))
|
|
244
|
+
|
|
245
|
+
# Return as a user message containing all tool results
|
|
246
|
+
return [
|
|
247
|
+
BetaMessageParam(
|
|
248
|
+
role="user",
|
|
249
|
+
content=user_content,
|
|
250
|
+
)
|
|
251
|
+
]
|
|
252
|
+
|
|
253
|
+
async def create_user_message(self, text: str) -> BetaMessageParam:
|
|
254
|
+
"""Create a user message in Claude's format."""
|
|
255
|
+
return BetaMessageParam(role="user", content=text)
|
|
256
|
+
|
|
257
|
+
def _convert_tools_for_claude(self) -> None:
|
|
258
|
+
"""Convert MCP tools to Claude API tools."""
|
|
259
|
+
|
|
260
|
+
def to_api_tool(tool: types.Tool) -> BetaToolUnionParam:
|
|
261
|
+
if tool.name == "str_replace_based_edit_tool":
|
|
262
|
+
return BetaToolTextEditor20250728Param(
|
|
263
|
+
type="text_editor_20250728",
|
|
264
|
+
name="str_replace_based_edit_tool",
|
|
265
|
+
cache_control=CacheControlEphemeralParam(type="ephemeral"),
|
|
266
|
+
)
|
|
267
|
+
if tool.name == "bash":
|
|
268
|
+
return BetaToolBash20250124Param(
|
|
269
|
+
type="bash_20250124",
|
|
270
|
+
name="bash",
|
|
271
|
+
cache_control=CacheControlEphemeralParam(type="ephemeral"),
|
|
272
|
+
)
|
|
273
|
+
if re.fullmatch(self.computer_tool_regex, tool.name):
|
|
274
|
+
return BetaToolComputerUse20250124Param(
|
|
275
|
+
type="computer_20250124",
|
|
276
|
+
name="computer",
|
|
277
|
+
display_number=1,
|
|
278
|
+
display_width_px=computer_settings.ANTHROPIC_COMPUTER_WIDTH,
|
|
279
|
+
display_height_px=computer_settings.ANTHROPIC_COMPUTER_HEIGHT,
|
|
280
|
+
cache_control=CacheControlEphemeralParam(type="ephemeral"),
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
if tool.description is None or tool.inputSchema is None:
|
|
284
|
+
raise ValueError(
|
|
285
|
+
cleandoc(f"""MCP tool {tool.name} requires both a description and inputSchema.
|
|
286
|
+
Add these by:
|
|
287
|
+
1. Adding a docstring to your @mcp.tool decorated function for the description
|
|
288
|
+
2. Using pydantic Field() annotations on function parameters for the schema
|
|
289
|
+
""")
|
|
290
|
+
)
|
|
291
|
+
"""Convert a tool to the API format"""
|
|
292
|
+
return BetaToolParam(
|
|
293
|
+
name=tool.name,
|
|
294
|
+
description=tool.description,
|
|
295
|
+
input_schema=tool.inputSchema,
|
|
296
|
+
cache_control=CacheControlEphemeralParam(type="ephemeral"),
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
self.has_computer_tool = False
|
|
300
|
+
self.tool_mapping = {}
|
|
301
|
+
self.claude_tools = []
|
|
302
|
+
for tool in self.get_available_tools():
|
|
303
|
+
claude_tool = to_api_tool(tool)
|
|
304
|
+
# warn if multiple computer tools are found
|
|
305
|
+
if claude_tool["name"] == "computer":
|
|
306
|
+
if self.has_computer_tool:
|
|
307
|
+
logger.warning(
|
|
308
|
+
"Multiple computer tools found. Ignoring %s since %s is already present",
|
|
309
|
+
tool.name,
|
|
310
|
+
self.tool_mapping["computer"],
|
|
311
|
+
)
|
|
312
|
+
continue
|
|
313
|
+
else:
|
|
314
|
+
self.has_computer_tool = True
|
|
315
|
+
self.tool_mapping[claude_tool["name"]] = tool.name
|
|
316
|
+
self.claude_tools.append(claude_tool)
|
|
317
|
+
|
|
318
|
+
def _add_prompt_caching(self, messages: list[BetaMessageParam]) -> list[BetaMessageParam]:
|
|
319
|
+
"""Add prompt caching to messages."""
|
|
320
|
+
messages_cached = copy.deepcopy(messages)
|
|
321
|
+
cache_control: CacheControlEphemeralParam = {"type": "ephemeral"}
|
|
322
|
+
|
|
323
|
+
# Mark last user message with cache control
|
|
324
|
+
if (
|
|
325
|
+
messages_cached
|
|
326
|
+
and isinstance(messages_cached[-1], dict)
|
|
327
|
+
and messages_cached[-1].get("role") == "user"
|
|
328
|
+
):
|
|
329
|
+
last_content = messages_cached[-1]["content"]
|
|
330
|
+
# Content is formatted to be list of ContentBlock in format_blocks and format_message
|
|
331
|
+
if isinstance(last_content, list):
|
|
332
|
+
for block in last_content:
|
|
333
|
+
# Only add cache control to dict-like block types that support it
|
|
334
|
+
if isinstance(block, dict):
|
|
335
|
+
match block["type"]:
|
|
336
|
+
case "redacted_thinking" | "thinking":
|
|
337
|
+
pass
|
|
338
|
+
case _:
|
|
339
|
+
block["cache_control"] = cache_control
|
|
340
|
+
|
|
341
|
+
return messages_cached
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def base64_to_content_block(base64: str) -> BetaImageBlockParam:
|
|
345
|
+
"""Convert base64 image to Claude content block."""
|
|
346
|
+
return BetaImageBlockParam(
|
|
347
|
+
type="image",
|
|
348
|
+
source=BetaBase64ImageSourceParam(
|
|
349
|
+
type="base64",
|
|
350
|
+
media_type="image/png",
|
|
351
|
+
data=base64,
|
|
352
|
+
),
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def text_to_content_block(text: str) -> BetaTextBlockParam:
|
|
357
|
+
"""Convert text to Claude content block."""
|
|
358
|
+
return {"type": "text", "text": text}
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def tool_use_content_block(
|
|
362
|
+
tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]
|
|
363
|
+
) -> BetaToolResultBlockParam:
|
|
364
|
+
"""Create tool result content block."""
|
|
365
|
+
return {"type": "tool_result", "tool_use_id": tool_use_id, "content": content}
|
|
@@ -89,7 +89,10 @@ class LangChainAgent(MCPAgent):
|
|
|
89
89
|
|
|
90
90
|
async def get_system_messages(self) -> list[BaseMessage]:
|
|
91
91
|
"""Get system messages for LangChain."""
|
|
92
|
-
|
|
92
|
+
if self.system_prompt is not None:
|
|
93
|
+
return [SystemMessage(content=self.system_prompt)]
|
|
94
|
+
else:
|
|
95
|
+
return []
|
|
93
96
|
|
|
94
97
|
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[BaseMessage]:
|
|
95
98
|
"""Create initial messages for LangChain."""
|
|
@@ -84,7 +84,10 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
84
84
|
|
|
85
85
|
async def get_system_messages(self) -> list[Any]:
|
|
86
86
|
"""Get system messages for OpenAI."""
|
|
87
|
-
|
|
87
|
+
if self.system_prompt is not None:
|
|
88
|
+
return [{"role": "system", "content": self.system_prompt}]
|
|
89
|
+
else:
|
|
90
|
+
return []
|
|
88
91
|
|
|
89
92
|
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
|
|
90
93
|
"""Format blocks for OpenAI."""
|
|
@@ -96,7 +96,6 @@ class TestBaseMCPAgent:
|
|
|
96
96
|
assert agent.allowed_tools is None
|
|
97
97
|
assert agent.disallowed_tools is None
|
|
98
98
|
assert agent.initial_screenshot is True
|
|
99
|
-
assert agent.system_prompt is not None # Default system prompt is set
|
|
100
99
|
|
|
101
100
|
def test_init_with_params(self):
|
|
102
101
|
"""Test initialization with custom parameters."""
|
|
@@ -232,7 +232,7 @@ def build_agent(
|
|
|
232
232
|
)
|
|
233
233
|
raise typer.Exit(1) from e
|
|
234
234
|
|
|
235
|
-
model = model or "claude-sonnet-4-
|
|
235
|
+
model = model or "claude-sonnet-4-5"
|
|
236
236
|
|
|
237
237
|
if allowed_tools:
|
|
238
238
|
return ClaudeAgent(
|
|
@@ -393,7 +393,7 @@ async def run_single_task(
|
|
|
393
393
|
|
|
394
394
|
agent_class = ClaudeAgent
|
|
395
395
|
agent_config = {
|
|
396
|
-
"model": model or "claude-sonnet-4-
|
|
396
|
+
"model": model or "claude-sonnet-4-5",
|
|
397
397
|
"verbose": verbose,
|
|
398
398
|
"validate_api_key": False,
|
|
399
399
|
}
|
|
@@ -626,7 +626,7 @@ async def run_full_dataset(
|
|
|
626
626
|
raise typer.Exit(1) from e
|
|
627
627
|
|
|
628
628
|
agent_config = {
|
|
629
|
-
"model": model or "claude-sonnet-4-
|
|
629
|
+
"model": model or "claude-sonnet-4-5",
|
|
630
630
|
"verbose": verbose,
|
|
631
631
|
"validate_api_key": False,
|
|
632
632
|
}
|