hud-python 0.5.25__tar.gz → 0.5.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hud_python-0.5.25 → hud_python-0.5.27}/PKG-INFO +1 -1
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/claude.py +96 -21
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_claude.py +138 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/dev.py +11 -12
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/eval.py +16 -26
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/dev.py +3 -2
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/remove.py +3 -2
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_build.py +2 -2
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/interactive.py +5 -3
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/version_check.py +6 -9
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/loader.py +15 -10
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/runner.py +1 -1
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/tests/test_loader.py +62 -6
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/utils.py +17 -4
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/__init__.py +2 -1
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/scenarios.py +128 -45
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/test_scenarios.py +78 -9
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/test_tools.py +56 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/context.py +14 -4
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/display.py +12 -7
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/manager.py +6 -50
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/task.py +59 -5
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/tests/test_context.py +48 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/tests/test_eval.py +35 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/tests/test_task.py +56 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/types.py +1 -1
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/anthropic.py +2 -2
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/types.py +11 -2
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/version.py +1 -1
- {hud_python-0.5.25 → hud_python-0.5.27}/pyproject.toml +1 -1
- {hud_python-0.5.25 → hud_python-0.5.27}/.gitignore +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/LICENSE +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/README.md +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/examples/README.md +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/__main__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/base.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/gateway.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/gemini.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/gemini_cua.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/grounded_openai.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/misc/integration_test_agent.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/openai.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/openai_chat.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/operator.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/resolver.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/conftest.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_base.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_base_runtime.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_gemini.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_integration_test_agent.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_operator.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_resolver.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_run_eval.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/types.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/__main__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/analyze.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/build.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/clone.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/convert/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/convert/base.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/convert/harbor.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/convert/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/convert/tests/conftest.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/convert/tests/test_harbor.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/debug.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/deploy.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/init.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/tasks.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/templates.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/tests/test_dev.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/get.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/init.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/link.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/list_func.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/pull.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/push.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/rft.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/rft_status.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_analyze_module.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_build_failure.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_build_module.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_cli_root.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_convert.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_debug_directory_mode.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_deploy.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_dev.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_eval.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_eval_bedrock.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_init.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_mcp_server.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_push_happy.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_push_wrapper.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/build_display.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/build_logs.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/celebrate.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/config.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/context.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/docker.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/env_check.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/git.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/local_runner.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/mcp.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/package_runner.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/runner.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/server.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/source_hash.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tasks.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_config.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_docker.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_docker_hints.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_env_check.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_environment.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_git.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_interactive_module.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_local_runner.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_logging_utils.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_metadata.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_package_runner.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_registry_utils.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_remote_runner.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_runner_modules.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_source_hash.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_tasks.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/validation.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/viewer.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/tests/test_utils.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/connection.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/connectors/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/connectors/base.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/connectors/local.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/connectors/mcp_config.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/connectors/openai.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/connectors/remote.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/environment.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/integrations/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/integrations/adk.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/integrations/anthropic.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/integrations/gemini.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/integrations/langchain.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/integrations/llamaindex.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/integrations/openai.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/mock.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/router.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/test_connection.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/test_connectors.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/test_environment.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/test_integrations.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/test_local_connectors.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/types.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/utils/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/utils/formats.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/utils/schema.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/utils/tool_wrappers.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/instrument.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/parallel.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/tests/test_manager.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/tests/test_parallel.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/utils.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/native/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/native/comparator.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/patches/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/patches/mcp_patches.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/patches/warnings.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/py.typed +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/samples/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/samples/browser.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/context.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/low_level.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/router.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/server.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_add_tool.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_context.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_mcp_server_handlers.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_mcp_server_integration.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_mcp_server_more.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_run_wrapper.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_server_extra.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_sigterm_runner.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/settings.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/exceptions.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/hints.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/requests.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/tests/test_exceptions.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/tests/test_hints.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/telemetry/exporter.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/telemetry/tests/test_eval_telemetry.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/telemetry/tests/test_exporter.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/telemetry/tests/test_instrument.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/agent.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/base.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/apply_patch.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/bash.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/edit.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/gemini_edit.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/gemini_shell.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/session.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/shell.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/test_apply_patch.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/test_bash.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/test_bash_extended.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/test_bash_integration.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/test_edit.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/test_gemini_tools.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/test_shell.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/utils.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/gemini.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/glm.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/qwen.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/tests/test_computer.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/tests/test_computer_actions.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/tests/test_glm_computer.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/executors/base.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/base.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/gemini.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/glob.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/grep.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/list.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/read.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/tests/test_glob.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/tests/test_grep.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/tests/test_list.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/tests/test_read.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/grounding/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/grounding/config.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/grounding/grounded_tool.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/grounding/grounder.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/grounding/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/hosted/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/hosted/base.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/hosted/code_execution.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/hosted/google_search.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/hosted/url_context.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/hosted/web_fetch.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/hosted/web_search.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/jupyter.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/base.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/claude.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/gemini.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/session.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/tests/test_claude.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/tests/test_gemini.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/tests/test_session.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/native_types.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/playwright.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/response.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/submit.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_agent_tool.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_jupyter_tool.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_native_tool_e2e.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_native_types.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_submit.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_types.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/types.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/utils.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/env.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/hud_console.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/mcp.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/pretty_errors.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/strict_schema.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/telemetry.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tests/test_mcp.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tests/test_pretty_errors.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tests/test_tool_shorthand.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tool_shorthand.py +0 -0
- {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/types.py +0 -0
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import copy
|
|
6
|
+
import json
|
|
6
7
|
import logging
|
|
7
8
|
from inspect import cleandoc
|
|
8
9
|
from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
|
|
@@ -85,7 +86,12 @@ class ClaudeAgent(MCPAgent):
|
|
|
85
86
|
logger.debug("Legacy fallback: detected %s as computer tool", tool.name)
|
|
86
87
|
model_lower = (self.model or "").lower()
|
|
87
88
|
if any(
|
|
88
|
-
fnmatch.fnmatch(model_lower, p)
|
|
89
|
+
fnmatch.fnmatch(model_lower, p)
|
|
90
|
+
for p in (
|
|
91
|
+
"claude-opus-4-5*",
|
|
92
|
+
"claude-opus-4-6*",
|
|
93
|
+
"claude-sonnet-4-6*",
|
|
94
|
+
)
|
|
89
95
|
):
|
|
90
96
|
return NativeToolSpec(
|
|
91
97
|
api_type="computer_20251124",
|
|
@@ -149,15 +155,15 @@ class ClaudeAgent(MCPAgent):
|
|
|
149
155
|
|
|
150
156
|
# these will be initialized in _convert_tools_for_claude
|
|
151
157
|
self.has_computer_tool = False
|
|
152
|
-
self.tool_mapping
|
|
153
|
-
self.claude_tools
|
|
154
|
-
self._required_betas
|
|
158
|
+
self.tool_mapping = {}
|
|
159
|
+
self.claude_tools = []
|
|
160
|
+
self._required_betas = set()
|
|
155
161
|
|
|
156
162
|
def _on_tools_ready(self) -> None:
|
|
157
163
|
"""Build Claude-specific tool mappings after tools are discovered."""
|
|
158
164
|
self._convert_tools_for_claude()
|
|
159
165
|
|
|
160
|
-
async def get_system_messages(self) -> list[
|
|
166
|
+
async def get_system_messages(self) -> list[types.ContentBlock]:
|
|
161
167
|
"""No system messages for Claude because applied in get_response"""
|
|
162
168
|
return []
|
|
163
169
|
|
|
@@ -195,10 +201,42 @@ class ClaudeAgent(MCPAgent):
|
|
|
195
201
|
|
|
196
202
|
return [BetaMessageParam(role="user", content=anthropic_blocks)]
|
|
197
203
|
|
|
204
|
+
@staticmethod
|
|
205
|
+
def _extract_invalid_tool_json(exc: Exception) -> str | None:
|
|
206
|
+
"""Extract malformed tool JSON payload from Anthropic stream errors.
|
|
207
|
+
|
|
208
|
+
Returns None when the exception is unrelated to tool JSON parsing.
|
|
209
|
+
"""
|
|
210
|
+
message = str(exc)
|
|
211
|
+
parse_error_prefix = "Unable to parse tool parameter JSON from model."
|
|
212
|
+
if parse_error_prefix not in message:
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
marker = "JSON: "
|
|
216
|
+
marker_index = message.find(marker)
|
|
217
|
+
if marker_index == -1:
|
|
218
|
+
return ""
|
|
219
|
+
|
|
220
|
+
return message[marker_index + len(marker) :].strip()
|
|
221
|
+
|
|
222
|
+
@staticmethod
|
|
223
|
+
def _build_invalid_tool_json_retry_message(invalid_json: str) -> BetaMessageParam:
|
|
224
|
+
"""Build a user message prompting the model to re-emit valid tool JSON."""
|
|
225
|
+
wrapped = json.dumps({"INVALID_JSON": invalid_json}, ensure_ascii=True)
|
|
226
|
+
retry_text = (
|
|
227
|
+
"Your previous tool-call arguments were invalid JSON and could not be parsed.\n"
|
|
228
|
+
"Retry the same intended tool call once with valid JSON arguments only.\n"
|
|
229
|
+
"Ensure all strings are quoted and all arrays/objects are valid JSON.\n"
|
|
230
|
+
f"Malformed payload (wrapped): {wrapped}"
|
|
231
|
+
)
|
|
232
|
+
return BetaMessageParam(
|
|
233
|
+
role="user",
|
|
234
|
+
content=[text_to_content_block(retry_text)],
|
|
235
|
+
)
|
|
236
|
+
|
|
198
237
|
async def get_response(self, messages: list[BetaMessageParam]) -> AgentResponse:
|
|
199
238
|
"""Get response from Claude including any tool calls."""
|
|
200
239
|
messages_cached = self._add_prompt_caching(messages)
|
|
201
|
-
|
|
202
240
|
# betas to use - collected during tool conversion based on native specs
|
|
203
241
|
# Only pass betas when non-empty; an empty list can produce an empty
|
|
204
242
|
# anthropic-beta header which the API rejects.
|
|
@@ -223,21 +261,58 @@ class ClaudeAgent(MCPAgent):
|
|
|
223
261
|
) from None
|
|
224
262
|
else:
|
|
225
263
|
# Regular Anthropic client supports .stream()
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
264
|
+
response = None
|
|
265
|
+
invalid_json_failures = 0
|
|
266
|
+
for _ in range(3):
|
|
267
|
+
messages_cached = self._add_prompt_caching(messages)
|
|
268
|
+
try:
|
|
269
|
+
async with self.anthropic_client.beta.messages.stream(
|
|
270
|
+
model=self.config.model,
|
|
271
|
+
system=self.system_prompt if self.system_prompt is not None else Omit(),
|
|
272
|
+
max_tokens=self.max_tokens,
|
|
273
|
+
messages=messages_cached,
|
|
274
|
+
tools=self.claude_tools,
|
|
275
|
+
tool_choice={"type": "auto", "disable_parallel_tool_use": True},
|
|
276
|
+
betas=betas,
|
|
277
|
+
) as stream:
|
|
278
|
+
# allow backend to accumulate message content
|
|
279
|
+
async for _ in stream:
|
|
280
|
+
pass
|
|
281
|
+
# get final message
|
|
282
|
+
response = await stream.get_final_message()
|
|
283
|
+
messages.append(
|
|
284
|
+
BetaMessageParam(
|
|
285
|
+
role="assistant",
|
|
286
|
+
content=response.content,
|
|
287
|
+
)
|
|
288
|
+
)
|
|
289
|
+
break
|
|
290
|
+
except ValueError as exc:
|
|
291
|
+
invalid_json = self._extract_invalid_tool_json(exc)
|
|
292
|
+
is_retryable = invalid_json is not None
|
|
293
|
+
if not is_retryable:
|
|
294
|
+
raise
|
|
295
|
+
|
|
296
|
+
invalid_json_failures += 1
|
|
297
|
+
if invalid_json_failures == 1:
|
|
298
|
+
logger.warning(
|
|
299
|
+
"Claude returned invalid streamed tool JSON; "
|
|
300
|
+
"retrying same generation once"
|
|
301
|
+
)
|
|
302
|
+
continue
|
|
303
|
+
|
|
304
|
+
if invalid_json_failures == 2:
|
|
305
|
+
logger.warning(
|
|
306
|
+
"Claude returned invalid streamed tool JSON twice; "
|
|
307
|
+
"retrying once with INVALID_JSON guidance"
|
|
308
|
+
)
|
|
309
|
+
messages.append(self._build_invalid_tool_json_retry_message(invalid_json))
|
|
310
|
+
continue
|
|
311
|
+
|
|
312
|
+
raise
|
|
313
|
+
|
|
314
|
+
if response is None:
|
|
315
|
+
raise ValueError("Claude response missing after stream retries")
|
|
241
316
|
|
|
242
317
|
# Process response
|
|
243
318
|
result = AgentResponse(content="", tool_calls=[], done=True)
|
|
@@ -99,6 +99,30 @@ class MockStreamContextManager:
|
|
|
99
99
|
return self.response
|
|
100
100
|
|
|
101
101
|
|
|
102
|
+
class MockErrorStreamContextManager:
|
|
103
|
+
"""Mock stream context manager that raises a fixed error while streaming."""
|
|
104
|
+
|
|
105
|
+
def __init__(self, error: Exception) -> None:
|
|
106
|
+
self.error = error
|
|
107
|
+
|
|
108
|
+
async def __aenter__(self) -> MockErrorStreamContextManager:
|
|
109
|
+
return self
|
|
110
|
+
|
|
111
|
+
async def __aexit__(
|
|
112
|
+
self, exc_type: type | None, exc_val: Exception | None, exc_tb: Any
|
|
113
|
+
) -> bool:
|
|
114
|
+
return False
|
|
115
|
+
|
|
116
|
+
def __aiter__(self) -> MockErrorStreamContextManager:
|
|
117
|
+
return self
|
|
118
|
+
|
|
119
|
+
async def __anext__(self) -> None:
|
|
120
|
+
raise self.error
|
|
121
|
+
|
|
122
|
+
async def get_final_message(self) -> MagicMock:
|
|
123
|
+
raise AssertionError("get_final_message should not be called when stream iteration fails")
|
|
124
|
+
|
|
125
|
+
|
|
102
126
|
class TestClaudeHelperFunctions:
|
|
103
127
|
"""Test helper functions for Claude message formatting."""
|
|
104
128
|
|
|
@@ -410,6 +434,120 @@ class TestClaudeAgent:
|
|
|
410
434
|
assert response.tool_calls[0].name == "my_tool"
|
|
411
435
|
assert response.tool_calls[0].arguments == {"x": "value"}
|
|
412
436
|
|
|
437
|
+
@pytest.mark.asyncio
|
|
438
|
+
async def test_get_response_retries_same_generation_once_on_invalid_streamed_tool_json(
|
|
439
|
+
self, mock_anthropic: AsyncAnthropic
|
|
440
|
+
) -> None:
|
|
441
|
+
"""First invalid streamed tool JSON should retry without adding guidance."""
|
|
442
|
+
invalid_json_error = ValueError(
|
|
443
|
+
"Unable to parse tool parameter JSON from model. Please retry your request or "
|
|
444
|
+
"adjust your "
|
|
445
|
+
'prompt. Error: expected value at line 1 column 10. JSON: {"labels": bug}'
|
|
446
|
+
)
|
|
447
|
+
first_stream = MockErrorStreamContextManager(invalid_json_error)
|
|
448
|
+
|
|
449
|
+
mock_response = MagicMock()
|
|
450
|
+
mock_response.content = [MagicMock(type="text", text="Recovered")]
|
|
451
|
+
second_stream = MockStreamContextManager(mock_response)
|
|
452
|
+
|
|
453
|
+
mock_anthropic.beta.messages.stream = MagicMock(side_effect=[first_stream, second_stream])
|
|
454
|
+
|
|
455
|
+
agent = ClaudeAgent.create(
|
|
456
|
+
model_client=mock_anthropic,
|
|
457
|
+
validate_api_key=False,
|
|
458
|
+
)
|
|
459
|
+
agent.claude_tools = []
|
|
460
|
+
agent.tool_mapping = {}
|
|
461
|
+
agent.has_computer_tool = False
|
|
462
|
+
agent._initialized = True
|
|
463
|
+
|
|
464
|
+
messages: list[BetaMessageParam] = [
|
|
465
|
+
cast(
|
|
466
|
+
"BetaMessageParam",
|
|
467
|
+
{"role": "user", "content": [{"type": "text", "text": "Create a Linear ticket"}]},
|
|
468
|
+
)
|
|
469
|
+
]
|
|
470
|
+
|
|
471
|
+
response = await agent.get_response(messages)
|
|
472
|
+
|
|
473
|
+
assert response.content == "Recovered"
|
|
474
|
+
assert mock_anthropic.beta.messages.stream.call_count == 2
|
|
475
|
+
# Original user message + assistant response (no guidance message needed)
|
|
476
|
+
assert len(messages) == 2
|
|
477
|
+
assert messages[1]["role"] == "assistant"
|
|
478
|
+
|
|
479
|
+
@pytest.mark.asyncio
|
|
480
|
+
async def test_get_response_adds_invalid_json_guidance_after_second_failure(
|
|
481
|
+
self, mock_anthropic: AsyncAnthropic
|
|
482
|
+
) -> None:
|
|
483
|
+
"""Second consecutive invalid JSON failure should add INVALID_JSON guidance."""
|
|
484
|
+
invalid_json_error = ValueError(
|
|
485
|
+
"Unable to parse tool parameter JSON from model. Please retry your request or "
|
|
486
|
+
"adjust your "
|
|
487
|
+
'prompt. Error: expected value at line 1 column 10. JSON: {"labels": bug}'
|
|
488
|
+
)
|
|
489
|
+
first_stream = MockErrorStreamContextManager(invalid_json_error)
|
|
490
|
+
second_stream = MockErrorStreamContextManager(invalid_json_error)
|
|
491
|
+
|
|
492
|
+
mock_response = MagicMock()
|
|
493
|
+
mock_response.content = [MagicMock(type="text", text="Recovered after guidance")]
|
|
494
|
+
third_stream = MockStreamContextManager(mock_response)
|
|
495
|
+
|
|
496
|
+
mock_anthropic.beta.messages.stream = MagicMock(
|
|
497
|
+
side_effect=[first_stream, second_stream, third_stream]
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
agent = ClaudeAgent.create(
|
|
501
|
+
model_client=mock_anthropic,
|
|
502
|
+
validate_api_key=False,
|
|
503
|
+
)
|
|
504
|
+
agent.claude_tools = []
|
|
505
|
+
agent.tool_mapping = {}
|
|
506
|
+
agent.has_computer_tool = False
|
|
507
|
+
agent._initialized = True
|
|
508
|
+
|
|
509
|
+
messages: list[BetaMessageParam] = [
|
|
510
|
+
cast(
|
|
511
|
+
"BetaMessageParam",
|
|
512
|
+
{"role": "user", "content": [{"type": "text", "text": "Create a Linear ticket"}]},
|
|
513
|
+
)
|
|
514
|
+
]
|
|
515
|
+
|
|
516
|
+
response = await agent.get_response(messages)
|
|
517
|
+
|
|
518
|
+
assert response.content == "Recovered after guidance"
|
|
519
|
+
assert mock_anthropic.beta.messages.stream.call_count == 3
|
|
520
|
+
# Original user message + INVALID_JSON guidance + assistant response
|
|
521
|
+
assert len(messages) == 3
|
|
522
|
+
retry_message = messages[1]
|
|
523
|
+
assert retry_message["role"] == "user"
|
|
524
|
+
retry_content = cast("list[dict[str, Any]]", retry_message["content"])
|
|
525
|
+
assert "INVALID_JSON" in retry_content[0]["text"]
|
|
526
|
+
|
|
527
|
+
@pytest.mark.asyncio
|
|
528
|
+
async def test_get_response_does_not_retry_unrelated_value_error(
|
|
529
|
+
self, mock_anthropic: AsyncAnthropic
|
|
530
|
+
) -> None:
|
|
531
|
+
"""Non-tool-json ValueErrors should propagate immediately."""
|
|
532
|
+
unrelated_error = ValueError("stream exploded for unrelated reason")
|
|
533
|
+
mock_anthropic.beta.messages.stream = MagicMock(
|
|
534
|
+
return_value=MockErrorStreamContextManager(unrelated_error)
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
agent = ClaudeAgent.create(
|
|
538
|
+
model_client=mock_anthropic,
|
|
539
|
+
validate_api_key=False,
|
|
540
|
+
)
|
|
541
|
+
agent.claude_tools = []
|
|
542
|
+
agent.tool_mapping = {}
|
|
543
|
+
agent.has_computer_tool = False
|
|
544
|
+
agent._initialized = True
|
|
545
|
+
|
|
546
|
+
with pytest.raises(ValueError, match="unrelated reason"):
|
|
547
|
+
await agent.get_response([])
|
|
548
|
+
|
|
549
|
+
assert mock_anthropic.beta.messages.stream.call_count == 1
|
|
550
|
+
|
|
413
551
|
|
|
414
552
|
class TestClaudeAgentBedrock:
|
|
415
553
|
"""Test ClaudeAgent class with Bedrock."""
|
|
@@ -50,42 +50,41 @@ def show_dev_server_info(
|
|
|
50
50
|
|
|
51
51
|
# Server section
|
|
52
52
|
hud_console.section_title("Server")
|
|
53
|
-
hud_console.print(f"{hud_console.sym.ITEM} {escape(server_name)}")
|
|
53
|
+
hud_console.console.print(f"{hud_console.sym.ITEM} {escape(server_name)}", highlight=False)
|
|
54
|
+
_print = lambda msg: hud_console.console.print(msg, highlight=False)
|
|
54
55
|
if transport == "http":
|
|
55
|
-
|
|
56
|
+
_print(f"{hud_console.sym.ITEM} http://localhost:{port}/mcp")
|
|
56
57
|
else:
|
|
57
|
-
|
|
58
|
+
_print(f"{hud_console.sym.ITEM} (stdio)")
|
|
58
59
|
|
|
59
60
|
# Quick Links (only for HTTP mode)
|
|
60
61
|
if transport == "http":
|
|
61
62
|
hud_console.section_title("Quick Links")
|
|
62
|
-
|
|
63
|
-
|
|
63
|
+
_print(f"{hud_console.sym.ITEM} Docs: http://localhost:{port}/docs")
|
|
64
|
+
_print(f"{hud_console.sym.ITEM} Cursor:")
|
|
64
65
|
# Display the Cursor link on its own line to prevent wrapping
|
|
65
66
|
hud_console.link(cursor_deeplink)
|
|
66
67
|
|
|
67
68
|
# Show eval endpoint if in Docker mode
|
|
68
69
|
if docker_mode:
|
|
69
|
-
hud_console.
|
|
70
|
-
f"{hud_console.sym.ITEM} Eval API: http://localhost:{port}/eval (POST)"
|
|
71
|
-
)
|
|
70
|
+
_print(f"{hud_console.sym.ITEM} Eval API: http://localhost:{port}/eval (POST)")
|
|
72
71
|
|
|
73
72
|
# Show debugging URLs from telemetry
|
|
74
73
|
if telemetry:
|
|
75
74
|
if "live_url" in telemetry:
|
|
76
75
|
url = escape(telemetry["live_url"])
|
|
77
|
-
|
|
76
|
+
_print(f"{hud_console.sym.ITEM} Live URL: {url}")
|
|
78
77
|
if "vnc_url" in telemetry:
|
|
79
|
-
|
|
78
|
+
_print(f"{hud_console.sym.ITEM} VNC URL: {escape(telemetry['vnc_url'])}")
|
|
80
79
|
if "cdp_url" in telemetry:
|
|
81
|
-
|
|
80
|
+
_print(f"{hud_console.sym.ITEM} CDP URL: {escape(telemetry['cdp_url'])}")
|
|
82
81
|
|
|
83
82
|
# Check for VNC (browser environment)
|
|
84
83
|
if env_dir and (env_dir / "environment" / "server.py").exists():
|
|
85
84
|
try:
|
|
86
85
|
content = (env_dir / "environment" / "server.py").read_text()
|
|
87
86
|
if "x11vnc" in content.lower() or "vnc" in content.lower():
|
|
88
|
-
|
|
87
|
+
_print(f"{hud_console.sym.ITEM} VNC: http://localhost:8080/vnc.html")
|
|
89
88
|
except Exception: # noqa: S110
|
|
90
89
|
pass
|
|
91
90
|
|
|
@@ -96,7 +96,7 @@ _DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
|
|
|
96
96
|
# max_steps = 10
|
|
97
97
|
# group_size = 1
|
|
98
98
|
# byok = false # Remote only; use encrypted env vars on the platform.
|
|
99
|
-
# task_ids = ["
|
|
99
|
+
# task_ids = ["checkout-smoke", "0"] # slugs or 0-based indices
|
|
100
100
|
# verbose = true
|
|
101
101
|
# very_verbose = true
|
|
102
102
|
# auto_respond = true
|
|
@@ -627,15 +627,18 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
|
|
|
627
627
|
hud_console.error(f"No tasks found in: {cfg.source}")
|
|
628
628
|
raise typer.Exit(1)
|
|
629
629
|
|
|
630
|
-
# Filter by task
|
|
630
|
+
# Filter by task slugs (or positional indices) if provided
|
|
631
631
|
if cfg.task_ids:
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
632
|
+
selector_set = set(cfg.task_ids)
|
|
633
|
+
filtered = []
|
|
634
|
+
for i, task in enumerate(tasks):
|
|
635
|
+
task_slug = getattr(task, "slug", None)
|
|
636
|
+
if (isinstance(task_slug, str) and task_slug in selector_set) or str(i) in selector_set:
|
|
637
|
+
filtered.append(task)
|
|
635
638
|
if not filtered:
|
|
636
|
-
hud_console.error(f"No tasks found matching
|
|
639
|
+
hud_console.error(f"No tasks found matching slugs/indices: {', '.join(cfg.task_ids)}")
|
|
637
640
|
raise typer.Exit(1)
|
|
638
|
-
hud_console.info(f"Filtered to {len(filtered)} task(s) by
|
|
641
|
+
hud_console.info(f"Filtered to {len(filtered)} task(s) by slug/index")
|
|
639
642
|
tasks = filtered
|
|
640
643
|
elif not cfg.all:
|
|
641
644
|
# Single task mode (no --all, --full, or --task-ids)
|
|
@@ -687,33 +690,16 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
|
|
|
687
690
|
sanitized[agent_name] = agent_settings
|
|
688
691
|
eval_cfg_dict["agent_config"] = sanitized
|
|
689
692
|
|
|
690
|
-
|
|
691
|
-
tasks_data = (
|
|
692
|
-
[t.model_dump(mode="json", exclude_none=True) for t in tasks_to_create]
|
|
693
|
-
if tasks_to_create
|
|
694
|
-
else None
|
|
695
|
-
)
|
|
696
|
-
|
|
697
|
-
ids = await _send_job_enter(
|
|
693
|
+
await _send_job_enter(
|
|
698
694
|
job_id=job_id,
|
|
699
695
|
name=f"eval ({cfg.source})" if cfg.source else "eval",
|
|
700
696
|
variants=None,
|
|
701
697
|
group=cfg.group_size,
|
|
702
698
|
api_key=None,
|
|
703
699
|
taskset=cfg.taskset,
|
|
704
|
-
tasks=tasks_data,
|
|
705
700
|
hud_eval_config=eval_cfg_dict,
|
|
706
701
|
)
|
|
707
702
|
|
|
708
|
-
if cfg.taskset and ids:
|
|
709
|
-
if len(ids) != len(tasks_to_create):
|
|
710
|
-
hud_console.warning(
|
|
711
|
-
f"Task count mismatch: sent {len(tasks_to_create)} tasks, "
|
|
712
|
-
f"received {len(ids)} IDs. Some tasks may not be linked."
|
|
713
|
-
)
|
|
714
|
-
for task_obj, task_version_id in zip(tasks_to_create, ids, strict=False):
|
|
715
|
-
task_obj.id = task_version_id
|
|
716
|
-
|
|
717
703
|
trace_ids = await submit_rollouts(
|
|
718
704
|
tasks=tasks,
|
|
719
705
|
job_id=job_id,
|
|
@@ -809,7 +795,11 @@ def eval_command(
|
|
|
809
795
|
help="Automatically prompt the agent to continue if it does not respond with a tool call",
|
|
810
796
|
),
|
|
811
797
|
group_size: int | None = typer.Option(None, "--group-size", help="Runs per task"),
|
|
812
|
-
task_ids: str | None = typer.Option(
|
|
798
|
+
task_ids: str | None = typer.Option(
|
|
799
|
+
None,
|
|
800
|
+
"--task-ids",
|
|
801
|
+
help="Comma-separated task slugs (or 0-based indices) to run",
|
|
802
|
+
),
|
|
813
803
|
yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation"),
|
|
814
804
|
remote: bool = typer.Option(
|
|
815
805
|
False, "--remote", help="Submit tasks to platform for remote execution"
|
|
@@ -138,8 +138,9 @@ def show_dev_ui(
|
|
|
138
138
|
# Show other info below
|
|
139
139
|
label = "Base image" if is_docker else "Server"
|
|
140
140
|
hud_console.info("")
|
|
141
|
-
hud_console.print(
|
|
142
|
-
|
|
141
|
+
_print = lambda msg: hud_console.console.print(msg, highlight=False)
|
|
142
|
+
_print(f"{hud_console.sym.ITEM} {escape(label)}: {escape(server_name)}")
|
|
143
|
+
_print(f"{hud_console.sym.ITEM} Cursor:")
|
|
143
144
|
# Display the Cursor link on its own line to prevent wrapping
|
|
144
145
|
hud_console.link(cursor_deeplink)
|
|
145
146
|
hud_console.info("")
|
|
@@ -5,6 +5,7 @@ from __future__ import annotations
|
|
|
5
5
|
import shutil
|
|
6
6
|
|
|
7
7
|
import typer
|
|
8
|
+
from rich.markup import escape
|
|
8
9
|
|
|
9
10
|
from hud.utils.hud_console import HUDConsole
|
|
10
11
|
|
|
@@ -91,8 +92,8 @@ def remove_environment(
|
|
|
91
92
|
if image:
|
|
92
93
|
hud_console.info("")
|
|
93
94
|
hud_console.info("Note: The Docker image may still exist locally.")
|
|
94
|
-
hud_console.
|
|
95
|
-
f"To remove it, run: [cyan]docker rmi {image.split('@')[0]}[/cyan]"
|
|
95
|
+
hud_console.print(
|
|
96
|
+
f"To remove it, run: [cyan]docker rmi {escape(image.split('@')[0])}[/cyan]"
|
|
96
97
|
)
|
|
97
98
|
except Exception as e:
|
|
98
99
|
hud_console.error(f"Failed to remove environment: {e}")
|
|
@@ -60,12 +60,12 @@ class TestIncrementVersion:
|
|
|
60
60
|
def test_increment_minor(self):
|
|
61
61
|
"""Test incrementing minor version."""
|
|
62
62
|
assert increment_version("1.2.3", "minor") == "1.3.0"
|
|
63
|
-
assert increment_version("0.5.
|
|
63
|
+
assert increment_version("0.5.27", "minor") == "0.6.0"
|
|
64
64
|
|
|
65
65
|
def test_increment_major(self):
|
|
66
66
|
"""Test incrementing major version."""
|
|
67
67
|
assert increment_version("1.2.3", "major") == "2.0.0"
|
|
68
|
-
assert increment_version("0.5.
|
|
68
|
+
assert increment_version("0.5.27", "major") == "1.0.0"
|
|
69
69
|
|
|
70
70
|
def test_increment_with_v_prefix(self):
|
|
71
71
|
"""Test incrementing version with v prefix."""
|
|
@@ -433,7 +433,7 @@ class InteractiveMCPTester:
|
|
|
433
433
|
# Show next steps tutorial
|
|
434
434
|
self.console.section_title("Next Steps")
|
|
435
435
|
self.console.info("🏗️ Ready to test with real agents? Run:")
|
|
436
|
-
self.console.
|
|
436
|
+
self.console.print(" [cyan]hud build[/cyan]")
|
|
437
437
|
self.console.info("")
|
|
438
438
|
self.console.info("This will:")
|
|
439
439
|
self.console.info(" 1. Build your environment image")
|
|
@@ -441,8 +441,10 @@ class InteractiveMCPTester:
|
|
|
441
441
|
self.console.info(" 3. Prepare it for testing with agents")
|
|
442
442
|
self.console.info("")
|
|
443
443
|
self.console.info("Then you can:")
|
|
444
|
-
self.console.
|
|
445
|
-
self.console.
|
|
444
|
+
self.console.print(" • Test locally: [cyan]hud run <image>[/cyan]")
|
|
445
|
+
self.console.print(
|
|
446
|
+
" • Push to registry: [cyan]hud push --image <registry/name>[/cyan]"
|
|
447
|
+
)
|
|
446
448
|
self.console.info(" • Use with agents via the lock file")
|
|
447
449
|
|
|
448
450
|
console.print("\n[dim]Happy testing! 🎉[/dim]")
|
|
@@ -26,6 +26,7 @@ from typing import NamedTuple
|
|
|
26
26
|
|
|
27
27
|
import httpx
|
|
28
28
|
from packaging import version
|
|
29
|
+
from rich.markup import escape
|
|
29
30
|
|
|
30
31
|
from hud.utils.hud_console import HUDConsole
|
|
31
32
|
|
|
@@ -241,16 +242,12 @@ def display_update_prompt(console: HUDConsole | None = None) -> None:
|
|
|
241
242
|
else:
|
|
242
243
|
upgrade_cmd = "uv tool upgrade hud-python"
|
|
243
244
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
f"
|
|
247
|
-
f"[
|
|
248
|
-
f"
|
|
249
|
-
f" Run: [bold yellow]{upgrade_cmd}[/bold yellow] to update"
|
|
245
|
+
console.print(
|
|
246
|
+
f"[yellow]🆕 A new version of hud-python is available: "
|
|
247
|
+
f"[bold cyan]{escape(info.latest)}[/bold cyan] "
|
|
248
|
+
f"(current: [dim]{escape(info.current)}[/dim])\n"
|
|
249
|
+
f" Run: [bold yellow]{escape(upgrade_cmd)}[/bold yellow] to update[/yellow]"
|
|
250
250
|
)
|
|
251
|
-
|
|
252
|
-
# Display using console info
|
|
253
|
-
console.info(f"[yellow]{update_msg}[/yellow]")
|
|
254
251
|
except Exception: # noqa: S110
|
|
255
252
|
# Never let version checking disrupt the user's workflow
|
|
256
253
|
pass
|
|
@@ -110,6 +110,8 @@ def _load_from_huggingface(dataset_name: str) -> list[Task]:
|
|
|
110
110
|
|
|
111
111
|
def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
|
|
112
112
|
"""Load raw task dicts from HUD API."""
|
|
113
|
+
from hud.datasets.utils import _normalize_task_dict
|
|
114
|
+
|
|
113
115
|
headers = {}
|
|
114
116
|
if settings.api_key:
|
|
115
117
|
headers["Authorization"] = f"Bearer {settings.api_key}"
|
|
@@ -126,13 +128,11 @@ def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
|
|
|
126
128
|
# Extract tasks dict from response
|
|
127
129
|
tasks_dict = data.get("tasks", {})
|
|
128
130
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
return raw_items
|
|
131
|
+
return [
|
|
132
|
+
_normalize_task_dict(task_data)
|
|
133
|
+
for task_data in tasks_dict.values()
|
|
134
|
+
if isinstance(task_data, dict)
|
|
135
|
+
]
|
|
136
136
|
|
|
137
137
|
|
|
138
138
|
def _load_from_api(dataset_name: str) -> list[Task]:
|
|
@@ -282,8 +282,13 @@ def save_tasks(
|
|
|
282
282
|
"Use Task.from_v4(legacy_task) to convert from LegacyTask."
|
|
283
283
|
)
|
|
284
284
|
|
|
285
|
-
# Convert tasks to dicts (Task is a Pydantic model)
|
|
286
|
-
|
|
285
|
+
# Convert tasks to dicts (Task is a Pydantic model).
|
|
286
|
+
# id is internal/platform-assigned; uploads should identify via slug.
|
|
287
|
+
task_dicts: list[dict[str, Any]] = []
|
|
288
|
+
for task in tasks:
|
|
289
|
+
task_data = task.model_dump(mode="json", exclude_none=True)
|
|
290
|
+
task_data.pop("id", None)
|
|
291
|
+
task_dicts.append(task_data)
|
|
287
292
|
|
|
288
293
|
# Build request payload
|
|
289
294
|
payload: dict[str, Any] = {
|
|
@@ -296,7 +301,7 @@ def save_tasks(
|
|
|
296
301
|
try:
|
|
297
302
|
with httpx.Client(timeout=60) as client:
|
|
298
303
|
response = client.post(
|
|
299
|
-
f"{settings.hud_api_url}/tasks/
|
|
304
|
+
f"{settings.hud_api_url}/tasks/upload",
|
|
300
305
|
json=payload,
|
|
301
306
|
headers=headers,
|
|
302
307
|
)
|
|
@@ -187,7 +187,7 @@ async def run_single_task(
|
|
|
187
187
|
```
|
|
188
188
|
"""
|
|
189
189
|
# Determine trace name
|
|
190
|
-
effective_trace_name = trace_name or task_id or task.
|
|
190
|
+
effective_trace_name = trace_name or task_id or task.slug or "single_task"
|
|
191
191
|
|
|
192
192
|
# Run with explicit eval context parameters
|
|
193
193
|
async with hud.eval(
|