hud-python 0.5.0__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hud_python-0.5.0 → hud_python-0.5.1}/PKG-INFO +27 -14
- {hud_python-0.5.0 → hud_python-0.5.1}/README.md +26 -13
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/__init__.py +1 -1
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/base.py +26 -2
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/misc/response_agent.py +5 -1
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/openai_chat.py +12 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_base.py +64 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/eval.py +56 -27
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/init.py +4 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/runner.py +4 -3
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/utils.py +7 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connectors/remote.py +3 -4
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/environment.py +11 -3
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/scenarios.py +46 -9
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/context.py +26 -23
- hud_python-0.5.1/hud/eval/instrument.py +185 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/hud_console.py +7 -3
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/version.py +1 -1
- {hud_python-0.5.0 → hud_python-0.5.1}/pyproject.toml +1 -1
- hud_python-0.5.0/hud/eval/instrument.py +0 -115
- {hud_python-0.5.0 → hud_python-0.5.1}/.gitignore +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/LICENSE +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/examples/README.md +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/__main__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/claude.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/gemini.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/gemini_cua.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/grounded_openai.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/misc/integration_test_agent.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/openai.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/operator.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/conftest.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_base_runtime.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_gemini.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_operator.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_run_eval.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/__main__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/analyze.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/build.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/clone.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/debug.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/dev.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/flows/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/flows/dev.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/flows/init.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/flows/tasks.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/flows/templates.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/flows/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/flows/tests/test_dev.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/get.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/list_func.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/pull.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/push.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/remove.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/rft.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/rft_status.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_analyze_module.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_build.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_build_failure.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_build_module.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_cli_root.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_convert.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_dev.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_eval.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_eval_bedrock.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_init.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_mcp_server.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_push_happy.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_push_wrapper.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/celebrate.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/config.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/docker.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/env_check.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/git.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/interactive.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/local_runner.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/package_runner.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/runner.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/server.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/source_hash.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tasks.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_config.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_docker.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_docker_hints.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_env_check.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_environment.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_git.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_interactive_module.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_local_runner.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_logging_utils.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_metadata.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_package_runner.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_registry_utils.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_remote_runner.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_runner_modules.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_source_hash.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_tasks.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/version_check.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/viewer.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/README.md +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/base.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/environment.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/fastmcp.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/mcp_use.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/tests/test_analyze_scenarios.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/tests/test_mcp_use_retry.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/utils/mcp_use_retry.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/utils/retry.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/loader.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/tests/test_loader.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/tests/test_utils.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connection.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connectors/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connectors/base.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connectors/local.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connectors/mcp_config.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connectors/openai.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/integrations/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/integrations/adk.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/integrations/anthropic.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/integrations/gemini.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/integrations/langchain.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/integrations/llamaindex.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/integrations/openai.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/mock.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/router.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/test_connection.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/test_connectors.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/test_environment.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/test_integrations.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/test_local_connectors.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/test_scenarios.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/test_tools.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/types.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/utils/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/utils/formats.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/utils/schema.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/utils/tool_wrappers.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/display.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/manager.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/parallel.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/task.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/tests/test_context.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/tests/test_eval.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/tests/test_manager.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/tests/test_parallel.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/tests/test_task.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/types.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/utils.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/native/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/native/comparator.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/patches/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/patches/mcp_patches.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/patches/warnings.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/py.typed +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/samples/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/samples/browser.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/context.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/low_level.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/router.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/server.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_add_tool.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_context.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_mcp_server_handlers.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_mcp_server_integration.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_mcp_server_more.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_run_wrapper.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_server_extra.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_sigterm_runner.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/settings.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/exceptions.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/hints.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/requests.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/tests/test_exceptions.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/tests/test_hints.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/telemetry/exporter.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/telemetry/tests/test_eval_telemetry.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/telemetry/tests/test_exporter.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/telemetry/tests/test_instrument.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/apply_patch.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/base.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/bash.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/computer/gemini.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/computer/qwen.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/edit.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/executors/base.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/grounding/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/grounding/config.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/grounding/grounded_tool.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/grounding/grounder.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/grounding/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/jupyter.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/playwright.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/response.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/shell.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/submit.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_apply_patch.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_jupyter_tool.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_shell.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_submit.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_types.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/types.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/utils.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/types.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/env.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/mcp.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/pretty_errors.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/strict_schema.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/telemetry.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tests/test_mcp.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tests/test_pretty_errors.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tests/test_tool_shorthand.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tool_shorthand.py +0 -0
- {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -166,14 +166,21 @@ from hud import Environment
|
|
|
166
166
|
env = Environment("my-env")
|
|
167
167
|
|
|
168
168
|
@env.tool()
|
|
169
|
-
def
|
|
170
|
-
"""
|
|
171
|
-
return
|
|
172
|
-
|
|
173
|
-
@env.scenario("
|
|
174
|
-
async def
|
|
175
|
-
response = yield
|
|
176
|
-
yield 1.0 if answer in response else 0.0 # Reward
|
|
169
|
+
def add(a: int, b: int) -> int:
|
|
170
|
+
"""Add two numbers."""
|
|
171
|
+
return a + b
|
|
172
|
+
|
|
173
|
+
@env.scenario("solve-math")
|
|
174
|
+
async def solve_math(problem: str, answer: int):
|
|
175
|
+
response = yield problem # Prompt
|
|
176
|
+
yield 1.0 if str(answer) in response else 0.0 # Reward
|
|
177
|
+
|
|
178
|
+
async with env("solve-math", problem="What is 2+2?", answer=4) as ctx:
|
|
179
|
+
# Your agent logic here - call tools, get response
|
|
180
|
+
result = await ctx.call_tool("add", a=2, b=2)
|
|
181
|
+
await ctx.submit(f"The answer is {result}")
|
|
182
|
+
|
|
183
|
+
print(ctx.reward) # 1.0
|
|
177
184
|
```
|
|
178
185
|
|
|
179
186
|
The agent runs between the yields. First yield sends the prompt, second yield scores the result. → [Docs](https://docs.hud.ai/quick-links/environments) · [Templates](https://hud.ai/environments)
|
|
@@ -183,14 +190,20 @@ The agent runs between the yields. First yield sends the prompt, second yield sc
|
|
|
183
190
|
Test different models. Repeat runs to see the distribution:
|
|
184
191
|
|
|
185
192
|
```python
|
|
186
|
-
import
|
|
193
|
+
from openai import AsyncOpenAI
|
|
194
|
+
import os
|
|
187
195
|
|
|
188
|
-
|
|
196
|
+
client = AsyncOpenAI(
|
|
197
|
+
base_url="https://inference.hud.ai",
|
|
198
|
+
api_key=os.environ["HUD_API_KEY"]
|
|
199
|
+
)
|
|
189
200
|
|
|
190
|
-
|
|
201
|
+
# Using the env from above
|
|
202
|
+
async with env("solve-math", problem="What is 2+2?", answer=4, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx:
|
|
191
203
|
response = await client.chat.completions.create(
|
|
192
204
|
model=ctx.variants["model"],
|
|
193
|
-
messages=[{"role": "user", "content": ctx.prompt}]
|
|
205
|
+
messages=[{"role": "user", "content": ctx.prompt}],
|
|
206
|
+
tools=ctx.tools # Environment tools available to the model
|
|
194
207
|
)
|
|
195
208
|
await ctx.submit(response.choices[0].message.content)
|
|
196
209
|
```
|
|
@@ -205,7 +218,7 @@ Push to GitHub, connect on hud.ai, run at scale:
|
|
|
205
218
|
hud init # Scaffold environment
|
|
206
219
|
git push # Push to GitHub
|
|
207
220
|
# Connect on hud.ai → New → Environment
|
|
208
|
-
hud eval my-
|
|
221
|
+
hud eval my-eval --model gpt-4o --group-size 100
|
|
209
222
|
# Or create and run tasks on the platform
|
|
210
223
|
```
|
|
211
224
|
|
|
@@ -68,14 +68,21 @@ from hud import Environment
|
|
|
68
68
|
env = Environment("my-env")
|
|
69
69
|
|
|
70
70
|
@env.tool()
|
|
71
|
-
def
|
|
72
|
-
"""
|
|
73
|
-
return
|
|
74
|
-
|
|
75
|
-
@env.scenario("
|
|
76
|
-
async def
|
|
77
|
-
response = yield
|
|
78
|
-
yield 1.0 if answer in response else 0.0 # Reward
|
|
71
|
+
def add(a: int, b: int) -> int:
|
|
72
|
+
"""Add two numbers."""
|
|
73
|
+
return a + b
|
|
74
|
+
|
|
75
|
+
@env.scenario("solve-math")
|
|
76
|
+
async def solve_math(problem: str, answer: int):
|
|
77
|
+
response = yield problem # Prompt
|
|
78
|
+
yield 1.0 if str(answer) in response else 0.0 # Reward
|
|
79
|
+
|
|
80
|
+
async with env("solve-math", problem="What is 2+2?", answer=4) as ctx:
|
|
81
|
+
# Your agent logic here - call tools, get response
|
|
82
|
+
result = await ctx.call_tool("add", a=2, b=2)
|
|
83
|
+
await ctx.submit(f"The answer is {result}")
|
|
84
|
+
|
|
85
|
+
print(ctx.reward) # 1.0
|
|
79
86
|
```
|
|
80
87
|
|
|
81
88
|
The agent runs between the yields. First yield sends the prompt, second yield scores the result. → [Docs](https://docs.hud.ai/quick-links/environments) · [Templates](https://hud.ai/environments)
|
|
@@ -85,14 +92,20 @@ The agent runs between the yields. First yield sends the prompt, second yield sc
|
|
|
85
92
|
Test different models. Repeat runs to see the distribution:
|
|
86
93
|
|
|
87
94
|
```python
|
|
88
|
-
import
|
|
95
|
+
from openai import AsyncOpenAI
|
|
96
|
+
import os
|
|
89
97
|
|
|
90
|
-
|
|
98
|
+
client = AsyncOpenAI(
|
|
99
|
+
base_url="https://inference.hud.ai",
|
|
100
|
+
api_key=os.environ["HUD_API_KEY"]
|
|
101
|
+
)
|
|
91
102
|
|
|
92
|
-
|
|
103
|
+
# Using the env from above
|
|
104
|
+
async with env("solve-math", problem="What is 2+2?", answer=4, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx:
|
|
93
105
|
response = await client.chat.completions.create(
|
|
94
106
|
model=ctx.variants["model"],
|
|
95
|
-
messages=[{"role": "user", "content": ctx.prompt}]
|
|
107
|
+
messages=[{"role": "user", "content": ctx.prompt}],
|
|
108
|
+
tools=ctx.tools # Environment tools available to the model
|
|
96
109
|
)
|
|
97
110
|
await ctx.submit(response.choices[0].message.content)
|
|
98
111
|
```
|
|
@@ -107,7 +120,7 @@ Push to GitHub, connect on hud.ai, run at scale:
|
|
|
107
120
|
hud init # Scaffold environment
|
|
108
121
|
git push # Push to GitHub
|
|
109
122
|
# Connect on hud.ai → New → Environment
|
|
110
|
-
hud eval my-
|
|
123
|
+
hud eval my-eval --model gpt-4o --group-size 100
|
|
111
124
|
# Or create and run tasks on the platform
|
|
112
125
|
```
|
|
113
126
|
|
|
@@ -18,7 +18,7 @@ from .telemetry.instrument import instrument
|
|
|
18
18
|
def trace(*args: object, **kwargs: object) -> EvalContext:
|
|
19
19
|
"""Deprecated: Use hud.eval() instead.
|
|
20
20
|
|
|
21
|
-
.. deprecated:: 0.5.
|
|
21
|
+
.. deprecated:: 0.5.1
|
|
22
22
|
hud.trace() is deprecated. Use hud.eval() or env.eval() instead.
|
|
23
23
|
"""
|
|
24
24
|
warnings.warn(
|
|
@@ -182,7 +182,23 @@ class MCPAgent(ABC):
|
|
|
182
182
|
raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
|
|
183
183
|
|
|
184
184
|
if not ctx.prompt:
|
|
185
|
-
|
|
185
|
+
if ctx.has_scenario:
|
|
186
|
+
# Scenario was specified but prompt is still empty
|
|
187
|
+
# (e.g., scenario returned empty string, or edge case not caught in scenarios.py)
|
|
188
|
+
scenario = ctx._task.scenario if ctx._task else "unknown"
|
|
189
|
+
raise ValueError(
|
|
190
|
+
f"ctx.prompt is not set.\n\n"
|
|
191
|
+
f"Scenario '{scenario}' was specified but returned an empty prompt.\n"
|
|
192
|
+
f"Check that the scenario's setup function returns a non-empty string."
|
|
193
|
+
)
|
|
194
|
+
else:
|
|
195
|
+
# No scenario specified at all
|
|
196
|
+
raise ValueError(
|
|
197
|
+
"ctx.prompt is not set.\n\n"
|
|
198
|
+
"No scenario was specified in your task file.\n"
|
|
199
|
+
"Either add a 'scenario' field to your task, or set ctx.prompt manually "
|
|
200
|
+
"before running the agent."
|
|
201
|
+
)
|
|
186
202
|
|
|
187
203
|
# Store context for tool calls
|
|
188
204
|
self.ctx = ctx
|
|
@@ -194,6 +210,11 @@ class MCPAgent(ABC):
|
|
|
194
210
|
try:
|
|
195
211
|
result = await self._run_context(text_to_blocks(ctx.prompt), max_steps=max_steps)
|
|
196
212
|
|
|
213
|
+
# Propagate error state to context for platform visibility
|
|
214
|
+
if result.isError and hasattr(ctx, "error"):
|
|
215
|
+
error_msg = result.info.get("error") if result.info else result.content
|
|
216
|
+
ctx.error = Exception(str(error_msg)) if error_msg else Exception("Agent error")
|
|
217
|
+
|
|
197
218
|
# Submit final answer to context (only if scenario is running)
|
|
198
219
|
if result.content and ctx.has_scenario:
|
|
199
220
|
await ctx.submit(result.content)
|
|
@@ -202,6 +223,9 @@ class MCPAgent(ABC):
|
|
|
202
223
|
|
|
203
224
|
except Exception as e:
|
|
204
225
|
logger.exception("Error while running agent:")
|
|
226
|
+
# Propagate error to context for platform visibility
|
|
227
|
+
if hasattr(ctx, "error"):
|
|
228
|
+
ctx.error = e
|
|
205
229
|
return Trace(
|
|
206
230
|
reward=0.0,
|
|
207
231
|
done=True,
|
|
@@ -537,7 +561,7 @@ def find_reward(result: MCPToolResult) -> float:
|
|
|
537
561
|
except json.JSONDecodeError:
|
|
538
562
|
pass
|
|
539
563
|
|
|
540
|
-
logger.error("Couldn't parse reward from result: %s", result)
|
|
564
|
+
logger.error("Couldn't parse reward from result: %s", str(result.structuredContent))
|
|
541
565
|
return 0.0
|
|
542
566
|
|
|
543
567
|
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from typing import Literal
|
|
4
5
|
|
|
5
6
|
from openai import AsyncOpenAI
|
|
6
7
|
|
|
7
8
|
from hud.settings import settings
|
|
8
9
|
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
9
12
|
ResponseType = Literal["STOP", "CONTINUE"]
|
|
10
13
|
|
|
11
14
|
DEFAULT_SYSTEM_PROMPT = """\
|
|
@@ -97,5 +100,6 @@ class ResponseAgent:
|
|
|
97
100
|
else:
|
|
98
101
|
return "CONTINUE"
|
|
99
102
|
|
|
100
|
-
except Exception:
|
|
103
|
+
except Exception as e:
|
|
104
|
+
logger.warning("Auto-respond failed: %s", e)
|
|
101
105
|
return "CONTINUE" # Default to continue on error
|
|
@@ -70,6 +70,18 @@ class OpenAIChatAgent(MCPAgent):
|
|
|
70
70
|
super().__init__(params, **kwargs)
|
|
71
71
|
self.config: OpenAIChatConfig
|
|
72
72
|
|
|
73
|
+
if (
|
|
74
|
+
self.config.api_key
|
|
75
|
+
and self.config.base_url
|
|
76
|
+
and settings.hud_gateway_url in self.config.base_url
|
|
77
|
+
and settings.api_key
|
|
78
|
+
and self.config.api_key != settings.api_key
|
|
79
|
+
):
|
|
80
|
+
raise ValueError(
|
|
81
|
+
"OpenAIChatAgent api_key is not allowed with HUD Gateway. "
|
|
82
|
+
"Use HUD_API_KEY for gateway auth and BYOK headers for provider keys."
|
|
83
|
+
)
|
|
84
|
+
|
|
73
85
|
if self.config.openai_client is not None:
|
|
74
86
|
self.oai = self.config.openai_client
|
|
75
87
|
elif self.config.api_key is not None or self.config.base_url is not None:
|
|
@@ -350,3 +350,67 @@ class TestMCPAgentToolSchemas:
|
|
|
350
350
|
assert len(schemas) == 1
|
|
351
351
|
assert schemas[0]["name"] == "my_tool"
|
|
352
352
|
assert schemas[0]["description"] == "My tool description"
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
class TestMCPAgentErrorPropagation:
|
|
356
|
+
"""Tests for error propagation to EvalContext."""
|
|
357
|
+
|
|
358
|
+
@pytest.mark.asyncio
|
|
359
|
+
async def test_exception_propagates_to_ctx_error(self) -> None:
|
|
360
|
+
"""Test that exceptions during run() set ctx.error for platform visibility."""
|
|
361
|
+
|
|
362
|
+
class FailingAgent(MockMCPAgent):
|
|
363
|
+
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
364
|
+
raise RuntimeError("Agent crashed")
|
|
365
|
+
|
|
366
|
+
ctx = MockEvalContext(prompt="Do something")
|
|
367
|
+
agent = FailingAgent()
|
|
368
|
+
|
|
369
|
+
result = await agent.run(ctx)
|
|
370
|
+
|
|
371
|
+
# Should return error trace
|
|
372
|
+
assert result.isError is True
|
|
373
|
+
assert result.content is not None
|
|
374
|
+
assert "Agent crashed" in result.content
|
|
375
|
+
|
|
376
|
+
assert ctx.error is not None
|
|
377
|
+
assert isinstance(ctx.error, BaseException)
|
|
378
|
+
assert "Agent crashed" in str(ctx.error)
|
|
379
|
+
|
|
380
|
+
@pytest.mark.asyncio
|
|
381
|
+
async def test_step_error_propagates_to_ctx_error(self) -> None:
|
|
382
|
+
"""Test that step-level errors (caught internally) set ctx.error."""
|
|
383
|
+
step_count = [0]
|
|
384
|
+
|
|
385
|
+
class FailOnSecondStepAgent(MockMCPAgent):
|
|
386
|
+
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
387
|
+
step_count[0] += 1
|
|
388
|
+
if step_count[0] == 1:
|
|
389
|
+
return AgentResponse(
|
|
390
|
+
content="",
|
|
391
|
+
tool_calls=[MCPToolCall(name="test_tool", arguments={})],
|
|
392
|
+
done=False,
|
|
393
|
+
)
|
|
394
|
+
else:
|
|
395
|
+
raise ValueError("Step 2 failed")
|
|
396
|
+
|
|
397
|
+
ctx = MockEvalContext(prompt="Do something")
|
|
398
|
+
agent = FailOnSecondStepAgent()
|
|
399
|
+
|
|
400
|
+
result = await agent.run(ctx)
|
|
401
|
+
|
|
402
|
+
# Should return error trace
|
|
403
|
+
assert result.isError is True
|
|
404
|
+
assert ctx.error is not None
|
|
405
|
+
assert "Step 2 failed" in str(ctx.error)
|
|
406
|
+
|
|
407
|
+
@pytest.mark.asyncio
|
|
408
|
+
async def test_no_error_when_successful(self) -> None:
|
|
409
|
+
"""Test that ctx.error remains None on successful run."""
|
|
410
|
+
ctx = MockEvalContext(prompt="Do something")
|
|
411
|
+
agent = MockMCPAgent()
|
|
412
|
+
|
|
413
|
+
result = await agent.run(ctx)
|
|
414
|
+
|
|
415
|
+
assert result.isError is False
|
|
416
|
+
assert ctx.error is None
|
|
@@ -91,10 +91,11 @@ _DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
|
|
|
91
91
|
[eval]
|
|
92
92
|
# source = "hud-evals/SheetBench-50"
|
|
93
93
|
# agent = "claude"
|
|
94
|
-
#
|
|
94
|
+
# all = false # Run all problems instead of just 1
|
|
95
95
|
# max_concurrent = 30
|
|
96
96
|
# max_steps = 10
|
|
97
97
|
# group_size = 1
|
|
98
|
+
# byok = false # Remote only; use encrypted env vars on the platform.
|
|
98
99
|
# task_ids = ["task_1", "task_2"]
|
|
99
100
|
# verbose = true
|
|
100
101
|
# very_verbose = true
|
|
@@ -152,12 +153,13 @@ class EvalConfig(BaseModel):
|
|
|
152
153
|
"source",
|
|
153
154
|
"agent_type",
|
|
154
155
|
"task_ids",
|
|
155
|
-
"
|
|
156
|
+
"all",
|
|
156
157
|
"max_concurrent",
|
|
157
158
|
"max_steps",
|
|
158
159
|
"verbose",
|
|
159
160
|
"very_verbose",
|
|
160
161
|
"group_size",
|
|
162
|
+
"byok",
|
|
161
163
|
"remote",
|
|
162
164
|
"auto_respond",
|
|
163
165
|
"quiet",
|
|
@@ -171,13 +173,14 @@ class EvalConfig(BaseModel):
|
|
|
171
173
|
agent_type: AgentType | None = None
|
|
172
174
|
model: str | None = None
|
|
173
175
|
task_ids: list[str] | None = None
|
|
174
|
-
|
|
176
|
+
all: bool = False # Run all problems instead of just 1
|
|
175
177
|
max_concurrent: int = 30
|
|
176
|
-
max_steps: int
|
|
178
|
+
max_steps: int = 10
|
|
177
179
|
verbose: bool = False
|
|
178
180
|
very_verbose: bool = False
|
|
179
|
-
auto_respond: bool | None = None # Continue without prompting
|
|
181
|
+
auto_respond: bool | None = None # Continue without prompting
|
|
180
182
|
group_size: int = 1
|
|
183
|
+
byok: bool = False
|
|
181
184
|
remote: bool = False
|
|
182
185
|
quiet: bool = False # Suppress opening browser for eval links
|
|
183
186
|
gateway: bool = False # Use HUD Gateway for LLM API calls
|
|
@@ -208,6 +211,11 @@ class EvalConfig(BaseModel):
|
|
|
208
211
|
|
|
209
212
|
def validate_api_keys(self) -> None:
|
|
210
213
|
"""Validate required API keys for the selected agent. Raises typer.Exit on failure."""
|
|
214
|
+
# BYOK requires remote execution (check before agent_type guard)
|
|
215
|
+
if self.byok and not self.remote:
|
|
216
|
+
hud_console.error("--byok requires --remote (BYOK only works with remote execution)")
|
|
217
|
+
raise typer.Exit(1)
|
|
218
|
+
|
|
211
219
|
if self.agent_type is None:
|
|
212
220
|
return
|
|
213
221
|
|
|
@@ -284,14 +292,11 @@ class EvalConfig(BaseModel):
|
|
|
284
292
|
if self.model:
|
|
285
293
|
kwargs["model"] = self.model
|
|
286
294
|
|
|
287
|
-
if
|
|
295
|
+
# For gateway base_url, inject HUD API key if not already set
|
|
296
|
+
if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
|
|
288
297
|
base_url = kwargs.get("base_url", "")
|
|
289
|
-
if
|
|
290
|
-
|
|
291
|
-
if settings.hud_gateway_url in base_url:
|
|
292
|
-
kwargs["api_key"] = settings.api_key
|
|
293
|
-
elif settings.openai_api_key:
|
|
294
|
-
kwargs["api_key"] = settings.openai_api_key
|
|
298
|
+
if settings.hud_gateway_url in base_url and settings.api_key:
|
|
299
|
+
kwargs["api_key"] = settings.api_key
|
|
295
300
|
|
|
296
301
|
# Auto-detect Bedrock when Claude is selected with a Bedrock ARN
|
|
297
302
|
# Check both model and checkpoint_name for ARN patterns
|
|
@@ -454,12 +459,20 @@ class EvalConfig(BaseModel):
|
|
|
454
459
|
|
|
455
460
|
overrides.update({k: v for k, v in cli_args.items() if v is not None and v is not False})
|
|
456
461
|
|
|
457
|
-
for k in ("
|
|
462
|
+
for k in ("all", "verbose", "very_verbose", "remote", "quiet", "gateway"):
|
|
458
463
|
if cli_args.get(k) is True:
|
|
459
464
|
overrides[k] = True
|
|
460
465
|
elif k in overrides and cli_args.get(k) is False:
|
|
461
466
|
del overrides[k]
|
|
462
467
|
|
|
468
|
+
# --full is a shortcut for --all --auto-respond --max-steps 100
|
|
469
|
+
if overrides.get("full"):
|
|
470
|
+
overrides["all"] = True
|
|
471
|
+
if "auto_respond" not in overrides:
|
|
472
|
+
overrides["auto_respond"] = True
|
|
473
|
+
if "max_steps" not in overrides:
|
|
474
|
+
overrides["max_steps"] = 100
|
|
475
|
+
|
|
463
476
|
if config:
|
|
464
477
|
merged_agent_config = dict(self.agent_config)
|
|
465
478
|
for item in config:
|
|
@@ -541,15 +554,13 @@ class EvalConfig(BaseModel):
|
|
|
541
554
|
table.add_row(
|
|
542
555
|
"task_ids", ", ".join(self.task_ids[:5]) + ("..." if len(self.task_ids) > 5 else "")
|
|
543
556
|
)
|
|
544
|
-
table.add_row("
|
|
545
|
-
table.add_row("max_steps", str(self.max_steps
|
|
557
|
+
table.add_row("all", str(self.all))
|
|
558
|
+
table.add_row("max_steps", str(self.max_steps))
|
|
546
559
|
if not self.remote:
|
|
547
560
|
table.add_row("max_concurrent", str(self.max_concurrent))
|
|
548
561
|
if self.group_size > 1:
|
|
549
562
|
table.add_row("group_size", str(self.group_size))
|
|
550
|
-
|
|
551
|
-
effective_auto_respond = self.auto_respond if self.auto_respond is not None else self.full
|
|
552
|
-
if effective_auto_respond:
|
|
563
|
+
if self.auto_respond:
|
|
553
564
|
table.add_row("auto_respond", "[bold green]True[/bold green]")
|
|
554
565
|
if self.very_verbose:
|
|
555
566
|
table.add_row("very_verbose", "[bold green]True[/bold green]")
|
|
@@ -559,6 +570,8 @@ class EvalConfig(BaseModel):
|
|
|
559
570
|
table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)")
|
|
560
571
|
if self.gateway:
|
|
561
572
|
table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)")
|
|
573
|
+
if self.byok:
|
|
574
|
+
table.add_row("byok", "[bold green]True[/bold green] (remote only)")
|
|
562
575
|
|
|
563
576
|
# Tool filters (only if set)
|
|
564
577
|
if self.allowed_tools:
|
|
@@ -642,8 +655,8 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
|
|
|
642
655
|
raise typer.Exit(1)
|
|
643
656
|
hud_console.info(f"Filtered to {len(filtered)} task(s) by ID")
|
|
644
657
|
tasks = filtered
|
|
645
|
-
elif not cfg.
|
|
646
|
-
# Single task mode (no --full,
|
|
658
|
+
elif not cfg.all:
|
|
659
|
+
# Single task mode (no --all, --full, or --task-ids)
|
|
647
660
|
tasks = [tasks[0]]
|
|
648
661
|
hud_console.info("Using first task (run with --full or --task-ids for more)…")
|
|
649
662
|
|
|
@@ -651,14 +664,17 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
|
|
|
651
664
|
|
|
652
665
|
# Prepare agent kwargs
|
|
653
666
|
agent_kwargs = cfg.get_agent_kwargs()
|
|
654
|
-
auto_respond = cfg.auto_respond
|
|
667
|
+
auto_respond = cfg.auto_respond
|
|
655
668
|
if auto_respond:
|
|
656
669
|
agent_kwargs = {**agent_kwargs, "auto_respond": True}
|
|
657
670
|
|
|
658
|
-
max_steps = cfg.max_steps
|
|
671
|
+
max_steps = cfg.max_steps
|
|
659
672
|
|
|
660
673
|
# Remote execution - submit to HUD platform
|
|
661
674
|
if cfg.remote:
|
|
675
|
+
agent_kwargs = {
|
|
676
|
+
k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
|
|
677
|
+
}
|
|
662
678
|
# Create a job ID for tracking
|
|
663
679
|
import uuid
|
|
664
680
|
|
|
@@ -676,9 +692,10 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
|
|
|
676
692
|
agent_params=agent_kwargs,
|
|
677
693
|
max_steps=max_steps,
|
|
678
694
|
group_size=cfg.group_size,
|
|
695
|
+
use_byok=cfg.byok,
|
|
679
696
|
)
|
|
680
697
|
|
|
681
|
-
hud_console.success(f"Tasks submitted. View at: https://hud.ai/
|
|
698
|
+
hud_console.success(f"Tasks submitted. View at: https://hud.ai/jobs/{job_id}")
|
|
682
699
|
return [], tasks
|
|
683
700
|
|
|
684
701
|
# Single task mode - show extra info
|
|
@@ -724,7 +741,12 @@ def eval_command(
|
|
|
724
741
|
None,
|
|
725
742
|
help="Agent: claude, openai, operator, gemini, gemini_cua, openai_compatible, integration_test", # noqa: E501
|
|
726
743
|
),
|
|
727
|
-
|
|
744
|
+
all: bool = typer.Option(False, "--all", help="Run all problems instead of just 1"),
|
|
745
|
+
full: bool = typer.Option(
|
|
746
|
+
False,
|
|
747
|
+
"--full",
|
|
748
|
+
help="Run the entire dataset. Shortcut for --all --auto-respond --max-steps 100",
|
|
749
|
+
),
|
|
728
750
|
model: str | None = typer.Option(None, "--model", "-m", help="Model name"),
|
|
729
751
|
config: list[str] | None = typer.Option( # noqa: B008
|
|
730
752
|
None, "--config", "-c", help="Agent config: key=value"
|
|
@@ -743,10 +765,10 @@ def eval_command(
|
|
|
743
765
|
max_steps: int | None = typer.Option(None, "--max-steps", help="Max steps per task"),
|
|
744
766
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
|
|
745
767
|
very_verbose: bool = typer.Option(False, "--very-verbose", "-vv", help="Debug logs"),
|
|
746
|
-
auto_respond: bool
|
|
747
|
-
|
|
768
|
+
auto_respond: bool = typer.Option(
|
|
769
|
+
False,
|
|
748
770
|
"--auto-respond",
|
|
749
|
-
help="
|
|
771
|
+
help="Automatically prompt the agent to continue if it does not respond with a tool call",
|
|
750
772
|
),
|
|
751
773
|
group_size: int | None = typer.Option(None, "--group-size", help="Runs per task"),
|
|
752
774
|
task_ids: str | None = typer.Option(None, "--task-ids", help="Comma-separated task IDs to run"),
|
|
@@ -754,6 +776,11 @@ def eval_command(
|
|
|
754
776
|
remote: bool = typer.Option(
|
|
755
777
|
False, "--remote", help="Submit tasks to platform for remote execution"
|
|
756
778
|
),
|
|
779
|
+
byok: bool = typer.Option(
|
|
780
|
+
False,
|
|
781
|
+
"--byok",
|
|
782
|
+
help="Remote only: use BYOK keys from encrypted env vars for inference",
|
|
783
|
+
),
|
|
757
784
|
quiet: bool = typer.Option(
|
|
758
785
|
False, "--quiet", "-q", help="Suppress opening browser for eval links"
|
|
759
786
|
),
|
|
@@ -778,6 +805,7 @@ def eval_command(
|
|
|
778
805
|
source=source,
|
|
779
806
|
agent=agent,
|
|
780
807
|
model=model,
|
|
808
|
+
all=all,
|
|
781
809
|
full=full,
|
|
782
810
|
max_concurrent=max_concurrent,
|
|
783
811
|
max_steps=max_steps,
|
|
@@ -790,6 +818,7 @@ def eval_command(
|
|
|
790
818
|
group_size=group_size,
|
|
791
819
|
config=config,
|
|
792
820
|
remote=remote,
|
|
821
|
+
byok=byok,
|
|
793
822
|
quiet=quiet,
|
|
794
823
|
gateway=gateway,
|
|
795
824
|
)
|
|
@@ -23,6 +23,8 @@ PRESET_MAP: dict[str, str | None] = {
|
|
|
23
23
|
"deep-research": "hud-deepresearch",
|
|
24
24
|
"browser": "hud-browser",
|
|
25
25
|
"rubrics": "hud-rubrics",
|
|
26
|
+
"verilog-coding-template": "verilog-coding-template",
|
|
27
|
+
"data-science-template": "data-science-template",
|
|
26
28
|
}
|
|
27
29
|
|
|
28
30
|
SKIP_DIR_NAMES = {"node_modules", "__pycache__", "dist", "build", ".next", ".git"}
|
|
@@ -92,6 +94,8 @@ def _prompt_for_preset() -> str:
|
|
|
92
94
|
{"name": "browser", "message": "browser"},
|
|
93
95
|
{"name": "deep-research", "message": "deep-research"},
|
|
94
96
|
{"name": "rubrics", "message": "rubrics"},
|
|
97
|
+
{"name": "verilog-coding-template", "message": "verilog-coding-template"},
|
|
98
|
+
{"name": "data-science-template", "message": "data-science-template"},
|
|
95
99
|
]
|
|
96
100
|
display_choices = [c["message"] for c in choices]
|
|
97
101
|
selected = questionary.select(
|
|
@@ -99,8 +99,8 @@ async def run_dataset(
|
|
|
99
99
|
) as ctx:
|
|
100
100
|
# Create agent fresh for each context (ensures correct tool initialization)
|
|
101
101
|
agent = agent_cls.create(**(agent_params or {}))
|
|
102
|
-
|
|
103
|
-
|
|
102
|
+
await agent.run(ctx, max_steps=max_steps)
|
|
103
|
+
# Reward is computed by EvalContext.__aexit__ from evaluate tools
|
|
104
104
|
|
|
105
105
|
# For parallel execution, results are collected via ctx.results
|
|
106
106
|
if hasattr(ctx, "results") and ctx.results:
|
|
@@ -207,6 +207,7 @@ async def run_single_task(
|
|
|
207
207
|
ctx.metadata.update(metadata)
|
|
208
208
|
|
|
209
209
|
result = await agent.run(ctx, max_steps=max_steps)
|
|
210
|
-
|
|
210
|
+
# Reward is computed by EvalContext.__aexit__ from evaluate tools
|
|
211
211
|
|
|
212
|
+
# Return the Trace (ctx.reward is set by EvalContext.__aexit__)
|
|
212
213
|
return result
|
|
@@ -51,6 +51,10 @@ class SingleTaskRequest(BaseModel):
|
|
|
51
51
|
description="Additional metadata to inject into the trace context.",
|
|
52
52
|
)
|
|
53
53
|
trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.")
|
|
54
|
+
use_byok: bool = Field(
|
|
55
|
+
default=False,
|
|
56
|
+
description="If True, use BYOK headers from encrypted env vars for inference.",
|
|
57
|
+
)
|
|
54
58
|
|
|
55
59
|
@model_validator(mode="after")
|
|
56
60
|
def _validate_task(self) -> SingleTaskRequest:
|
|
@@ -110,6 +114,7 @@ async def submit_rollouts(
|
|
|
110
114
|
group_size: int = 1,
|
|
111
115
|
batch_size: int = 50,
|
|
112
116
|
metadata: dict[str, Any] | None = None,
|
|
117
|
+
use_byok: bool = False,
|
|
113
118
|
) -> None:
|
|
114
119
|
"""Submit rollouts to the HUD platform API for remote execution (fire-and-forget).
|
|
115
120
|
|
|
@@ -122,6 +127,7 @@ async def submit_rollouts(
|
|
|
122
127
|
group_size: Number of rollouts per task (for variance estimation)
|
|
123
128
|
batch_size: Number of rollouts per API batch request
|
|
124
129
|
metadata: Additional metadata for each rollout
|
|
130
|
+
use_byok: If True, use BYOK keys from encrypted env vars (remote only)
|
|
125
131
|
"""
|
|
126
132
|
from hud.eval.utils import is_v4_format
|
|
127
133
|
|
|
@@ -168,6 +174,7 @@ async def submit_rollouts(
|
|
|
168
174
|
trace_name=trace_name,
|
|
169
175
|
group_id=base_task_id if group_size > 1 else None,
|
|
170
176
|
metadata=metadata or {},
|
|
177
|
+
use_byok=use_byok,
|
|
171
178
|
)
|
|
172
179
|
)
|
|
173
180
|
|
|
@@ -61,13 +61,12 @@ class RemoteConnectorMixin(MCPConfigConnectorMixin):
|
|
|
61
61
|
self._hub_config = hub_config
|
|
62
62
|
|
|
63
63
|
# Create mcp_config with standard MCP URL and hub slug in headers
|
|
64
|
+
# Note: Authorization is injected at request time by httpx/aiohttp hooks
|
|
65
|
+
# in hud.eval.instrument (uses contextvar for api_key).
|
|
64
66
|
mcp_config = {
|
|
65
67
|
"hud": {
|
|
66
68
|
"url": settings.hud_mcp_url,
|
|
67
|
-
"headers": {
|
|
68
|
-
"Authorization": f"Bearer {settings.api_key}",
|
|
69
|
-
"Environment-Name": slug,
|
|
70
|
-
},
|
|
69
|
+
"headers": {"Environment-Name": slug},
|
|
71
70
|
}
|
|
72
71
|
}
|
|
73
72
|
|
|
@@ -323,7 +323,8 @@ class Environment(
|
|
|
323
323
|
if conn.is_connected:
|
|
324
324
|
await conn.disconnect()
|
|
325
325
|
name, err = errors[0]
|
|
326
|
-
|
|
326
|
+
str_err = str(err).replace("Client failed to connect: ", "") # Strip from FastMCP
|
|
327
|
+
raise ConnectionError(f"Failed to connect to {name}: {str_err}") from err
|
|
327
328
|
|
|
328
329
|
await self._build_routing()
|
|
329
330
|
|
|
@@ -399,13 +400,20 @@ class Environment(
|
|
|
399
400
|
if self._router.is_local(name):
|
|
400
401
|
# Call tool manager directly to avoid FastMCP context requirement
|
|
401
402
|
result = await self._tool_manager.call_tool(name, arguments)
|
|
402
|
-
return MCPToolResult(
|
|
403
|
+
return MCPToolResult(
|
|
404
|
+
content=result.content,
|
|
405
|
+
structuredContent=result.structured_content,
|
|
406
|
+
)
|
|
403
407
|
|
|
404
408
|
connection_name = self._router.get_connection(name)
|
|
405
409
|
if connection_name:
|
|
406
410
|
conn = self._connections[connection_name]
|
|
407
411
|
result = await conn.call_tool(name, arguments)
|
|
408
|
-
return MCPToolResult(
|
|
412
|
+
return MCPToolResult(
|
|
413
|
+
content=result.content,
|
|
414
|
+
isError=result.isError,
|
|
415
|
+
structuredContent=result.structuredContent,
|
|
416
|
+
)
|
|
409
417
|
|
|
410
418
|
raise ValueError(f"Tool not found: {name}")
|
|
411
419
|
|