hud-python 0.4.50__tar.gz → 0.4.52__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.50 → hud_python-0.4.52}/PKG-INFO +2 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/blank/server/pyproject.toml +1 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/deepresearch/server/pyproject.toml +1 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/__init__.py +13 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/base.py +5 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/lite_llm.py +1 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/tests/test_base.py +8 -16
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/__init__.py +12 -22
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/eval.py +53 -84
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_build.py +2 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_eval.py +4 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_mcp_server.py +1 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/tasks.py +4 -1
- hud_python-0.4.52/hud/cli/utils/version_check.py +257 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/clients/base.py +1 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/clients/mcp_use.py +3 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/datasets/parallel.py +2 -2
- hud_python-0.4.52/hud/datasets/runner.py +184 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/otel/config.py +8 -6
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/otel/context.py +4 -4
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/otel/exporters.py +231 -57
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/learner.py +1 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/router.py +1 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/shared/exceptions.py +0 -5
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/shared/tests/test_exceptions.py +17 -16
- hud_python-0.4.52/hud/telemetry/__init__.py +50 -0
- hud_python-0.4.52/hud/telemetry/async_context.py +331 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/telemetry/job.py +51 -12
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/telemetry/tests/test_trace.py +4 -4
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/telemetry/trace.py +16 -17
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/computer/qwen.py +4 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/executors/base.py +4 -2
- hud_python-0.4.52/hud/utils/task_tracking.py +223 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/version.py +1 -1
- {hud_python-0.4.50 → hud_python-0.4.52}/pyproject.toml +2 -1
- hud_python-0.4.50/hud/datasets/runner.py +0 -123
- hud_python-0.4.50/hud/telemetry/__init__.py +0 -26
- {hud_python-0.4.50 → hud_python-0.4.52}/.gitignore +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/LICENSE +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/blank/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/blank/environment/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/blank/environment/pyproject.toml +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/blank/server/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/browser/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/browser/environment/2048/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/browser/environment/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/browser/environment/pyproject.toml +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/browser/environment/todo/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/browser/pyproject.toml +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/browser/server/pyproject.toml +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/deepresearch/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/deepresearch/environment/pyproject.toml +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/deepresearch/pyproject.toml +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/examples/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/__main__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/claude.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/grounded_openai.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/misc/integration_test_agent.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/openai.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/openai_chat_generic.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/build.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/clone.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/debug.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/dev.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/flows/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/flows/tasks.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/get.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/init.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/list_func.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/pull.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/push.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/remove.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/rl/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/rl/celebrate.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/rl/config.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/rl/display.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/rl/gpu.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/rl/gpu_utils.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/rl/local_runner.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/rl/presets.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/rl/remote_runner.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/rl/rl_api.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/rl/viewer.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/rl/vllm.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/rl/wait_utils.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/config.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/docker.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/env_check.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/interactive.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/local_runner.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/package_runner.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/runner.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/server.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/cli/utils/source_hash.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/clients/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/clients/fastmcp.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/clients/tests/test_mcp_use_retry.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/clients/utils/mcp_use_retry.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/clients/utils/retry.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/datasets/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/datasets/utils.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/native/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/native/comparator.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/otel/collector.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/otel/instrumentation.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/otel/processors.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/otel/tests/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/py.typed +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/README.md +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/actor.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/buffer.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/chat_template.jinja +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/config.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/distributed.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/tests/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/tests/test_learner.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/train.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/types.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/utils/start_vllm_server.sh +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/utils.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/rl/vllm_adapter.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/samples/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/samples/browser.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/context.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/low_level.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/server.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/tests/test_add_tool.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/tests/test_context.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/tests/test_mcp_server_handlers.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/tests/test_mcp_server_integration.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/tests/test_mcp_server_more.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/tests/test_run_wrapper.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/tests/test_server_extra.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/server/tests/test_sigterm_runner.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/settings.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/shared/hints.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/shared/requests.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/base.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/bash.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/edit.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/grounding/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/grounding/config.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/grounding/grounded_tool.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/grounding/grounder.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/grounding/tests/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/response.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/submit.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/types.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/tools/utils.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/types.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/agent_factories.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/group_eval.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/hud_console.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/mcp.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/pretty_errors.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/progress.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/tasks.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/tests/test_mcp.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.50 → hud_python-0.4.52}/hud/utils/tool_shorthand.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.52
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -48,6 +48,7 @@ Requires-Dist: opentelemetry-api>=1.34.1
|
|
|
48
48
|
Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
|
|
49
49
|
Requires-Dist: opentelemetry-instrumentation-mcp==0.47.0
|
|
50
50
|
Requires-Dist: opentelemetry-sdk>=1.34.1
|
|
51
|
+
Requires-Dist: packaging>=21.0
|
|
51
52
|
Requires-Dist: pathspec>=0.12.1
|
|
52
53
|
Requires-Dist: pillow>=11.1.0
|
|
53
54
|
Requires-Dist: prompt-toolkit==3.0.51
|
|
@@ -5,10 +5,22 @@ tools for building, evaluating, and training AI agents.
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
from .telemetry import
|
|
8
|
+
from .telemetry import (
|
|
9
|
+
Trace,
|
|
10
|
+
async_job,
|
|
11
|
+
async_trace,
|
|
12
|
+
clear_trace,
|
|
13
|
+
create_job,
|
|
14
|
+
get_trace,
|
|
15
|
+
instrument,
|
|
16
|
+
job,
|
|
17
|
+
trace,
|
|
18
|
+
)
|
|
9
19
|
|
|
10
20
|
__all__ = [
|
|
11
21
|
"Trace",
|
|
22
|
+
"async_job",
|
|
23
|
+
"async_trace",
|
|
12
24
|
"clear_trace",
|
|
13
25
|
"create_job",
|
|
14
26
|
"get_trace",
|
|
@@ -55,6 +55,7 @@ class MCPAgent(ABC):
|
|
|
55
55
|
# Filtering
|
|
56
56
|
allowed_tools: list[str] | None = None,
|
|
57
57
|
disallowed_tools: list[str] | None = None,
|
|
58
|
+
response_tool_name: str | None = None,
|
|
58
59
|
# Messages
|
|
59
60
|
system_prompt: str = GLOBAL_SYSTEM_PROMPT,
|
|
60
61
|
append_setup_output: bool = True,
|
|
@@ -74,6 +75,7 @@ class MCPAgent(ABC):
|
|
|
74
75
|
that provides `mcp_config`.
|
|
75
76
|
allowed_tools: Names of tools to allow (None means allow all).
|
|
76
77
|
disallowed_tools: Names of tools to always exclude.
|
|
78
|
+
response_tool_name: Name of the tool to use for response.
|
|
77
79
|
system_prompt: System prompt to seed the conversation.
|
|
78
80
|
append_setup_output: Whether to append setup tool output to the
|
|
79
81
|
first turn's messages.
|
|
@@ -108,7 +110,7 @@ class MCPAgent(ABC):
|
|
|
108
110
|
|
|
109
111
|
# Initialize these here so methods can be called before initialize()
|
|
110
112
|
self._tool_map: dict[str, types.Tool] = {} # Simplified: just name to tool
|
|
111
|
-
self.response_tool_name =
|
|
113
|
+
self.response_tool_name = response_tool_name
|
|
112
114
|
|
|
113
115
|
# Trace
|
|
114
116
|
self._auto_trace = auto_trace
|
|
@@ -168,6 +170,8 @@ class MCPAgent(ABC):
|
|
|
168
170
|
self.disallowed_tools.extend(task.agent_config["disallowed_tools"])
|
|
169
171
|
else: # If disallowed_tools is None, we overwrite it
|
|
170
172
|
self.disallowed_tools = task.agent_config["disallowed_tools"]
|
|
173
|
+
if "response_tool_name" in task.agent_config:
|
|
174
|
+
self.response_tool_name = task.agent_config["response_tool_name"]
|
|
171
175
|
|
|
172
176
|
all_tools = await self.mcp_client.list_tools()
|
|
173
177
|
self._available_tools = []
|
|
@@ -47,7 +47,7 @@ class LiteAgent(GenericOpenAIChatAgent):
|
|
|
47
47
|
**agent_kwargs,
|
|
48
48
|
)
|
|
49
49
|
|
|
50
|
-
def get_tool_schemas(self) -> list[
|
|
50
|
+
def get_tool_schemas(self) -> list[Any]:
|
|
51
51
|
# Prefer LiteLLM's stricter transformer (handles Bedrock & friends)
|
|
52
52
|
if transform_mcp_tool_to_openai_tool is not None:
|
|
53
53
|
return [
|
|
@@ -94,7 +94,7 @@ class TestBaseMCPAgent:
|
|
|
94
94
|
|
|
95
95
|
assert agent.mcp_client is not None
|
|
96
96
|
assert agent.allowed_tools is None
|
|
97
|
-
assert agent.disallowed_tools
|
|
97
|
+
assert agent.disallowed_tools is None
|
|
98
98
|
assert agent.initial_screenshot is True
|
|
99
99
|
assert agent.system_prompt is not None # Default system prompt is set
|
|
100
100
|
|
|
@@ -241,6 +241,13 @@ class TestBaseMCPAgent:
|
|
|
241
241
|
assert "tool2" not in tool_names # Not in allowed list
|
|
242
242
|
assert "tool3" not in tool_names # In disallowed list
|
|
243
243
|
|
|
244
|
+
# Make sure tool schemas are correct
|
|
245
|
+
schemas = agent.get_tool_schemas()
|
|
246
|
+
assert len(schemas) == 1
|
|
247
|
+
assert schemas[0]["name"] == "tool1"
|
|
248
|
+
assert schemas[0]["description"] == "Tool 1"
|
|
249
|
+
assert schemas[0]["parameters"] == {"type": "object"}
|
|
250
|
+
|
|
244
251
|
@pytest.mark.asyncio
|
|
245
252
|
async def test_call_tool_success(self):
|
|
246
253
|
"""Test successful tool call."""
|
|
@@ -322,21 +329,6 @@ class TestBaseMCPAgent:
|
|
|
322
329
|
# call_tools doesn't validate empty names, it will return error
|
|
323
330
|
await agent.call_tools(tool_call)
|
|
324
331
|
|
|
325
|
-
def test_get_tool_schemas(self):
|
|
326
|
-
"""Test getting tool schemas."""
|
|
327
|
-
agent = MockMCPAgent()
|
|
328
|
-
|
|
329
|
-
agent._available_tools = [
|
|
330
|
-
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
331
|
-
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
332
|
-
]
|
|
333
|
-
|
|
334
|
-
schemas = agent.get_tool_schemas()
|
|
335
|
-
|
|
336
|
-
# Should include non-lifecycle tools
|
|
337
|
-
assert len(schemas) == 1
|
|
338
|
-
assert schemas[0]["name"] == "tool1"
|
|
339
|
-
|
|
340
332
|
def test_get_tools_by_server(self):
|
|
341
333
|
"""Test getting tools grouped by server."""
|
|
342
334
|
agent = MockMCPAgent()
|
|
@@ -796,33 +796,19 @@ def eval(
|
|
|
796
796
|
help="Comma-separated list of allowed tools",
|
|
797
797
|
),
|
|
798
798
|
max_concurrent: int = typer.Option(
|
|
799
|
-
|
|
799
|
+
30,
|
|
800
800
|
"--max-concurrent",
|
|
801
|
-
help="
|
|
801
|
+
help="Maximum concurrent tasks (1-200 recommended, prevents rate limits)",
|
|
802
802
|
),
|
|
803
803
|
max_steps: int | None = typer.Option(
|
|
804
804
|
None,
|
|
805
805
|
"--max-steps",
|
|
806
806
|
help="Maximum steps per task (default: 10 for single, 50 for full)",
|
|
807
807
|
),
|
|
808
|
-
parallel: bool = typer.Option(
|
|
809
|
-
False,
|
|
810
|
-
"--parallel",
|
|
811
|
-
help="Use process-based parallel execution for large datasets (100+ tasks)",
|
|
812
|
-
),
|
|
813
|
-
max_workers: int | None = typer.Option(
|
|
814
|
-
None,
|
|
815
|
-
"--max-workers",
|
|
816
|
-
help="Number of worker processes for parallel mode (auto-optimized if not set)",
|
|
817
|
-
),
|
|
818
|
-
max_concurrent_per_worker: int = typer.Option(
|
|
819
|
-
20,
|
|
820
|
-
"--max-concurrent-per-worker",
|
|
821
|
-
help="Maximum concurrent tasks per worker in parallel mode",
|
|
822
|
-
),
|
|
823
808
|
verbose: bool = typer.Option(
|
|
824
809
|
False,
|
|
825
810
|
"--verbose",
|
|
811
|
+
"-v",
|
|
826
812
|
help="Enable verbose output from the agent",
|
|
827
813
|
),
|
|
828
814
|
very_verbose: bool = typer.Option(
|
|
@@ -867,14 +853,14 @@ def eval(
|
|
|
867
853
|
|
|
868
854
|
source = find_tasks_file(None, msg="Select a tasks file to run")
|
|
869
855
|
hud_console.success(f"Selected: {source}")
|
|
870
|
-
except Exception
|
|
856
|
+
except (FileNotFoundError, Exception):
|
|
871
857
|
hud_console.error(
|
|
872
858
|
"No source provided and no task/eval JSON files found in current directory"
|
|
873
859
|
)
|
|
874
860
|
hud_console.info(
|
|
875
861
|
"Usage: hud eval <source> or create a task JSON file (e.g., task.json, tasks.jsonl)"
|
|
876
862
|
)
|
|
877
|
-
raise typer.Exit(1) from
|
|
863
|
+
raise typer.Exit(1) from None
|
|
878
864
|
|
|
879
865
|
# Import eval_command lazily to avoid importing agent dependencies
|
|
880
866
|
try:
|
|
@@ -950,9 +936,6 @@ def eval(
|
|
|
950
936
|
allowed_tools=allowed_tools,
|
|
951
937
|
max_concurrent=max_concurrent,
|
|
952
938
|
max_steps=max_steps,
|
|
953
|
-
parallel=parallel,
|
|
954
|
-
max_workers=max_workers,
|
|
955
|
-
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
956
939
|
verbose=verbose,
|
|
957
940
|
very_verbose=very_verbose,
|
|
958
941
|
vllm_base_url=vllm_base_url,
|
|
@@ -1126,6 +1109,13 @@ def set(
|
|
|
1126
1109
|
|
|
1127
1110
|
def main() -> None:
|
|
1128
1111
|
"""Main entry point for the CLI."""
|
|
1112
|
+
# Check for updates (including on --version command)
|
|
1113
|
+
# Skip only on help-only commands
|
|
1114
|
+
if not (len(sys.argv) == 1 or (len(sys.argv) == 2 and sys.argv[1] in ["--help", "-h"])):
|
|
1115
|
+
from .utils.version_check import display_update_prompt
|
|
1116
|
+
|
|
1117
|
+
display_update_prompt()
|
|
1118
|
+
|
|
1129
1119
|
# Handle --version flag before Typer parses args
|
|
1130
1120
|
if "--version" in sys.argv:
|
|
1131
1121
|
try:
|
|
@@ -300,6 +300,7 @@ async def run_single_task(
|
|
|
300
300
|
agent_config = {
|
|
301
301
|
"model": model or "claude-sonnet-4-20250514",
|
|
302
302
|
"verbose": verbose,
|
|
303
|
+
"validate_api_key": False,
|
|
303
304
|
}
|
|
304
305
|
if allowed_tools:
|
|
305
306
|
agent_config["allowed_tools"] = allowed_tools
|
|
@@ -345,24 +346,18 @@ async def run_full_dataset(
|
|
|
345
346
|
allowed_tools: list[str] | None = None,
|
|
346
347
|
max_concurrent: int = 30,
|
|
347
348
|
max_steps: int = 10,
|
|
348
|
-
parallel: bool = False,
|
|
349
|
-
max_workers: int | None = None,
|
|
350
|
-
max_concurrent_per_worker: int = 25,
|
|
351
349
|
verbose: bool = False,
|
|
352
350
|
vllm_base_url: str | None = None,
|
|
353
351
|
group_size: int = 1,
|
|
354
352
|
) -> list[Any]:
|
|
355
|
-
"""Run evaluation across the entire dataset.
|
|
356
|
-
|
|
357
|
-
Uses either asyncio-based run_dataset or process-based parallel execution
|
|
358
|
-
depending on the parallel flag."""
|
|
353
|
+
"""Run evaluation across the entire dataset using asyncio-based concurrency."""
|
|
359
354
|
|
|
360
355
|
# Provide early feedback to user
|
|
361
356
|
hud_console.info("🔧 Initializing evaluation...")
|
|
362
357
|
|
|
363
358
|
# Import run_dataset lazily
|
|
364
359
|
try:
|
|
365
|
-
from hud.datasets import run_dataset
|
|
360
|
+
from hud.datasets import run_dataset
|
|
366
361
|
from hud.utils.tasks import load_tasks
|
|
367
362
|
except ImportError as e:
|
|
368
363
|
hud_console.error(
|
|
@@ -434,7 +429,7 @@ async def run_full_dataset(
|
|
|
434
429
|
)
|
|
435
430
|
raise typer.Exit(1) from e
|
|
436
431
|
|
|
437
|
-
agent_config = {"verbose": verbose}
|
|
432
|
+
agent_config = {"verbose": verbose, "validate_api_key": False}
|
|
438
433
|
if allowed_tools:
|
|
439
434
|
agent_config["allowed_tools"] = allowed_tools
|
|
440
435
|
|
|
@@ -472,6 +467,7 @@ async def run_full_dataset(
|
|
|
472
467
|
agent_config = {
|
|
473
468
|
"model": model or "claude-sonnet-4-20250514",
|
|
474
469
|
"verbose": verbose,
|
|
470
|
+
"validate_api_key": False,
|
|
475
471
|
}
|
|
476
472
|
if allowed_tools:
|
|
477
473
|
agent_config["allowed_tools"] = allowed_tools
|
|
@@ -505,9 +501,7 @@ async def run_full_dataset(
|
|
|
505
501
|
agent_class=agent_class,
|
|
506
502
|
agent_config=agent_config,
|
|
507
503
|
group_size=group_size,
|
|
508
|
-
max_parallel_episodes=max_concurrent
|
|
509
|
-
if not parallel
|
|
510
|
-
else max_concurrent_per_worker * (max_workers or 4),
|
|
504
|
+
max_parallel_episodes=max_concurrent,
|
|
511
505
|
max_steps=max_steps,
|
|
512
506
|
verbose=verbose,
|
|
513
507
|
job_id=job.id,
|
|
@@ -519,48 +513,18 @@ async def run_full_dataset(
|
|
|
519
513
|
# Return stats for consistency with other modes
|
|
520
514
|
return stats
|
|
521
515
|
|
|
522
|
-
#
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
max_concurrent=max_concurrent,
|
|
535
|
-
metadata={"dataset": source, "parallel": True},
|
|
536
|
-
max_steps=max_steps,
|
|
537
|
-
auto_respond=True,
|
|
538
|
-
)
|
|
539
|
-
else:
|
|
540
|
-
# Use manual configuration
|
|
541
|
-
return await run_dataset_parallel_manual(
|
|
542
|
-
name=f"Evaluation {dataset_name}",
|
|
543
|
-
dataset=dataset_or_tasks,
|
|
544
|
-
agent_class=agent_class,
|
|
545
|
-
agent_config=agent_config,
|
|
546
|
-
max_workers=max_workers,
|
|
547
|
-
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
548
|
-
max_concurrent=max_concurrent,
|
|
549
|
-
metadata={"dataset": source, "parallel": True},
|
|
550
|
-
max_steps=max_steps,
|
|
551
|
-
auto_respond=True,
|
|
552
|
-
)
|
|
553
|
-
else:
|
|
554
|
-
hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
|
|
555
|
-
return await run_dataset(
|
|
556
|
-
name=f"Evaluation {dataset_name}",
|
|
557
|
-
dataset=dataset_or_tasks,
|
|
558
|
-
agent_class=agent_class,
|
|
559
|
-
agent_config=agent_config,
|
|
560
|
-
max_concurrent=max_concurrent,
|
|
561
|
-
metadata={"dataset": source},
|
|
562
|
-
max_steps=max_steps,
|
|
563
|
-
)
|
|
516
|
+
# Run evaluation with asyncio-based concurrency
|
|
517
|
+
hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
|
|
518
|
+
return await run_dataset(
|
|
519
|
+
name=f"Evaluation {dataset_name}",
|
|
520
|
+
dataset=dataset_or_tasks,
|
|
521
|
+
agent_class=agent_class,
|
|
522
|
+
agent_config=agent_config,
|
|
523
|
+
max_concurrent=max_concurrent,
|
|
524
|
+
metadata={"dataset": source},
|
|
525
|
+
max_steps=max_steps,
|
|
526
|
+
auto_respond=True,
|
|
527
|
+
)
|
|
564
528
|
|
|
565
529
|
|
|
566
530
|
def eval_command(
|
|
@@ -591,31 +555,20 @@ def eval_command(
|
|
|
591
555
|
max_concurrent: int = typer.Option(
|
|
592
556
|
30,
|
|
593
557
|
"--max-concurrent",
|
|
594
|
-
help=
|
|
558
|
+
help=(
|
|
559
|
+
"Maximum concurrent tasks (1-200 recommended, prevents rate limits "
|
|
560
|
+
"and resource exhaustion)"
|
|
561
|
+
),
|
|
595
562
|
),
|
|
596
563
|
max_steps: int | None = typer.Option(
|
|
597
564
|
None,
|
|
598
565
|
"--max-steps",
|
|
599
566
|
help="Maximum steps per task (default: 10 for single, 50 for full)",
|
|
600
567
|
),
|
|
601
|
-
parallel: bool = typer.Option(
|
|
602
|
-
False,
|
|
603
|
-
"--parallel",
|
|
604
|
-
help="Use process-based parallel execution for large datasets (100+ tasks)",
|
|
605
|
-
),
|
|
606
|
-
max_workers: int | None = typer.Option(
|
|
607
|
-
None,
|
|
608
|
-
"--max-workers",
|
|
609
|
-
help="Number of worker processes for parallel mode (auto-optimized if not set)",
|
|
610
|
-
),
|
|
611
|
-
max_concurrent_per_worker: int = typer.Option(
|
|
612
|
-
20,
|
|
613
|
-
"--max-concurrent-per-worker",
|
|
614
|
-
help="Maximum concurrent tasks per worker in parallel mode",
|
|
615
|
-
),
|
|
616
568
|
verbose: bool = typer.Option(
|
|
617
569
|
False,
|
|
618
570
|
"--verbose",
|
|
571
|
+
"-v",
|
|
619
572
|
help="Enable verbose output from the agent",
|
|
620
573
|
),
|
|
621
574
|
very_verbose: bool = typer.Option(
|
|
@@ -650,23 +603,20 @@ def eval_command(
|
|
|
650
603
|
# Evaluate a single task from SheetBench
|
|
651
604
|
hud eval hud-evals/SheetBench-50
|
|
652
605
|
|
|
653
|
-
# Evaluate the FULL SheetBench dataset with Claude
|
|
606
|
+
# Evaluate the FULL SheetBench dataset with Claude
|
|
654
607
|
hud eval hud-evals/SheetBench-50 --full --agent claude
|
|
655
608
|
|
|
656
|
-
# Run
|
|
657
|
-
hud eval hud-evals/OSWorld-Verified-Gold --full --
|
|
658
|
-
|
|
659
|
-
# Parallel mode with manual configuration (16 workers, 25 tasks each)
|
|
660
|
-
hud eval hud-evals/OSWorld-Verified-Gold --full --parallel --max-workers 16
|
|
609
|
+
# Run with higher concurrency for faster evaluation
|
|
610
|
+
hud eval hud-evals/OSWorld-Verified-Gold --full --max-concurrent 100
|
|
661
611
|
|
|
662
|
-
# Limit
|
|
663
|
-
hud eval hud-evals/SheetBench-50 --full --
|
|
612
|
+
# Limit concurrent tasks to prevent rate limits
|
|
613
|
+
hud eval hud-evals/SheetBench-50 --full --max-concurrent 20
|
|
664
614
|
|
|
665
615
|
# Run a single task from a JSON file
|
|
666
616
|
hud eval task.json
|
|
667
617
|
|
|
668
|
-
# Run multiple tasks from a JSON file
|
|
669
|
-
hud eval tasks.json --full
|
|
618
|
+
# Run multiple tasks from a JSON file
|
|
619
|
+
hud eval tasks.json --full
|
|
670
620
|
|
|
671
621
|
# Run with OpenAI Operator agent
|
|
672
622
|
hud eval hud-evals/OSWorld-Gold-Beta --agent openai
|
|
@@ -736,7 +686,11 @@ def eval_command(
|
|
|
736
686
|
|
|
737
687
|
# Run evaluation
|
|
738
688
|
if full:
|
|
739
|
-
|
|
689
|
+
import time
|
|
690
|
+
|
|
691
|
+
start_time = time.time()
|
|
692
|
+
|
|
693
|
+
results = asyncio.run(
|
|
740
694
|
run_full_dataset(
|
|
741
695
|
source,
|
|
742
696
|
agent_type=agent,
|
|
@@ -744,14 +698,29 @@ def eval_command(
|
|
|
744
698
|
allowed_tools=allowed_tools_list,
|
|
745
699
|
max_concurrent=max_concurrent,
|
|
746
700
|
max_steps=max_steps,
|
|
747
|
-
parallel=parallel,
|
|
748
|
-
max_workers=max_workers,
|
|
749
|
-
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
750
701
|
verbose=very_verbose or verbose,
|
|
751
702
|
vllm_base_url=vllm_base_url,
|
|
752
703
|
group_size=group_size,
|
|
753
704
|
)
|
|
754
705
|
)
|
|
706
|
+
|
|
707
|
+
elapsed = time.time() - start_time
|
|
708
|
+
|
|
709
|
+
# Print statistics (only for non-grouped mode)
|
|
710
|
+
if group_size == 1 and results:
|
|
711
|
+
hud_console.info("\n" + "=" * 50)
|
|
712
|
+
hud_console.success("📊 Evaluation Complete!")
|
|
713
|
+
hud_console.info("=" * 50)
|
|
714
|
+
hud_console.info(f"Total tasks: {len(results)}")
|
|
715
|
+
hud_console.info(f"Time elapsed: {elapsed:.2f} seconds")
|
|
716
|
+
hud_console.info(f"Throughput: {len(results) / elapsed:.2f} tasks/second")
|
|
717
|
+
hud_console.info(f"Execution mode: ASYNCIO (max_concurrent: {max_concurrent})")
|
|
718
|
+
|
|
719
|
+
# Count successes
|
|
720
|
+
successful = sum(1 for r in results if getattr(r, "reward", 0) > 0.7)
|
|
721
|
+
success_rate = 100 * successful / len(results)
|
|
722
|
+
hud_console.info(f"Successful tasks: {successful}/{len(results)} ({success_rate:.1f}%)")
|
|
723
|
+
hud_console.info("=" * 50)
|
|
755
724
|
else:
|
|
756
725
|
asyncio.run(
|
|
757
726
|
run_single_task(
|
|
@@ -373,7 +373,8 @@ ENV API_KEY
|
|
|
373
373
|
with open(lock_file) as f:
|
|
374
374
|
lock_data = yaml.safe_load(f)
|
|
375
375
|
|
|
376
|
-
assert lock_data["
|
|
376
|
+
assert lock_data["images"]["full"] == "test-env:0.1.0@sha256:abc123"
|
|
377
|
+
assert lock_data["images"]["local"] == "test-env:0.1.0"
|
|
377
378
|
assert lock_data["build"]["version"] == "0.1.0"
|
|
378
379
|
assert lock_data["environment"]["toolCount"] == 2
|
|
379
380
|
assert len(lock_data["tools"]) == 2
|
|
@@ -332,6 +332,7 @@ class TestRunDatasetToolFiltering:
|
|
|
332
332
|
patch.object(ClaudeAgent, "_run_context", mock_run_context),
|
|
333
333
|
patch.object(ClaudeAgent, "call_tools", mock_call_tools),
|
|
334
334
|
patch("hud.clients.MCPClient", return_value=mock_client_instance),
|
|
335
|
+
patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
|
|
335
336
|
):
|
|
336
337
|
# Run the dataset
|
|
337
338
|
await run_dataset(
|
|
@@ -400,6 +401,7 @@ class TestRunDatasetToolFiltering:
|
|
|
400
401
|
patch.object(ClaudeAgent, "_run_context", mock_run_context),
|
|
401
402
|
patch.object(ClaudeAgent, "call_tools", mock_call_tools),
|
|
402
403
|
patch("hud.clients.MCPClient", return_value=mock_client_instance),
|
|
404
|
+
patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
|
|
403
405
|
):
|
|
404
406
|
# Run the dataset
|
|
405
407
|
await run_dataset(
|
|
@@ -500,6 +502,7 @@ class TestSystemPromptHandling:
|
|
|
500
502
|
patch.object(ClaudeAgent, "_run_context", mock_run_context),
|
|
501
503
|
patch.object(ClaudeAgent, "call_tools", mock_call_tools),
|
|
502
504
|
patch("hud.clients.MCPClient", return_value=mock_mcp_client),
|
|
505
|
+
patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
|
|
503
506
|
):
|
|
504
507
|
# Run the dataset
|
|
505
508
|
await run_dataset(
|
|
@@ -551,6 +554,7 @@ class TestSystemPromptHandling:
|
|
|
551
554
|
patch.object(ClaudeAgent, "_run_context", mock_run_context),
|
|
552
555
|
patch.object(ClaudeAgent, "call_tools", mock_call_tools),
|
|
553
556
|
patch("hud.clients.MCPClient", return_value=mock_mcp_client),
|
|
557
|
+
patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
|
|
554
558
|
):
|
|
555
559
|
# Run the dataset
|
|
556
560
|
await run_dataset(
|
|
@@ -19,7 +19,7 @@ class TestRunMCPDevServer:
|
|
|
19
19
|
import click
|
|
20
20
|
|
|
21
21
|
with (
|
|
22
|
-
patch("hud.cli.
|
|
22
|
+
patch("hud.cli.utils.environment.image_exists", return_value=False),
|
|
23
23
|
patch("click.confirm", return_value=False),
|
|
24
24
|
pytest.raises(click.Abort),
|
|
25
25
|
):
|
|
@@ -18,9 +18,12 @@ def find_tasks_file(tasks_file: str | None, msg: str = "Select a tasks file") ->
|
|
|
18
18
|
]
|
|
19
19
|
all_files = [file for file in all_files if file[0] != "."] # Remove all config files
|
|
20
20
|
|
|
21
|
+
if not all_files:
|
|
22
|
+
# No task files found - raise a clear exception
|
|
23
|
+
raise FileNotFoundError("No task JSON or JSONL files found in current directory")
|
|
24
|
+
|
|
21
25
|
if len(all_files) == 1:
|
|
22
26
|
return str(all_files[0])
|
|
23
|
-
|
|
24
27
|
else:
|
|
25
28
|
# Prompt user to select a file
|
|
26
29
|
return hud_console.select(msg, choices=all_files)
|