hud-python 0.4.44__tar.gz → 0.4.46__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.44 → hud_python-0.4.46}/.gitignore +1 -1
- {hud_python-0.4.44 → hud_python-0.4.46}/PKG-INFO +2 -2
- hud_python-0.4.46/hud/agents/misc/__init__.py +8 -0
- hud_python-0.4.46/hud/agents/misc/integration_test_agent.py +56 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/tests/test_openai.py +32 -26
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/__init__.py +17 -4
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/eval.py +85 -64
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/rl/gpu_utils.py +1 -2
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/config.py +1 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/distributed.py +6 -2
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/learner.py +58 -23
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/train.py +19 -8
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/telemetry/trace.py +4 -1
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/base.py +37 -1
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/types.py +2 -1
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/version.py +1 -1
- {hud_python-0.4.44 → hud_python-0.4.46}/pyproject.toml +2 -2
- hud_python-0.4.44/hud/agents/misc/__init__.py +0 -7
- {hud_python-0.4.44 → hud_python-0.4.46}/LICENSE +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/blank/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/blank/controller/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/blank/environment/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/blank/pyproject.toml +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/browser/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/browser/environment/2048/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/browser/environment/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/browser/environment/todo/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/browser/pyproject.toml +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/deepresearch/pyproject.toml +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/examples/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/__main__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/base.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/claude.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/grounded_openai.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/lite_llm.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/openai.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/openai_chat_generic.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/tests/test_base.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/build.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/clone.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/debug.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/dev.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/flows/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/flows/tasks.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/get.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/init.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/list_func.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/pull.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/push.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/remove.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/rl/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/rl/celebrate.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/rl/config.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/rl/display.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/rl/gpu.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/rl/local_runner.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/rl/presets.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/rl/remote_runner.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/rl/rl_api.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/rl/viewer.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/rl/vllm.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/rl/wait_utils.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_build.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_mcp_server.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/config.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/docker.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/env_check.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/interactive.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/local_runner.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/package_runner.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/runner.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/server.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/source_hash.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/cli/utils/tasks.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/clients/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/clients/base.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/clients/fastmcp.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/clients/mcp_use.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/clients/tests/test_mcp_use_retry.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/clients/utils/mcp_use_retry.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/clients/utils/retry.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/datasets/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/datasets/parallel.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/datasets/runner.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/datasets/utils.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/native/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/native/comparator.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/otel/collector.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/otel/config.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/otel/context.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/otel/exporters.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/otel/instrumentation.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/otel/processors.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/otel/tests/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/py.typed +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/README.md +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/actor.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/buffer.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/chat_template.jinja +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/tests/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/tests/test_learner.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/types.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/utils/start_vllm_server.sh +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/utils.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/rl/vllm_adapter.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/samples/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/samples/browser.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/server/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/server/context.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/server/low_level.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/server/server.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/server/tests/test_add_tool.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/server/tests/test_context.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/server/tests/test_mcp_server_handlers.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/server/tests/test_mcp_server_integration.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/server/tests/test_mcp_server_more.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/server/tests/test_run_wrapper.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/server/tests/test_server_extra.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/server/tests/test_sigterm_runner.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/settings.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/shared/exceptions.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/shared/hints.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/shared/requests.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/shared/tests/test_exceptions.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/telemetry/job.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/telemetry/tests/test_trace.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/bash.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/edit.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/executors/base.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/grounding/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/grounding/config.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/grounding/grounded_tool.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/grounding/grounder.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/grounding/tests/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/response.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/submit.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/types.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/tools/utils.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/agent_factories.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/group_eval.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/hud_console.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/mcp.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/pretty_errors.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/progress.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/tasks.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/tests/test_mcp.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.44 → hud_python-0.4.46}/hud/utils/tool_shorthand.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.46
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -41,7 +41,7 @@ Requires-Dist: datasets>=2.14.0
|
|
|
41
41
|
Requires-Dist: httpx<1,>=0.23.0
|
|
42
42
|
Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
|
|
43
43
|
Requires-Dist: hud-mcp-python-sdk>=3.13.2
|
|
44
|
-
Requires-Dist: hud-mcp-use-python-sdk==2.3.
|
|
44
|
+
Requires-Dist: hud-mcp-use-python-sdk==2.3.20
|
|
45
45
|
Requires-Dist: numpy>=1.24.0
|
|
46
46
|
Requires-Dist: openai
|
|
47
47
|
Requires-Dist: opentelemetry-api>=1.34.1
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from hud.agents.base import MCPAgent, find_reward
|
|
6
|
+
from hud.types import AgentResponse, Task, Trace
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class IntegrationTestRunner(MCPAgent):
|
|
10
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
11
|
+
kwargs["auto_trace"] = False
|
|
12
|
+
super().__init__(**kwargs)
|
|
13
|
+
self.metadata = {}
|
|
14
|
+
|
|
15
|
+
async def run(self, task: Task, max_steps: int = 10) -> Trace:
|
|
16
|
+
try:
|
|
17
|
+
# Initialize using base to set up client and telemetry correctly
|
|
18
|
+
await self.initialize(task)
|
|
19
|
+
|
|
20
|
+
# Validate task shape
|
|
21
|
+
if not getattr(task, "integration_test_tool", None):
|
|
22
|
+
raise ValueError(
|
|
23
|
+
"--integration-test requires task.integration_test_tool (single call)"
|
|
24
|
+
)
|
|
25
|
+
elif not getattr(task, "evaluate_tool", None):
|
|
26
|
+
raise ValueError("--integration-test requires task.evaluate_tool (single call)")
|
|
27
|
+
|
|
28
|
+
if task.setup_tool:
|
|
29
|
+
_ = await self.call_tools(task.setup_tool)
|
|
30
|
+
|
|
31
|
+
_ = await self.call_tools(task.integration_test_tool)
|
|
32
|
+
evaluate_result = await self.call_tools(task.evaluate_tool)
|
|
33
|
+
|
|
34
|
+
reward = float(find_reward(evaluate_result[0])) if evaluate_result else 0.0
|
|
35
|
+
|
|
36
|
+
return Trace(done=True, reward=reward, info={})
|
|
37
|
+
finally:
|
|
38
|
+
# Ensure resources are cleaned up so the CLI can exit cleanly
|
|
39
|
+
await self._cleanup()
|
|
40
|
+
|
|
41
|
+
# Stub implementations to satisfy abstract base class; not used in --integration-test path
|
|
42
|
+
async def get_system_messages(self) -> list[Any]:
|
|
43
|
+
return []
|
|
44
|
+
|
|
45
|
+
async def get_response(self, messages: list[Any]) -> AgentResponse:
|
|
46
|
+
raise NotImplementedError("IntegrationTestRunner does not implement agent loop")
|
|
47
|
+
|
|
48
|
+
async def format_blocks(self, blocks: list[Any]) -> list[Any]:
|
|
49
|
+
return []
|
|
50
|
+
|
|
51
|
+
async def format_tool_results(
|
|
52
|
+
self,
|
|
53
|
+
tool_calls: list[Any],
|
|
54
|
+
tool_results: list[Any],
|
|
55
|
+
) -> list[Any]:
|
|
56
|
+
return []
|
|
@@ -146,37 +146,43 @@ class TestOperatorAgent:
|
|
|
146
146
|
@pytest.mark.asyncio
|
|
147
147
|
async def test_get_model_response(self, mock_mcp_client, mock_openai):
|
|
148
148
|
"""Test getting model response from OpenAI API."""
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
149
|
+
# Disable telemetry for this test to avoid backend configuration issues
|
|
150
|
+
with patch("hud.settings.settings.telemetry_enabled", False):
|
|
151
|
+
agent = OperatorAgent(
|
|
152
|
+
mcp_client=mock_mcp_client,
|
|
153
|
+
model_client=mock_openai,
|
|
154
|
+
validate_api_key=False, # Skip validation in tests
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Set up available tools so agent doesn't return "No computer use tools available"
|
|
158
|
+
agent._available_tools = [
|
|
159
|
+
types.Tool(name="computer_openai", description="Computer tool", inputSchema={})
|
|
160
|
+
]
|
|
154
161
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
162
|
+
# Mock OpenAI API response for a successful computer use response
|
|
163
|
+
mock_response = MagicMock()
|
|
164
|
+
mock_response.id = "response_123"
|
|
165
|
+
mock_response.state = "completed"
|
|
166
|
+
# Mock the output message structure
|
|
167
|
+
mock_output_text = MagicMock()
|
|
168
|
+
mock_output_text.type = "output_text"
|
|
169
|
+
mock_output_text.text = "I can see the screen content."
|
|
159
170
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
mock_response.state = "completed"
|
|
164
|
-
# Mock the output message structure
|
|
165
|
-
mock_output_text = MagicMock()
|
|
166
|
-
mock_output_text.type = "output_text"
|
|
167
|
-
mock_output_text.text = "I can see the screen content."
|
|
168
|
-
mock_output_message = MagicMock()
|
|
169
|
-
mock_output_message.type = "message"
|
|
170
|
-
mock_output_message.content = [mock_output_text]
|
|
171
|
-
mock_response.output = [mock_output_message]
|
|
171
|
+
mock_output_message = MagicMock()
|
|
172
|
+
mock_output_message.type = "message"
|
|
173
|
+
mock_output_message.content = [mock_output_text]
|
|
172
174
|
|
|
173
|
-
|
|
175
|
+
mock_response.output = [mock_output_message]
|
|
174
176
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
+
mock_openai.responses.create = AsyncMock(return_value=mock_response)
|
|
178
|
+
|
|
179
|
+
messages = [{"prompt": "What's on the screen?", "screenshot": None}]
|
|
180
|
+
response = await agent.get_response(messages)
|
|
177
181
|
|
|
178
|
-
|
|
179
|
-
|
|
182
|
+
# The test should verify that the response is processed correctly
|
|
183
|
+
# Since the isinstance checks will fail, content will be empty, but done should be True
|
|
184
|
+
assert response.done is True
|
|
185
|
+
assert response.tool_calls == []
|
|
180
186
|
|
|
181
187
|
@pytest.mark.asyncio
|
|
182
188
|
async def test_handle_empty_response(self, mock_mcp_client, mock_openai):
|
|
@@ -144,7 +144,7 @@ def debug(
|
|
|
144
144
|
None,
|
|
145
145
|
help="Docker image, environment directory, or config file followed by optional Docker arguments", # noqa: E501
|
|
146
146
|
),
|
|
147
|
-
config: Path = typer.Option( # noqa: B008
|
|
147
|
+
config: Path | None = typer.Option( # noqa: B008
|
|
148
148
|
None,
|
|
149
149
|
"--config",
|
|
150
150
|
"-c",
|
|
@@ -976,6 +976,15 @@ def eval(
|
|
|
976
976
|
"--group-size",
|
|
977
977
|
help="Number of times to run each task (similar to RL training)",
|
|
978
978
|
),
|
|
979
|
+
integration_test: bool = typer.Option(
|
|
980
|
+
False,
|
|
981
|
+
"--integration-test",
|
|
982
|
+
help=(
|
|
983
|
+
"Run integration_test_tool, where problem is setup, "
|
|
984
|
+
"actions are applied, and evaluation is performed, without "
|
|
985
|
+
"spinning up an agent"
|
|
986
|
+
),
|
|
987
|
+
),
|
|
979
988
|
) -> None:
|
|
980
989
|
"""🚀 Run evaluation on datasets or individual tasks with agents."""
|
|
981
990
|
from hud.settings import settings
|
|
@@ -983,6 +992,9 @@ def eval(
|
|
|
983
992
|
|
|
984
993
|
hud_console = HUDConsole()
|
|
985
994
|
|
|
995
|
+
if integration_test:
|
|
996
|
+
agent = "integration_test"
|
|
997
|
+
|
|
986
998
|
# If no source provided, reuse RL helper to find a tasks file interactively
|
|
987
999
|
if source is None:
|
|
988
1000
|
try:
|
|
@@ -1038,7 +1050,7 @@ def eval(
|
|
|
1038
1050
|
agent = hud_console.select("Select an agent to use:", choices=choices, default=0)
|
|
1039
1051
|
|
|
1040
1052
|
# Handle HUD model selection
|
|
1041
|
-
if agent and agent not in ["claude", "openai", "vllm", "litellm"]:
|
|
1053
|
+
if agent and agent not in ["claude", "openai", "vllm", "litellm", "integration_test"]:
|
|
1042
1054
|
# Find remote model name
|
|
1043
1055
|
model = agent
|
|
1044
1056
|
if not vllm_base_url:
|
|
@@ -1059,7 +1071,7 @@ def eval(
|
|
|
1059
1071
|
hud_console.info(f"Using HUD model: {model} (trained on {base_model})")
|
|
1060
1072
|
|
|
1061
1073
|
# Validate agent choice
|
|
1062
|
-
valid_agents = ["claude", "openai", "vllm", "litellm"]
|
|
1074
|
+
valid_agents = ["claude", "openai", "vllm", "litellm", "integration_test"]
|
|
1063
1075
|
if agent not in valid_agents:
|
|
1064
1076
|
hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
|
|
1065
1077
|
raise typer.Exit(1)
|
|
@@ -1080,6 +1092,7 @@ def eval(
|
|
|
1080
1092
|
very_verbose=very_verbose,
|
|
1081
1093
|
vllm_base_url=vllm_base_url,
|
|
1082
1094
|
group_size=group_size,
|
|
1095
|
+
integration_test=integration_test,
|
|
1083
1096
|
)
|
|
1084
1097
|
|
|
1085
1098
|
|
|
@@ -1105,7 +1118,7 @@ def get(
|
|
|
1105
1118
|
),
|
|
1106
1119
|
) -> None:
|
|
1107
1120
|
"""📥 Download a HuggingFace dataset and save it as JSONL."""
|
|
1108
|
-
from .get import get_command
|
|
1121
|
+
from hud.cli.get import get_command
|
|
1109
1122
|
|
|
1110
1123
|
get_command(
|
|
1111
1124
|
dataset_name=dataset_name,
|
|
@@ -69,7 +69,7 @@ def get_available_models() -> list[dict[str, str | None]]:
|
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
def build_agent(
|
|
72
|
-
agent_type: Literal["claude", "openai", "vllm", "litellm"],
|
|
72
|
+
agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"],
|
|
73
73
|
*,
|
|
74
74
|
model: str | None = None,
|
|
75
75
|
allowed_tools: list[str] | None = None,
|
|
@@ -79,7 +79,11 @@ def build_agent(
|
|
|
79
79
|
"""Create and return the requested agent type."""
|
|
80
80
|
|
|
81
81
|
# Import agents lazily to avoid dependency issues
|
|
82
|
-
if agent_type == "
|
|
82
|
+
if agent_type == "integration_test":
|
|
83
|
+
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
84
|
+
|
|
85
|
+
return IntegrationTestRunner(verbose=verbose)
|
|
86
|
+
elif agent_type == "vllm":
|
|
83
87
|
# Create a generic OpenAI agent for vLLM server
|
|
84
88
|
try:
|
|
85
89
|
from openai import AsyncOpenAI
|
|
@@ -185,7 +189,7 @@ def build_agent(
|
|
|
185
189
|
async def run_single_task(
|
|
186
190
|
source: str,
|
|
187
191
|
*,
|
|
188
|
-
agent_type: Literal["claude", "openai", "vllm", "litellm"] = "claude",
|
|
192
|
+
agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
|
|
189
193
|
model: str | None = None,
|
|
190
194
|
allowed_tools: list[str] | None = None,
|
|
191
195
|
max_steps: int = 10,
|
|
@@ -205,12 +209,9 @@ async def run_single_task(
|
|
|
205
209
|
)
|
|
206
210
|
raise typer.Exit(1) from e
|
|
207
211
|
|
|
208
|
-
# Check if it's a file
|
|
209
212
|
path = Path(source)
|
|
210
213
|
if path.exists() and (path.suffix in [".json", ".jsonl"]):
|
|
211
214
|
hud_console.info("📊 Loading task file…")
|
|
212
|
-
|
|
213
|
-
# Use unified loader for both JSON and JSONL
|
|
214
215
|
tasks: list[Task] = load_tasks(str(path)) # type: ignore[assignment]
|
|
215
216
|
|
|
216
217
|
# If tasks reference a local environment (nearby), ensure it's built/up-to-date.
|
|
@@ -218,13 +219,14 @@ async def run_single_task(
|
|
|
218
219
|
env_dir = find_environment_dir(path)
|
|
219
220
|
if env_dir is not None:
|
|
220
221
|
# Non-interactive for eval; warn but don't block
|
|
221
|
-
ensure_built(env_dir, interactive=
|
|
222
|
+
ensure_built(env_dir, interactive=False)
|
|
222
223
|
except Exception as e:
|
|
223
224
|
hud_console.debug(f"Eval preflight env check skipped: {e}")
|
|
224
225
|
|
|
225
226
|
# Single task - use the first (and only) task
|
|
226
227
|
task = tasks[0]
|
|
227
228
|
hud_console.info("Found 1 task, running as single task…")
|
|
229
|
+
|
|
228
230
|
else:
|
|
229
231
|
# Load from HuggingFace dataset or non-file source
|
|
230
232
|
hud_console.info(f"📊 Loading tasks from: {source}…")
|
|
@@ -243,60 +245,67 @@ async def run_single_task(
|
|
|
243
245
|
task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
|
|
244
246
|
|
|
245
247
|
# Use grouped evaluation if group_size > 1
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
248
|
+
agent_config: dict[str, Any] = {}
|
|
249
|
+
if agent_type == "integration_test":
|
|
250
|
+
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
249
251
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
252
|
+
agent_class = IntegrationTestRunner
|
|
253
|
+
agent_config = {"verbose": verbose}
|
|
254
|
+
if allowed_tools:
|
|
255
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
256
|
+
elif agent_type == "vllm":
|
|
257
|
+
# Special handling for vLLM
|
|
258
|
+
sample_agent = build_agent(
|
|
259
|
+
agent_type,
|
|
260
|
+
model=model,
|
|
261
|
+
allowed_tools=allowed_tools,
|
|
262
|
+
verbose=verbose,
|
|
263
|
+
vllm_base_url=vllm_base_url,
|
|
264
|
+
)
|
|
265
|
+
agent_config = {
|
|
266
|
+
"openai_client": sample_agent.oai,
|
|
267
|
+
"model_name": sample_agent.model_name,
|
|
268
|
+
"verbose": verbose,
|
|
269
|
+
"completion_kwargs": sample_agent.completion_kwargs,
|
|
270
|
+
}
|
|
271
|
+
if allowed_tools:
|
|
272
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
268
273
|
|
|
269
|
-
|
|
274
|
+
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
270
275
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
276
|
+
agent_class = GenericOpenAIChatAgent
|
|
277
|
+
elif agent_type == "openai":
|
|
278
|
+
from hud.agents import OperatorAgent
|
|
274
279
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
280
|
+
agent_class = OperatorAgent
|
|
281
|
+
agent_config = {"verbose": verbose}
|
|
282
|
+
if allowed_tools:
|
|
283
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
284
|
+
elif agent_type == "litellm":
|
|
285
|
+
from hud.agents.lite_llm import LiteAgent
|
|
281
286
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
287
|
+
agent_class = LiteAgent
|
|
288
|
+
agent_config = {
|
|
289
|
+
"model_name": model or "gpt-4o-mini",
|
|
290
|
+
"verbose": verbose,
|
|
291
|
+
}
|
|
292
|
+
if allowed_tools:
|
|
293
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
294
|
+
elif agent_type == "claude":
|
|
295
|
+
from hud.agents import ClaudeAgent
|
|
291
296
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
297
|
+
agent_class = ClaudeAgent
|
|
298
|
+
agent_config = {
|
|
299
|
+
"model": model or "claude-sonnet-4-20250514",
|
|
300
|
+
"verbose": verbose,
|
|
301
|
+
}
|
|
302
|
+
if allowed_tools:
|
|
303
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
304
|
+
else:
|
|
305
|
+
raise ValueError(f"Invalid agent type: {agent_type}")
|
|
299
306
|
|
|
307
|
+
if group_size > 1:
|
|
308
|
+
hud_console.info(f"🔄 Running task with group_size={group_size}")
|
|
300
309
|
# Run with grouping
|
|
301
310
|
stats = await run_tasks_grouped(
|
|
302
311
|
tasks=[task],
|
|
@@ -307,10 +316,7 @@ async def run_single_task(
|
|
|
307
316
|
max_steps=max_steps,
|
|
308
317
|
verbose=verbose,
|
|
309
318
|
)
|
|
310
|
-
|
|
311
|
-
# Display results
|
|
312
319
|
display_group_statistics(stats, show_details=True)
|
|
313
|
-
|
|
314
320
|
else:
|
|
315
321
|
# Original single-run logic
|
|
316
322
|
with hud.trace(name=task_prompt):
|
|
@@ -329,7 +335,7 @@ async def run_single_task(
|
|
|
329
335
|
async def run_full_dataset(
|
|
330
336
|
source: str,
|
|
331
337
|
*,
|
|
332
|
-
agent_type: Literal["claude", "openai", "vllm", "litellm"] = "claude",
|
|
338
|
+
agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
|
|
333
339
|
model: str | None = None,
|
|
334
340
|
allowed_tools: list[str] | None = None,
|
|
335
341
|
max_concurrent: int = 30,
|
|
@@ -372,10 +378,13 @@ async def run_full_dataset(
|
|
|
372
378
|
path = Path(source)
|
|
373
379
|
dataset_name = f"Dataset: {path.name}" if path.exists() else source.split("/")[-1]
|
|
374
380
|
|
|
375
|
-
hud_console.info(f"Found {len(tasks)} tasks")
|
|
376
|
-
|
|
377
381
|
# Build agent class + config for run_dataset
|
|
378
|
-
if agent_type == "
|
|
382
|
+
if agent_type == "integration_test": # --integration-test mode
|
|
383
|
+
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
384
|
+
|
|
385
|
+
agent_class = IntegrationTestRunner
|
|
386
|
+
agent_config = {"verbose": verbose}
|
|
387
|
+
elif agent_type == "vllm":
|
|
379
388
|
try:
|
|
380
389
|
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
381
390
|
|
|
@@ -405,7 +414,6 @@ async def run_full_dataset(
|
|
|
405
414
|
}
|
|
406
415
|
if allowed_tools:
|
|
407
416
|
agent_config["allowed_tools"] = allowed_tools
|
|
408
|
-
|
|
409
417
|
elif agent_type == "openai":
|
|
410
418
|
try:
|
|
411
419
|
from hud.agents import OperatorAgent
|
|
@@ -557,7 +565,7 @@ def eval_command(
|
|
|
557
565
|
"--full",
|
|
558
566
|
help="Run the entire dataset (omit for single-task debug mode)",
|
|
559
567
|
),
|
|
560
|
-
agent: Literal["claude", "openai", "vllm", "litellm"] = typer.Option(
|
|
568
|
+
agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option(
|
|
561
569
|
"claude",
|
|
562
570
|
"--agent",
|
|
563
571
|
help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
|
|
@@ -573,7 +581,7 @@ def eval_command(
|
|
|
573
581
|
help="Comma-separated list of allowed tools",
|
|
574
582
|
),
|
|
575
583
|
max_concurrent: int = typer.Option(
|
|
576
|
-
|
|
584
|
+
30,
|
|
577
585
|
"--max-concurrent",
|
|
578
586
|
help="Concurrency level for asyncio mode (ignored in parallel mode)",
|
|
579
587
|
),
|
|
@@ -618,6 +626,15 @@ def eval_command(
|
|
|
618
626
|
"--group-size",
|
|
619
627
|
help="Number of times to run each task (similar to RL training)",
|
|
620
628
|
),
|
|
629
|
+
integration_test: bool = typer.Option(
|
|
630
|
+
False,
|
|
631
|
+
"--integration-test",
|
|
632
|
+
help=(
|
|
633
|
+
"Run integration_test_tool tool, where problem is setup, "
|
|
634
|
+
"actions are applied, and evaluation is performed, without "
|
|
635
|
+
"spinning up an agent"
|
|
636
|
+
),
|
|
637
|
+
),
|
|
621
638
|
) -> None:
|
|
622
639
|
"""🚀 Run evaluation on datasets or individual tasks with agents.
|
|
623
640
|
|
|
@@ -674,6 +691,10 @@ def eval_command(
|
|
|
674
691
|
logging.getLogger("hud.agents").setLevel(logging.INFO)
|
|
675
692
|
logging.getLogger("hud.agents.base").setLevel(logging.INFO)
|
|
676
693
|
|
|
694
|
+
# We pass integration_test as the agent_type
|
|
695
|
+
if integration_test:
|
|
696
|
+
agent = "integration_test"
|
|
697
|
+
|
|
677
698
|
# Check for required API keys
|
|
678
699
|
if agent == "claude":
|
|
679
700
|
if not settings.anthropic_api_key:
|
|
@@ -7,8 +7,6 @@ import subprocess
|
|
|
7
7
|
import time
|
|
8
8
|
from typing import TYPE_CHECKING, Any
|
|
9
9
|
|
|
10
|
-
import torch
|
|
11
|
-
|
|
12
10
|
from hud.utils.hud_console import HUDConsole
|
|
13
11
|
|
|
14
12
|
if TYPE_CHECKING:
|
|
@@ -87,6 +85,7 @@ def health_check_gpus(gpu_indices: list[int]) -> dict[str, Any]:
|
|
|
87
85
|
- all_healthy: Boolean indicating if all GPUs are healthy
|
|
88
86
|
- memory_issues: Boolean indicating if there are memory issues
|
|
89
87
|
"""
|
|
88
|
+
import torch
|
|
90
89
|
from rich.console import Console
|
|
91
90
|
from rich.table import Table
|
|
92
91
|
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from typing import Any
|
|
7
8
|
|
|
8
9
|
import torch
|
|
@@ -17,7 +18,10 @@ def setup_distributed() -> None:
|
|
|
17
18
|
torch.cuda.set_device(local_rank)
|
|
18
19
|
|
|
19
20
|
# Initialize process group
|
|
20
|
-
|
|
21
|
+
# Increase watchdog timeout to accommodate long eval/sampling phases
|
|
22
|
+
# and enable clearer NCCL error handling.
|
|
23
|
+
os.environ.setdefault("TORCH_NCCL_ASYNC_ERROR_HANDLING", "1")
|
|
24
|
+
dist.init_process_group("nccl", timeout=timedelta(minutes=20))
|
|
21
25
|
|
|
22
26
|
|
|
23
27
|
def get_local_rank() -> int:
|
|
@@ -77,7 +81,7 @@ def broadcast_object(obj: Any, src: int = 0) -> Any:
|
|
|
77
81
|
return obj
|
|
78
82
|
|
|
79
83
|
obj_list = [obj] if dist.get_rank() == src else [None]
|
|
80
|
-
dist.broadcast_object_list(obj_list, src=src)
|
|
84
|
+
dist.broadcast_object_list(obj_list, src=src, device=torch.device("cpu"))
|
|
81
85
|
return obj_list[0]
|
|
82
86
|
|
|
83
87
|
|