hud-python 0.4.52__tar.gz → 0.4.53__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.52 → hud_python-0.4.53}/PKG-INFO +47 -48
- {hud_python-0.4.52 → hud_python-0.4.53}/README.md +46 -47
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/blank/README.md +9 -2
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/blank/server/pyproject.toml +1 -1
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/browser/server/pyproject.toml +1 -1
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/deepresearch/server/pyproject.toml +1 -1
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/base.py +9 -2
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/openai_chat_generic.py +15 -3
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/tests/test_base.py +15 -0
- hud_python-0.4.53/hud/agents/tests/test_base_runtime.py +164 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/__init__.py +6 -3
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/build.py +35 -27
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/dev.py +11 -29
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/eval.py +61 -61
- hud_python-0.4.53/hud/cli/tests/test_analyze_module.py +120 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_build.py +24 -2
- hud_python-0.4.53/hud/cli/tests/test_build_failure.py +41 -0
- hud_python-0.4.53/hud/cli/tests/test_build_module.py +50 -0
- hud_python-0.4.53/hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud_python-0.4.53/hud/cli/tests/test_cli_root.py +134 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_mcp_server.py +8 -7
- hud_python-0.4.53/hud/cli/tests/test_push_happy.py +74 -0
- hud_python-0.4.53/hud/cli/tests/test_push_wrapper.py +23 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/docker.py +120 -1
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/runner.py +1 -1
- hud_python-0.4.53/hud/cli/utils/tests/test_config.py +58 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_docker.py +93 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_env_check.py +74 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_environment.py +42 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_local_runner.py +50 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_metadata.py +49 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_package_runner.py +35 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_source_hash.py +36 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_tasks.py +80 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/version_check.py +2 -2
- hud_python-0.4.53/hud/datasets/tests/test_runner.py +106 -0
- hud_python-0.4.53/hud/datasets/tests/test_utils.py +228 -0
- hud_python-0.4.53/hud/otel/tests/test_instrumentation.py +207 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/tests/test_server_extra.py +2 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/shared/exceptions.py +35 -4
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/shared/hints.py +25 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/shared/requests.py +15 -3
- hud_python-0.4.53/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/shared/tests/test_exceptions.py +31 -23
- hud_python-0.4.53/hud/shared/tests/test_hints.py +167 -0
- hud_python-0.4.53/hud/telemetry/tests/__init__.py +0 -0
- hud_python-0.4.53/hud/telemetry/tests/test_async_context.py +242 -0
- hud_python-0.4.53/hud/telemetry/tests/test_instrument.py +414 -0
- hud_python-0.4.53/hud/telemetry/tests/test_job.py +609 -0
- hud_python-0.4.53/hud/telemetry/tests/test_trace.py +241 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/computer/settings.py +2 -2
- hud_python-0.4.53/hud/tools/tests/test_submit.py +85 -0
- hud_python-0.4.53/hud/tools/tests/test_types.py +193 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/types.py +7 -1
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/agent_factories.py +1 -3
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/mcp.py +1 -1
- hud_python-0.4.53/hud/utils/tests/__init__.py +0 -0
- hud_python-0.4.53/hud/utils/tests/test_agent_factories.py +60 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/tests/test_mcp.py +4 -6
- hud_python-0.4.53/hud/utils/tests/test_pretty_errors.py +186 -0
- hud_python-0.4.53/hud/utils/tests/test_tasks.py +187 -0
- hud_python-0.4.53/hud/utils/tests/test_tool_shorthand.py +154 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/version.py +1 -1
- {hud_python-0.4.52 → hud_python-0.4.53}/pyproject.toml +16 -3
- hud_python-0.4.52/hud/otel/tests/__init__.py +0 -1
- hud_python-0.4.52/hud/telemetry/tests/test_trace.py +0 -63
- {hud_python-0.4.52 → hud_python-0.4.53}/.gitignore +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/LICENSE +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/blank/environment/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/blank/environment/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/blank/server/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/browser/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/browser/environment/2048/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/browser/environment/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/browser/environment/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/browser/environment/todo/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/browser/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/deepresearch/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/deepresearch/environment/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/deepresearch/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/examples/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/__main__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/claude.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/grounded_openai.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/lite_llm.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/misc/integration_test_agent.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/openai.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/clone.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/debug.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/flows/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/flows/tasks.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/get.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/init.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/list_func.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/pull.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/push.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/remove.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/rl/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/rl/celebrate.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/rl/config.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/rl/display.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/rl/gpu.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/rl/gpu_utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/rl/local_runner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/rl/presets.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/rl/remote_runner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/rl/rl_api.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/rl/viewer.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/rl/vllm.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/rl/wait_utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_eval.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/config.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/env_check.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/interactive.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/local_runner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/package_runner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/server.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/source_hash.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/cli/utils/tasks.py +0 -0
- {hud_python-0.4.52/hud/shared → hud_python-0.4.53/hud/cli/utils}/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/clients/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/clients/base.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/clients/fastmcp.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/clients/mcp_use.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/clients/tests/test_mcp_use_retry.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/clients/utils/mcp_use_retry.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/clients/utils/retry.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/datasets/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/datasets/parallel.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/datasets/runner.py +0 -0
- {hud_python-0.4.52/hud/telemetry → hud_python-0.4.53/hud/datasets}/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/datasets/utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/native/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/native/comparator.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/otel/collector.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/otel/config.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/otel/context.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/otel/exporters.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/otel/instrumentation.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/otel/processors.py +0 -0
- {hud_python-0.4.52/hud/utils → hud_python-0.4.53/hud/otel}/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/py.typed +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/actor.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/buffer.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/chat_template.jinja +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/config.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/distributed.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/learner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/tests/test_learner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/train.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/types.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/utils/start_vllm_server.sh +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/rl/vllm_adapter.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/samples/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/samples/browser.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/context.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/low_level.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/router.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/server.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/tests/test_add_tool.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/tests/test_context.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/tests/test_mcp_server_handlers.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/tests/test_mcp_server_integration.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/tests/test_mcp_server_more.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/tests/test_run_wrapper.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/server/tests/test_sigterm_runner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/settings.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/telemetry/async_context.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/telemetry/job.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/telemetry/trace.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/base.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/bash.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/computer/qwen.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/edit.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/executors/base.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/grounding/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/grounding/config.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/grounding/grounded_tool.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/grounding/grounder.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/grounding/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/response.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/submit.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/types.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/tools/utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/group_eval.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/hud_console.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/pretty_errors.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/progress.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/task_tracking.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/tasks.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.53}/hud/utils/tool_shorthand.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.53
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -160,12 +160,12 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
|
|
|
160
160
|
|
|
161
161
|
## Highlights
|
|
162
162
|
|
|
163
|
-
- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
|
|
164
163
|
- 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
|
|
165
164
|
- ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
|
|
166
165
|
- 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
|
|
167
166
|
- 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
|
|
168
167
|
- 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
|
|
168
|
+
- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
|
|
169
169
|
|
|
170
170
|
> We welcome contributors and feature requests – open an issue or hop on a call to discuss improvements!
|
|
171
171
|
|
|
@@ -186,29 +186,6 @@ uv tool install hud-python
|
|
|
186
186
|
Before starting, get your HUD_API_KEY at [hud.so](https://hud.so).
|
|
187
187
|
|
|
188
188
|
|
|
189
|
-
## Quickstart: Training
|
|
190
|
-
|
|
191
|
-
RL using GRPO a Qwen2.5-VL model on any hud dataset:
|
|
192
|
-
|
|
193
|
-
```bash
|
|
194
|
-
hud get hud-evals/basic-2048 # from HF
|
|
195
|
-
hud rl basic-2048.json
|
|
196
|
-
```
|
|
197
|
-
|
|
198
|
-
> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
|
|
199
|
-
|
|
200
|
-
Or make your own environment and dataset:
|
|
201
|
-
|
|
202
|
-
```bash
|
|
203
|
-
hud init my-env && cd my-env
|
|
204
|
-
hud dev --interactive
|
|
205
|
-
# When ready to run:
|
|
206
|
-
hud rl
|
|
207
|
-
```
|
|
208
|
-
|
|
209
|
-
> See [environment design docs](https://docs.hud.so/build-environments)
|
|
210
|
-
|
|
211
|
-
|
|
212
189
|
## Quickstart: Evals
|
|
213
190
|
|
|
214
191
|
For a tutorial that explains the agent and evaluation design, run:
|
|
@@ -265,38 +242,27 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
|
|
|
265
242
|
|
|
266
243
|

|
|
267
244
|
|
|
268
|
-
##
|
|
269
|
-
|
|
270
|
-
This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
|
|
271
|
-
|
|
272
|
-

|
|
245
|
+
## Quickstart: Training
|
|
273
246
|
|
|
274
|
-
|
|
247
|
+
RL using GRPO a Qwen2.5-VL model on any hud dataset:
|
|
275
248
|
|
|
276
249
|
```bash
|
|
277
|
-
#
|
|
278
|
-
uv tool install hud-python
|
|
279
|
-
|
|
280
|
-
# Option A: Run directly from a HuggingFace dataset
|
|
281
|
-
hud rl hud-evals/basic-2048
|
|
282
|
-
|
|
283
|
-
# Option B: Download first, modify, then train
|
|
284
|
-
hud get hud-evals/basic-2048
|
|
250
|
+
hud get hud-evals/basic-2048 # from HF
|
|
285
251
|
hud rl basic-2048.json
|
|
286
|
-
|
|
287
|
-
# Optional: baseline evaluation
|
|
288
|
-
hud eval basic-2048.json
|
|
289
252
|
```
|
|
290
253
|
|
|
291
|
-
|
|
292
|
-
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
293
|
-
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
254
|
+
> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
|
|
294
255
|
|
|
295
|
-
|
|
256
|
+
Or make your own environment and dataset:
|
|
296
257
|
|
|
297
|
-
|
|
258
|
+
```bash
|
|
259
|
+
hud init my-env && cd my-env
|
|
260
|
+
hud dev --interactive
|
|
261
|
+
# When ready to run:
|
|
262
|
+
hud rl
|
|
263
|
+
```
|
|
298
264
|
|
|
299
|
-
|
|
265
|
+
> See [environment design docs](https://docs.hud.so/build-environments)
|
|
300
266
|
|
|
301
267
|
## Benchmarking Agents
|
|
302
268
|
|
|
@@ -460,6 +426,39 @@ We highly suggest running 3-5 evaluations per dataset for the most consistent re
|
|
|
460
426
|
|
|
461
427
|
Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it:
|
|
462
428
|
|
|
429
|
+
## Reinforcement Learning with GRPO
|
|
430
|
+
|
|
431
|
+
This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
|
|
432
|
+
|
|
433
|
+

|
|
434
|
+
|
|
435
|
+
Train with the new interactive `hud rl` flow:
|
|
436
|
+
|
|
437
|
+
```bash
|
|
438
|
+
# Install CLI
|
|
439
|
+
uv tool install hud-python
|
|
440
|
+
|
|
441
|
+
# Option A: Run directly from a HuggingFace dataset
|
|
442
|
+
hud rl hud-evals/basic-2048
|
|
443
|
+
|
|
444
|
+
# Option B: Download first, modify, then train
|
|
445
|
+
hud get hud-evals/basic-2048
|
|
446
|
+
hud rl basic-2048.json
|
|
447
|
+
|
|
448
|
+
# Optional: baseline evaluation
|
|
449
|
+
hud eval basic-2048.json
|
|
450
|
+
```
|
|
451
|
+
|
|
452
|
+
Supports multi‑turn RL for both:
|
|
453
|
+
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
454
|
+
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
455
|
+
|
|
456
|
+
By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
|
|
457
|
+
|
|
458
|
+
Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
|
|
459
|
+
|
|
460
|
+
Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
|
|
461
|
+
|
|
463
462
|
## Architecture
|
|
464
463
|
|
|
465
464
|
```mermaid
|
|
@@ -22,12 +22,12 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
|
|
|
22
22
|
|
|
23
23
|
## Highlights
|
|
24
24
|
|
|
25
|
-
- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
|
|
26
25
|
- 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
|
|
27
26
|
- ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
|
|
28
27
|
- 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
|
|
29
28
|
- 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
|
|
30
29
|
- 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
|
|
30
|
+
- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
|
|
31
31
|
|
|
32
32
|
> We welcome contributors and feature requests – open an issue or hop on a call to discuss improvements!
|
|
33
33
|
|
|
@@ -48,29 +48,6 @@ uv tool install hud-python
|
|
|
48
48
|
Before starting, get your HUD_API_KEY at [hud.so](https://hud.so).
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
## Quickstart: Training
|
|
52
|
-
|
|
53
|
-
RL using GRPO a Qwen2.5-VL model on any hud dataset:
|
|
54
|
-
|
|
55
|
-
```bash
|
|
56
|
-
hud get hud-evals/basic-2048 # from HF
|
|
57
|
-
hud rl basic-2048.json
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
|
|
61
|
-
|
|
62
|
-
Or make your own environment and dataset:
|
|
63
|
-
|
|
64
|
-
```bash
|
|
65
|
-
hud init my-env && cd my-env
|
|
66
|
-
hud dev --interactive
|
|
67
|
-
# When ready to run:
|
|
68
|
-
hud rl
|
|
69
|
-
```
|
|
70
|
-
|
|
71
|
-
> See [environment design docs](https://docs.hud.so/build-environments)
|
|
72
|
-
|
|
73
|
-
|
|
74
51
|
## Quickstart: Evals
|
|
75
52
|
|
|
76
53
|
For a tutorial that explains the agent and evaluation design, run:
|
|
@@ -127,38 +104,27 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
|
|
|
127
104
|
|
|
128
105
|

|
|
129
106
|
|
|
130
|
-
##
|
|
131
|
-
|
|
132
|
-
This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
|
|
133
|
-
|
|
134
|
-

|
|
107
|
+
## Quickstart: Training
|
|
135
108
|
|
|
136
|
-
|
|
109
|
+
RL using GRPO a Qwen2.5-VL model on any hud dataset:
|
|
137
110
|
|
|
138
111
|
```bash
|
|
139
|
-
#
|
|
140
|
-
uv tool install hud-python
|
|
141
|
-
|
|
142
|
-
# Option A: Run directly from a HuggingFace dataset
|
|
143
|
-
hud rl hud-evals/basic-2048
|
|
144
|
-
|
|
145
|
-
# Option B: Download first, modify, then train
|
|
146
|
-
hud get hud-evals/basic-2048
|
|
112
|
+
hud get hud-evals/basic-2048 # from HF
|
|
147
113
|
hud rl basic-2048.json
|
|
148
|
-
|
|
149
|
-
# Optional: baseline evaluation
|
|
150
|
-
hud eval basic-2048.json
|
|
151
114
|
```
|
|
152
115
|
|
|
153
|
-
|
|
154
|
-
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
155
|
-
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
116
|
+
> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
|
|
156
117
|
|
|
157
|
-
|
|
118
|
+
Or make your own environment and dataset:
|
|
158
119
|
|
|
159
|
-
|
|
120
|
+
```bash
|
|
121
|
+
hud init my-env && cd my-env
|
|
122
|
+
hud dev --interactive
|
|
123
|
+
# When ready to run:
|
|
124
|
+
hud rl
|
|
125
|
+
```
|
|
160
126
|
|
|
161
|
-
|
|
127
|
+
> See [environment design docs](https://docs.hud.so/build-environments)
|
|
162
128
|
|
|
163
129
|
## Benchmarking Agents
|
|
164
130
|
|
|
@@ -322,6 +288,39 @@ We highly suggest running 3-5 evaluations per dataset for the most consistent re
|
|
|
322
288
|
|
|
323
289
|
Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it:
|
|
324
290
|
|
|
291
|
+
## Reinforcement Learning with GRPO
|
|
292
|
+
|
|
293
|
+
This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
|
|
294
|
+
|
|
295
|
+

|
|
296
|
+
|
|
297
|
+
Train with the new interactive `hud rl` flow:
|
|
298
|
+
|
|
299
|
+
```bash
|
|
300
|
+
# Install CLI
|
|
301
|
+
uv tool install hud-python
|
|
302
|
+
|
|
303
|
+
# Option A: Run directly from a HuggingFace dataset
|
|
304
|
+
hud rl hud-evals/basic-2048
|
|
305
|
+
|
|
306
|
+
# Option B: Download first, modify, then train
|
|
307
|
+
hud get hud-evals/basic-2048
|
|
308
|
+
hud rl basic-2048.json
|
|
309
|
+
|
|
310
|
+
# Optional: baseline evaluation
|
|
311
|
+
hud eval basic-2048.json
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
Supports multi‑turn RL for both:
|
|
315
|
+
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
316
|
+
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
317
|
+
|
|
318
|
+
By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
|
|
319
|
+
|
|
320
|
+
Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
|
|
321
|
+
|
|
322
|
+
Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
|
|
323
|
+
|
|
325
324
|
## Architecture
|
|
326
325
|
|
|
327
326
|
```mermaid
|
|
@@ -6,10 +6,12 @@ See [docs](https://docs.hud.so/build-environments) for the complete environment
|
|
|
6
6
|
## Architecture
|
|
7
7
|
|
|
8
8
|
**`environment/`** - Produces structured data
|
|
9
|
+
|
|
9
10
|
- Owns all state (game logic, browser sessions, databases, etc.)
|
|
10
11
|
- Exposes HTTP endpoints `/health`, `/act`, `/reset`, `/state` that return structured information about the environment state
|
|
11
12
|
|
|
12
13
|
**`server/`** - Wraps data in MCP tools
|
|
14
|
+
|
|
13
15
|
- Calls environment endpoints to get structured data for the agent, and environment setup/evaluation
|
|
14
16
|
- Agents and tasks interact only with these tools!
|
|
15
17
|
|
|
@@ -33,12 +35,14 @@ Visit http://localhost:8765/docs to see the new tool appear instantly.
|
|
|
33
35
|
In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
|
|
34
36
|
|
|
35
37
|
For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
|
|
38
|
+
|
|
36
39
|
```bash
|
|
37
40
|
cd ..
|
|
38
41
|
hud dev
|
|
39
42
|
```
|
|
40
43
|
|
|
41
44
|
## Tasks & Evaluation
|
|
45
|
+
|
|
42
46
|
```bash
|
|
43
47
|
# Build first in the global folder with the Dockerfile (creates blank:0.1.0)
|
|
44
48
|
hud build
|
|
@@ -59,6 +63,7 @@ Your `tasks.json` uses `docker run` to launch the environment:
|
|
|
59
63
|
```
|
|
60
64
|
|
|
61
65
|
**Commands:**
|
|
66
|
+
|
|
62
67
|
```bash
|
|
63
68
|
# Build first
|
|
64
69
|
hud build
|
|
@@ -78,6 +83,7 @@ hud rl tasks.json # Auto-converts docker→remote, builds & pushes if needed
|
|
|
78
83
|
Once your environment is ready, you can share it with the community:
|
|
79
84
|
|
|
80
85
|
### 1. Push to Registry
|
|
86
|
+
|
|
81
87
|
```bash
|
|
82
88
|
# Build and push your environment (requires docker hub login and hud api key)
|
|
83
89
|
hud build
|
|
@@ -89,10 +95,12 @@ hud push
|
|
|
89
95
|
Create a dataset on HuggingFace with your tasks:
|
|
90
96
|
|
|
91
97
|
**Option A: Upload manually**
|
|
98
|
+
|
|
92
99
|
1. Upload your `tasks.json` to HuggingFace
|
|
93
100
|
2. Make sure it's **public** to appear on leaderboards
|
|
94
101
|
|
|
95
102
|
**Option B: Use the SDK**
|
|
103
|
+
|
|
96
104
|
```python
|
|
97
105
|
from hud.datasets import save_tasks
|
|
98
106
|
import json
|
|
@@ -109,7 +117,7 @@ save_tasks(tasks, repo_id="your-org/your-dataset")
|
|
|
109
117
|
|
|
110
118
|
```bash
|
|
111
119
|
# Run Claude on your benchmark
|
|
112
|
-
hud eval "your-org/your-dataset"
|
|
120
|
+
hud eval "your-org/your-dataset" claude
|
|
113
121
|
|
|
114
122
|
# View results at:
|
|
115
123
|
# hud.so/leaderboards/your-org/your-dataset
|
|
@@ -118,4 +126,3 @@ hud eval "your-org/your-dataset" --agent claude
|
|
|
118
126
|
**Note**: Only public HuggingFace datasets appear as leaderboards!
|
|
119
127
|
|
|
120
128
|
📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
|
|
121
|
-
|
|
@@ -137,7 +137,11 @@ class MCPAgent(ABC):
|
|
|
137
137
|
"No MCPClient. Please provide one when initializing the agent or pass a Task with mcp_config." # noqa: E501
|
|
138
138
|
)
|
|
139
139
|
|
|
140
|
-
|
|
140
|
+
try:
|
|
141
|
+
client_cfg = getattr(self.mcp_client, "mcp_config", None)
|
|
142
|
+
except Exception:
|
|
143
|
+
client_cfg = None
|
|
144
|
+
await self._setup_config(client_cfg)
|
|
141
145
|
|
|
142
146
|
# Initialize client if needed
|
|
143
147
|
try:
|
|
@@ -618,8 +622,11 @@ class MCPAgent(ABC):
|
|
|
618
622
|
except Exception as e:
|
|
619
623
|
self.console.error_log(f"Response lifecycle tool failed: {e}")
|
|
620
624
|
|
|
621
|
-
async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
|
|
625
|
+
async def _setup_config(self, mcp_config: dict[str, dict[str, Any]] | None) -> None:
|
|
622
626
|
"""Inject metadata into the metadata of the initialize request."""
|
|
627
|
+
if not isinstance(mcp_config, dict):
|
|
628
|
+
return
|
|
629
|
+
|
|
623
630
|
if self.metadata:
|
|
624
631
|
patch_mcp_config(
|
|
625
632
|
mcp_config,
|
|
@@ -20,6 +20,7 @@ import logging
|
|
|
20
20
|
from typing import TYPE_CHECKING, Any, ClassVar, cast
|
|
21
21
|
|
|
22
22
|
import mcp.types as types
|
|
23
|
+
from openai import AsyncOpenAI
|
|
23
24
|
|
|
24
25
|
from hud import instrument
|
|
25
26
|
from hud.types import AgentResponse, MCPToolCall, MCPToolResult
|
|
@@ -28,7 +29,6 @@ from hud.utils.hud_console import HUDConsole
|
|
|
28
29
|
from .base import MCPAgent
|
|
29
30
|
|
|
30
31
|
if TYPE_CHECKING:
|
|
31
|
-
from openai import AsyncOpenAI
|
|
32
32
|
from openai.types.chat import ChatCompletionToolParam
|
|
33
33
|
|
|
34
34
|
logger = logging.getLogger(__name__)
|
|
@@ -42,14 +42,26 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
42
42
|
def __init__(
|
|
43
43
|
self,
|
|
44
44
|
*,
|
|
45
|
-
openai_client: AsyncOpenAI | None,
|
|
45
|
+
openai_client: AsyncOpenAI | None = None,
|
|
46
|
+
api_key: str | None = None,
|
|
47
|
+
base_url: str | None = None,
|
|
46
48
|
model_name: str = "gpt-4o-mini",
|
|
47
49
|
completion_kwargs: dict[str, Any] | None = None,
|
|
48
50
|
**agent_kwargs: Any,
|
|
49
51
|
) -> None:
|
|
50
52
|
# Accept base-agent settings via **agent_kwargs (e.g., mcp_client, system_prompt, etc.)
|
|
51
53
|
super().__init__(**agent_kwargs)
|
|
52
|
-
|
|
54
|
+
|
|
55
|
+
# Handle client creation - support both patterns
|
|
56
|
+
if openai_client is not None:
|
|
57
|
+
# Use provided client (backward compatibility)
|
|
58
|
+
self.oai = openai_client
|
|
59
|
+
elif api_key is not None or base_url is not None:
|
|
60
|
+
# Create client from config (new pattern, consistent with other agents)
|
|
61
|
+
self.oai = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
62
|
+
else:
|
|
63
|
+
raise ValueError("Either openai_client or (api_key and base_url) must be provided")
|
|
64
|
+
|
|
53
65
|
self.model_name = model_name
|
|
54
66
|
self.completion_kwargs: dict[str, Any] = completion_kwargs or {}
|
|
55
67
|
self.mcp_schemas = []
|
|
@@ -329,6 +329,21 @@ class TestBaseMCPAgent:
|
|
|
329
329
|
# call_tools doesn't validate empty names, it will return error
|
|
330
330
|
await agent.call_tools(tool_call)
|
|
331
331
|
|
|
332
|
+
def test_get_tool_schemas(self):
|
|
333
|
+
"""Test getting tool schemas."""
|
|
334
|
+
agent = MockMCPAgent()
|
|
335
|
+
|
|
336
|
+
agent._available_tools = [
|
|
337
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
338
|
+
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
339
|
+
]
|
|
340
|
+
|
|
341
|
+
schemas = agent.get_tool_schemas()
|
|
342
|
+
|
|
343
|
+
# Should include non-lifecycle tools
|
|
344
|
+
assert len(schemas) == 2
|
|
345
|
+
assert schemas[0]["name"] == "tool1"
|
|
346
|
+
|
|
332
347
|
def test_get_tools_by_server(self):
|
|
333
348
|
"""Test getting tools grouped by server."""
|
|
334
349
|
agent = MockMCPAgent()
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unittest import mock
|
|
4
|
+
|
|
5
|
+
import mcp.types as types
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
from hud.agents.base import MCPAgent, find_content, find_reward, text_to_blocks
|
|
9
|
+
from hud.types import AgentResponse, MCPToolCall, MCPToolResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DummyAgent(MCPAgent):
|
|
13
|
+
async def get_system_messages(self):
|
|
14
|
+
return [types.TextContent(text="sys", type="text")]
|
|
15
|
+
|
|
16
|
+
async def get_response(self, messages):
|
|
17
|
+
# Single step: no tool calls -> done
|
|
18
|
+
return AgentResponse(content="ok", tool_calls=[], done=True)
|
|
19
|
+
|
|
20
|
+
async def format_blocks(self, blocks):
|
|
21
|
+
# Return as-is
|
|
22
|
+
return blocks
|
|
23
|
+
|
|
24
|
+
async def format_tool_results(self, tool_calls, tool_results):
|
|
25
|
+
return [types.TextContent(text="tools", type="text")]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@pytest.mark.asyncio
|
|
29
|
+
async def test_run_with_string_prompt_auto_client(monkeypatch):
|
|
30
|
+
# Fake MCPClient with required methods
|
|
31
|
+
fake_client = mock.AsyncMock()
|
|
32
|
+
fake_client.initialize.return_value = None
|
|
33
|
+
fake_client.list_tools.return_value = []
|
|
34
|
+
fake_client.shutdown.return_value = None
|
|
35
|
+
|
|
36
|
+
# Patch MCPClient construction inside initialize()
|
|
37
|
+
with mock.patch("hud.clients.MCPClient", return_value=fake_client):
|
|
38
|
+
agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
|
|
39
|
+
result = await agent.run("hello", max_steps=1)
|
|
40
|
+
assert result.done is True and result.isError is False
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_find_reward_and_content_extractors():
|
|
44
|
+
# Structured content
|
|
45
|
+
r = MCPToolResult(
|
|
46
|
+
content=text_to_blocks("{}"), isError=False, structuredContent={"reward": 0.7}
|
|
47
|
+
)
|
|
48
|
+
assert find_reward(r) == 0.7
|
|
49
|
+
|
|
50
|
+
# Text JSON
|
|
51
|
+
r2 = MCPToolResult(content=text_to_blocks('{"score": 0.5, "content": "hi"}'), isError=False)
|
|
52
|
+
assert find_reward(r2) == 0.5
|
|
53
|
+
assert find_content(r2) == "hi"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@pytest.mark.asyncio
|
|
57
|
+
async def test_call_tools_error_paths():
|
|
58
|
+
fake_client = mock.AsyncMock()
|
|
59
|
+
# First call succeeds
|
|
60
|
+
ok_result = MCPToolResult(content=text_to_blocks("ok"), isError=False)
|
|
61
|
+
fake_client.call_tool.side_effect = [ok_result, RuntimeError("boom")]
|
|
62
|
+
agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
|
|
63
|
+
results = await agent.call_tools(
|
|
64
|
+
[MCPToolCall(name="a", arguments={}), MCPToolCall(name="b", arguments={})]
|
|
65
|
+
)
|
|
66
|
+
assert results[0].isError is False
|
|
67
|
+
assert results[1].isError is True
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@pytest.mark.asyncio
|
|
71
|
+
async def test_initialize_without_client_raises_valueerror():
|
|
72
|
+
agent = DummyAgent(mcp_client=None, auto_trace=False)
|
|
73
|
+
with pytest.raises(ValueError):
|
|
74
|
+
await agent.initialize(None)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def test_get_available_tools_before_initialize_raises():
|
|
78
|
+
agent = DummyAgent(mcp_client=mock.AsyncMock(), auto_trace=False)
|
|
79
|
+
with pytest.raises(RuntimeError):
|
|
80
|
+
agent.get_available_tools()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@pytest.mark.asyncio
|
|
84
|
+
async def test_format_message_invalid_type_raises():
|
|
85
|
+
agent = DummyAgent(mcp_client=mock.AsyncMock(), auto_trace=False)
|
|
86
|
+
with pytest.raises(ValueError):
|
|
87
|
+
await agent.format_message({"oops": 1}) # type: ignore
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@pytest.mark.asyncio
|
|
91
|
+
async def test_call_tools_timeout_error_shutdown_called():
|
|
92
|
+
fake_client = mock.AsyncMock()
|
|
93
|
+
fake_client.call_tool.side_effect = TimeoutError("timeout")
|
|
94
|
+
fake_client.shutdown.return_value = None
|
|
95
|
+
agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
|
|
96
|
+
with pytest.raises(TimeoutError):
|
|
97
|
+
await agent.call_tools(MCPToolCall(name="x", arguments={}))
|
|
98
|
+
fake_client.shutdown.assert_awaited_once()
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def test_text_to_blocks_shapes():
|
|
102
|
+
blocks = text_to_blocks("x")
|
|
103
|
+
assert isinstance(blocks, list) and blocks and isinstance(blocks[0], types.TextContent)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@pytest.mark.asyncio
|
|
107
|
+
async def test_run_returns_connection_error_trace(monkeypatch):
|
|
108
|
+
fake_client = mock.AsyncMock()
|
|
109
|
+
fake_client.mcp_config = {}
|
|
110
|
+
fake_client.initialize.side_effect = RuntimeError("Connection refused http://localhost:1234")
|
|
111
|
+
fake_client.list_tools.return_value = []
|
|
112
|
+
fake_client.shutdown.return_value = None
|
|
113
|
+
|
|
114
|
+
class DummyCM:
|
|
115
|
+
def __exit__(self, *args, **kwargs):
|
|
116
|
+
return False
|
|
117
|
+
|
|
118
|
+
monkeypatch.setattr("hud.utils.mcp.setup_hud_telemetry", lambda *args, **kwargs: DummyCM())
|
|
119
|
+
|
|
120
|
+
agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
|
|
121
|
+
result = await agent.run("p", max_steps=1)
|
|
122
|
+
assert result.isError is True
|
|
123
|
+
assert "Could not connect" in (result.content or "")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@pytest.mark.asyncio
|
|
127
|
+
async def test_run_calls_response_tool_when_configured(monkeypatch):
|
|
128
|
+
fake_client = mock.AsyncMock()
|
|
129
|
+
fake_client.mcp_config = {}
|
|
130
|
+
fake_client.initialize.return_value = None
|
|
131
|
+
fake_client.list_tools.return_value = []
|
|
132
|
+
fake_client.shutdown.return_value = None
|
|
133
|
+
ok = MCPToolResult(content=text_to_blocks("ok"), isError=False)
|
|
134
|
+
fake_client.call_tool.return_value = ok
|
|
135
|
+
|
|
136
|
+
class DummyCM:
|
|
137
|
+
def __exit__(self, *args, **kwargs):
|
|
138
|
+
return False
|
|
139
|
+
|
|
140
|
+
monkeypatch.setattr("hud.utils.mcp.setup_hud_telemetry", lambda *args, **kwargs: DummyCM())
|
|
141
|
+
|
|
142
|
+
agent = DummyAgent(mcp_client=fake_client, auto_trace=False, response_tool_name="submit")
|
|
143
|
+
result = await agent.run("hello", max_steps=1)
|
|
144
|
+
assert result.isError is False
|
|
145
|
+
fake_client.call_tool.assert_awaited()
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@pytest.mark.asyncio
|
|
149
|
+
async def test_get_available_tools_after_initialize(monkeypatch):
|
|
150
|
+
fake_client = mock.AsyncMock()
|
|
151
|
+
fake_client.mcp_config = {}
|
|
152
|
+
fake_client.initialize.return_value = None
|
|
153
|
+
fake_client.list_tools.return_value = []
|
|
154
|
+
fake_client.shutdown.return_value = None
|
|
155
|
+
|
|
156
|
+
class DummyCM:
|
|
157
|
+
def __exit__(self, *args, **kwargs):
|
|
158
|
+
return False
|
|
159
|
+
|
|
160
|
+
monkeypatch.setattr("hud.utils.mcp.setup_hud_telemetry", lambda *args, **kwargs: DummyCM())
|
|
161
|
+
|
|
162
|
+
agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
|
|
163
|
+
await agent.initialize(None)
|
|
164
|
+
assert agent.get_available_tools() == []
|
|
@@ -242,15 +242,18 @@ def debug(
|
|
|
242
242
|
if build and not build_environment(directory, image_name):
|
|
243
243
|
raise typer.Exit(1)
|
|
244
244
|
|
|
245
|
-
# Build Docker command
|
|
246
|
-
from .utils.docker import
|
|
245
|
+
# Build Docker command with folder-mode envs
|
|
246
|
+
from .utils.docker import create_docker_run_command
|
|
247
247
|
|
|
248
|
-
command =
|
|
248
|
+
command = create_docker_run_command(
|
|
249
|
+
image_name, docker_args=docker_args, env_dir=directory
|
|
250
|
+
)
|
|
249
251
|
else:
|
|
250
252
|
# Assume it's an image name
|
|
251
253
|
image = first_param
|
|
252
254
|
from .utils.docker import build_run_command
|
|
253
255
|
|
|
256
|
+
# Image-only mode: do not auto-inject local .env
|
|
254
257
|
command = build_run_command(image, docker_args)
|
|
255
258
|
else:
|
|
256
259
|
console.print(
|