hud-python 0.4.52__tar.gz → 0.4.54__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.52 → hud_python-0.4.54}/PKG-INFO +49 -49
- {hud_python-0.4.52 → hud_python-0.4.54}/README.md +47 -48
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/README.md +2 -2
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/blank/README.md +9 -2
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/blank/server/pyproject.toml +1 -1
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/environment/todo/README.md +2 -2
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/server/pyproject.toml +1 -1
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/deepresearch/server/pyproject.toml +1 -1
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/base.py +9 -2
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/openai_chat_generic.py +15 -3
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/tests/test_base.py +15 -0
- hud_python-0.4.54/hud/agents/tests/test_base_runtime.py +164 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/__init__.py +20 -12
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/build.py +35 -27
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/dev.py +13 -31
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/eval.py +85 -84
- hud_python-0.4.54/hud/cli/tests/test_analyze_module.py +120 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_build.py +24 -2
- hud_python-0.4.54/hud/cli/tests/test_build_failure.py +41 -0
- hud_python-0.4.54/hud/cli/tests/test_build_module.py +50 -0
- hud_python-0.4.54/hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud_python-0.4.54/hud/cli/tests/test_cli_root.py +134 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_eval.py +6 -6
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_mcp_server.py +8 -7
- hud_python-0.4.54/hud/cli/tests/test_push_happy.py +74 -0
- hud_python-0.4.54/hud/cli/tests/test_push_wrapper.py +23 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/docker.py +120 -1
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/runner.py +1 -1
- hud_python-0.4.54/hud/cli/utils/tests/test_config.py +58 -0
- hud_python-0.4.54/hud/cli/utils/tests/test_docker.py +93 -0
- hud_python-0.4.54/hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud_python-0.4.54/hud/cli/utils/tests/test_env_check.py +74 -0
- hud_python-0.4.54/hud/cli/utils/tests/test_environment.py +42 -0
- hud_python-0.4.54/hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud_python-0.4.54/hud/cli/utils/tests/test_local_runner.py +50 -0
- hud_python-0.4.54/hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud_python-0.4.54/hud/cli/utils/tests/test_metadata.py +49 -0
- hud_python-0.4.54/hud/cli/utils/tests/test_package_runner.py +35 -0
- hud_python-0.4.54/hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud_python-0.4.54/hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud_python-0.4.54/hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud_python-0.4.54/hud/cli/utils/tests/test_source_hash.py +36 -0
- hud_python-0.4.54/hud/cli/utils/tests/test_tasks.py +80 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/version_check.py +2 -2
- hud_python-0.4.54/hud/datasets/tests/test_runner.py +106 -0
- hud_python-0.4.54/hud/datasets/tests/test_utils.py +228 -0
- hud_python-0.4.54/hud/otel/tests/test_instrumentation.py +207 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_server_extra.py +2 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/shared/exceptions.py +35 -4
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/shared/hints.py +25 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/shared/requests.py +15 -3
- hud_python-0.4.54/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/shared/tests/test_exceptions.py +31 -23
- hud_python-0.4.54/hud/shared/tests/test_hints.py +167 -0
- hud_python-0.4.54/hud/telemetry/tests/__init__.py +0 -0
- hud_python-0.4.54/hud/telemetry/tests/test_async_context.py +242 -0
- hud_python-0.4.54/hud/telemetry/tests/test_instrument.py +414 -0
- hud_python-0.4.54/hud/telemetry/tests/test_job.py +609 -0
- hud_python-0.4.54/hud/telemetry/tests/test_trace.py +241 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/computer/settings.py +2 -2
- hud_python-0.4.54/hud/tools/tests/test_submit.py +85 -0
- hud_python-0.4.54/hud/tools/tests/test_types.py +193 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/types.py +17 -1
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/agent_factories.py +1 -3
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/mcp.py +1 -1
- hud_python-0.4.54/hud/utils/tests/__init__.py +0 -0
- hud_python-0.4.54/hud/utils/tests/test_agent_factories.py +60 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tests/test_mcp.py +4 -6
- hud_python-0.4.54/hud/utils/tests/test_pretty_errors.py +186 -0
- hud_python-0.4.54/hud/utils/tests/test_tasks.py +187 -0
- hud_python-0.4.54/hud/utils/tests/test_tool_shorthand.py +154 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/version.py +1 -1
- {hud_python-0.4.52 → hud_python-0.4.54}/pyproject.toml +17 -3
- hud_python-0.4.52/hud/otel/tests/__init__.py +0 -1
- hud_python-0.4.52/hud/telemetry/tests/test_trace.py +0 -63
- {hud_python-0.4.52 → hud_python-0.4.54}/.gitignore +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/LICENSE +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/blank/environment/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/blank/environment/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/blank/server/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/environment/2048/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/environment/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/environment/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/deepresearch/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/deepresearch/environment/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/deepresearch/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/examples/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/__main__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/claude.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/grounded_openai.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/lite_llm.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/misc/integration_test_agent.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/openai.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/clone.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/debug.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/flows/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/flows/tasks.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/get.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/init.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/list_func.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/pull.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/push.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/remove.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/celebrate.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/config.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/display.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/gpu.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/gpu_utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/local_runner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/presets.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/remote_runner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/rl_api.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/viewer.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/vllm.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/wait_utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/config.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/env_check.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/interactive.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/local_runner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/package_runner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/server.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/source_hash.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/tasks.py +0 -0
- {hud_python-0.4.52/hud/shared → hud_python-0.4.54/hud/cli/utils}/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/base.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/fastmcp.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/mcp_use.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/tests/test_mcp_use_retry.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/utils/mcp_use_retry.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/utils/retry.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/datasets/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/datasets/parallel.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/datasets/runner.py +0 -0
- {hud_python-0.4.52/hud/telemetry → hud_python-0.4.54/hud/datasets}/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/datasets/utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/native/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/native/comparator.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/collector.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/config.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/context.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/exporters.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/instrumentation.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/processors.py +0 -0
- {hud_python-0.4.52/hud/utils → hud_python-0.4.54/hud/otel}/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/py.typed +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/README.md +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/actor.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/buffer.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/chat_template.jinja +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/config.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/distributed.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/learner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/tests/test_learner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/train.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/types.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/utils/start_vllm_server.sh +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/vllm_adapter.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/samples/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/samples/browser.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/context.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/low_level.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/router.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/server.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_add_tool.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_context.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_mcp_server_handlers.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_mcp_server_integration.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_mcp_server_more.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_run_wrapper.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_sigterm_runner.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/settings.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/telemetry/async_context.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/telemetry/job.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/telemetry/trace.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/base.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/bash.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/computer/qwen.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/edit.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/executors/base.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/grounding/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/grounding/config.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/grounding/grounded_tool.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/grounding/grounder.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/grounding/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/response.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/submit.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/types.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/group_eval.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/hud_console.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/pretty_errors.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/progress.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/task_tracking.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tasks.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tool_shorthand.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.54
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -42,6 +42,7 @@ Requires-Dist: httpx<1,>=0.23.0
|
|
|
42
42
|
Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
|
|
43
43
|
Requires-Dist: hud-mcp-python-sdk>=3.13.2
|
|
44
44
|
Requires-Dist: hud-mcp-use-python-sdk==2.3.20
|
|
45
|
+
Requires-Dist: langchain==0.3.27
|
|
45
46
|
Requires-Dist: numpy>=1.24.0
|
|
46
47
|
Requires-Dist: openai
|
|
47
48
|
Requires-Dist: opentelemetry-api>=1.34.1
|
|
@@ -160,12 +161,12 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
|
|
|
160
161
|
|
|
161
162
|
## Highlights
|
|
162
163
|
|
|
163
|
-
- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
|
|
164
164
|
- 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
|
|
165
165
|
- ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
|
|
166
166
|
- 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
|
|
167
167
|
- 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
|
|
168
168
|
- 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
|
|
169
|
+
- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
|
|
169
170
|
|
|
170
171
|
> We welcome contributors and feature requests – open an issue or hop on a call to discuss improvements!
|
|
171
172
|
|
|
@@ -186,29 +187,6 @@ uv tool install hud-python
|
|
|
186
187
|
Before starting, get your HUD_API_KEY at [hud.so](https://hud.so).
|
|
187
188
|
|
|
188
189
|
|
|
189
|
-
## Quickstart: Training
|
|
190
|
-
|
|
191
|
-
RL using GRPO a Qwen2.5-VL model on any hud dataset:
|
|
192
|
-
|
|
193
|
-
```bash
|
|
194
|
-
hud get hud-evals/basic-2048 # from HF
|
|
195
|
-
hud rl basic-2048.json
|
|
196
|
-
```
|
|
197
|
-
|
|
198
|
-
> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
|
|
199
|
-
|
|
200
|
-
Or make your own environment and dataset:
|
|
201
|
-
|
|
202
|
-
```bash
|
|
203
|
-
hud init my-env && cd my-env
|
|
204
|
-
hud dev --interactive
|
|
205
|
-
# When ready to run:
|
|
206
|
-
hud rl
|
|
207
|
-
```
|
|
208
|
-
|
|
209
|
-
> See [environment design docs](https://docs.hud.so/build-environments)
|
|
210
|
-
|
|
211
|
-
|
|
212
190
|
## Quickstart: Evals
|
|
213
191
|
|
|
214
192
|
For a tutorial that explains the agent and evaluation design, run:
|
|
@@ -265,38 +243,27 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
|
|
|
265
243
|
|
|
266
244
|

|
|
267
245
|
|
|
268
|
-
##
|
|
269
|
-
|
|
270
|
-
This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
|
|
271
|
-
|
|
272
|
-

|
|
246
|
+
## Quickstart: Training
|
|
273
247
|
|
|
274
|
-
|
|
248
|
+
RL using GRPO a Qwen2.5-VL model on any hud dataset:
|
|
275
249
|
|
|
276
250
|
```bash
|
|
277
|
-
#
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
# Option A: Run directly from a HuggingFace dataset
|
|
281
|
-
hud rl hud-evals/basic-2048
|
|
282
|
-
|
|
283
|
-
# Option B: Download first, modify, then train
|
|
284
|
-
hud get hud-evals/basic-2048
|
|
285
|
-
hud rl basic-2048.json
|
|
286
|
-
|
|
287
|
-
# Optional: baseline evaluation
|
|
288
|
-
hud eval basic-2048.json
|
|
251
|
+
hud get hud-evals/2048-basic # from HF
|
|
252
|
+
hud rl 2048-basic.json
|
|
289
253
|
```
|
|
290
254
|
|
|
291
|
-
|
|
292
|
-
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
293
|
-
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
255
|
+
> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
|
|
294
256
|
|
|
295
|
-
|
|
257
|
+
Or make your own environment and dataset:
|
|
296
258
|
|
|
297
|
-
|
|
259
|
+
```bash
|
|
260
|
+
hud init my-env && cd my-env
|
|
261
|
+
hud dev --interactive
|
|
262
|
+
# When ready to run:
|
|
263
|
+
hud rl
|
|
264
|
+
```
|
|
298
265
|
|
|
299
|
-
|
|
266
|
+
> See [environment design docs](https://docs.hud.so/build-environments)
|
|
300
267
|
|
|
301
268
|
## Benchmarking Agents
|
|
302
269
|
|
|
@@ -460,6 +427,39 @@ We highly suggest running 3-5 evaluations per dataset for the most consistent re
|
|
|
460
427
|
|
|
461
428
|
Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it:
|
|
462
429
|
|
|
430
|
+
## Reinforcement Learning with GRPO
|
|
431
|
+
|
|
432
|
+
This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
|
|
433
|
+
|
|
434
|
+

|
|
435
|
+
|
|
436
|
+
Train with the new interactive `hud rl` flow:
|
|
437
|
+
|
|
438
|
+
```bash
|
|
439
|
+
# Install CLI
|
|
440
|
+
uv tool install hud-python
|
|
441
|
+
|
|
442
|
+
# Option A: Run directly from a HuggingFace dataset
|
|
443
|
+
hud rl hud-evals/2048-basic
|
|
444
|
+
|
|
445
|
+
# Option B: Download first, modify, then train
|
|
446
|
+
hud get hud-evals/2048-basic
|
|
447
|
+
hud rl 2048-basic.json
|
|
448
|
+
|
|
449
|
+
# Optional: baseline evaluation
|
|
450
|
+
hud eval 2048-basic.json
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
Supports multi‑turn RL for both:
|
|
454
|
+
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
455
|
+
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
456
|
+
|
|
457
|
+
By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
|
|
458
|
+
|
|
459
|
+
Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
|
|
460
|
+
|
|
461
|
+
Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
|
|
462
|
+
|
|
463
463
|
## Architecture
|
|
464
464
|
|
|
465
465
|
```mermaid
|
|
@@ -22,12 +22,12 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
|
|
|
22
22
|
|
|
23
23
|
## Highlights
|
|
24
24
|
|
|
25
|
-
- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
|
|
26
25
|
- 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
|
|
27
26
|
- ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
|
|
28
27
|
- 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
|
|
29
28
|
- 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
|
|
30
29
|
- 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
|
|
30
|
+
- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
|
|
31
31
|
|
|
32
32
|
> We welcome contributors and feature requests – open an issue or hop on a call to discuss improvements!
|
|
33
33
|
|
|
@@ -48,29 +48,6 @@ uv tool install hud-python
|
|
|
48
48
|
Before starting, get your HUD_API_KEY at [hud.so](https://hud.so).
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
## Quickstart: Training
|
|
52
|
-
|
|
53
|
-
RL using GRPO a Qwen2.5-VL model on any hud dataset:
|
|
54
|
-
|
|
55
|
-
```bash
|
|
56
|
-
hud get hud-evals/basic-2048 # from HF
|
|
57
|
-
hud rl basic-2048.json
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
|
|
61
|
-
|
|
62
|
-
Or make your own environment and dataset:
|
|
63
|
-
|
|
64
|
-
```bash
|
|
65
|
-
hud init my-env && cd my-env
|
|
66
|
-
hud dev --interactive
|
|
67
|
-
# When ready to run:
|
|
68
|
-
hud rl
|
|
69
|
-
```
|
|
70
|
-
|
|
71
|
-
> See [environment design docs](https://docs.hud.so/build-environments)
|
|
72
|
-
|
|
73
|
-
|
|
74
51
|
## Quickstart: Evals
|
|
75
52
|
|
|
76
53
|
For a tutorial that explains the agent and evaluation design, run:
|
|
@@ -127,38 +104,27 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
|
|
|
127
104
|
|
|
128
105
|

|
|
129
106
|
|
|
130
|
-
##
|
|
131
|
-
|
|
132
|
-
This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
|
|
133
|
-
|
|
134
|
-

|
|
107
|
+
## Quickstart: Training
|
|
135
108
|
|
|
136
|
-
|
|
109
|
+
RL using GRPO a Qwen2.5-VL model on any hud dataset:
|
|
137
110
|
|
|
138
111
|
```bash
|
|
139
|
-
#
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
# Option A: Run directly from a HuggingFace dataset
|
|
143
|
-
hud rl hud-evals/basic-2048
|
|
144
|
-
|
|
145
|
-
# Option B: Download first, modify, then train
|
|
146
|
-
hud get hud-evals/basic-2048
|
|
147
|
-
hud rl basic-2048.json
|
|
148
|
-
|
|
149
|
-
# Optional: baseline evaluation
|
|
150
|
-
hud eval basic-2048.json
|
|
112
|
+
hud get hud-evals/2048-basic # from HF
|
|
113
|
+
hud rl 2048-basic.json
|
|
151
114
|
```
|
|
152
115
|
|
|
153
|
-
|
|
154
|
-
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
155
|
-
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
116
|
+
> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
|
|
156
117
|
|
|
157
|
-
|
|
118
|
+
Or make your own environment and dataset:
|
|
158
119
|
|
|
159
|
-
|
|
120
|
+
```bash
|
|
121
|
+
hud init my-env && cd my-env
|
|
122
|
+
hud dev --interactive
|
|
123
|
+
# When ready to run:
|
|
124
|
+
hud rl
|
|
125
|
+
```
|
|
160
126
|
|
|
161
|
-
|
|
127
|
+
> See [environment design docs](https://docs.hud.so/build-environments)
|
|
162
128
|
|
|
163
129
|
## Benchmarking Agents
|
|
164
130
|
|
|
@@ -322,6 +288,39 @@ We highly suggest running 3-5 evaluations per dataset for the most consistent re
|
|
|
322
288
|
|
|
323
289
|
Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it:
|
|
324
290
|
|
|
291
|
+
## Reinforcement Learning with GRPO
|
|
292
|
+
|
|
293
|
+
This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
|
|
294
|
+
|
|
295
|
+

|
|
296
|
+
|
|
297
|
+
Train with the new interactive `hud rl` flow:
|
|
298
|
+
|
|
299
|
+
```bash
|
|
300
|
+
# Install CLI
|
|
301
|
+
uv tool install hud-python
|
|
302
|
+
|
|
303
|
+
# Option A: Run directly from a HuggingFace dataset
|
|
304
|
+
hud rl hud-evals/2048-basic
|
|
305
|
+
|
|
306
|
+
# Option B: Download first, modify, then train
|
|
307
|
+
hud get hud-evals/2048-basic
|
|
308
|
+
hud rl 2048-basic.json
|
|
309
|
+
|
|
310
|
+
# Optional: baseline evaluation
|
|
311
|
+
hud eval 2048-basic.json
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
Supports multi‑turn RL for both:
|
|
315
|
+
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
316
|
+
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
317
|
+
|
|
318
|
+
By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
|
|
319
|
+
|
|
320
|
+
Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
|
|
321
|
+
|
|
322
|
+
Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
|
|
323
|
+
|
|
325
324
|
## Architecture
|
|
326
325
|
|
|
327
326
|
```mermaid
|
|
@@ -804,9 +804,9 @@ class TodoCompleted:
|
|
|
804
804
|
@problem("todo_basic", description="Complete two todo items", difficulty="easy")
|
|
805
805
|
class TodoBasic:
|
|
806
806
|
def get_setup(self):
|
|
807
|
-
return {"
|
|
807
|
+
return {"name": "todo_seed", "arguments": {"num_items": 5}}
|
|
808
808
|
def get_evaluation(self):
|
|
809
|
-
return {"
|
|
809
|
+
return {"name": "todo_completed", "arguments": {"expected_count": 2}}
|
|
810
810
|
```
|
|
811
811
|
|
|
812
812
|
Decorators keep registration *next to the implementation* and avoid manual bookkeeping. The server simply exposes the combined metadata through an MCP **resource**. Follow `environments/browser/src/hud_controller/problems/registry.py` as a template and expose the JSON with `@mcp.resource("problems://registry")`.
|
|
@@ -6,10 +6,12 @@ See [docs](https://docs.hud.so/build-environments) for the complete environment
|
|
|
6
6
|
## Architecture
|
|
7
7
|
|
|
8
8
|
**`environment/`** - Produces structured data
|
|
9
|
+
|
|
9
10
|
- Owns all state (game logic, browser sessions, databases, etc.)
|
|
10
11
|
- Exposes HTTP endpoints `/health`, `/act`, `/reset`, `/state` that return structured information about the environment state
|
|
11
12
|
|
|
12
13
|
**`server/`** - Wraps data in MCP tools
|
|
14
|
+
|
|
13
15
|
- Calls environment endpoints to get structured data for the agent, and environment setup/evaluation
|
|
14
16
|
- Agents and tasks interact only with these tools!
|
|
15
17
|
|
|
@@ -33,12 +35,14 @@ Visit http://localhost:8765/docs to see the new tool appear instantly.
|
|
|
33
35
|
In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
|
|
34
36
|
|
|
35
37
|
For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
|
|
38
|
+
|
|
36
39
|
```bash
|
|
37
40
|
cd ..
|
|
38
41
|
hud dev
|
|
39
42
|
```
|
|
40
43
|
|
|
41
44
|
## Tasks & Evaluation
|
|
45
|
+
|
|
42
46
|
```bash
|
|
43
47
|
# Build first in the global folder with the Dockerfile (creates blank:0.1.0)
|
|
44
48
|
hud build
|
|
@@ -59,6 +63,7 @@ Your `tasks.json` uses `docker run` to launch the environment:
|
|
|
59
63
|
```
|
|
60
64
|
|
|
61
65
|
**Commands:**
|
|
66
|
+
|
|
62
67
|
```bash
|
|
63
68
|
# Build first
|
|
64
69
|
hud build
|
|
@@ -78,6 +83,7 @@ hud rl tasks.json # Auto-converts docker→remote, builds & pushes if needed
|
|
|
78
83
|
Once your environment is ready, you can share it with the community:
|
|
79
84
|
|
|
80
85
|
### 1. Push to Registry
|
|
86
|
+
|
|
81
87
|
```bash
|
|
82
88
|
# Build and push your environment (requires docker hub login and hud api key)
|
|
83
89
|
hud build
|
|
@@ -89,10 +95,12 @@ hud push
|
|
|
89
95
|
Create a dataset on HuggingFace with your tasks:
|
|
90
96
|
|
|
91
97
|
**Option A: Upload manually**
|
|
98
|
+
|
|
92
99
|
1. Upload your `tasks.json` to HuggingFace
|
|
93
100
|
2. Make sure it's **public** to appear on leaderboards
|
|
94
101
|
|
|
95
102
|
**Option B: Use the SDK**
|
|
103
|
+
|
|
96
104
|
```python
|
|
97
105
|
from hud.datasets import save_tasks
|
|
98
106
|
import json
|
|
@@ -109,7 +117,7 @@ save_tasks(tasks, repo_id="your-org/your-dataset")
|
|
|
109
117
|
|
|
110
118
|
```bash
|
|
111
119
|
# Run Claude on your benchmark
|
|
112
|
-
hud eval "your-org/your-dataset"
|
|
120
|
+
hud eval "your-org/your-dataset" claude
|
|
113
121
|
|
|
114
122
|
# View results at:
|
|
115
123
|
# hud.so/leaderboards/your-org/your-dataset
|
|
@@ -118,4 +126,3 @@ hud eval "your-org/your-dataset" --agent claude
|
|
|
118
126
|
**Note**: Only public HuggingFace datasets appear as leaderboards!
|
|
119
127
|
|
|
120
128
|
📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
|
|
121
|
-
|
|
@@ -47,8 +47,8 @@ await setup({"name": "todo_basic_usage"})
|
|
|
47
47
|
await evaluate({"name": "todo_basic_usage"})
|
|
48
48
|
|
|
49
49
|
# Direct function calls
|
|
50
|
-
await setup({"
|
|
51
|
-
await evaluate({"
|
|
50
|
+
await setup({"name": "todo_reset", "arguments": {}})
|
|
51
|
+
await evaluate({"name": "todo_completion_rate", "arguments": {"min_rate": 0.5}})
|
|
52
52
|
|
|
53
53
|
# MCP resource discovery
|
|
54
54
|
todo_evaluators = await client.read_resource("evaluators://todo")
|
|
@@ -137,7 +137,11 @@ class MCPAgent(ABC):
|
|
|
137
137
|
"No MCPClient. Please provide one when initializing the agent or pass a Task with mcp_config." # noqa: E501
|
|
138
138
|
)
|
|
139
139
|
|
|
140
|
-
|
|
140
|
+
try:
|
|
141
|
+
client_cfg = getattr(self.mcp_client, "mcp_config", None)
|
|
142
|
+
except Exception:
|
|
143
|
+
client_cfg = None
|
|
144
|
+
await self._setup_config(client_cfg)
|
|
141
145
|
|
|
142
146
|
# Initialize client if needed
|
|
143
147
|
try:
|
|
@@ -618,8 +622,11 @@ class MCPAgent(ABC):
|
|
|
618
622
|
except Exception as e:
|
|
619
623
|
self.console.error_log(f"Response lifecycle tool failed: {e}")
|
|
620
624
|
|
|
621
|
-
async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
|
|
625
|
+
async def _setup_config(self, mcp_config: dict[str, dict[str, Any]] | None) -> None:
|
|
622
626
|
"""Inject metadata into the metadata of the initialize request."""
|
|
627
|
+
if not isinstance(mcp_config, dict):
|
|
628
|
+
return
|
|
629
|
+
|
|
623
630
|
if self.metadata:
|
|
624
631
|
patch_mcp_config(
|
|
625
632
|
mcp_config,
|
|
@@ -20,6 +20,7 @@ import logging
|
|
|
20
20
|
from typing import TYPE_CHECKING, Any, ClassVar, cast
|
|
21
21
|
|
|
22
22
|
import mcp.types as types
|
|
23
|
+
from openai import AsyncOpenAI
|
|
23
24
|
|
|
24
25
|
from hud import instrument
|
|
25
26
|
from hud.types import AgentResponse, MCPToolCall, MCPToolResult
|
|
@@ -28,7 +29,6 @@ from hud.utils.hud_console import HUDConsole
|
|
|
28
29
|
from .base import MCPAgent
|
|
29
30
|
|
|
30
31
|
if TYPE_CHECKING:
|
|
31
|
-
from openai import AsyncOpenAI
|
|
32
32
|
from openai.types.chat import ChatCompletionToolParam
|
|
33
33
|
|
|
34
34
|
logger = logging.getLogger(__name__)
|
|
@@ -42,14 +42,26 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
42
42
|
def __init__(
|
|
43
43
|
self,
|
|
44
44
|
*,
|
|
45
|
-
openai_client: AsyncOpenAI | None,
|
|
45
|
+
openai_client: AsyncOpenAI | None = None,
|
|
46
|
+
api_key: str | None = None,
|
|
47
|
+
base_url: str | None = None,
|
|
46
48
|
model_name: str = "gpt-4o-mini",
|
|
47
49
|
completion_kwargs: dict[str, Any] | None = None,
|
|
48
50
|
**agent_kwargs: Any,
|
|
49
51
|
) -> None:
|
|
50
52
|
# Accept base-agent settings via **agent_kwargs (e.g., mcp_client, system_prompt, etc.)
|
|
51
53
|
super().__init__(**agent_kwargs)
|
|
52
|
-
|
|
54
|
+
|
|
55
|
+
# Handle client creation - support both patterns
|
|
56
|
+
if openai_client is not None:
|
|
57
|
+
# Use provided client (backward compatibility)
|
|
58
|
+
self.oai = openai_client
|
|
59
|
+
elif api_key is not None or base_url is not None:
|
|
60
|
+
# Create client from config (new pattern, consistent with other agents)
|
|
61
|
+
self.oai = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
62
|
+
else:
|
|
63
|
+
raise ValueError("Either openai_client or (api_key and base_url) must be provided")
|
|
64
|
+
|
|
53
65
|
self.model_name = model_name
|
|
54
66
|
self.completion_kwargs: dict[str, Any] = completion_kwargs or {}
|
|
55
67
|
self.mcp_schemas = []
|
|
@@ -329,6 +329,21 @@ class TestBaseMCPAgent:
|
|
|
329
329
|
# call_tools doesn't validate empty names, it will return error
|
|
330
330
|
await agent.call_tools(tool_call)
|
|
331
331
|
|
|
332
|
+
def test_get_tool_schemas(self):
|
|
333
|
+
"""Test getting tool schemas."""
|
|
334
|
+
agent = MockMCPAgent()
|
|
335
|
+
|
|
336
|
+
agent._available_tools = [
|
|
337
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
338
|
+
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
339
|
+
]
|
|
340
|
+
|
|
341
|
+
schemas = agent.get_tool_schemas()
|
|
342
|
+
|
|
343
|
+
# Should include non-lifecycle tools
|
|
344
|
+
assert len(schemas) == 2
|
|
345
|
+
assert schemas[0]["name"] == "tool1"
|
|
346
|
+
|
|
332
347
|
def test_get_tools_by_server(self):
|
|
333
348
|
"""Test getting tools grouped by server."""
|
|
334
349
|
agent = MockMCPAgent()
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unittest import mock
|
|
4
|
+
|
|
5
|
+
import mcp.types as types
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
from hud.agents.base import MCPAgent, find_content, find_reward, text_to_blocks
|
|
9
|
+
from hud.types import AgentResponse, MCPToolCall, MCPToolResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DummyAgent(MCPAgent):
|
|
13
|
+
async def get_system_messages(self):
|
|
14
|
+
return [types.TextContent(text="sys", type="text")]
|
|
15
|
+
|
|
16
|
+
async def get_response(self, messages):
|
|
17
|
+
# Single step: no tool calls -> done
|
|
18
|
+
return AgentResponse(content="ok", tool_calls=[], done=True)
|
|
19
|
+
|
|
20
|
+
async def format_blocks(self, blocks):
|
|
21
|
+
# Return as-is
|
|
22
|
+
return blocks
|
|
23
|
+
|
|
24
|
+
async def format_tool_results(self, tool_calls, tool_results):
|
|
25
|
+
return [types.TextContent(text="tools", type="text")]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@pytest.mark.asyncio
|
|
29
|
+
async def test_run_with_string_prompt_auto_client(monkeypatch):
|
|
30
|
+
# Fake MCPClient with required methods
|
|
31
|
+
fake_client = mock.AsyncMock()
|
|
32
|
+
fake_client.initialize.return_value = None
|
|
33
|
+
fake_client.list_tools.return_value = []
|
|
34
|
+
fake_client.shutdown.return_value = None
|
|
35
|
+
|
|
36
|
+
# Patch MCPClient construction inside initialize()
|
|
37
|
+
with mock.patch("hud.clients.MCPClient", return_value=fake_client):
|
|
38
|
+
agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
|
|
39
|
+
result = await agent.run("hello", max_steps=1)
|
|
40
|
+
assert result.done is True and result.isError is False
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_find_reward_and_content_extractors():
|
|
44
|
+
# Structured content
|
|
45
|
+
r = MCPToolResult(
|
|
46
|
+
content=text_to_blocks("{}"), isError=False, structuredContent={"reward": 0.7}
|
|
47
|
+
)
|
|
48
|
+
assert find_reward(r) == 0.7
|
|
49
|
+
|
|
50
|
+
# Text JSON
|
|
51
|
+
r2 = MCPToolResult(content=text_to_blocks('{"score": 0.5, "content": "hi"}'), isError=False)
|
|
52
|
+
assert find_reward(r2) == 0.5
|
|
53
|
+
assert find_content(r2) == "hi"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@pytest.mark.asyncio
|
|
57
|
+
async def test_call_tools_error_paths():
|
|
58
|
+
fake_client = mock.AsyncMock()
|
|
59
|
+
# First call succeeds
|
|
60
|
+
ok_result = MCPToolResult(content=text_to_blocks("ok"), isError=False)
|
|
61
|
+
fake_client.call_tool.side_effect = [ok_result, RuntimeError("boom")]
|
|
62
|
+
agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
|
|
63
|
+
results = await agent.call_tools(
|
|
64
|
+
[MCPToolCall(name="a", arguments={}), MCPToolCall(name="b", arguments={})]
|
|
65
|
+
)
|
|
66
|
+
assert results[0].isError is False
|
|
67
|
+
assert results[1].isError is True
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@pytest.mark.asyncio
|
|
71
|
+
async def test_initialize_without_client_raises_valueerror():
|
|
72
|
+
agent = DummyAgent(mcp_client=None, auto_trace=False)
|
|
73
|
+
with pytest.raises(ValueError):
|
|
74
|
+
await agent.initialize(None)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def test_get_available_tools_before_initialize_raises():
|
|
78
|
+
agent = DummyAgent(mcp_client=mock.AsyncMock(), auto_trace=False)
|
|
79
|
+
with pytest.raises(RuntimeError):
|
|
80
|
+
agent.get_available_tools()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@pytest.mark.asyncio
|
|
84
|
+
async def test_format_message_invalid_type_raises():
|
|
85
|
+
agent = DummyAgent(mcp_client=mock.AsyncMock(), auto_trace=False)
|
|
86
|
+
with pytest.raises(ValueError):
|
|
87
|
+
await agent.format_message({"oops": 1}) # type: ignore
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@pytest.mark.asyncio
|
|
91
|
+
async def test_call_tools_timeout_error_shutdown_called():
|
|
92
|
+
fake_client = mock.AsyncMock()
|
|
93
|
+
fake_client.call_tool.side_effect = TimeoutError("timeout")
|
|
94
|
+
fake_client.shutdown.return_value = None
|
|
95
|
+
agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
|
|
96
|
+
with pytest.raises(TimeoutError):
|
|
97
|
+
await agent.call_tools(MCPToolCall(name="x", arguments={}))
|
|
98
|
+
fake_client.shutdown.assert_awaited_once()
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def test_text_to_blocks_shapes():
|
|
102
|
+
blocks = text_to_blocks("x")
|
|
103
|
+
assert isinstance(blocks, list) and blocks and isinstance(blocks[0], types.TextContent)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@pytest.mark.asyncio
|
|
107
|
+
async def test_run_returns_connection_error_trace(monkeypatch):
|
|
108
|
+
fake_client = mock.AsyncMock()
|
|
109
|
+
fake_client.mcp_config = {}
|
|
110
|
+
fake_client.initialize.side_effect = RuntimeError("Connection refused http://localhost:1234")
|
|
111
|
+
fake_client.list_tools.return_value = []
|
|
112
|
+
fake_client.shutdown.return_value = None
|
|
113
|
+
|
|
114
|
+
class DummyCM:
|
|
115
|
+
def __exit__(self, *args, **kwargs):
|
|
116
|
+
return False
|
|
117
|
+
|
|
118
|
+
monkeypatch.setattr("hud.utils.mcp.setup_hud_telemetry", lambda *args, **kwargs: DummyCM())
|
|
119
|
+
|
|
120
|
+
agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
|
|
121
|
+
result = await agent.run("p", max_steps=1)
|
|
122
|
+
assert result.isError is True
|
|
123
|
+
assert "Could not connect" in (result.content or "")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@pytest.mark.asyncio
|
|
127
|
+
async def test_run_calls_response_tool_when_configured(monkeypatch):
|
|
128
|
+
fake_client = mock.AsyncMock()
|
|
129
|
+
fake_client.mcp_config = {}
|
|
130
|
+
fake_client.initialize.return_value = None
|
|
131
|
+
fake_client.list_tools.return_value = []
|
|
132
|
+
fake_client.shutdown.return_value = None
|
|
133
|
+
ok = MCPToolResult(content=text_to_blocks("ok"), isError=False)
|
|
134
|
+
fake_client.call_tool.return_value = ok
|
|
135
|
+
|
|
136
|
+
class DummyCM:
|
|
137
|
+
def __exit__(self, *args, **kwargs):
|
|
138
|
+
return False
|
|
139
|
+
|
|
140
|
+
monkeypatch.setattr("hud.utils.mcp.setup_hud_telemetry", lambda *args, **kwargs: DummyCM())
|
|
141
|
+
|
|
142
|
+
agent = DummyAgent(mcp_client=fake_client, auto_trace=False, response_tool_name="submit")
|
|
143
|
+
result = await agent.run("hello", max_steps=1)
|
|
144
|
+
assert result.isError is False
|
|
145
|
+
fake_client.call_tool.assert_awaited()
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@pytest.mark.asyncio
|
|
149
|
+
async def test_get_available_tools_after_initialize(monkeypatch):
|
|
150
|
+
fake_client = mock.AsyncMock()
|
|
151
|
+
fake_client.mcp_config = {}
|
|
152
|
+
fake_client.initialize.return_value = None
|
|
153
|
+
fake_client.list_tools.return_value = []
|
|
154
|
+
fake_client.shutdown.return_value = None
|
|
155
|
+
|
|
156
|
+
class DummyCM:
|
|
157
|
+
def __exit__(self, *args, **kwargs):
|
|
158
|
+
return False
|
|
159
|
+
|
|
160
|
+
monkeypatch.setattr("hud.utils.mcp.setup_hud_telemetry", lambda *args, **kwargs: DummyCM())
|
|
161
|
+
|
|
162
|
+
agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
|
|
163
|
+
await agent.initialize(None)
|
|
164
|
+
assert agent.get_available_tools() == []
|