hud-python 0.4.51__tar.gz → 0.4.53__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.51 → hud_python-0.4.53}/PKG-INFO +48 -48
- {hud_python-0.4.51 → hud_python-0.4.53}/README.md +46 -47
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/blank/README.md +9 -2
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/blank/server/pyproject.toml +1 -1
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/server/pyproject.toml +1 -1
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/deepresearch/server/pyproject.toml +1 -1
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/__init__.py +13 -1
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/base.py +14 -3
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/lite_llm.py +1 -1
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/openai_chat_generic.py +15 -3
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/tests/test_base.py +9 -2
- hud_python-0.4.53/hud/agents/tests/test_base_runtime.py +164 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/__init__.py +18 -25
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/build.py +35 -27
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/dev.py +11 -29
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/eval.py +114 -145
- hud_python-0.4.53/hud/cli/tests/test_analyze_module.py +120 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_build.py +26 -3
- hud_python-0.4.53/hud/cli/tests/test_build_failure.py +41 -0
- hud_python-0.4.53/hud/cli/tests/test_build_module.py +50 -0
- hud_python-0.4.53/hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud_python-0.4.53/hud/cli/tests/test_cli_root.py +134 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_eval.py +4 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_mcp_server.py +8 -7
- hud_python-0.4.53/hud/cli/tests/test_push_happy.py +74 -0
- hud_python-0.4.53/hud/cli/tests/test_push_wrapper.py +23 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/docker.py +120 -1
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/runner.py +1 -1
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/tasks.py +4 -1
- hud_python-0.4.53/hud/cli/utils/tests/test_config.py +58 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_docker.py +93 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_env_check.py +74 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_environment.py +42 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_local_runner.py +50 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_metadata.py +49 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_package_runner.py +35 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_source_hash.py +36 -0
- hud_python-0.4.53/hud/cli/utils/tests/test_tasks.py +80 -0
- hud_python-0.4.53/hud/cli/utils/version_check.py +257 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/base.py +1 -1
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/mcp_use.py +3 -1
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/datasets/parallel.py +2 -2
- hud_python-0.4.53/hud/datasets/runner.py +184 -0
- hud_python-0.4.53/hud/datasets/tests/test_runner.py +106 -0
- hud_python-0.4.53/hud/datasets/tests/test_utils.py +228 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/config.py +8 -6
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/context.py +4 -4
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/exporters.py +231 -57
- hud_python-0.4.53/hud/otel/tests/test_instrumentation.py +207 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/learner.py +1 -1
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_server_extra.py +2 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/shared/exceptions.py +35 -9
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/shared/hints.py +25 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/shared/requests.py +15 -3
- hud_python-0.4.53/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/shared/tests/test_exceptions.py +39 -30
- hud_python-0.4.53/hud/shared/tests/test_hints.py +167 -0
- hud_python-0.4.53/hud/telemetry/__init__.py +50 -0
- hud_python-0.4.53/hud/telemetry/async_context.py +331 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/telemetry/job.py +51 -12
- hud_python-0.4.53/hud/telemetry/tests/__init__.py +0 -0
- hud_python-0.4.53/hud/telemetry/tests/test_async_context.py +242 -0
- hud_python-0.4.53/hud/telemetry/tests/test_instrument.py +414 -0
- hud_python-0.4.53/hud/telemetry/tests/test_job.py +609 -0
- hud_python-0.4.53/hud/telemetry/tests/test_trace.py +241 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/telemetry/trace.py +16 -17
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/computer/qwen.py +4 -1
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/computer/settings.py +2 -2
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/executors/base.py +4 -2
- hud_python-0.4.53/hud/tools/tests/test_submit.py +85 -0
- hud_python-0.4.53/hud/tools/tests/test_types.py +193 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/types.py +7 -1
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/agent_factories.py +1 -3
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/mcp.py +1 -1
- hud_python-0.4.53/hud/utils/task_tracking.py +223 -0
- hud_python-0.4.53/hud/utils/tests/__init__.py +0 -0
- hud_python-0.4.53/hud/utils/tests/test_agent_factories.py +60 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tests/test_mcp.py +4 -6
- hud_python-0.4.53/hud/utils/tests/test_pretty_errors.py +186 -0
- hud_python-0.4.53/hud/utils/tests/test_tasks.py +187 -0
- hud_python-0.4.53/hud/utils/tests/test_tool_shorthand.py +154 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/version.py +1 -1
- {hud_python-0.4.51 → hud_python-0.4.53}/pyproject.toml +17 -3
- hud_python-0.4.51/hud/datasets/runner.py +0 -123
- hud_python-0.4.51/hud/otel/tests/__init__.py +0 -1
- hud_python-0.4.51/hud/telemetry/__init__.py +0 -26
- hud_python-0.4.51/hud/telemetry/tests/test_trace.py +0 -63
- {hud_python-0.4.51 → hud_python-0.4.53}/.gitignore +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/LICENSE +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/README.md +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/blank/environment/README.md +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/blank/environment/pyproject.toml +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/blank/server/README.md +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/README.md +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/environment/2048/README.md +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/environment/README.md +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/environment/pyproject.toml +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/environment/todo/README.md +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/pyproject.toml +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/deepresearch/README.md +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/deepresearch/environment/pyproject.toml +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/deepresearch/pyproject.toml +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/examples/README.md +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/__main__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/claude.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/grounded_openai.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/misc/integration_test_agent.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/openai.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/clone.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/debug.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/flows/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/flows/tasks.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/get.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/init.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/list_func.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/pull.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/push.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/remove.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/celebrate.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/config.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/display.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/gpu.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/gpu_utils.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/local_runner.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/presets.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/remote_runner.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/rl_api.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/viewer.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/vllm.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/wait_utils.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/config.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/env_check.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/interactive.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/local_runner.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/package_runner.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/server.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/source_hash.py +0 -0
- {hud_python-0.4.51/hud/shared → hud_python-0.4.53/hud/cli/utils}/tests/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/README.md +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/fastmcp.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/tests/test_mcp_use_retry.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/utils/mcp_use_retry.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/utils/retry.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/datasets/__init__.py +0 -0
- {hud_python-0.4.51/hud/telemetry → hud_python-0.4.53/hud/datasets}/tests/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/datasets/utils.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/native/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/native/comparator.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/collector.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/instrumentation.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/processors.py +0 -0
- {hud_python-0.4.51/hud/utils → hud_python-0.4.53/hud/otel}/tests/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/py.typed +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/README.md +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/actor.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/buffer.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/chat_template.jinja +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/config.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/distributed.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/tests/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/tests/test_learner.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/train.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/types.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/utils/start_vllm_server.sh +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/utils.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/vllm_adapter.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/samples/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/samples/browser.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/context.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/low_level.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/router.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/server.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_add_tool.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_context.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_mcp_server_handlers.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_mcp_server_integration.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_mcp_server_more.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_run_wrapper.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_sigterm_runner.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/settings.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/base.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/bash.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/edit.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/grounding/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/grounding/config.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/grounding/grounded_tool.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/grounding/grounder.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/grounding/tests/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/response.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/submit.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/types.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/utils.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/group_eval.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/hud_console.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/pretty_errors.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/progress.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tasks.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tool_shorthand.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.53
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -48,6 +48,7 @@ Requires-Dist: opentelemetry-api>=1.34.1
|
|
|
48
48
|
Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
|
|
49
49
|
Requires-Dist: opentelemetry-instrumentation-mcp==0.47.0
|
|
50
50
|
Requires-Dist: opentelemetry-sdk>=1.34.1
|
|
51
|
+
Requires-Dist: packaging>=21.0
|
|
51
52
|
Requires-Dist: pathspec>=0.12.1
|
|
52
53
|
Requires-Dist: pillow>=11.1.0
|
|
53
54
|
Requires-Dist: prompt-toolkit==3.0.51
|
|
@@ -159,12 +160,12 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
|
|
|
159
160
|
|
|
160
161
|
## Highlights
|
|
161
162
|
|
|
162
|
-
- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
|
|
163
163
|
- 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
|
|
164
164
|
- ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
|
|
165
165
|
- 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
|
|
166
166
|
- 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
|
|
167
167
|
- 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
|
|
168
|
+
- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
|
|
168
169
|
|
|
169
170
|
> We welcome contributors and feature requests – open an issue or hop on a call to discuss improvements!
|
|
170
171
|
|
|
@@ -185,29 +186,6 @@ uv tool install hud-python
|
|
|
185
186
|
Before starting, get your HUD_API_KEY at [hud.so](https://hud.so).
|
|
186
187
|
|
|
187
188
|
|
|
188
|
-
## Quickstart: Training
|
|
189
|
-
|
|
190
|
-
RL using GRPO a Qwen2.5-VL model on any hud dataset:
|
|
191
|
-
|
|
192
|
-
```bash
|
|
193
|
-
hud get hud-evals/basic-2048 # from HF
|
|
194
|
-
hud rl basic-2048.json
|
|
195
|
-
```
|
|
196
|
-
|
|
197
|
-
> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
|
|
198
|
-
|
|
199
|
-
Or make your own environment and dataset:
|
|
200
|
-
|
|
201
|
-
```bash
|
|
202
|
-
hud init my-env && cd my-env
|
|
203
|
-
hud dev --interactive
|
|
204
|
-
# When ready to run:
|
|
205
|
-
hud rl
|
|
206
|
-
```
|
|
207
|
-
|
|
208
|
-
> See [environment design docs](https://docs.hud.so/build-environments)
|
|
209
|
-
|
|
210
|
-
|
|
211
189
|
## Quickstart: Evals
|
|
212
190
|
|
|
213
191
|
For a tutorial that explains the agent and evaluation design, run:
|
|
@@ -264,38 +242,27 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
|
|
|
264
242
|
|
|
265
243
|

|
|
266
244
|
|
|
267
|
-
##
|
|
268
|
-
|
|
269
|
-
This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
|
|
270
|
-
|
|
271
|
-

|
|
245
|
+
## Quickstart: Training
|
|
272
246
|
|
|
273
|
-
|
|
247
|
+
RL using GRPO a Qwen2.5-VL model on any hud dataset:
|
|
274
248
|
|
|
275
249
|
```bash
|
|
276
|
-
#
|
|
277
|
-
uv tool install hud-python
|
|
278
|
-
|
|
279
|
-
# Option A: Run directly from a HuggingFace dataset
|
|
280
|
-
hud rl hud-evals/basic-2048
|
|
281
|
-
|
|
282
|
-
# Option B: Download first, modify, then train
|
|
283
|
-
hud get hud-evals/basic-2048
|
|
250
|
+
hud get hud-evals/basic-2048 # from HF
|
|
284
251
|
hud rl basic-2048.json
|
|
285
|
-
|
|
286
|
-
# Optional: baseline evaluation
|
|
287
|
-
hud eval basic-2048.json
|
|
288
252
|
```
|
|
289
253
|
|
|
290
|
-
|
|
291
|
-
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
292
|
-
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
254
|
+
> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
|
|
293
255
|
|
|
294
|
-
|
|
256
|
+
Or make your own environment and dataset:
|
|
295
257
|
|
|
296
|
-
|
|
258
|
+
```bash
|
|
259
|
+
hud init my-env && cd my-env
|
|
260
|
+
hud dev --interactive
|
|
261
|
+
# When ready to run:
|
|
262
|
+
hud rl
|
|
263
|
+
```
|
|
297
264
|
|
|
298
|
-
|
|
265
|
+
> See [environment design docs](https://docs.hud.so/build-environments)
|
|
299
266
|
|
|
300
267
|
## Benchmarking Agents
|
|
301
268
|
|
|
@@ -459,6 +426,39 @@ We highly suggest running 3-5 evaluations per dataset for the most consistent re
|
|
|
459
426
|
|
|
460
427
|
Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it:
|
|
461
428
|
|
|
429
|
+
## Reinforcement Learning with GRPO
|
|
430
|
+
|
|
431
|
+
This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
|
|
432
|
+
|
|
433
|
+

|
|
434
|
+
|
|
435
|
+
Train with the new interactive `hud rl` flow:
|
|
436
|
+
|
|
437
|
+
```bash
|
|
438
|
+
# Install CLI
|
|
439
|
+
uv tool install hud-python
|
|
440
|
+
|
|
441
|
+
# Option A: Run directly from a HuggingFace dataset
|
|
442
|
+
hud rl hud-evals/basic-2048
|
|
443
|
+
|
|
444
|
+
# Option B: Download first, modify, then train
|
|
445
|
+
hud get hud-evals/basic-2048
|
|
446
|
+
hud rl basic-2048.json
|
|
447
|
+
|
|
448
|
+
# Optional: baseline evaluation
|
|
449
|
+
hud eval basic-2048.json
|
|
450
|
+
```
|
|
451
|
+
|
|
452
|
+
Supports multi‑turn RL for both:
|
|
453
|
+
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
454
|
+
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
455
|
+
|
|
456
|
+
By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
|
|
457
|
+
|
|
458
|
+
Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
|
|
459
|
+
|
|
460
|
+
Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
|
|
461
|
+
|
|
462
462
|
## Architecture
|
|
463
463
|
|
|
464
464
|
```mermaid
|
|
@@ -22,12 +22,12 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
|
|
|
22
22
|
|
|
23
23
|
## Highlights
|
|
24
24
|
|
|
25
|
-
- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
|
|
26
25
|
- 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
|
|
27
26
|
- ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
|
|
28
27
|
- 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
|
|
29
28
|
- 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
|
|
30
29
|
- 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
|
|
30
|
+
- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
|
|
31
31
|
|
|
32
32
|
> We welcome contributors and feature requests – open an issue or hop on a call to discuss improvements!
|
|
33
33
|
|
|
@@ -48,29 +48,6 @@ uv tool install hud-python
|
|
|
48
48
|
Before starting, get your HUD_API_KEY at [hud.so](https://hud.so).
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
## Quickstart: Training
|
|
52
|
-
|
|
53
|
-
RL using GRPO a Qwen2.5-VL model on any hud dataset:
|
|
54
|
-
|
|
55
|
-
```bash
|
|
56
|
-
hud get hud-evals/basic-2048 # from HF
|
|
57
|
-
hud rl basic-2048.json
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
|
|
61
|
-
|
|
62
|
-
Or make your own environment and dataset:
|
|
63
|
-
|
|
64
|
-
```bash
|
|
65
|
-
hud init my-env && cd my-env
|
|
66
|
-
hud dev --interactive
|
|
67
|
-
# When ready to run:
|
|
68
|
-
hud rl
|
|
69
|
-
```
|
|
70
|
-
|
|
71
|
-
> See [environment design docs](https://docs.hud.so/build-environments)
|
|
72
|
-
|
|
73
|
-
|
|
74
51
|
## Quickstart: Evals
|
|
75
52
|
|
|
76
53
|
For a tutorial that explains the agent and evaluation design, run:
|
|
@@ -127,38 +104,27 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
|
|
|
127
104
|
|
|
128
105
|

|
|
129
106
|
|
|
130
|
-
##
|
|
131
|
-
|
|
132
|
-
This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
|
|
133
|
-
|
|
134
|
-

|
|
107
|
+
## Quickstart: Training
|
|
135
108
|
|
|
136
|
-
|
|
109
|
+
RL using GRPO a Qwen2.5-VL model on any hud dataset:
|
|
137
110
|
|
|
138
111
|
```bash
|
|
139
|
-
#
|
|
140
|
-
uv tool install hud-python
|
|
141
|
-
|
|
142
|
-
# Option A: Run directly from a HuggingFace dataset
|
|
143
|
-
hud rl hud-evals/basic-2048
|
|
144
|
-
|
|
145
|
-
# Option B: Download first, modify, then train
|
|
146
|
-
hud get hud-evals/basic-2048
|
|
112
|
+
hud get hud-evals/basic-2048 # from HF
|
|
147
113
|
hud rl basic-2048.json
|
|
148
|
-
|
|
149
|
-
# Optional: baseline evaluation
|
|
150
|
-
hud eval basic-2048.json
|
|
151
114
|
```
|
|
152
115
|
|
|
153
|
-
|
|
154
|
-
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
155
|
-
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
116
|
+
> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
|
|
156
117
|
|
|
157
|
-
|
|
118
|
+
Or make your own environment and dataset:
|
|
158
119
|
|
|
159
|
-
|
|
120
|
+
```bash
|
|
121
|
+
hud init my-env && cd my-env
|
|
122
|
+
hud dev --interactive
|
|
123
|
+
# When ready to run:
|
|
124
|
+
hud rl
|
|
125
|
+
```
|
|
160
126
|
|
|
161
|
-
|
|
127
|
+
> See [environment design docs](https://docs.hud.so/build-environments)
|
|
162
128
|
|
|
163
129
|
## Benchmarking Agents
|
|
164
130
|
|
|
@@ -322,6 +288,39 @@ We highly suggest running 3-5 evaluations per dataset for the most consistent re
|
|
|
322
288
|
|
|
323
289
|
Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it:
|
|
324
290
|
|
|
291
|
+
## Reinforcement Learning with GRPO
|
|
292
|
+
|
|
293
|
+
This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
|
|
294
|
+
|
|
295
|
+

|
|
296
|
+
|
|
297
|
+
Train with the new interactive `hud rl` flow:
|
|
298
|
+
|
|
299
|
+
```bash
|
|
300
|
+
# Install CLI
|
|
301
|
+
uv tool install hud-python
|
|
302
|
+
|
|
303
|
+
# Option A: Run directly from a HuggingFace dataset
|
|
304
|
+
hud rl hud-evals/basic-2048
|
|
305
|
+
|
|
306
|
+
# Option B: Download first, modify, then train
|
|
307
|
+
hud get hud-evals/basic-2048
|
|
308
|
+
hud rl basic-2048.json
|
|
309
|
+
|
|
310
|
+
# Optional: baseline evaluation
|
|
311
|
+
hud eval basic-2048.json
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
Supports multi‑turn RL for both:
|
|
315
|
+
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
316
|
+
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
317
|
+
|
|
318
|
+
By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
|
|
319
|
+
|
|
320
|
+
Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
|
|
321
|
+
|
|
322
|
+
Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
|
|
323
|
+
|
|
325
324
|
## Architecture
|
|
326
325
|
|
|
327
326
|
```mermaid
|
|
@@ -6,10 +6,12 @@ See [docs](https://docs.hud.so/build-environments) for the complete environment
|
|
|
6
6
|
## Architecture
|
|
7
7
|
|
|
8
8
|
**`environment/`** - Produces structured data
|
|
9
|
+
|
|
9
10
|
- Owns all state (game logic, browser sessions, databases, etc.)
|
|
10
11
|
- Exposes HTTP endpoints `/health`, `/act`, `/reset`, `/state` that return structured information about the environment state
|
|
11
12
|
|
|
12
13
|
**`server/`** - Wraps data in MCP tools
|
|
14
|
+
|
|
13
15
|
- Calls environment endpoints to get structured data for the agent, and environment setup/evaluation
|
|
14
16
|
- Agents and tasks interact only with these tools!
|
|
15
17
|
|
|
@@ -33,12 +35,14 @@ Visit http://localhost:8765/docs to see the new tool appear instantly.
|
|
|
33
35
|
In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
|
|
34
36
|
|
|
35
37
|
For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
|
|
38
|
+
|
|
36
39
|
```bash
|
|
37
40
|
cd ..
|
|
38
41
|
hud dev
|
|
39
42
|
```
|
|
40
43
|
|
|
41
44
|
## Tasks & Evaluation
|
|
45
|
+
|
|
42
46
|
```bash
|
|
43
47
|
# Build first in the global folder with the Dockerfile (creates blank:0.1.0)
|
|
44
48
|
hud build
|
|
@@ -59,6 +63,7 @@ Your `tasks.json` uses `docker run` to launch the environment:
|
|
|
59
63
|
```
|
|
60
64
|
|
|
61
65
|
**Commands:**
|
|
66
|
+
|
|
62
67
|
```bash
|
|
63
68
|
# Build first
|
|
64
69
|
hud build
|
|
@@ -78,6 +83,7 @@ hud rl tasks.json # Auto-converts docker→remote, builds & pushes if needed
|
|
|
78
83
|
Once your environment is ready, you can share it with the community:
|
|
79
84
|
|
|
80
85
|
### 1. Push to Registry
|
|
86
|
+
|
|
81
87
|
```bash
|
|
82
88
|
# Build and push your environment (requires docker hub login and hud api key)
|
|
83
89
|
hud build
|
|
@@ -89,10 +95,12 @@ hud push
|
|
|
89
95
|
Create a dataset on HuggingFace with your tasks:
|
|
90
96
|
|
|
91
97
|
**Option A: Upload manually**
|
|
98
|
+
|
|
92
99
|
1. Upload your `tasks.json` to HuggingFace
|
|
93
100
|
2. Make sure it's **public** to appear on leaderboards
|
|
94
101
|
|
|
95
102
|
**Option B: Use the SDK**
|
|
103
|
+
|
|
96
104
|
```python
|
|
97
105
|
from hud.datasets import save_tasks
|
|
98
106
|
import json
|
|
@@ -109,7 +117,7 @@ save_tasks(tasks, repo_id="your-org/your-dataset")
|
|
|
109
117
|
|
|
110
118
|
```bash
|
|
111
119
|
# Run Claude on your benchmark
|
|
112
|
-
hud eval "your-org/your-dataset"
|
|
120
|
+
hud eval "your-org/your-dataset" claude
|
|
113
121
|
|
|
114
122
|
# View results at:
|
|
115
123
|
# hud.so/leaderboards/your-org/your-dataset
|
|
@@ -118,4 +126,3 @@ hud eval "your-org/your-dataset" --agent claude
|
|
|
118
126
|
**Note**: Only public HuggingFace datasets appear as leaderboards!
|
|
119
127
|
|
|
120
128
|
📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
|
|
121
|
-
|
|
@@ -5,10 +5,22 @@ tools for building, evaluating, and training AI agents.
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
from .telemetry import
|
|
8
|
+
from .telemetry import (
|
|
9
|
+
Trace,
|
|
10
|
+
async_job,
|
|
11
|
+
async_trace,
|
|
12
|
+
clear_trace,
|
|
13
|
+
create_job,
|
|
14
|
+
get_trace,
|
|
15
|
+
instrument,
|
|
16
|
+
job,
|
|
17
|
+
trace,
|
|
18
|
+
)
|
|
9
19
|
|
|
10
20
|
__all__ = [
|
|
11
21
|
"Trace",
|
|
22
|
+
"async_job",
|
|
23
|
+
"async_trace",
|
|
12
24
|
"clear_trace",
|
|
13
25
|
"create_job",
|
|
14
26
|
"get_trace",
|
|
@@ -55,6 +55,7 @@ class MCPAgent(ABC):
|
|
|
55
55
|
# Filtering
|
|
56
56
|
allowed_tools: list[str] | None = None,
|
|
57
57
|
disallowed_tools: list[str] | None = None,
|
|
58
|
+
response_tool_name: str | None = None,
|
|
58
59
|
# Messages
|
|
59
60
|
system_prompt: str = GLOBAL_SYSTEM_PROMPT,
|
|
60
61
|
append_setup_output: bool = True,
|
|
@@ -74,6 +75,7 @@ class MCPAgent(ABC):
|
|
|
74
75
|
that provides `mcp_config`.
|
|
75
76
|
allowed_tools: Names of tools to allow (None means allow all).
|
|
76
77
|
disallowed_tools: Names of tools to always exclude.
|
|
78
|
+
response_tool_name: Name of the tool to use for response.
|
|
77
79
|
system_prompt: System prompt to seed the conversation.
|
|
78
80
|
append_setup_output: Whether to append setup tool output to the
|
|
79
81
|
first turn's messages.
|
|
@@ -108,7 +110,7 @@ class MCPAgent(ABC):
|
|
|
108
110
|
|
|
109
111
|
# Initialize these here so methods can be called before initialize()
|
|
110
112
|
self._tool_map: dict[str, types.Tool] = {} # Simplified: just name to tool
|
|
111
|
-
self.response_tool_name =
|
|
113
|
+
self.response_tool_name = response_tool_name
|
|
112
114
|
|
|
113
115
|
# Trace
|
|
114
116
|
self._auto_trace = auto_trace
|
|
@@ -135,7 +137,11 @@ class MCPAgent(ABC):
|
|
|
135
137
|
"No MCPClient. Please provide one when initializing the agent or pass a Task with mcp_config." # noqa: E501
|
|
136
138
|
)
|
|
137
139
|
|
|
138
|
-
|
|
140
|
+
try:
|
|
141
|
+
client_cfg = getattr(self.mcp_client, "mcp_config", None)
|
|
142
|
+
except Exception:
|
|
143
|
+
client_cfg = None
|
|
144
|
+
await self._setup_config(client_cfg)
|
|
139
145
|
|
|
140
146
|
# Initialize client if needed
|
|
141
147
|
try:
|
|
@@ -168,6 +174,8 @@ class MCPAgent(ABC):
|
|
|
168
174
|
self.disallowed_tools.extend(task.agent_config["disallowed_tools"])
|
|
169
175
|
else: # If disallowed_tools is None, we overwrite it
|
|
170
176
|
self.disallowed_tools = task.agent_config["disallowed_tools"]
|
|
177
|
+
if "response_tool_name" in task.agent_config:
|
|
178
|
+
self.response_tool_name = task.agent_config["response_tool_name"]
|
|
171
179
|
|
|
172
180
|
all_tools = await self.mcp_client.list_tools()
|
|
173
181
|
self._available_tools = []
|
|
@@ -614,8 +622,11 @@ class MCPAgent(ABC):
|
|
|
614
622
|
except Exception as e:
|
|
615
623
|
self.console.error_log(f"Response lifecycle tool failed: {e}")
|
|
616
624
|
|
|
617
|
-
async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
|
|
625
|
+
async def _setup_config(self, mcp_config: dict[str, dict[str, Any]] | None) -> None:
|
|
618
626
|
"""Inject metadata into the metadata of the initialize request."""
|
|
627
|
+
if not isinstance(mcp_config, dict):
|
|
628
|
+
return
|
|
629
|
+
|
|
619
630
|
if self.metadata:
|
|
620
631
|
patch_mcp_config(
|
|
621
632
|
mcp_config,
|
|
@@ -47,7 +47,7 @@ class LiteAgent(GenericOpenAIChatAgent):
|
|
|
47
47
|
**agent_kwargs,
|
|
48
48
|
)
|
|
49
49
|
|
|
50
|
-
def get_tool_schemas(self) -> list[
|
|
50
|
+
def get_tool_schemas(self) -> list[Any]:
|
|
51
51
|
# Prefer LiteLLM's stricter transformer (handles Bedrock & friends)
|
|
52
52
|
if transform_mcp_tool_to_openai_tool is not None:
|
|
53
53
|
return [
|
|
@@ -20,6 +20,7 @@ import logging
|
|
|
20
20
|
from typing import TYPE_CHECKING, Any, ClassVar, cast
|
|
21
21
|
|
|
22
22
|
import mcp.types as types
|
|
23
|
+
from openai import AsyncOpenAI
|
|
23
24
|
|
|
24
25
|
from hud import instrument
|
|
25
26
|
from hud.types import AgentResponse, MCPToolCall, MCPToolResult
|
|
@@ -28,7 +29,6 @@ from hud.utils.hud_console import HUDConsole
|
|
|
28
29
|
from .base import MCPAgent
|
|
29
30
|
|
|
30
31
|
if TYPE_CHECKING:
|
|
31
|
-
from openai import AsyncOpenAI
|
|
32
32
|
from openai.types.chat import ChatCompletionToolParam
|
|
33
33
|
|
|
34
34
|
logger = logging.getLogger(__name__)
|
|
@@ -42,14 +42,26 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
42
42
|
def __init__(
|
|
43
43
|
self,
|
|
44
44
|
*,
|
|
45
|
-
openai_client: AsyncOpenAI | None,
|
|
45
|
+
openai_client: AsyncOpenAI | None = None,
|
|
46
|
+
api_key: str | None = None,
|
|
47
|
+
base_url: str | None = None,
|
|
46
48
|
model_name: str = "gpt-4o-mini",
|
|
47
49
|
completion_kwargs: dict[str, Any] | None = None,
|
|
48
50
|
**agent_kwargs: Any,
|
|
49
51
|
) -> None:
|
|
50
52
|
# Accept base-agent settings via **agent_kwargs (e.g., mcp_client, system_prompt, etc.)
|
|
51
53
|
super().__init__(**agent_kwargs)
|
|
52
|
-
|
|
54
|
+
|
|
55
|
+
# Handle client creation - support both patterns
|
|
56
|
+
if openai_client is not None:
|
|
57
|
+
# Use provided client (backward compatibility)
|
|
58
|
+
self.oai = openai_client
|
|
59
|
+
elif api_key is not None or base_url is not None:
|
|
60
|
+
# Create client from config (new pattern, consistent with other agents)
|
|
61
|
+
self.oai = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
62
|
+
else:
|
|
63
|
+
raise ValueError("Either openai_client or (api_key and base_url) must be provided")
|
|
64
|
+
|
|
53
65
|
self.model_name = model_name
|
|
54
66
|
self.completion_kwargs: dict[str, Any] = completion_kwargs or {}
|
|
55
67
|
self.mcp_schemas = []
|
|
@@ -94,7 +94,7 @@ class TestBaseMCPAgent:
|
|
|
94
94
|
|
|
95
95
|
assert agent.mcp_client is not None
|
|
96
96
|
assert agent.allowed_tools is None
|
|
97
|
-
assert agent.disallowed_tools
|
|
97
|
+
assert agent.disallowed_tools is None
|
|
98
98
|
assert agent.initial_screenshot is True
|
|
99
99
|
assert agent.system_prompt is not None # Default system prompt is set
|
|
100
100
|
|
|
@@ -241,6 +241,13 @@ class TestBaseMCPAgent:
|
|
|
241
241
|
assert "tool2" not in tool_names # Not in allowed list
|
|
242
242
|
assert "tool3" not in tool_names # In disallowed list
|
|
243
243
|
|
|
244
|
+
# Make sure tool schemas are correct
|
|
245
|
+
schemas = agent.get_tool_schemas()
|
|
246
|
+
assert len(schemas) == 1
|
|
247
|
+
assert schemas[0]["name"] == "tool1"
|
|
248
|
+
assert schemas[0]["description"] == "Tool 1"
|
|
249
|
+
assert schemas[0]["parameters"] == {"type": "object"}
|
|
250
|
+
|
|
244
251
|
@pytest.mark.asyncio
|
|
245
252
|
async def test_call_tool_success(self):
|
|
246
253
|
"""Test successful tool call."""
|
|
@@ -334,7 +341,7 @@ class TestBaseMCPAgent:
|
|
|
334
341
|
schemas = agent.get_tool_schemas()
|
|
335
342
|
|
|
336
343
|
# Should include non-lifecycle tools
|
|
337
|
-
assert len(schemas) ==
|
|
344
|
+
assert len(schemas) == 2
|
|
338
345
|
assert schemas[0]["name"] == "tool1"
|
|
339
346
|
|
|
340
347
|
def test_get_tools_by_server(self):
|