hud-python 0.4.35__tar.gz → 0.4.37__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.35 → hud_python-0.4.37}/.gitignore +0 -1
- {hud_python-0.4.35 → hud_python-0.4.37}/PKG-INFO +43 -23
- {hud_python-0.4.35 → hud_python-0.4.37}/README.md +11 -11
- {hud_python-0.4.35 → hud_python-0.4.37}/environments/README.md +5 -5
- hud_python-0.4.37/environments/blank/README.md +108 -0
- hud_python-0.4.37/environments/blank/controller/README.md +16 -0
- hud_python-0.4.37/environments/blank/environment/README.md +16 -0
- hud_python-0.4.37/environments/blank/pyproject.toml +19 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/environments/browser/README.md +67 -88
- hud_python-0.4.37/environments/browser/pyproject.toml +22 -0
- hud_python-0.4.37/environments/deepresearch/pyproject.toml +19 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/__init__.py +2 -0
- hud_python-0.4.37/hud/agents/lite_llm.py +72 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/openai_chat_generic.py +21 -7
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/tests/test_claude.py +32 -7
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/tests/test_openai.py +29 -6
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/__init__.py +228 -79
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/build.py +26 -6
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/dev.py +21 -40
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/eval.py +96 -15
- hud_python-0.4.37/hud/cli/flows/tasks.py +388 -0
- hud_python-0.4.37/hud/cli/init.py +270 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/pull.py +6 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/push.py +11 -1
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/__init__.py +14 -4
- hud_python-0.4.37/hud/cli/rl/celebrate.py +187 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/config.py +15 -8
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/local_runner.py +44 -20
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/remote_runner.py +166 -87
- hud_python-0.4.37/hud/cli/rl/viewer.py +141 -0
- hud_python-0.4.37/hud/cli/rl/wait_utils.py +89 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_build.py +3 -27
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_mcp_server.py +1 -12
- hud_python-0.4.37/hud/cli/utils/config.py +85 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/docker.py +21 -39
- hud_python-0.4.37/hud/cli/utils/env_check.py +196 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/environment.py +4 -3
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/interactive.py +2 -1
- hud_python-0.4.37/hud/cli/utils/local_runner.py +204 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/metadata.py +3 -1
- hud_python-0.4.37/hud/cli/utils/package_runner.py +292 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/remote_runner.py +4 -1
- hud_python-0.4.37/hud/cli/utils/source_hash.py +108 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/base.py +1 -1
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/fastmcp.py +1 -1
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/mcp_use.py +30 -7
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/datasets/parallel.py +3 -1
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/datasets/runner.py +4 -1
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/config.py +1 -1
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/context.py +40 -6
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/buffer.py +3 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/tests/test_learner.py +1 -1
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/vllm_adapter.py +1 -1
- hud_python-0.4.37/hud/server/server.py +471 -0
- hud_python-0.4.37/hud/server/tests/test_add_tool.py +60 -0
- hud_python-0.4.37/hud/server/tests/test_context.py +128 -0
- hud_python-0.4.37/hud/server/tests/test_mcp_server_handlers.py +44 -0
- hud_python-0.4.37/hud/server/tests/test_mcp_server_integration.py +405 -0
- hud_python-0.4.37/hud/server/tests/test_mcp_server_more.py +247 -0
- hud_python-0.4.37/hud/server/tests/test_run_wrapper.py +53 -0
- hud_python-0.4.37/hud/server/tests/test_server_extra.py +166 -0
- hud_python-0.4.37/hud/server/tests/test_sigterm_runner.py +78 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/settings.py +38 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/shared/hints.py +2 -2
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/job.py +2 -2
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/types.py +9 -2
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tasks.py +32 -24
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/version.py +1 -1
- {hud_python-0.4.35 → hud_python-0.4.37}/pyproject.toml +22 -22
- hud_python-0.4.35/environments/browser/pyproject.toml +0 -22
- hud_python-0.4.35/hud/cli/flows/tasks.py +0 -255
- hud_python-0.4.35/hud/cli/init.py +0 -677
- hud_python-0.4.35/hud/server/server.py +0 -244
- {hud_python-0.4.35 → hud_python-0.4.37}/LICENSE +0 -0
- {hud_python-0.4.35/environments/browser/apps → hud_python-0.4.37/environments/browser/environment}/2048/README.md +0 -0
- {hud_python-0.4.35/environments/browser/apps → hud_python-0.4.37/environments/browser/environment}/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.35/environments/browser/apps → hud_python-0.4.37/environments/browser/environment}/README.md +0 -0
- {hud_python-0.4.35/environments/browser/apps → hud_python-0.4.37/environments/browser/environment}/todo/README.md +0 -0
- {hud_python-0.4.35/environments/browser/apps → hud_python-0.4.37/environments/browser/environment}/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/examples/README.md +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/__main__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/base.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/claude.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/grounded_openai.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/openai.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/tests/test_base.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/clone.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/debug.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/flows/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/get.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/list_func.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/remove.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/display.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/gpu.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/gpu_utils.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/presets.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/rl_api.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/vllm.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/runner.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/server.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/tasks.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/README.md +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/tests/test_mcp_use_retry.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/utils/mcp_use_retry.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/utils/retry.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/datasets/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/datasets/utils.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/native/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/native/comparator.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/collector.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/exporters.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/instrumentation.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/processors.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/tests/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/py.typed +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/README.md +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/actor.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/chat_template.jinja +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/config.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/distributed.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/learner.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/tests/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/train.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/types.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/utils/start_vllm_server.sh +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/utils.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/samples/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/samples/browser.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/server/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/server/context.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/server/low_level.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/shared/exceptions.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/shared/requests.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/shared/tests/test_exceptions.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/tests/test_trace.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/trace.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/base.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/bash.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/edit.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/executors/base.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/grounding/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/grounding/config.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/grounding/grounded_tool.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/grounding/grounder.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/grounding/tests/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/response.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/submit.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/types.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/utils.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/agent_factories.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/group_eval.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/hud_console.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/mcp.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/pretty_errors.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/progress.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tests/test_mcp.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tool_shorthand.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.37
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -36,11 +36,13 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
36
36
|
Classifier: Programming Language :: Python :: 3.13
|
|
37
37
|
Requires-Python: <3.13,>=3.11
|
|
38
38
|
Requires-Dist: anthropic
|
|
39
|
+
Requires-Dist: blessed>=1.20.0
|
|
39
40
|
Requires-Dist: datasets>=2.14.0
|
|
40
41
|
Requires-Dist: httpx<1,>=0.23.0
|
|
41
42
|
Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
|
|
42
43
|
Requires-Dist: hud-mcp-python-sdk>=3.13.2
|
|
43
|
-
Requires-Dist: hud-mcp-use-python-sdk
|
|
44
|
+
Requires-Dist: hud-mcp-use-python-sdk==2.3.19
|
|
45
|
+
Requires-Dist: litellm>=1.55.0
|
|
44
46
|
Requires-Dist: numpy>=1.24.0
|
|
45
47
|
Requires-Dist: openai
|
|
46
48
|
Requires-Dist: opentelemetry-api>=1.34.1
|
|
@@ -50,8 +52,8 @@ Requires-Dist: opentelemetry-sdk>=1.34.1
|
|
|
50
52
|
Requires-Dist: pathspec>=0.12.1
|
|
51
53
|
Requires-Dist: pillow>=11.1.0
|
|
52
54
|
Requires-Dist: prompt-toolkit==3.0.51
|
|
53
|
-
Requires-Dist: pydantic-settings<3,>=2
|
|
54
|
-
Requires-Dist: pydantic<3,>=2
|
|
55
|
+
Requires-Dist: pydantic-settings<3,>=2.2
|
|
56
|
+
Requires-Dist: pydantic<3,>=2.6
|
|
55
57
|
Requires-Dist: questionary==2.1.0
|
|
56
58
|
Requires-Dist: rich>=13.0.0
|
|
57
59
|
Requires-Dist: toml>=0.10.2
|
|
@@ -59,7 +61,9 @@ Requires-Dist: typer>=0.9.0
|
|
|
59
61
|
Requires-Dist: watchfiles>=0.21.0
|
|
60
62
|
Requires-Dist: wrapt>=1.14.0
|
|
61
63
|
Provides-Extra: agent
|
|
64
|
+
Requires-Dist: aiodocker>=0.24.0; extra == 'agent'
|
|
62
65
|
Requires-Dist: dotenv>=0.9.9; extra == 'agent'
|
|
66
|
+
Requires-Dist: inspect-ai>=0.3.80; extra == 'agent'
|
|
63
67
|
Requires-Dist: ipykernel; extra == 'agent'
|
|
64
68
|
Requires-Dist: ipython<9; extra == 'agent'
|
|
65
69
|
Requires-Dist: jupyter-client; extra == 'agent'
|
|
@@ -67,8 +71,21 @@ Requires-Dist: jupyter-core; extra == 'agent'
|
|
|
67
71
|
Requires-Dist: langchain; extra == 'agent'
|
|
68
72
|
Requires-Dist: langchain-anthropic; extra == 'agent'
|
|
69
73
|
Requires-Dist: langchain-openai; extra == 'agent'
|
|
74
|
+
Requires-Dist: pillow>=11.1.0; extra == 'agent'
|
|
75
|
+
Requires-Dist: playwright; extra == 'agent'
|
|
76
|
+
Requires-Dist: pyautogui>=0.9.54; extra == 'agent'
|
|
77
|
+
Requires-Dist: pyright==1.1.401; extra == 'agent'
|
|
78
|
+
Requires-Dist: pytest-asyncio; extra == 'agent'
|
|
79
|
+
Requires-Dist: pytest-cov; extra == 'agent'
|
|
80
|
+
Requires-Dist: pytest-mock; extra == 'agent'
|
|
81
|
+
Requires-Dist: pytest<9,>=8.1.1; extra == 'agent'
|
|
82
|
+
Requires-Dist: ruff>=0.11.8; extra == 'agent'
|
|
83
|
+
Requires-Dist: setuptools; extra == 'agent'
|
|
84
|
+
Requires-Dist: textdistance<5,>=4.5.0; extra == 'agent'
|
|
70
85
|
Provides-Extra: agents
|
|
86
|
+
Requires-Dist: aiodocker>=0.24.0; extra == 'agents'
|
|
71
87
|
Requires-Dist: dotenv>=0.9.9; extra == 'agents'
|
|
88
|
+
Requires-Dist: inspect-ai>=0.3.80; extra == 'agents'
|
|
72
89
|
Requires-Dist: ipykernel; extra == 'agents'
|
|
73
90
|
Requires-Dist: ipython<9; extra == 'agents'
|
|
74
91
|
Requires-Dist: jupyter-client; extra == 'agents'
|
|
@@ -76,6 +93,17 @@ Requires-Dist: jupyter-core; extra == 'agents'
|
|
|
76
93
|
Requires-Dist: langchain; extra == 'agents'
|
|
77
94
|
Requires-Dist: langchain-anthropic; extra == 'agents'
|
|
78
95
|
Requires-Dist: langchain-openai; extra == 'agents'
|
|
96
|
+
Requires-Dist: pillow>=11.1.0; extra == 'agents'
|
|
97
|
+
Requires-Dist: playwright; extra == 'agents'
|
|
98
|
+
Requires-Dist: pyautogui>=0.9.54; extra == 'agents'
|
|
99
|
+
Requires-Dist: pyright==1.1.401; extra == 'agents'
|
|
100
|
+
Requires-Dist: pytest-asyncio; extra == 'agents'
|
|
101
|
+
Requires-Dist: pytest-cov; extra == 'agents'
|
|
102
|
+
Requires-Dist: pytest-mock; extra == 'agents'
|
|
103
|
+
Requires-Dist: pytest<9,>=8.1.1; extra == 'agents'
|
|
104
|
+
Requires-Dist: ruff>=0.11.8; extra == 'agents'
|
|
105
|
+
Requires-Dist: setuptools; extra == 'agents'
|
|
106
|
+
Requires-Dist: textdistance<5,>=4.5.0; extra == 'agents'
|
|
79
107
|
Provides-Extra: dev
|
|
80
108
|
Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
|
|
81
109
|
Requires-Dist: dotenv>=0.9.9; extra == 'dev'
|
|
@@ -100,14 +128,6 @@ Requires-Dist: setuptools; extra == 'dev'
|
|
|
100
128
|
Requires-Dist: textdistance<5,>=4.5.0; extra == 'dev'
|
|
101
129
|
Provides-Extra: rl
|
|
102
130
|
Requires-Dist: bitsandbytes>=0.41.0; (sys_platform == 'linux') and extra == 'rl'
|
|
103
|
-
Requires-Dist: dotenv>=0.9.9; extra == 'rl'
|
|
104
|
-
Requires-Dist: ipykernel; extra == 'rl'
|
|
105
|
-
Requires-Dist: ipython<9; extra == 'rl'
|
|
106
|
-
Requires-Dist: jupyter-client; extra == 'rl'
|
|
107
|
-
Requires-Dist: jupyter-core; extra == 'rl'
|
|
108
|
-
Requires-Dist: langchain; extra == 'rl'
|
|
109
|
-
Requires-Dist: langchain-anthropic; extra == 'rl'
|
|
110
|
-
Requires-Dist: langchain-openai; extra == 'rl'
|
|
111
131
|
Requires-Dist: liger-kernel>=0.5.0; (sys_platform == 'linux') and extra == 'rl'
|
|
112
132
|
Requires-Dist: peft>=0.17.1; extra == 'rl'
|
|
113
133
|
Requires-Dist: vllm==0.10.1.1; extra == 'rl'
|
|
@@ -138,8 +158,8 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
|
|
|
138
158
|
## Highlights
|
|
139
159
|
|
|
140
160
|
- 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
|
|
141
|
-
- ⚡️ **[Live telemetry](https://
|
|
142
|
-
- 🗂️ **[Public benchmarks](https://
|
|
161
|
+
- ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
|
|
162
|
+
- 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
|
|
143
163
|
- 🌱 **[Reinforcement learning built-in](rl/)** – Verifiers gym pipelines for GRPO on any environment.
|
|
144
164
|
- 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
|
|
145
165
|
- 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
|
|
@@ -185,14 +205,14 @@ from hud.agents import ClaudeAgent
|
|
|
185
205
|
from hud.datasets import Task # See docs: https://docs.hud.so/reference/tasks
|
|
186
206
|
|
|
187
207
|
async def main() -> None:
|
|
188
|
-
with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://
|
|
208
|
+
with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://hud.so)
|
|
189
209
|
task = {
|
|
190
210
|
"prompt": "Reach 64 in 2048.",
|
|
191
211
|
"mcp_config": {
|
|
192
212
|
"hud": {
|
|
193
213
|
"url": "https://mcp.hud.so/v3/mcp", # HUD's cloud MCP server (see https://docs.hud.so/core-concepts/architecture)
|
|
194
214
|
"headers": {
|
|
195
|
-
"Authorization": f"Bearer {settings.api_key}", # Get your key at https://
|
|
215
|
+
"Authorization": f"Bearer {settings.api_key}", # Get your key at https://hud.so
|
|
196
216
|
"Mcp-Image": "hudpython/hud-text-2048:v1.2" # Docker image from https://hub.docker.com/u/hudpython
|
|
197
217
|
}
|
|
198
218
|
}
|
|
@@ -219,7 +239,7 @@ async def main() -> None:
|
|
|
219
239
|
asyncio.run(main())
|
|
220
240
|
```
|
|
221
241
|
|
|
222
|
-
The above example let's the agent play 2048 ([See replay](https://
|
|
242
|
+
The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
|
|
223
243
|
|
|
224
244
|

|
|
225
245
|
|
|
@@ -250,7 +270,7 @@ Supports multi‑turn RL for both:
|
|
|
250
270
|
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
251
271
|
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
252
272
|
|
|
253
|
-
By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `
|
|
273
|
+
By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
|
|
254
274
|
|
|
255
275
|
Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
|
|
256
276
|
|
|
@@ -260,7 +280,7 @@ This is Claude Computer Use running on our proprietary financial analyst benchma
|
|
|
260
280
|
|
|
261
281
|

|
|
262
282
|
|
|
263
|
-
> [See this trace on
|
|
283
|
+
> [See this trace on _hud.so_](https://hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
|
|
264
284
|
|
|
265
285
|
This example runs the full dataset (only takes ~20 minutes) using [run_evaluation.py](examples/run_evaluation.py):
|
|
266
286
|
|
|
@@ -286,7 +306,7 @@ results = await run_dataset(
|
|
|
286
306
|
print(f"Average reward: {sum(r.reward for r in results) / len(results):.2f}")
|
|
287
307
|
```
|
|
288
308
|
|
|
289
|
-
> Running a dataset creates a job and streams results to the [
|
|
309
|
+
> Running a dataset creates a job and streams results to the [hud.so](https://hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
|
|
290
310
|
|
|
291
311
|
## Building Environments (MCP)
|
|
292
312
|
|
|
@@ -377,7 +397,7 @@ Tools
|
|
|
377
397
|
hud push # needs docker login, hud api key
|
|
378
398
|
```
|
|
379
399
|
|
|
380
|
-
5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [
|
|
400
|
+
5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [hud.so](https://hud.so):
|
|
381
401
|
|
|
382
402
|
```python
|
|
383
403
|
from hud.agents import ClaudeAgent
|
|
@@ -408,7 +428,7 @@ result = await ClaudeAgent().run({ # See all agents: https://docs.hud.so/refere
|
|
|
408
428
|
|
|
409
429
|
## Leaderboards & benchmarks
|
|
410
430
|
|
|
411
|
-
All leaderboards are publicly available on [
|
|
431
|
+
All leaderboards are publicly available on [hud.so/leaderboards](https://hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
|
|
412
432
|
|
|
413
433
|

|
|
414
434
|
|
|
@@ -422,7 +442,7 @@ Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) funct
|
|
|
422
442
|
%%{init: {"theme": "neutral", "themeVariables": {"fontSize": "14px"}} }%%
|
|
423
443
|
graph LR
|
|
424
444
|
subgraph "Platform"
|
|
425
|
-
Dashboard["📊
|
|
445
|
+
Dashboard["📊 hud.so"]
|
|
426
446
|
API["🔌 mcp.hud.so"]
|
|
427
447
|
end
|
|
428
448
|
|
|
@@ -23,8 +23,8 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
|
|
|
23
23
|
## Highlights
|
|
24
24
|
|
|
25
25
|
- 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
|
|
26
|
-
- ⚡️ **[Live telemetry](https://
|
|
27
|
-
- 🗂️ **[Public benchmarks](https://
|
|
26
|
+
- ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
|
|
27
|
+
- 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
|
|
28
28
|
- 🌱 **[Reinforcement learning built-in](rl/)** – Verifiers gym pipelines for GRPO on any environment.
|
|
29
29
|
- 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
|
|
30
30
|
- 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
|
|
@@ -70,14 +70,14 @@ from hud.agents import ClaudeAgent
|
|
|
70
70
|
from hud.datasets import Task # See docs: https://docs.hud.so/reference/tasks
|
|
71
71
|
|
|
72
72
|
async def main() -> None:
|
|
73
|
-
with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://
|
|
73
|
+
with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://hud.so)
|
|
74
74
|
task = {
|
|
75
75
|
"prompt": "Reach 64 in 2048.",
|
|
76
76
|
"mcp_config": {
|
|
77
77
|
"hud": {
|
|
78
78
|
"url": "https://mcp.hud.so/v3/mcp", # HUD's cloud MCP server (see https://docs.hud.so/core-concepts/architecture)
|
|
79
79
|
"headers": {
|
|
80
|
-
"Authorization": f"Bearer {settings.api_key}", # Get your key at https://
|
|
80
|
+
"Authorization": f"Bearer {settings.api_key}", # Get your key at https://hud.so
|
|
81
81
|
"Mcp-Image": "hudpython/hud-text-2048:v1.2" # Docker image from https://hub.docker.com/u/hudpython
|
|
82
82
|
}
|
|
83
83
|
}
|
|
@@ -104,7 +104,7 @@ async def main() -> None:
|
|
|
104
104
|
asyncio.run(main())
|
|
105
105
|
```
|
|
106
106
|
|
|
107
|
-
The above example let's the agent play 2048 ([See replay](https://
|
|
107
|
+
The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
|
|
108
108
|
|
|
109
109
|

|
|
110
110
|
|
|
@@ -135,7 +135,7 @@ Supports multi‑turn RL for both:
|
|
|
135
135
|
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
136
136
|
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
137
137
|
|
|
138
|
-
By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `
|
|
138
|
+
By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
|
|
139
139
|
|
|
140
140
|
Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
|
|
141
141
|
|
|
@@ -145,7 +145,7 @@ This is Claude Computer Use running on our proprietary financial analyst benchma
|
|
|
145
145
|
|
|
146
146
|

|
|
147
147
|
|
|
148
|
-
> [See this trace on
|
|
148
|
+
> [See this trace on _hud.so_](https://hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
|
|
149
149
|
|
|
150
150
|
This example runs the full dataset (only takes ~20 minutes) using [run_evaluation.py](examples/run_evaluation.py):
|
|
151
151
|
|
|
@@ -171,7 +171,7 @@ results = await run_dataset(
|
|
|
171
171
|
print(f"Average reward: {sum(r.reward for r in results) / len(results):.2f}")
|
|
172
172
|
```
|
|
173
173
|
|
|
174
|
-
> Running a dataset creates a job and streams results to the [
|
|
174
|
+
> Running a dataset creates a job and streams results to the [hud.so](https://hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
|
|
175
175
|
|
|
176
176
|
## Building Environments (MCP)
|
|
177
177
|
|
|
@@ -262,7 +262,7 @@ Tools
|
|
|
262
262
|
hud push # needs docker login, hud api key
|
|
263
263
|
```
|
|
264
264
|
|
|
265
|
-
5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [
|
|
265
|
+
5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [hud.so](https://hud.so):
|
|
266
266
|
|
|
267
267
|
```python
|
|
268
268
|
from hud.agents import ClaudeAgent
|
|
@@ -293,7 +293,7 @@ result = await ClaudeAgent().run({ # See all agents: https://docs.hud.so/refere
|
|
|
293
293
|
|
|
294
294
|
## Leaderboards & benchmarks
|
|
295
295
|
|
|
296
|
-
All leaderboards are publicly available on [
|
|
296
|
+
All leaderboards are publicly available on [hud.so/leaderboards](https://hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
|
|
297
297
|
|
|
298
298
|

|
|
299
299
|
|
|
@@ -307,7 +307,7 @@ Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) funct
|
|
|
307
307
|
%%{init: {"theme": "neutral", "themeVariables": {"fontSize": "14px"}} }%%
|
|
308
308
|
graph LR
|
|
309
309
|
subgraph "Platform"
|
|
310
|
-
Dashboard["📊
|
|
310
|
+
Dashboard["📊 hud.so"]
|
|
311
311
|
API["🔌 mcp.hud.so"]
|
|
312
312
|
end
|
|
313
313
|
|
|
@@ -495,7 +495,7 @@ from hud.agents import ClaudeAgent
|
|
|
495
495
|
from hud.clients import MCPClient
|
|
496
496
|
|
|
497
497
|
async def main():
|
|
498
|
-
# `trace` captures *everything* that happens and sends it to
|
|
498
|
+
# `trace` captures *everything* that happens and sends it to hud.so
|
|
499
499
|
with hud.trace("local_test"):
|
|
500
500
|
task = Task(
|
|
501
501
|
prompt="Complete the task",
|
|
@@ -524,7 +524,7 @@ async def main():
|
|
|
524
524
|
asyncio.run(main())
|
|
525
525
|
```
|
|
526
526
|
|
|
527
|
-
The `trace` context manager sends a full timeline of agent actions, tool calls, and rewards to
|
|
527
|
+
The `trace` context manager sends a full timeline of agent actions, tool calls, and rewards to hud.so – perfect for debugging.
|
|
528
528
|
|
|
529
529
|
See `examples/01_hello_2048.py` and `examples/task_with_setup_eval.py` for larger end-to-end demos.
|
|
530
530
|
|
|
@@ -532,7 +532,7 @@ See `examples/01_hello_2048.py` and `examples/task_with_setup_eval.py` for large
|
|
|
532
532
|
|
|
533
533
|
## Phase 4 – Remote Deployment & HUD Runner
|
|
534
534
|
|
|
535
|
-
**Goal →** the exact same image runs in parallel on hundreds of instances, and exposes more telemetry so the
|
|
535
|
+
**Goal →** the exact same image runs in parallel on hundreds of instances, and exposes more telemetry so the hud.so can visualise the whole lifecycle.
|
|
536
536
|
|
|
537
537
|
### 1. Publish your image
|
|
538
538
|
|
|
@@ -595,11 +595,11 @@ async def initialize_environment(session=None, progress_token=None):
|
|
|
595
595
|
await send(100, "ready")
|
|
596
596
|
```
|
|
597
597
|
|
|
598
|
-
Those messages are displayed live on
|
|
598
|
+
Those messages are displayed live on hud.so alongside resource graphs – perfect feedback while you wait.
|
|
599
599
|
|
|
600
600
|
### 4. Live telemetry (`telemetry://live`) (Optional)
|
|
601
601
|
|
|
602
|
-
Expose a resource named `telemetry://live` exactly like in `environments/browser/src/hud_controller/server.py` to return live url to be displayed on
|
|
602
|
+
Expose a resource named `telemetry://live` exactly like in `environments/browser/src/hud_controller/server.py` to return live url to be displayed on hud.so.
|
|
603
603
|
|
|
604
604
|
Once all of the above works you can unleash *hundreds* of concurrent agents on your new environment.
|
|
605
605
|
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# test-test
|
|
2
|
+
|
|
3
|
+
## Environment design pattern
|
|
4
|
+
- Controller (Think of this as a frontend in web development)
|
|
5
|
+
- Creates the UX and manages the lifecycle of an app (in this case for an agent)
|
|
6
|
+
- Define `mcp = MCPServer()` and register `@mcp.tool` as tools the agent can interact with
|
|
7
|
+
- Environment (Think of this as a backend in web development)
|
|
8
|
+
- Owns all long‑lived states of the environment and exposes the environment data structure
|
|
9
|
+
- Expose simple HTTP endpoints (`/health`, `/act`, `/reset`, `/state`)
|
|
10
|
+
|
|
11
|
+
IMPORTANT: Make sure all logs are going to stderr instead of stdio, which is reserved for MCP communication
|
|
12
|
+
|
|
13
|
+
### Testing your environment
|
|
14
|
+
```bash
|
|
15
|
+
# 1. Configure your API keys (optional - only needed for evaluation)
|
|
16
|
+
# Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY
|
|
17
|
+
|
|
18
|
+
# 2. Start the environment (optional: with --inspector or --interactive)
|
|
19
|
+
hud dev --build --interactive
|
|
20
|
+
|
|
21
|
+
# 3. Choose your preferred way to test:
|
|
22
|
+
|
|
23
|
+
# Option A: Run the task with Claude (requires ANTHROPIC_API_KEY)
|
|
24
|
+
hud eval tasks.json --agent claude
|
|
25
|
+
|
|
26
|
+
# Option B: Interactive notebook test_env.ipynb (great for learning!)
|
|
27
|
+
|
|
28
|
+
# Option C: Simple Python script (runs all tasks from tasks.json)
|
|
29
|
+
python test_task.py
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Iterating on your environment
|
|
33
|
+
This is usually the process for making any environment better:
|
|
34
|
+
```bash
|
|
35
|
+
# 1. Start the environment and interact with it directly (or give MCP server to an agent):
|
|
36
|
+
hud dev --build --interactive
|
|
37
|
+
|
|
38
|
+
# 2. If the environment cannot start or fails inexplicably:
|
|
39
|
+
hud debug test_env:dev # Or your env name that appears when you run hud dev
|
|
40
|
+
# After fixing the error, go back to 1.
|
|
41
|
+
|
|
42
|
+
# 3. When the environment is in a stable state:
|
|
43
|
+
hud build
|
|
44
|
+
hud push # Requires docker login
|
|
45
|
+
|
|
46
|
+
# 4. As soon as it's pushed to the newest version, make sure tasks have it updated and run:
|
|
47
|
+
hud rl
|
|
48
|
+
# This is a good test to see if your environment and tasks are high quality!
|
|
49
|
+
|
|
50
|
+
## Layout
|
|
51
|
+
```
|
|
52
|
+
controller/
|
|
53
|
+
__init__.py # mcp + shared HTTP client
|
|
54
|
+
__main__.py # python -m controller → mcp.run()
|
|
55
|
+
hooks.py # @mcp.initialize / @mcp.shutdown
|
|
56
|
+
tools.py # @mcp.tool act / setup / evaluate
|
|
57
|
+
|
|
58
|
+
./environment
|
|
59
|
+
├── __init__.py
|
|
60
|
+
└── server.py # FastAPI app: /health, /act, /reset, /state
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Publishing Your Environment
|
|
64
|
+
|
|
65
|
+
Once your environment is ready, you can share it with the community:
|
|
66
|
+
|
|
67
|
+
### 1. Push to Registry
|
|
68
|
+
```bash
|
|
69
|
+
# Build and push your environment (requires docker hub login and hud api key)
|
|
70
|
+
hud build
|
|
71
|
+
hud push
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### 2. Create a Dataset
|
|
75
|
+
|
|
76
|
+
Create a dataset on HuggingFace with your tasks:
|
|
77
|
+
|
|
78
|
+
**Option A: Upload manually**
|
|
79
|
+
1. Upload your `tasks.json` to HuggingFace
|
|
80
|
+
2. Make sure it's **public** to appear on leaderboards
|
|
81
|
+
|
|
82
|
+
**Option B: Use the SDK**
|
|
83
|
+
```python
|
|
84
|
+
from hud.datasets import save_tasks
|
|
85
|
+
import json
|
|
86
|
+
|
|
87
|
+
# Load your tasks
|
|
88
|
+
with open("tasks.json") as f:
|
|
89
|
+
tasks = json.load(f)
|
|
90
|
+
|
|
91
|
+
# Push to HuggingFace
|
|
92
|
+
save_tasks(tasks, repo_id="your-org/your-dataset")
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### 3. Run and Track Performance
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
# Run Claude on your benchmark
|
|
99
|
+
hud eval "your-org/your-dataset" --agent claude
|
|
100
|
+
|
|
101
|
+
# View results at:
|
|
102
|
+
# hud.so/leaderboards/your-org/your-dataset
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
**Note**: Only public HuggingFace datasets appear as leaderboards!
|
|
106
|
+
|
|
107
|
+
📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
|
|
108
|
+
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Controller
|
|
2
|
+
|
|
3
|
+
Frontend for the agent: defines tools, minimal state, calls the environment over HTTP.
|
|
4
|
+
|
|
5
|
+
What to implement
|
|
6
|
+
- Shared client in `__init__.py` (one `httpx.AsyncClient`)
|
|
7
|
+
- Lifecycle in `hooks.py` (`@mcp.initialize`/`@mcp.shutdown`)
|
|
8
|
+
- Tools in `tools.py` (`@mcp.tool`) — keep logic thin; docstrings = descriptions
|
|
9
|
+
|
|
10
|
+
Run
|
|
11
|
+
```bash
|
|
12
|
+
hud run controller --transport http --reload
|
|
13
|
+
# Helper endpoints: http://localhost:8765/hud and /hud/tools
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
Principle: the controller is UX, not state. Keep long‑lived state in the environment.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Environment
|
|
2
|
+
|
|
3
|
+
Backend service: owns state and exposes HTTP APIs the controller calls.
|
|
4
|
+
|
|
5
|
+
Endpoints (FastAPI)
|
|
6
|
+
- `GET /health` → {status: ok}
|
|
7
|
+
- `POST /act` → increments counter and returns {count}
|
|
8
|
+
- `POST /reset` → resets counter
|
|
9
|
+
- `GET /state` → returns {count}
|
|
10
|
+
|
|
11
|
+
Run (dev)
|
|
12
|
+
```bash
|
|
13
|
+
uv run uvicorn environment.server:app --reload --port 8005
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
Principle: treat like a backend. Keep long‑lived state here; add endpoints as tools need them.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "test_test"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A minimal HUD environment"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [ "hud-python==0.4.37", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",]
|
|
7
|
+
|
|
8
|
+
[build-system]
|
|
9
|
+
requires = [ "hatchling",]
|
|
10
|
+
build-backend = "hatchling.build"
|
|
11
|
+
|
|
12
|
+
[tool.hud]
|
|
13
|
+
image = "test_test:dev"
|
|
14
|
+
|
|
15
|
+
[tool.hatch.metadata]
|
|
16
|
+
allow-direct-references = true
|
|
17
|
+
|
|
18
|
+
[tool.hatch.build.targets.wheel]
|
|
19
|
+
packages = [ "controller", "environment",]
|