hud-python 0.4.36__tar.gz → 0.4.37__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.36 → hud_python-0.4.37}/PKG-INFO +14 -12
- {hud_python-0.4.36 → hud_python-0.4.37}/README.md +11 -11
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/README.md +5 -5
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/blank/README.md +20 -4
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/blank/pyproject.toml +1 -1
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/browser/README.md +1 -1
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/browser/pyproject.toml +2 -2
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/deepresearch/pyproject.toml +1 -1
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/__init__.py +2 -0
- hud_python-0.4.37/hud/agents/lite_llm.py +72 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/openai_chat_generic.py +21 -7
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/__init__.py +19 -4
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/build.py +17 -2
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/dev.py +1 -1
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/eval.py +93 -13
- hud_python-0.4.37/hud/cli/flows/tasks.py +388 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/push.py +9 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/rl/__init__.py +14 -4
- hud_python-0.4.37/hud/cli/rl/celebrate.py +187 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/rl/config.py +15 -8
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/rl/local_runner.py +44 -20
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/rl/remote_runner.py +163 -86
- hud_python-0.4.37/hud/cli/rl/viewer.py +141 -0
- hud_python-0.4.37/hud/cli/rl/wait_utils.py +89 -0
- hud_python-0.4.37/hud/cli/utils/env_check.py +196 -0
- hud_python-0.4.37/hud/cli/utils/source_hash.py +108 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/clients/base.py +1 -1
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/clients/fastmcp.py +1 -1
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/otel/config.py +1 -1
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/otel/context.py +2 -2
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/vllm_adapter.py +1 -1
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/server/server.py +84 -13
- hud_python-0.4.37/hud/server/tests/test_add_tool.py +60 -0
- hud_python-0.4.37/hud/server/tests/test_context.py +128 -0
- hud_python-0.4.37/hud/server/tests/test_mcp_server_handlers.py +44 -0
- hud_python-0.4.37/hud/server/tests/test_mcp_server_integration.py +405 -0
- hud_python-0.4.37/hud/server/tests/test_mcp_server_more.py +247 -0
- hud_python-0.4.37/hud/server/tests/test_run_wrapper.py +53 -0
- hud_python-0.4.37/hud/server/tests/test_server_extra.py +166 -0
- hud_python-0.4.37/hud/server/tests/test_sigterm_runner.py +78 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/shared/hints.py +1 -1
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/telemetry/job.py +2 -2
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/types.py +9 -2
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/tasks.py +32 -24
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/version.py +1 -1
- {hud_python-0.4.36 → hud_python-0.4.37}/pyproject.toml +4 -1
- hud_python-0.4.36/environments/browser/environment/pyproject.toml +0 -20
- hud_python-0.4.36/hud/cli/flows/tasks.py +0 -256
- {hud_python-0.4.36 → hud_python-0.4.37}/.gitignore +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/LICENSE +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/blank/controller/README.md +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/blank/environment/README.md +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/browser/environment/2048/README.md +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/browser/environment/README.md +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/browser/environment/todo/README.md +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/examples/README.md +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/__main__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/base.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/claude.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/grounded_openai.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/openai.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/tests/test_base.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/clone.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/debug.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/flows/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/get.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/init.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/list_func.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/pull.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/remove.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/rl/display.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/rl/gpu.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/rl/gpu_utils.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/rl/presets.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/rl/rl_api.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/rl/vllm.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_build.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_mcp_server.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/config.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/docker.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/interactive.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/local_runner.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/package_runner.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/runner.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/server.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/cli/utils/tasks.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/clients/README.md +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/clients/mcp_use.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/clients/tests/test_mcp_use_retry.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/clients/utils/mcp_use_retry.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/clients/utils/retry.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/datasets/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/datasets/parallel.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/datasets/runner.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/datasets/utils.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/native/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/native/comparator.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/otel/collector.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/otel/exporters.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/otel/instrumentation.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/otel/processors.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/otel/tests/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/py.typed +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/README.md +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/actor.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/buffer.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/chat_template.jinja +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/config.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/distributed.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/learner.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/tests/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/tests/test_learner.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/train.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/types.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/utils/start_vllm_server.sh +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/rl/utils.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/samples/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/samples/browser.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/server/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/server/context.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/server/low_level.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/settings.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/shared/exceptions.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/shared/requests.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/shared/tests/test_exceptions.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/telemetry/tests/test_trace.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/telemetry/trace.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/base.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/bash.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/edit.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/executors/base.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/grounding/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/grounding/config.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/grounding/grounded_tool.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/grounding/grounder.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/grounding/tests/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/response.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/submit.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/types.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/tools/utils.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/agent_factories.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/group_eval.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/hud_console.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/mcp.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/pretty_errors.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/progress.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/tests/test_mcp.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.36 → hud_python-0.4.37}/hud/utils/tool_shorthand.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.37
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -36,11 +36,13 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
36
36
|
Classifier: Programming Language :: Python :: 3.13
|
|
37
37
|
Requires-Python: <3.13,>=3.11
|
|
38
38
|
Requires-Dist: anthropic
|
|
39
|
+
Requires-Dist: blessed>=1.20.0
|
|
39
40
|
Requires-Dist: datasets>=2.14.0
|
|
40
41
|
Requires-Dist: httpx<1,>=0.23.0
|
|
41
42
|
Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
|
|
42
43
|
Requires-Dist: hud-mcp-python-sdk>=3.13.2
|
|
43
44
|
Requires-Dist: hud-mcp-use-python-sdk==2.3.19
|
|
45
|
+
Requires-Dist: litellm>=1.55.0
|
|
44
46
|
Requires-Dist: numpy>=1.24.0
|
|
45
47
|
Requires-Dist: openai
|
|
46
48
|
Requires-Dist: opentelemetry-api>=1.34.1
|
|
@@ -156,8 +158,8 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
|
|
|
156
158
|
## Highlights
|
|
157
159
|
|
|
158
160
|
- 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
|
|
159
|
-
- ⚡️ **[Live telemetry](https://
|
|
160
|
-
- 🗂️ **[Public benchmarks](https://
|
|
161
|
+
- ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
|
|
162
|
+
- 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
|
|
161
163
|
- 🌱 **[Reinforcement learning built-in](rl/)** – Verifiers gym pipelines for GRPO on any environment.
|
|
162
164
|
- 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
|
|
163
165
|
- 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
|
|
@@ -203,14 +205,14 @@ from hud.agents import ClaudeAgent
|
|
|
203
205
|
from hud.datasets import Task # See docs: https://docs.hud.so/reference/tasks
|
|
204
206
|
|
|
205
207
|
async def main() -> None:
|
|
206
|
-
with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://
|
|
208
|
+
with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://hud.so)
|
|
207
209
|
task = {
|
|
208
210
|
"prompt": "Reach 64 in 2048.",
|
|
209
211
|
"mcp_config": {
|
|
210
212
|
"hud": {
|
|
211
213
|
"url": "https://mcp.hud.so/v3/mcp", # HUD's cloud MCP server (see https://docs.hud.so/core-concepts/architecture)
|
|
212
214
|
"headers": {
|
|
213
|
-
"Authorization": f"Bearer {settings.api_key}", # Get your key at https://
|
|
215
|
+
"Authorization": f"Bearer {settings.api_key}", # Get your key at https://hud.so
|
|
214
216
|
"Mcp-Image": "hudpython/hud-text-2048:v1.2" # Docker image from https://hub.docker.com/u/hudpython
|
|
215
217
|
}
|
|
216
218
|
}
|
|
@@ -237,7 +239,7 @@ async def main() -> None:
|
|
|
237
239
|
asyncio.run(main())
|
|
238
240
|
```
|
|
239
241
|
|
|
240
|
-
The above example let's the agent play 2048 ([See replay](https://
|
|
242
|
+
The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
|
|
241
243
|
|
|
242
244
|

|
|
243
245
|
|
|
@@ -268,7 +270,7 @@ Supports multi‑turn RL for both:
|
|
|
268
270
|
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
269
271
|
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
270
272
|
|
|
271
|
-
By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `
|
|
273
|
+
By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
|
|
272
274
|
|
|
273
275
|
Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
|
|
274
276
|
|
|
@@ -278,7 +280,7 @@ This is Claude Computer Use running on our proprietary financial analyst benchma
|
|
|
278
280
|
|
|
279
281
|

|
|
280
282
|
|
|
281
|
-
> [See this trace on
|
|
283
|
+
> [See this trace on _hud.so_](https://hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
|
|
282
284
|
|
|
283
285
|
This example runs the full dataset (only takes ~20 minutes) using [run_evaluation.py](examples/run_evaluation.py):
|
|
284
286
|
|
|
@@ -304,7 +306,7 @@ results = await run_dataset(
|
|
|
304
306
|
print(f"Average reward: {sum(r.reward for r in results) / len(results):.2f}")
|
|
305
307
|
```
|
|
306
308
|
|
|
307
|
-
> Running a dataset creates a job and streams results to the [
|
|
309
|
+
> Running a dataset creates a job and streams results to the [hud.so](https://hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
|
|
308
310
|
|
|
309
311
|
## Building Environments (MCP)
|
|
310
312
|
|
|
@@ -395,7 +397,7 @@ Tools
|
|
|
395
397
|
hud push # needs docker login, hud api key
|
|
396
398
|
```
|
|
397
399
|
|
|
398
|
-
5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [
|
|
400
|
+
5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [hud.so](https://hud.so):
|
|
399
401
|
|
|
400
402
|
```python
|
|
401
403
|
from hud.agents import ClaudeAgent
|
|
@@ -426,7 +428,7 @@ result = await ClaudeAgent().run({ # See all agents: https://docs.hud.so/refere
|
|
|
426
428
|
|
|
427
429
|
## Leaderboards & benchmarks
|
|
428
430
|
|
|
429
|
-
All leaderboards are publicly available on [
|
|
431
|
+
All leaderboards are publicly available on [hud.so/leaderboards](https://hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
|
|
430
432
|
|
|
431
433
|

|
|
432
434
|
|
|
@@ -440,7 +442,7 @@ Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) funct
|
|
|
440
442
|
%%{init: {"theme": "neutral", "themeVariables": {"fontSize": "14px"}} }%%
|
|
441
443
|
graph LR
|
|
442
444
|
subgraph "Platform"
|
|
443
|
-
Dashboard["📊
|
|
445
|
+
Dashboard["📊 hud.so"]
|
|
444
446
|
API["🔌 mcp.hud.so"]
|
|
445
447
|
end
|
|
446
448
|
|
|
@@ -23,8 +23,8 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
|
|
|
23
23
|
## Highlights
|
|
24
24
|
|
|
25
25
|
- 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
|
|
26
|
-
- ⚡️ **[Live telemetry](https://
|
|
27
|
-
- 🗂️ **[Public benchmarks](https://
|
|
26
|
+
- ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
|
|
27
|
+
- 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
|
|
28
28
|
- 🌱 **[Reinforcement learning built-in](rl/)** – Verifiers gym pipelines for GRPO on any environment.
|
|
29
29
|
- 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
|
|
30
30
|
- 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
|
|
@@ -70,14 +70,14 @@ from hud.agents import ClaudeAgent
|
|
|
70
70
|
from hud.datasets import Task # See docs: https://docs.hud.so/reference/tasks
|
|
71
71
|
|
|
72
72
|
async def main() -> None:
|
|
73
|
-
with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://
|
|
73
|
+
with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://hud.so)
|
|
74
74
|
task = {
|
|
75
75
|
"prompt": "Reach 64 in 2048.",
|
|
76
76
|
"mcp_config": {
|
|
77
77
|
"hud": {
|
|
78
78
|
"url": "https://mcp.hud.so/v3/mcp", # HUD's cloud MCP server (see https://docs.hud.so/core-concepts/architecture)
|
|
79
79
|
"headers": {
|
|
80
|
-
"Authorization": f"Bearer {settings.api_key}", # Get your key at https://
|
|
80
|
+
"Authorization": f"Bearer {settings.api_key}", # Get your key at https://hud.so
|
|
81
81
|
"Mcp-Image": "hudpython/hud-text-2048:v1.2" # Docker image from https://hub.docker.com/u/hudpython
|
|
82
82
|
}
|
|
83
83
|
}
|
|
@@ -104,7 +104,7 @@ async def main() -> None:
|
|
|
104
104
|
asyncio.run(main())
|
|
105
105
|
```
|
|
106
106
|
|
|
107
|
-
The above example let's the agent play 2048 ([See replay](https://
|
|
107
|
+
The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
|
|
108
108
|
|
|
109
109
|

|
|
110
110
|
|
|
@@ -135,7 +135,7 @@ Supports multi‑turn RL for both:
|
|
|
135
135
|
- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
|
|
136
136
|
- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
|
|
137
137
|
|
|
138
|
-
By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `
|
|
138
|
+
By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
|
|
139
139
|
|
|
140
140
|
Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
|
|
141
141
|
|
|
@@ -145,7 +145,7 @@ This is Claude Computer Use running on our proprietary financial analyst benchma
|
|
|
145
145
|
|
|
146
146
|

|
|
147
147
|
|
|
148
|
-
> [See this trace on
|
|
148
|
+
> [See this trace on _hud.so_](https://hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
|
|
149
149
|
|
|
150
150
|
This example runs the full dataset (only takes ~20 minutes) using [run_evaluation.py](examples/run_evaluation.py):
|
|
151
151
|
|
|
@@ -171,7 +171,7 @@ results = await run_dataset(
|
|
|
171
171
|
print(f"Average reward: {sum(r.reward for r in results) / len(results):.2f}")
|
|
172
172
|
```
|
|
173
173
|
|
|
174
|
-
> Running a dataset creates a job and streams results to the [
|
|
174
|
+
> Running a dataset creates a job and streams results to the [hud.so](https://hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
|
|
175
175
|
|
|
176
176
|
## Building Environments (MCP)
|
|
177
177
|
|
|
@@ -262,7 +262,7 @@ Tools
|
|
|
262
262
|
hud push # needs docker login, hud api key
|
|
263
263
|
```
|
|
264
264
|
|
|
265
|
-
5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [
|
|
265
|
+
5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [hud.so](https://hud.so):
|
|
266
266
|
|
|
267
267
|
```python
|
|
268
268
|
from hud.agents import ClaudeAgent
|
|
@@ -293,7 +293,7 @@ result = await ClaudeAgent().run({ # See all agents: https://docs.hud.so/refere
|
|
|
293
293
|
|
|
294
294
|
## Leaderboards & benchmarks
|
|
295
295
|
|
|
296
|
-
All leaderboards are publicly available on [
|
|
296
|
+
All leaderboards are publicly available on [hud.so/leaderboards](https://hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
|
|
297
297
|
|
|
298
298
|

|
|
299
299
|
|
|
@@ -307,7 +307,7 @@ Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) funct
|
|
|
307
307
|
%%{init: {"theme": "neutral", "themeVariables": {"fontSize": "14px"}} }%%
|
|
308
308
|
graph LR
|
|
309
309
|
subgraph "Platform"
|
|
310
|
-
Dashboard["📊
|
|
310
|
+
Dashboard["📊 hud.so"]
|
|
311
311
|
API["🔌 mcp.hud.so"]
|
|
312
312
|
end
|
|
313
313
|
|
|
@@ -495,7 +495,7 @@ from hud.agents import ClaudeAgent
|
|
|
495
495
|
from hud.clients import MCPClient
|
|
496
496
|
|
|
497
497
|
async def main():
|
|
498
|
-
# `trace` captures *everything* that happens and sends it to
|
|
498
|
+
# `trace` captures *everything* that happens and sends it to hud.so
|
|
499
499
|
with hud.trace("local_test"):
|
|
500
500
|
task = Task(
|
|
501
501
|
prompt="Complete the task",
|
|
@@ -524,7 +524,7 @@ async def main():
|
|
|
524
524
|
asyncio.run(main())
|
|
525
525
|
```
|
|
526
526
|
|
|
527
|
-
The `trace` context manager sends a full timeline of agent actions, tool calls, and rewards to
|
|
527
|
+
The `trace` context manager sends a full timeline of agent actions, tool calls, and rewards to hud.so – perfect for debugging.
|
|
528
528
|
|
|
529
529
|
See `examples/01_hello_2048.py` and `examples/task_with_setup_eval.py` for larger end-to-end demos.
|
|
530
530
|
|
|
@@ -532,7 +532,7 @@ See `examples/01_hello_2048.py` and `examples/task_with_setup_eval.py` for large
|
|
|
532
532
|
|
|
533
533
|
## Phase 4 – Remote Deployment & HUD Runner
|
|
534
534
|
|
|
535
|
-
**Goal →** the exact same image runs in parallel on hundreds of instances, and exposes more telemetry so the
|
|
535
|
+
**Goal →** the exact same image runs in parallel on hundreds of instances, and exposes more telemetry so the hud.so can visualise the whole lifecycle.
|
|
536
536
|
|
|
537
537
|
### 1. Publish your image
|
|
538
538
|
|
|
@@ -595,11 +595,11 @@ async def initialize_environment(session=None, progress_token=None):
|
|
|
595
595
|
await send(100, "ready")
|
|
596
596
|
```
|
|
597
597
|
|
|
598
|
-
Those messages are displayed live on
|
|
598
|
+
Those messages are displayed live on hud.so alongside resource graphs – perfect feedback while you wait.
|
|
599
599
|
|
|
600
600
|
### 4. Live telemetry (`telemetry://live`) (Optional)
|
|
601
601
|
|
|
602
|
-
Expose a resource named `telemetry://live` exactly like in `environments/browser/src/hud_controller/server.py` to return live url to be displayed on
|
|
602
|
+
Expose a resource named `telemetry://live` exactly like in `environments/browser/src/hud_controller/server.py` to return live url to be displayed on hud.so.
|
|
603
603
|
|
|
604
604
|
Once all of the above works you can unleash *hundreds* of concurrent agents on your new environment.
|
|
605
605
|
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
|
|
11
11
|
IMPORTANT: Make sure all logs are going to stderr instead of stdio, which is reserved for MCP communication
|
|
12
12
|
|
|
13
|
-
###
|
|
13
|
+
### Testing your environment
|
|
14
14
|
```bash
|
|
15
15
|
# 1. Configure your API keys (optional - only needed for evaluation)
|
|
16
16
|
# Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY
|
|
@@ -24,13 +24,29 @@ hud dev --build --interactive
|
|
|
24
24
|
hud eval tasks.json --agent claude
|
|
25
25
|
|
|
26
26
|
# Option B: Interactive notebook test_env.ipynb (great for learning!)
|
|
27
|
-
# Requires installation:
|
|
28
|
-
pip install hud-python[agents]
|
|
29
27
|
|
|
30
28
|
# Option C: Simple Python script (runs all tasks from tasks.json)
|
|
31
29
|
python test_task.py
|
|
32
30
|
```
|
|
33
31
|
|
|
32
|
+
## Iterating on your environment
|
|
33
|
+
This is usually the process for making any environment better:
|
|
34
|
+
```bash
|
|
35
|
+
# 1. Start the environment and interact with it directly (or give MCP server to an agent):
|
|
36
|
+
hud dev --build --interactive
|
|
37
|
+
|
|
38
|
+
# 2. If the environment cannot start or fails inexplicably:
|
|
39
|
+
hud debug test_env:dev # Or your env name that appears when you run hud dev
|
|
40
|
+
# After fixing the error, go back to 1.
|
|
41
|
+
|
|
42
|
+
# 3. When the environment is in a stable state:
|
|
43
|
+
hud build
|
|
44
|
+
hud push # Requires docker login
|
|
45
|
+
|
|
46
|
+
# 4. As soon as it's pushed to the newest version, make sure tasks have it updated and run:
|
|
47
|
+
hud rl
|
|
48
|
+
# This is a good test to see if your environment and tasks are high quality!
|
|
49
|
+
|
|
34
50
|
## Layout
|
|
35
51
|
```
|
|
36
52
|
controller/
|
|
@@ -83,7 +99,7 @@ save_tasks(tasks, repo_id="your-org/your-dataset")
|
|
|
83
99
|
hud eval "your-org/your-dataset" --agent claude
|
|
84
100
|
|
|
85
101
|
# View results at:
|
|
86
|
-
#
|
|
102
|
+
# hud.so/leaderboards/your-org/your-dataset
|
|
87
103
|
```
|
|
88
104
|
|
|
89
105
|
**Note**: Only public HuggingFace datasets appear as leaderboards!
|
|
@@ -3,7 +3,7 @@ name = "test_test"
|
|
|
3
3
|
version = "0.1.0"
|
|
4
4
|
description = "A minimal HUD environment"
|
|
5
5
|
requires-python = ">=3.11"
|
|
6
|
-
dependencies = [ "hud-python==0.4.
|
|
6
|
+
dependencies = [ "hud-python==0.4.37", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",]
|
|
7
7
|
|
|
8
8
|
[build-system]
|
|
9
9
|
requires = [ "hatchling",]
|
|
@@ -75,7 +75,7 @@ save_tasks(tasks, repo_id="your-org/your-dataset")
|
|
|
75
75
|
hud eval "your-org/your-dataset" --agent claude
|
|
76
76
|
|
|
77
77
|
# View results at:
|
|
78
|
-
#
|
|
78
|
+
# hud.so/leaderboards/your-org/your-dataset
|
|
79
79
|
```
|
|
80
80
|
|
|
81
81
|
**Note**: Only public HuggingFace datasets appear as leaderboards!
|
|
@@ -3,7 +3,7 @@ name = "hud-browser-controller"
|
|
|
3
3
|
version = "0.1.0"
|
|
4
4
|
description = "HUD Browser Controller - MCP interface for browser environments"
|
|
5
5
|
requires-python = ">=3.11,<3.14"
|
|
6
|
-
dependencies = [ "pydantic>=2.6,<3", "pydantic-settings>=2.2,<3", "hud-python@git+https://github.com/hud-evals/hud-python@env-cli-improvements", "playwright", "pyautogui", "httpx", "typer", "fastapi", "uvicorn",]
|
|
6
|
+
dependencies = [ "pydantic>=2.6,<3", "pydantic-settings>=2.2,<3", "hud-python@git+https://github.com/hud-evals/hud-python@env-cli-improvements", "playwright", "pyautogui", "httpx", "typer", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "python-multipart>=0.0.6",]
|
|
7
7
|
|
|
8
8
|
[build-system]
|
|
9
9
|
requires = [ "hatchling",]
|
|
@@ -19,4 +19,4 @@ image = "hud-browser:dev"
|
|
|
19
19
|
allow-direct-references = true
|
|
20
20
|
|
|
21
21
|
[tool.hatch.build.targets.wheel]
|
|
22
|
-
packages = [ "controller", "
|
|
22
|
+
packages = [ "controller", "environment",]
|
|
@@ -3,7 +3,7 @@ name = "deepresearch"
|
|
|
3
3
|
version = "0.1.0"
|
|
4
4
|
description = "DeepResearch HUD environment with HTTP backend (EXA on server)"
|
|
5
5
|
requires-python = ">=3.11"
|
|
6
|
-
dependencies = [ "hud-python==0.4.
|
|
6
|
+
dependencies = [ "hud-python==0.4.37", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "httpx>=0.24.0",]
|
|
7
7
|
|
|
8
8
|
[build-system]
|
|
9
9
|
requires = [ "hatchling",]
|
|
@@ -2,12 +2,14 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from .base import MCPAgent
|
|
4
4
|
from .claude import ClaudeAgent
|
|
5
|
+
from .lite_llm import LiteAgent
|
|
5
6
|
from .openai import OperatorAgent
|
|
6
7
|
from .openai_chat_generic import GenericOpenAIChatAgent
|
|
7
8
|
|
|
8
9
|
__all__ = [
|
|
9
10
|
"ClaudeAgent",
|
|
10
11
|
"GenericOpenAIChatAgent",
|
|
12
|
+
"LiteAgent",
|
|
11
13
|
"MCPAgent",
|
|
12
14
|
"OperatorAgent",
|
|
13
15
|
]
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""LiteLLM MCP Agent implementation.
|
|
2
|
+
|
|
3
|
+
Same OpenAI chat-completions shape + MCP tool plumbing,
|
|
4
|
+
but transport is LiteLLM and (optionally) tools are shaped by LiteLLM's MCP transformer.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any, ClassVar
|
|
11
|
+
|
|
12
|
+
import litellm
|
|
13
|
+
|
|
14
|
+
from .openai_chat_generic import GenericOpenAIChatAgent
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
# Prefer LiteLLM's built-in MCP -> OpenAI tool transformer (handles Bedrock nuances)
|
|
19
|
+
try:
|
|
20
|
+
from litellm.experimental_mcp_client.tools import (
|
|
21
|
+
transform_mcp_tool_to_openai_tool,
|
|
22
|
+
)
|
|
23
|
+
except Exception: # pragma: no cover - optional dependency
|
|
24
|
+
transform_mcp_tool_to_openai_tool = None # type: ignore
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class LiteAgent(GenericOpenAIChatAgent):
|
|
28
|
+
"""
|
|
29
|
+
Same OpenAI chat-completions shape + MCP tool plumbing,
|
|
30
|
+
but transport is LiteLLM and (optionally) tools are shaped by LiteLLM's MCP transformer.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
metadata: ClassVar[dict[str, Any]] = {}
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
*,
|
|
38
|
+
model_name: str = "gpt-4o-mini",
|
|
39
|
+
completion_kwargs: dict[str, Any] | None = None,
|
|
40
|
+
**agent_kwargs: Any,
|
|
41
|
+
) -> None:
|
|
42
|
+
# We don't need an OpenAI client; pass None
|
|
43
|
+
super().__init__(
|
|
44
|
+
openai_client=None,
|
|
45
|
+
model_name=model_name,
|
|
46
|
+
completion_kwargs=completion_kwargs,
|
|
47
|
+
**agent_kwargs,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def get_tool_schemas(self) -> list[dict]:
|
|
51
|
+
# Prefer LiteLLM's stricter transformer (handles Bedrock & friends)
|
|
52
|
+
if transform_mcp_tool_to_openai_tool is not None:
|
|
53
|
+
return [
|
|
54
|
+
transform_mcp_tool_to_openai_tool(t) # returns ChatCompletionToolParam-like dict
|
|
55
|
+
for t in self.get_available_tools()
|
|
56
|
+
]
|
|
57
|
+
# Fallback to the generic OpenAI sanitizer
|
|
58
|
+
return GenericOpenAIChatAgent.get_tool_schemas(self)
|
|
59
|
+
|
|
60
|
+
async def _invoke_chat_completion(
|
|
61
|
+
self,
|
|
62
|
+
*,
|
|
63
|
+
messages: list[Any],
|
|
64
|
+
tools: list[dict] | None,
|
|
65
|
+
extra: dict[str, Any],
|
|
66
|
+
):
|
|
67
|
+
return await litellm.acompletion(
|
|
68
|
+
model=self.model_name,
|
|
69
|
+
messages=messages,
|
|
70
|
+
tools=tools or None, # LiteLLM tolerates None better than []
|
|
71
|
+
**extra,
|
|
72
|
+
)
|
|
@@ -42,7 +42,7 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
42
42
|
def __init__(
|
|
43
43
|
self,
|
|
44
44
|
*,
|
|
45
|
-
openai_client: AsyncOpenAI,
|
|
45
|
+
openai_client: AsyncOpenAI | None,
|
|
46
46
|
model_name: str = "gpt-4o-mini",
|
|
47
47
|
completion_kwargs: dict[str, Any] | None = None,
|
|
48
48
|
**agent_kwargs: Any,
|
|
@@ -171,6 +171,23 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
171
171
|
openai_tools.append(openai_tool)
|
|
172
172
|
return openai_tools
|
|
173
173
|
|
|
174
|
+
async def _invoke_chat_completion(
|
|
175
|
+
self,
|
|
176
|
+
*,
|
|
177
|
+
messages: list[Any],
|
|
178
|
+
tools: list[dict] | None,
|
|
179
|
+
extra: dict[str, Any],
|
|
180
|
+
):
|
|
181
|
+
if self.oai is None:
|
|
182
|
+
raise ValueError("openai_client is required for GenericOpenAIChatAgent")
|
|
183
|
+
# default transport = OpenAI SDK
|
|
184
|
+
return await self.oai.chat.completions.create(
|
|
185
|
+
model=self.model_name,
|
|
186
|
+
messages=messages,
|
|
187
|
+
tools=tools, # already ChatCompletionToolParam-shaped
|
|
188
|
+
**extra,
|
|
189
|
+
)
|
|
190
|
+
|
|
174
191
|
@instrument(
|
|
175
192
|
span_type="agent",
|
|
176
193
|
record_args=False,
|
|
@@ -180,17 +197,14 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
180
197
|
"""Send chat request to OpenAI and convert the response."""
|
|
181
198
|
|
|
182
199
|
# Convert MCP tool schemas to OpenAI format
|
|
183
|
-
|
|
200
|
+
tools = cast("list[ChatCompletionToolParam]", self.get_tool_schemas())
|
|
184
201
|
|
|
185
202
|
protected_keys = {"model", "messages", "tools"}
|
|
186
203
|
extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
|
|
187
204
|
|
|
188
205
|
try:
|
|
189
|
-
response = await self.
|
|
190
|
-
|
|
191
|
-
messages=messages,
|
|
192
|
-
tools=cast("list[ChatCompletionToolParam]", mcp_schemas),
|
|
193
|
-
**extra,
|
|
206
|
+
response = await self._invoke_chat_completion(
|
|
207
|
+
messages=messages, tools=tools, extra=extra
|
|
194
208
|
)
|
|
195
209
|
except Exception as e:
|
|
196
210
|
error_content = f"Error getting response {e}"
|
|
@@ -912,7 +912,7 @@ def eval(
|
|
|
912
912
|
agent: str | None = typer.Argument(
|
|
913
913
|
None,
|
|
914
914
|
help=(
|
|
915
|
-
"Agent backend to use (claude, openai, or
|
|
915
|
+
"Agent backend to use (claude, openai, vllm, or litellm). If not provided, will prompt interactively." # noqa: E501
|
|
916
916
|
),
|
|
917
917
|
),
|
|
918
918
|
full: bool = typer.Option(
|
|
@@ -960,6 +960,12 @@ def eval(
|
|
|
960
960
|
"--verbose",
|
|
961
961
|
help="Enable verbose output from the agent",
|
|
962
962
|
),
|
|
963
|
+
very_verbose: bool = typer.Option(
|
|
964
|
+
False,
|
|
965
|
+
"--very-verbose",
|
|
966
|
+
"-vv",
|
|
967
|
+
help="Enable debug-level logs for maximum visibility",
|
|
968
|
+
),
|
|
963
969
|
vllm_base_url: str | None = typer.Option(
|
|
964
970
|
None,
|
|
965
971
|
"--vllm-base-url",
|
|
@@ -1025,13 +1031,14 @@ def eval(
|
|
|
1025
1031
|
{"name": "Claude 4 Sonnet", "value": "claude"},
|
|
1026
1032
|
{"name": "OpenAI Computer Use", "value": "openai"},
|
|
1027
1033
|
{"name": "vLLM (Local Server)", "value": "vllm"},
|
|
1034
|
+
{"name": "LiteLLM (Multi-provider)", "value": "litellm"},
|
|
1028
1035
|
]
|
|
1029
1036
|
)
|
|
1030
1037
|
|
|
1031
1038
|
agent = hud_console.select("Select an agent to use:", choices=choices, default=0)
|
|
1032
1039
|
|
|
1033
1040
|
# Handle HUD model selection
|
|
1034
|
-
if agent and agent not in ["claude", "openai", "vllm"]:
|
|
1041
|
+
if agent and agent not in ["claude", "openai", "vllm", "litellm"]:
|
|
1035
1042
|
# Find remote model name
|
|
1036
1043
|
model = agent
|
|
1037
1044
|
if not vllm_base_url:
|
|
@@ -1052,7 +1059,7 @@ def eval(
|
|
|
1052
1059
|
hud_console.info(f"Using HUD model: {model} (trained on {base_model})")
|
|
1053
1060
|
|
|
1054
1061
|
# Validate agent choice
|
|
1055
|
-
valid_agents = ["claude", "openai", "vllm"]
|
|
1062
|
+
valid_agents = ["claude", "openai", "vllm", "litellm"]
|
|
1056
1063
|
if agent not in valid_agents:
|
|
1057
1064
|
hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
|
|
1058
1065
|
raise typer.Exit(1)
|
|
@@ -1070,6 +1077,7 @@ def eval(
|
|
|
1070
1077
|
max_workers=max_workers,
|
|
1071
1078
|
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
1072
1079
|
verbose=verbose,
|
|
1080
|
+
very_verbose=very_verbose,
|
|
1073
1081
|
vllm_base_url=vllm_base_url,
|
|
1074
1082
|
group_size=group_size,
|
|
1075
1083
|
)
|
|
@@ -1119,7 +1127,7 @@ def rl(
|
|
|
1119
1127
|
),
|
|
1120
1128
|
model: str | None = typer.Argument(
|
|
1121
1129
|
None,
|
|
1122
|
-
help="Model to train (default: interactive selection)",
|
|
1130
|
+
help="Model to train from https://hud.so/models (default: interactive selection)",
|
|
1123
1131
|
),
|
|
1124
1132
|
config_file: Path | None = typer.Option( # noqa: B008
|
|
1125
1133
|
None,
|
|
@@ -1159,6 +1167,12 @@ def rl(
|
|
|
1159
1167
|
"--ddp-gpus",
|
|
1160
1168
|
help="Specific GPUs for DDP (e.g., '0,1,2,3')",
|
|
1161
1169
|
),
|
|
1170
|
+
yes: bool = typer.Option(
|
|
1171
|
+
False,
|
|
1172
|
+
"--yes",
|
|
1173
|
+
"-y",
|
|
1174
|
+
help="Auto-accept all prompts and use defaults (lazy mode)",
|
|
1175
|
+
),
|
|
1162
1176
|
vllm_gpu: int | None = typer.Option(
|
|
1163
1177
|
None,
|
|
1164
1178
|
"--vllm-gpu",
|
|
@@ -1180,6 +1194,7 @@ def rl(
|
|
|
1180
1194
|
no_ddp=no_ddp,
|
|
1181
1195
|
ddp_gpus=ddp_gpus,
|
|
1182
1196
|
vllm_gpu=vllm_gpu,
|
|
1197
|
+
yes=yes,
|
|
1183
1198
|
)
|
|
1184
1199
|
|
|
1185
1200
|
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
+
import contextlib
|
|
6
7
|
import hashlib
|
|
7
8
|
import subprocess
|
|
8
9
|
import time
|
|
@@ -13,6 +14,7 @@ from typing import Any
|
|
|
13
14
|
import typer
|
|
14
15
|
import yaml
|
|
15
16
|
|
|
17
|
+
from hud.cli.utils.source_hash import compute_source_hash, list_source_files
|
|
16
18
|
from hud.clients import MCPClient
|
|
17
19
|
from hud.utils.hud_console import HUDConsole
|
|
18
20
|
from hud.version import __version__ as hud_version
|
|
@@ -341,10 +343,11 @@ def build_environment(
|
|
|
341
343
|
required_env, optional_env = extract_env_vars_from_dockerfile(dockerfile_path)
|
|
342
344
|
|
|
343
345
|
# Merge user-provided env vars with detected ones
|
|
344
|
-
provided_env_vars = {}
|
|
346
|
+
provided_env_vars: dict[str, str] = {}
|
|
345
347
|
missing_required = []
|
|
346
348
|
if env_vars:
|
|
347
|
-
|
|
349
|
+
# Use placeholders in lock file for any provided values to avoid storing secrets
|
|
350
|
+
provided_env_vars = {k: f"${{{k}}}" for k in env_vars}
|
|
348
351
|
# Track which required vars are still missing
|
|
349
352
|
missing_required = [e for e in required_env if e not in env_vars]
|
|
350
353
|
|
|
@@ -384,6 +387,8 @@ def build_environment(
|
|
|
384
387
|
"hudVersion": hud_version,
|
|
385
388
|
"directory": str(env_dir.name),
|
|
386
389
|
"version": new_version, # Internal environment version
|
|
390
|
+
# Fast source fingerprint for change detection
|
|
391
|
+
"sourceHash": compute_source_hash(env_dir),
|
|
387
392
|
},
|
|
388
393
|
"environment": {
|
|
389
394
|
"initializeMs": analysis["initializeMs"],
|
|
@@ -424,6 +429,16 @@ def build_environment(
|
|
|
424
429
|
with open(lock_path, "w") as f:
|
|
425
430
|
yaml.dump(lock_content, f, default_flow_style=False, sort_keys=False)
|
|
426
431
|
|
|
432
|
+
# Also write the file list we hashed for transparency (non-essential)
|
|
433
|
+
with contextlib.suppress(Exception):
|
|
434
|
+
files = [
|
|
435
|
+
str(p.resolve().relative_to(env_dir)).replace("\\", "/")
|
|
436
|
+
for p in list_source_files(env_dir)
|
|
437
|
+
]
|
|
438
|
+
lock_content["build"]["sourceFiles"] = files
|
|
439
|
+
with open(lock_path, "w") as f:
|
|
440
|
+
yaml.dump(lock_content, f, default_flow_style=False, sort_keys=False)
|
|
441
|
+
|
|
427
442
|
hud_console.success("Created lock file: hud.lock.yaml")
|
|
428
443
|
|
|
429
444
|
# Calculate lock file hash
|
|
@@ -530,7 +530,7 @@ async def start_mcp_proxy(
|
|
|
530
530
|
stderr=asyncio.subprocess.DEVNULL,
|
|
531
531
|
)
|
|
532
532
|
await stop_result.communicate()
|
|
533
|
-
hud_console.success("
|
|
533
|
+
hud_console.success("Container stopped successfully")
|
|
534
534
|
container_stopped = True
|
|
535
535
|
except Exception as e:
|
|
536
536
|
hud_console.warning(f"Failed to stop container: {e}")
|