hud-python 0.4.48__tar.gz → 0.4.49__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.48 → hud_python-0.4.49}/PKG-INFO +1 -1
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/README.md +56 -45
- hud_python-0.4.49/environments/blank/README.md +121 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/blank/environment/README.md +1 -1
- hud_python-0.4.49/environments/blank/environment/pyproject.toml +16 -0
- hud_python-0.4.49/environments/blank/server/README.md +21 -0
- hud_python-0.4.49/environments/blank/server/pyproject.toml +19 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/browser/README.md +24 -25
- hud_python-0.4.49/environments/browser/environment/pyproject.toml +23 -0
- hud_python-0.4.49/environments/browser/server/pyproject.toml +21 -0
- hud_python-0.4.49/environments/deepresearch/README.md +165 -0
- hud_python-0.4.49/environments/deepresearch/environment/pyproject.toml +17 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/deepresearch/pyproject.toml +1 -1
- hud_python-0.4.49/environments/deepresearch/server/pyproject.toml +19 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/base.py +40 -34
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/grounded_openai.py +1 -1
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/__init__.py +78 -213
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/build.py +105 -45
- hud_python-0.4.49/hud/cli/dev.py +699 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/flows/tasks.py +98 -17
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/init.py +18 -14
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/push.py +27 -9
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/rl/local_runner.py +3 -3
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_eval.py +168 -119
- hud_python-0.4.49/hud/cli/tests/test_mcp_server.py +36 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/env_check.py +9 -9
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/source_hash.py +1 -1
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/server/__init__.py +2 -1
- hud_python-0.4.49/hud/server/router.py +160 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/server/server.py +246 -79
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/base.py +9 -1
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/hud_console.py +43 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/version.py +1 -1
- {hud_python-0.4.48 → hud_python-0.4.49}/pyproject.toml +1 -1
- hud_python-0.4.48/environments/blank/README.md +0 -108
- hud_python-0.4.48/environments/blank/controller/README.md +0 -16
- hud_python-0.4.48/environments/blank/pyproject.toml +0 -19
- hud_python-0.4.48/hud/cli/dev.py +0 -828
- hud_python-0.4.48/hud/cli/tests/test_mcp_server.py +0 -125
- {hud_python-0.4.48 → hud_python-0.4.49}/.gitignore +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/LICENSE +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/README.md +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/browser/environment/2048/README.md +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/browser/environment/README.md +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/browser/environment/todo/README.md +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/browser/pyproject.toml +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/examples/README.md +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/__main__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/claude.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/lite_llm.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/misc/integration_test_agent.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/openai.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/openai_chat_generic.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/tests/test_base.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/clone.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/debug.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/eval.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/flows/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/get.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/list_func.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/pull.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/remove.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/rl/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/rl/celebrate.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/rl/config.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/rl/display.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/rl/gpu.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/rl/gpu_utils.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/rl/presets.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/rl/remote_runner.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/rl/rl_api.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/rl/viewer.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/rl/vllm.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/rl/wait_utils.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_build.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/config.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/docker.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/interactive.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/local_runner.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/package_runner.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/runner.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/server.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/cli/utils/tasks.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/clients/README.md +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/clients/base.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/clients/fastmcp.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/clients/mcp_use.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/clients/tests/test_mcp_use_retry.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/clients/utils/mcp_use_retry.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/clients/utils/retry.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/datasets/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/datasets/parallel.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/datasets/runner.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/datasets/utils.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/native/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/native/comparator.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/otel/collector.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/otel/config.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/otel/context.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/otel/exporters.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/otel/instrumentation.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/otel/processors.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/otel/tests/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/py.typed +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/README.md +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/actor.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/buffer.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/chat_template.jinja +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/config.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/distributed.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/learner.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/tests/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/tests/test_learner.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/train.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/types.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/utils/start_vllm_server.sh +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/utils.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/rl/vllm_adapter.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/samples/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/samples/browser.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/server/context.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/server/low_level.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/server/tests/test_add_tool.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/server/tests/test_context.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/server/tests/test_mcp_server_handlers.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/server/tests/test_mcp_server_integration.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/server/tests/test_mcp_server_more.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/server/tests/test_run_wrapper.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/server/tests/test_server_extra.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/server/tests/test_sigterm_runner.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/settings.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/shared/exceptions.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/shared/hints.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/shared/requests.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/shared/tests/test_exceptions.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/telemetry/job.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/telemetry/tests/test_trace.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/telemetry/trace.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/bash.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/computer/qwen.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/edit.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/executors/base.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/grounding/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/grounding/config.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/grounding/grounded_tool.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/grounding/grounder.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/grounding/tests/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/response.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/submit.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/types.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/tools/utils.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/types.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/agent_factories.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/group_eval.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/mcp.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/pretty_errors.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/progress.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/tasks.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/tests/test_mcp.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.48 → hud_python-0.4.49}/hud/utils/tool_shorthand.py +0 -0
|
@@ -156,24 +156,24 @@ For Python-based MCP environments, use this standard structure:
|
|
|
156
156
|
```
|
|
157
157
|
my-environment/
|
|
158
158
|
├── Dockerfile
|
|
159
|
-
├──
|
|
160
|
-
├──
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
159
|
+
├── README.md
|
|
160
|
+
├── server/ # MCP server package
|
|
161
|
+
│ ├── pyproject.toml # MCP dependencies (hud-python, etc.)
|
|
162
|
+
│ ├── __init__.py # Empty package marker
|
|
163
|
+
│ ├── main.py # mcp = MCPServer() + lifecycle hooks
|
|
164
|
+
│ ├── tools.py # router = MCPRouter() + @router.tool decorators
|
|
165
|
+
│ ├── setup/ # Setup router (modular approach)
|
|
166
|
+
│ │ ├── __init__.py
|
|
167
|
+
│ │ ├── basic.py # Basic setup functions
|
|
168
|
+
│ │ └── advanced.py # Advanced setup functions
|
|
169
|
+
│ └── evaluate/ # Evaluate router (modular approach)
|
|
170
|
+
│ ├── __init__.py
|
|
171
|
+
│ ├── checks.py # Basic evaluation checks
|
|
172
|
+
│ └── metrics.py # Advanced metrics evaluators
|
|
173
|
+
└── environment/ # Backend service package
|
|
174
|
+
├── pyproject.toml # Backend dependencies (fastapi, uvicorn)
|
|
175
|
+
├── __init__.py
|
|
176
|
+
└── server.py # FastAPI app with /health, /act, /reset, /state
|
|
177
177
|
```
|
|
178
178
|
|
|
179
179
|
This structure enables:
|
|
@@ -607,51 +607,62 @@ Once all of the above works you can unleash *hundreds* of concurrent agents on y
|
|
|
607
607
|
|
|
608
608
|
## Phase 5 – Hot-Reload Development
|
|
609
609
|
|
|
610
|
-
|
|
610
|
+
For rapid local development, run the controller and environment servers separately. This enables instant code updates without Docker rebuilds.
|
|
611
611
|
|
|
612
|
+
### Development Setup
|
|
613
|
+
|
|
614
|
+
You'll need **two terminal windows** for local development:
|
|
615
|
+
|
|
616
|
+
#### Terminal 1: MCP Server
|
|
612
617
|
```bash
|
|
613
|
-
|
|
614
|
-
|
|
618
|
+
cd environments/my-environment/server
|
|
619
|
+
hud dev # Auto-detects and runs with hot-reload
|
|
615
620
|
|
|
616
|
-
#
|
|
617
|
-
hud dev --
|
|
621
|
+
# Optional flags:
|
|
622
|
+
hud dev --inspector # Launch MCP Inspector
|
|
623
|
+
hud dev --interactive # Launch interactive testing mode
|
|
624
|
+
hud dev --stdio # Use stdio transport (default: HTTP)
|
|
625
|
+
hud dev --watch ../shared # Watch additional directories
|
|
626
|
+
```
|
|
627
|
+
|
|
628
|
+
The `hud dev` command:
|
|
629
|
+
- Auto-detects the MCP module in the current directory
|
|
630
|
+
- Watches for file changes and reloads automatically
|
|
631
|
+
- Runs on HTTP by default (http://localhost:8765/mcp)
|
|
632
|
+
- Can launch MCP Inspector for testing tools
|
|
633
|
+
- Can launch interactive mode for manual testing
|
|
618
634
|
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
# }
|
|
624
|
-
# ✨ Add to Cursor: cursor://anysphere.cursor-deeplink/mcp/install?name=...
|
|
625
|
-
# 🌐 Reloading proxy live, press Ctrl+C to stop
|
|
635
|
+
#### Terminal 2: Environment Server (Backend)
|
|
636
|
+
```bash
|
|
637
|
+
cd environments/my-environment/environment
|
|
638
|
+
uvicorn server:app --reload # Standard uvicorn with hot-reload
|
|
626
639
|
```
|
|
627
640
|
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
641
|
+
For the backend, we simply use `uvicorn` directly since it already provides excellent hot-reload capabilities.
|
|
642
|
+
|
|
643
|
+
### Development Workflow
|
|
644
|
+
|
|
645
|
+
1. Start both servers in separate terminals
|
|
646
|
+
2. Edit code in either `server/` or `environment/` - changes reload automatically
|
|
647
|
+
3. Test changes immediately without rebuilding Docker images
|
|
648
|
+
4. Use MCP Inspector or interactive mode to test tools
|
|
649
|
+
5. When ready, build the complete Docker image: `hud build`
|
|
634
650
|
|
|
635
|
-
|
|
651
|
+
### Quick Cursor Setup
|
|
636
652
|
|
|
637
|
-
|
|
653
|
+
Add to `.cursor/mcp.json` (or use the deeplink from `hud dev` output):
|
|
638
654
|
|
|
639
655
|
```json
|
|
640
656
|
{
|
|
641
657
|
"mcpServers": {
|
|
642
|
-
"
|
|
658
|
+
"my-environment-dev": {
|
|
643
659
|
"url": "http://localhost:8765/mcp"
|
|
644
660
|
}
|
|
645
661
|
}
|
|
646
662
|
}
|
|
647
663
|
```
|
|
648
664
|
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
1. Keep `hud dev` running in one terminal - it automatically handles reloads
|
|
652
|
-
2. Edit your code in `src/` - changes take effect immediately
|
|
653
|
-
3. Test changes in another terminal with `hud analyze` or the interactive mode
|
|
654
|
-
4. Use Cursor/Claude to iterate quickly on your environment
|
|
665
|
+
**Note**: Make sure both MCP server and environment backend are running when using with Cursor or agents.
|
|
655
666
|
|
|
656
667
|
### Process Separation for Stateful Environments
|
|
657
668
|
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# Blank Environment
|
|
2
|
+
|
|
3
|
+
Minimal starter template for building HUD environments.
|
|
4
|
+
See [docs](https://docs.hud.so/build-environments) for the complete environment design workflow.
|
|
5
|
+
|
|
6
|
+
## Architecture
|
|
7
|
+
|
|
8
|
+
**`environment/`** - Produces structured data
|
|
9
|
+
- Owns all state (game logic, browser sessions, databases, etc.)
|
|
10
|
+
- Exposes HTTP endpoints `/health`, `/act`, `/reset`, `/state` that return structured information about the environment state
|
|
11
|
+
|
|
12
|
+
**`server/`** - Wraps data in MCP tools
|
|
13
|
+
- Calls environment endpoints to get structured data for the agent, and environment setup/evaluation
|
|
14
|
+
- Agents and tasks interact only with these tools!
|
|
15
|
+
|
|
16
|
+
**Why separate?** Edit tools for the agent or tasks without restarting the heavy environment backend.
|
|
17
|
+
|
|
18
|
+
## Development
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
# Terminal 1 - Environment backend
|
|
22
|
+
cd environment
|
|
23
|
+
uv run uvicorn server:app --reload
|
|
24
|
+
|
|
25
|
+
# Terminal 2 - MCP server
|
|
26
|
+
cd server
|
|
27
|
+
uv run hud dev
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Uncomment the `setup` tool in `server/tools.py`, save, and watch it reload.
|
|
31
|
+
Visit http://localhost:8765/docs to see the new tool appear instantly.
|
|
32
|
+
|
|
33
|
+
In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
|
|
34
|
+
|
|
35
|
+
For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
|
|
36
|
+
```bash
|
|
37
|
+
cd ..
|
|
38
|
+
hud dev
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Tasks & Evaluation
|
|
42
|
+
```bash
|
|
43
|
+
# Build first in the global folder with the Dockerfile (creates blank:0.1.0)
|
|
44
|
+
hud build
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Your `tasks.json` uses `docker run` to launch the environment:
|
|
48
|
+
|
|
49
|
+
```json
|
|
50
|
+
{
|
|
51
|
+
"prompt": "Your task prompt",
|
|
52
|
+
"mcp_config": {
|
|
53
|
+
"local": {
|
|
54
|
+
"command": "docker",
|
|
55
|
+
"args": ["run", "--rm", "-i", "blank:0.1.0"]
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Commands:**
|
|
62
|
+
```bash
|
|
63
|
+
# Build first
|
|
64
|
+
hud build
|
|
65
|
+
|
|
66
|
+
# Test task locally
|
|
67
|
+
hud eval tasks.json
|
|
68
|
+
|
|
69
|
+
# Push environment for remote running
|
|
70
|
+
hud push
|
|
71
|
+
|
|
72
|
+
# Production RL training
|
|
73
|
+
hud rl tasks.json # Auto-converts docker→remote, builds & pushes if needed
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Publishing Your Environment
|
|
77
|
+
|
|
78
|
+
Once your environment is ready, you can share it with the community:
|
|
79
|
+
|
|
80
|
+
### 1. Push to Registry
|
|
81
|
+
```bash
|
|
82
|
+
# Build and push your environment (requires docker hub login and hud api key)
|
|
83
|
+
hud build
|
|
84
|
+
hud push
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 2. Create a Dataset
|
|
88
|
+
|
|
89
|
+
Create a dataset on HuggingFace with your tasks:
|
|
90
|
+
|
|
91
|
+
**Option A: Upload manually**
|
|
92
|
+
1. Upload your `tasks.json` to HuggingFace
|
|
93
|
+
2. Make sure it's **public** to appear on leaderboards
|
|
94
|
+
|
|
95
|
+
**Option B: Use the SDK**
|
|
96
|
+
```python
|
|
97
|
+
from hud.datasets import save_tasks
|
|
98
|
+
import json
|
|
99
|
+
|
|
100
|
+
# Load your tasks
|
|
101
|
+
with open("tasks.json") as f:
|
|
102
|
+
tasks = json.load(f)
|
|
103
|
+
|
|
104
|
+
# Push to HuggingFace
|
|
105
|
+
save_tasks(tasks, repo_id="your-org/your-dataset")
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### 3. Run and Track Performance
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
# Run Claude on your benchmark
|
|
112
|
+
hud eval "your-org/your-dataset" --agent claude
|
|
113
|
+
|
|
114
|
+
# View results at:
|
|
115
|
+
# hud.so/leaderboards/your-org/your-dataset
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
**Note**: Only public HuggingFace datasets appear as leaderboards!
|
|
119
|
+
|
|
120
|
+
📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
|
|
121
|
+
|
|
@@ -10,7 +10,7 @@ Endpoints (FastAPI)
|
|
|
10
10
|
|
|
11
11
|
Run (dev)
|
|
12
12
|
```bash
|
|
13
|
-
uv run uvicorn
|
|
13
|
+
uv run uvicorn server:app --reload --port 8005
|
|
14
14
|
```
|
|
15
15
|
|
|
16
16
|
Principle: treat like a backend. Keep long‑lived state here; add endpoints as tools need them.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "blank-environment"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Backend service for blank environment"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"fastapi",
|
|
8
|
+
"uvicorn[standard]",
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
[build-system]
|
|
12
|
+
requires = ["hatchling"]
|
|
13
|
+
build-backend = "hatchling.build"
|
|
14
|
+
|
|
15
|
+
[tool.hatch.build.targets.wheel]
|
|
16
|
+
packages = ["."]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# MCP Server
|
|
2
|
+
|
|
3
|
+
MCP layer that wraps environment data in tools for agent interaction.
|
|
4
|
+
|
|
5
|
+
## Structure
|
|
6
|
+
|
|
7
|
+
- `main.py` - Server initialization, imports routers
|
|
8
|
+
- `tools.py` - MCP tools that call environment HTTP endpoints
|
|
9
|
+
|
|
10
|
+
## Development
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
# Start MCP server with hot-reload
|
|
14
|
+
uv run hud dev
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Key Principles
|
|
18
|
+
|
|
19
|
+
- Keep tools thin - call environment HTTP endpoints
|
|
20
|
+
- Use routers for organization
|
|
21
|
+
- All long-lived state lives in `environment/`, not here
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "blank-server"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "MCP server for blank environment"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"hud-python>=0.4.49",
|
|
8
|
+
"httpx>=0.28.1",
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
[build-system]
|
|
12
|
+
requires = ["hatchling"]
|
|
13
|
+
build-backend = "hatchling.build"
|
|
14
|
+
|
|
15
|
+
[tool.hatch.metadata]
|
|
16
|
+
allow-direct-references = true
|
|
17
|
+
|
|
18
|
+
[tool.hatch.build.targets.wheel]
|
|
19
|
+
packages = ["."]
|
|
@@ -1,40 +1,39 @@
|
|
|
1
1
|
# Browser Environment
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Browser automation environment with GUI access for testing web applications. Includes sample apps (2048, Todo) and supports hot-reload development.
|
|
4
4
|
|
|
5
|
-
##
|
|
5
|
+
## Architecture
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
# 2. Start the environment (optional: with inspector)
|
|
13
|
-
hud dev --build --inspector
|
|
7
|
+
**`environment/`** - Produces structured data
|
|
8
|
+
- FastAPI backend with X11/VNC services (Linux-only)
|
|
9
|
+
- Launches and manages web apps (Next.js frontends + Python backends)
|
|
10
|
+
- Exposes HTTP endpoints for app control and state
|
|
14
11
|
|
|
15
|
-
|
|
12
|
+
**`server/`** - Wraps data in MCP tools
|
|
13
|
+
- Browser automation tools (Playwright, computer vision)
|
|
14
|
+
- Setup tools (launch apps, seed data)
|
|
15
|
+
- Evaluation tools (check game state, todo completion)
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
hud eval tasks.json --agent claude
|
|
17
|
+
**Why separate?** The environment backend requires X11/VNC/Chromium (Docker-only). The MCP server tools can be edited with hot-reload, while the heavy environment stays running.
|
|
19
18
|
|
|
20
|
-
|
|
21
|
-
# Requires installation:
|
|
22
|
-
pip install hud-python[agents]
|
|
23
|
-
|
|
24
|
-
# Option C: Simple Python script (runs all tasks from tasks.json)
|
|
25
|
-
python test_task.py
|
|
26
|
-
```
|
|
19
|
+
## Development
|
|
27
20
|
|
|
28
|
-
|
|
21
|
+
This environment **requires Docker** due to X11/VNC dependencies.
|
|
29
22
|
|
|
30
|
-
|
|
23
|
+
```bash
|
|
24
|
+
# Build first (creates hud-browser:0.1.0)
|
|
25
|
+
hud build
|
|
31
26
|
|
|
32
|
-
|
|
33
|
-
|
|
27
|
+
# Start with hot-reload
|
|
28
|
+
hud dev
|
|
29
|
+
```
|
|
34
30
|
|
|
35
|
-
|
|
31
|
+
When you run `hud dev` in an environment with a Dockerfile, it automatically:
|
|
32
|
+
- Detects Docker mode is needed
|
|
33
|
+
- Mounts `server/` and `environment/` as volumes
|
|
34
|
+
- Enables hot-reload for both layers
|
|
36
35
|
|
|
37
|
-
|
|
36
|
+
Edit files in `server/` or `environment/` and they reload inside the container!
|
|
38
37
|
|
|
39
38
|
## Publishing Your Environment
|
|
40
39
|
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "hud-browser-environment"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "HUD Browser Environment Backend"
|
|
5
|
+
requires-python = ">=3.11,<3.14"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"fastapi>=0.104.1",
|
|
8
|
+
"uvicorn[standard]>=0.24.0",
|
|
9
|
+
"python-multipart>=0.0.6",
|
|
10
|
+
"pydantic>=2.6,<3",
|
|
11
|
+
"pydantic-settings>=2.2,<3",
|
|
12
|
+
"httpx",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[build-system]
|
|
16
|
+
requires = ["hatchling"]
|
|
17
|
+
build-backend = "hatchling.build"
|
|
18
|
+
|
|
19
|
+
[tool.hatch.metadata]
|
|
20
|
+
allow-direct-references = true
|
|
21
|
+
|
|
22
|
+
[tool.hatch.build.targets.wheel]
|
|
23
|
+
packages = ["environment"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "hud-browser-server"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "HUD Browser MCP Server"
|
|
5
|
+
requires-python = ">=3.11,<3.14"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"hud-python@git+https://github.com/hud-evals/hud-python@cli-dev",
|
|
8
|
+
"httpx",
|
|
9
|
+
"playwright",
|
|
10
|
+
"pyautogui",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[build-system]
|
|
14
|
+
requires = ["hatchling"]
|
|
15
|
+
build-backend = "hatchling.build"
|
|
16
|
+
|
|
17
|
+
[tool.hatch.metadata]
|
|
18
|
+
allow-direct-references = true
|
|
19
|
+
|
|
20
|
+
[tool.hatch.build.targets.wheel]
|
|
21
|
+
packages = ["server"]
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# Deep Research Environment
|
|
2
|
+
|
|
3
|
+
Web research environment powered by Exa API for searching and fetching content.
|
|
4
|
+
See [docs](https://docs.hud.so/build-environments) for the complete environment design workflow.
|
|
5
|
+
|
|
6
|
+
## Architecture
|
|
7
|
+
|
|
8
|
+
**`environment/`** - Manages Exa API integration and state
|
|
9
|
+
- Holds the Exa API key server-side
|
|
10
|
+
- Exposes HTTP endpoints `/search`, `/fetch`, `/answer`, `/evaluate` for research workflows
|
|
11
|
+
- Implements exponential backoff for rate limiting
|
|
12
|
+
|
|
13
|
+
**`server/`** - Wraps data in MCP tools
|
|
14
|
+
- Provides `search()`, `fetch()`, `answer()`, `evaluate()` tools for agents
|
|
15
|
+
- Agents and tasks interact only with these tools
|
|
16
|
+
|
|
17
|
+
**Why separate?** Edit tools for the agent or tasks without restarting the environment backend.
|
|
18
|
+
|
|
19
|
+
## Tools
|
|
20
|
+
|
|
21
|
+
- **`search(query: str)`** - Search the web using Exa API, returns list of results with titles and URLs
|
|
22
|
+
- **`fetch(url: str)`** - Fetch full content from a URL, returns summary, highlights, and text
|
|
23
|
+
- **`answer(final_answer: str)`** - Submit the final research answer
|
|
24
|
+
- **`evaluate(expected_answer: str)`** - Evaluate submitted answer against expected result
|
|
25
|
+
|
|
26
|
+
## Setup
|
|
27
|
+
|
|
28
|
+
### Requirements
|
|
29
|
+
- Exa API key (get one at [exa.ai](https://exa.ai))
|
|
30
|
+
|
|
31
|
+
### Environment Variables
|
|
32
|
+
```bash
|
|
33
|
+
export EXA_API_KEY="your_exa_api_key_here"
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Development
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# Terminal 1 - Environment backend
|
|
40
|
+
cd environment
|
|
41
|
+
export EXA_API_KEY="your_key"
|
|
42
|
+
uv run uvicorn server:app --reload
|
|
43
|
+
|
|
44
|
+
# Terminal 2 - MCP server
|
|
45
|
+
cd server
|
|
46
|
+
uv run hud dev
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
The environment includes exponential backoff for rate limiting, so API calls will automatically retry on 429 errors.
|
|
50
|
+
|
|
51
|
+
In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
|
|
52
|
+
|
|
53
|
+
For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
|
|
54
|
+
```bash
|
|
55
|
+
cd ..
|
|
56
|
+
export EXA_API_KEY="your_key"
|
|
57
|
+
hud dev
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Tasks & Evaluation
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# Build first in the global folder with the Dockerfile (creates deepresearch:0.1.0)
|
|
64
|
+
hud build
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Your `tasks.json` uses `docker run` to launch the environment:
|
|
68
|
+
|
|
69
|
+
```json
|
|
70
|
+
{
|
|
71
|
+
"prompt": "Research and answer: What is the capital of France?",
|
|
72
|
+
"mcp_config": {
|
|
73
|
+
"local": {
|
|
74
|
+
"command": "docker",
|
|
75
|
+
"args": ["run", "--rm", "-i", "-e", "EXA_API_KEY", "deepresearch:0.1.0"]
|
|
76
|
+
}
|
|
77
|
+
},
|
|
78
|
+
"evaluator": {
|
|
79
|
+
"tool_name": "evaluate",
|
|
80
|
+
"tool_params": {
|
|
81
|
+
"expected_answer": "Paris"
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
**Note:** The `-e EXA_API_KEY` flag passes your local API key to the container.
|
|
88
|
+
|
|
89
|
+
**Commands:**
|
|
90
|
+
```bash
|
|
91
|
+
# Build first
|
|
92
|
+
hud build
|
|
93
|
+
|
|
94
|
+
# Test task locally
|
|
95
|
+
export EXA_API_KEY="your_key"
|
|
96
|
+
hud eval tasks.json
|
|
97
|
+
|
|
98
|
+
# Push environment for remote running
|
|
99
|
+
hud push
|
|
100
|
+
|
|
101
|
+
# Production RL training
|
|
102
|
+
hud rl tasks.json # Auto-converts docker→remote, builds & pushes if needed
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Publishing Your Environment
|
|
106
|
+
|
|
107
|
+
Once your environment is ready, you can share it with the community:
|
|
108
|
+
|
|
109
|
+
### 1. Push to Registry
|
|
110
|
+
```bash
|
|
111
|
+
# Build and push your environment (requires docker hub login and hud api key)
|
|
112
|
+
hud build
|
|
113
|
+
hud push
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### 2. Create a Dataset
|
|
117
|
+
|
|
118
|
+
Create a dataset on HuggingFace with your tasks:
|
|
119
|
+
|
|
120
|
+
**Option A: Upload manually**
|
|
121
|
+
1. Upload your `tasks.json` to HuggingFace
|
|
122
|
+
2. Make sure it's **public** to appear on leaderboards
|
|
123
|
+
|
|
124
|
+
**Option B: Use the SDK**
|
|
125
|
+
```python
|
|
126
|
+
from hud.datasets import save_tasks
|
|
127
|
+
import json
|
|
128
|
+
|
|
129
|
+
# Load your tasks
|
|
130
|
+
with open("tasks.json") as f:
|
|
131
|
+
tasks = json.load(f)
|
|
132
|
+
|
|
133
|
+
# Push to HuggingFace
|
|
134
|
+
save_tasks(tasks, repo_id="your-org/your-dataset")
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### 3. Run and Track Performance
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
# Run Claude on your benchmark
|
|
141
|
+
hud eval "your-org/your-dataset" --agent claude
|
|
142
|
+
|
|
143
|
+
# View results at:
|
|
144
|
+
# hud.so/leaderboards/your-org/your-dataset
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
**Note**: Only public HuggingFace datasets appear as leaderboards!
|
|
148
|
+
|
|
149
|
+
📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
|
|
150
|
+
|
|
151
|
+
## Example Research Workflow
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
# Agent searches for information
|
|
155
|
+
results = search("latest AI developments 2024")
|
|
156
|
+
|
|
157
|
+
# Agent fetches detailed content from top result
|
|
158
|
+
content = fetch(results[0]["url"])
|
|
159
|
+
|
|
160
|
+
# Agent submits final answer
|
|
161
|
+
answer("Based on research, AI developments in 2024 include...")
|
|
162
|
+
|
|
163
|
+
# Evaluate answer
|
|
164
|
+
result = evaluate(expected_answer="AI developments")
|
|
165
|
+
```
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "deepresearch-environment"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Backend service for DeepResearch environment"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"fastapi>=0.104.1",
|
|
8
|
+
"uvicorn[standard]>=0.24.0",
|
|
9
|
+
"httpx>=0.24.0",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[build-system]
|
|
13
|
+
requires = ["hatchling"]
|
|
14
|
+
build-backend = "hatchling.build"
|
|
15
|
+
|
|
16
|
+
[tool.hatch.build.targets.wheel]
|
|
17
|
+
packages = ["environment"]
|
|
@@ -3,7 +3,7 @@ name = "deepresearch"
|
|
|
3
3
|
version = "0.1.0"
|
|
4
4
|
description = "DeepResearch HUD environment with HTTP backend (EXA on server)"
|
|
5
5
|
requires-python = ">=3.11"
|
|
6
|
-
dependencies = [ "hud-python==0.4.
|
|
6
|
+
dependencies = [ "hud-python==0.4.42", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "httpx>=0.24.0",]
|
|
7
7
|
|
|
8
8
|
[build-system]
|
|
9
9
|
requires = [ "hatchling",]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "deepresearch-mcp"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "MCP server for DeepResearch environment"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"hud-python>=0.4.49",
|
|
8
|
+
"httpx>=0.24.0",
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
[build-system]
|
|
12
|
+
requires = ["hatchling"]
|
|
13
|
+
build-backend = "hatchling.build"
|
|
14
|
+
|
|
15
|
+
[tool.hatch.metadata]
|
|
16
|
+
allow-direct-references = true
|
|
17
|
+
|
|
18
|
+
[tool.hatch.build.targets.wheel]
|
|
19
|
+
packages = ["mcp"]
|