hud-python 0.4.56__tar.gz → 0.4.58__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.56 → hud_python-0.4.58}/PKG-INFO +1 -1
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/README.md +5 -5
- hud_python-0.4.58/environments/browser/browser-base/README.md +58 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/browser/server/pyproject.toml +1 -1
- hud_python-0.4.58/environments/rubrics/README.md +239 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/rubrics/environment/pyproject.toml +3 -2
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/rubrics/pyproject.toml +1 -1
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/__init__.py +20 -7
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/dev.py +135 -5
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/eval.py +2 -2
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/flows/dev.py +10 -19
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/init.py +14 -18
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/push.py +2 -2
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/rl/__init__.py +1 -1
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/rl/celebrate.py +1 -1
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/rl/remote_runner.py +3 -3
- hud_python-0.4.58/hud/cli/tests/test_convert.py +367 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/version_check.py +7 -6
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/clients/base.py +29 -3
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/clients/fastmcp.py +3 -3
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/clients/mcp_use.py +2 -2
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/clients/tests/test_protocol.py +9 -3
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/otel/config.py +1 -1
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/otel/context.py +2 -2
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/server.py +306 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/shared/hints.py +3 -3
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/telemetry/job.py +2 -2
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/playwright.py +8 -1
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/types.py +1 -1
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/version.py +1 -1
- {hud_python-0.4.56 → hud_python-0.4.58}/pyproject.toml +1 -1
- hud_python-0.4.56/environments/rubrics/README.md +0 -182
- {hud_python-0.4.56 → hud_python-0.4.58}/.gitignore +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/LICENSE +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/blank/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/blank/environment/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/blank/environment/pyproject.toml +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/blank/server/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/blank/server/pyproject.toml +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/browser/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/browser/environment/2048/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/browser/environment/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/browser/environment/pyproject.toml +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/browser/environment/todo/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/browser/pyproject.toml +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/deepresearch/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/deepresearch/environment/pyproject.toml +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/deepresearch/pyproject.toml +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/deepresearch/server/pyproject.toml +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/rubrics/server/pyproject.toml +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/examples/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/__main__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/base.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/claude.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/grounded_openai.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/lite_llm.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/misc/integration_test_agent.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/openai.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/openai_chat_generic.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/tests/test_base.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/tests/test_base_runtime.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/agents/utils.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/build.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/clone.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/debug.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/flows/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/flows/tasks.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/get.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/list_func.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/pull.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/remove.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/rl/config.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/rl/display.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/rl/gpu.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/rl/gpu_utils.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/rl/local_runner.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/rl/presets.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/rl/rl_api.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/rl/viewer.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/rl/vllm.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/rl/wait_utils.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_analyze_module.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_build.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_build_failure.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_build_module.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_cli_root.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_eval.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_mcp_server.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_push_happy.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_push_wrapper.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/config.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/docker.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/env_check.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/interactive.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/local_runner.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/package_runner.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/runner.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/server.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/source_hash.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tasks.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_config.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_docker.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_docker_hints.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_env_check.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_environment.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_interactive_module.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_local_runner.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_logging_utils.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_metadata.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_package_runner.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_registry_utils.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_remote_runner.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_runner_modules.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_source_hash.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/cli/utils/tests/test_tasks.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/clients/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/clients/tests/test_mcp_use_retry.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/clients/utils/mcp_use_retry.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/clients/utils/retry.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/datasets/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/datasets/parallel.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/datasets/runner.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/datasets/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/datasets/tests/test_runner.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/datasets/tests/test_utils.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/datasets/utils.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/native/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/native/comparator.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/otel/collector.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/otel/exporters.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/otel/instrumentation.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/otel/processors.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/otel/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/otel/tests/test_instrumentation.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/py.typed +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/README.md +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/actor.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/buffer.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/chat_template.jinja +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/config.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/distributed.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/learner.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/tests/test_learner.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/train.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/types.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/utils/start_vllm_server.sh +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/utils.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/rl/vllm_adapter.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/samples/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/samples/browser.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/context.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/low_level.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/router.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/tests/test_add_tool.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/tests/test_context.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/tests/test_mcp_server_handlers.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/tests/test_mcp_server_integration.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/tests/test_mcp_server_more.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/tests/test_run_wrapper.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/tests/test_server_extra.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/server/tests/test_sigterm_runner.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/settings.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/shared/exceptions.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/shared/requests.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/shared/tests/test_exceptions.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/shared/tests/test_hints.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/telemetry/async_context.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/telemetry/tests/test_async_context.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/telemetry/tests/test_instrument.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/telemetry/tests/test_job.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/telemetry/tests/test_trace.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/telemetry/trace.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/base.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/bash.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/computer/qwen.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/edit.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/executors/base.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/grounding/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/grounding/config.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/grounding/grounded_tool.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/grounding/grounder.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/grounding/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/response.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/submit.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/test_submit.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/test_types.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/types.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/tools/utils.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/agent_factories.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/group_eval.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/hud_console.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/mcp.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/pretty_errors.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/progress.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/task_tracking.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/tasks.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/tests/test_agent_factories.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/tests/test_mcp.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/tests/test_pretty_errors.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/tests/test_tasks.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/tests/test_tool_shorthand.py +0 -0
- {hud_python-0.4.56 → hud_python-0.4.58}/hud/utils/tool_shorthand.py +0 -0
|
@@ -495,7 +495,7 @@ from hud.agents import ClaudeAgent
|
|
|
495
495
|
from hud.clients import MCPClient
|
|
496
496
|
|
|
497
497
|
async def main():
|
|
498
|
-
# `trace` captures *everything* that happens and sends it to hud.
|
|
498
|
+
# `trace` captures *everything* that happens and sends it to hud.ai
|
|
499
499
|
with hud.trace("local_test"):
|
|
500
500
|
task = Task(
|
|
501
501
|
prompt="Complete the task",
|
|
@@ -524,7 +524,7 @@ async def main():
|
|
|
524
524
|
asyncio.run(main())
|
|
525
525
|
```
|
|
526
526
|
|
|
527
|
-
The `trace` context manager sends a full timeline of agent actions, tool calls, and rewards to hud.
|
|
527
|
+
The `trace` context manager sends a full timeline of agent actions, tool calls, and rewards to hud.ai – perfect for debugging.
|
|
528
528
|
|
|
529
529
|
See `examples/01_hello_2048.py` and `examples/task_with_setup_eval.py` for larger end-to-end demos.
|
|
530
530
|
|
|
@@ -532,7 +532,7 @@ See `examples/01_hello_2048.py` and `examples/task_with_setup_eval.py` for large
|
|
|
532
532
|
|
|
533
533
|
## Phase 4 – Remote Deployment & HUD Runner
|
|
534
534
|
|
|
535
|
-
**Goal →** the exact same image runs in parallel on hundreds of instances, and exposes more telemetry so the hud.
|
|
535
|
+
**Goal →** the exact same image runs in parallel on hundreds of instances, and exposes more telemetry so the hud.ai can visualise the whole lifecycle.
|
|
536
536
|
|
|
537
537
|
### 1. Publish your image
|
|
538
538
|
|
|
@@ -595,11 +595,11 @@ async def initialize_environment(session=None, progress_token=None):
|
|
|
595
595
|
await send(100, "ready")
|
|
596
596
|
```
|
|
597
597
|
|
|
598
|
-
Those messages are displayed live on hud.
|
|
598
|
+
Those messages are displayed live on hud.ai alongside resource graphs – perfect feedback while you wait.
|
|
599
599
|
|
|
600
600
|
### 4. Live telemetry (`telemetry://live`) (Optional)
|
|
601
601
|
|
|
602
|
-
Expose a resource named `telemetry://live` exactly like in `environments/browser/src/hud_controller/server.py` to return live url to be displayed on hud.
|
|
602
|
+
Expose a resource named `telemetry://live` exactly like in `environments/browser/src/hud_controller/server.py` to return live url to be displayed on hud.ai.
|
|
603
603
|
|
|
604
604
|
Once all of the above works you can unleash *hundreds* of concurrent agents on your new environment.
|
|
605
605
|
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Browser Base Image
|
|
2
|
+
|
|
3
|
+
Base Docker image for browser environments with Playwright, Chromium, and VNC support.
|
|
4
|
+
|
|
5
|
+
## Build
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
docker build -t browser-base:latest .
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Test with VNC Access
|
|
12
|
+
|
|
13
|
+
### 1. Start the container
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
docker run -it --rm \
|
|
17
|
+
-p 6080:6080 \
|
|
18
|
+
-p 5900:5900 \
|
|
19
|
+
-e DISPLAY=:1 \
|
|
20
|
+
browser-base:latest \
|
|
21
|
+
bash
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### 2. Inside the container, start display servers
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
Xvfb :1 -screen 0 1920x1080x24 > /dev/null 2>&1 &
|
|
28
|
+
x11vnc -display :1 -nopw -listen 0.0.0.0 -forever > /dev/null 2>&1 &
|
|
29
|
+
/usr/share/novnc/utils/novnc_proxy --vnc localhost:5900 --listen 6080 > /dev/null 2>&1 &
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### 3. Test Playwright
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
python3 -c "
|
|
36
|
+
from playwright.sync_api import sync_playwright
|
|
37
|
+
with sync_playwright() as p:
|
|
38
|
+
browser = p.chromium.launch(headless=False)
|
|
39
|
+
page = browser.new_page()
|
|
40
|
+
page.goto('https://example.com')
|
|
41
|
+
print('Title:', page.title())
|
|
42
|
+
input('Press Enter to close...')
|
|
43
|
+
browser.close()
|
|
44
|
+
"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### 4. View in browser
|
|
48
|
+
|
|
49
|
+
Open `http://localhost:6080/vnc.html` to see Chromium running.
|
|
50
|
+
|
|
51
|
+
## What's Included
|
|
52
|
+
|
|
53
|
+
- Ubuntu 24.04
|
|
54
|
+
- Desktop environment (Xvfb, x11vnc, noVNC, xfce4)
|
|
55
|
+
- Node.js & npm
|
|
56
|
+
- Python 3 with uv package manager
|
|
57
|
+
- Playwright with Chromium
|
|
58
|
+
- Development tools (git, curl, wget, etc.)
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
# SEC EDGAR Rubrics Environment
|
|
2
|
+
|
|
3
|
+
SEC filing research environment powered by the SEC EDGAR database for accessing company filings and financial data, with rubric-based evaluation for structured grading provided by [The LLM Data Company](https://llmdata.com).
|
|
4
|
+
|
|
5
|
+
See [docs](https://docs.hud.so/build-environments) for the complete environment design workflow.
|
|
6
|
+
|
|
7
|
+
## Architecture
|
|
8
|
+
|
|
9
|
+
**`environment/`** - Manages SEC EDGAR and web search integration
|
|
10
|
+
- Uses the edgartools Python library to access SEC filing data
|
|
11
|
+
- Integrates with Exa API for supplementary web search capabilities
|
|
12
|
+
- Exposes HTTP endpoints for research workflows with exponential backoff for rate limiting
|
|
13
|
+
|
|
14
|
+
**`server/`** - Wraps data in MCP tools
|
|
15
|
+
- Provides research tools for agents to access SEC filings, financial data, and web search
|
|
16
|
+
- Agents and tasks interact only with these tools
|
|
17
|
+
|
|
18
|
+
**Why separate?** Edit tools for the agent or tasks without restarting the environment backend.
|
|
19
|
+
|
|
20
|
+
## Tools
|
|
21
|
+
|
|
22
|
+
### SEC EDGAR Tools
|
|
23
|
+
- **`setup()`** - Initialize the environment and reset state.
|
|
24
|
+
- **`search_company(query: str)`** - Search for a company by ticker symbol or name. Returns company information including ticker, name, and CIK.
|
|
25
|
+
- **`get_filings(ticker?: str, form_type?: str, limit?: int, cutoff_date?: str)`** - Get SEC filings. When `ticker` is provided, returns company-specific filings. Otherwise, returns global recent filings. Can filter by form type (e.g., "10-K", "10-Q", "8-K"), limit results, and filter by date (YYYY-MM-DD).
|
|
26
|
+
- **`get_filing_content(filing_url: str)`** - Fetch the full text content of a specific SEC filing from its URL.
|
|
27
|
+
- **`get_financial_data(ticker: str, accession_number: str)`** - Extract financial statements and key metrics from a 10-K or 10-Q filing. Returns income statement, balance sheet, cash flow, and other financial data.
|
|
28
|
+
- **`get_segment_data(ticker: str, accession_number: str)`** - Extract segment-level financial data from a 10-K or 10-Q filing for companies with multiple business segments.
|
|
29
|
+
- **`get_filing_sections(ticker: str, accession_number: str)`** - Extract specific sections from a 10-K or 10-Q filing (e.g., Business, Risk Factors, MD&A).
|
|
30
|
+
|
|
31
|
+
### Web Search Tools
|
|
32
|
+
- **`web_search(query: str)`** - Search the web using Exa API. Returns titles and URLs of relevant results.
|
|
33
|
+
- **`web_fetch(url: str)`** - Fetch and extract content from a web URL. Returns summary, highlights, and full content.
|
|
34
|
+
|
|
35
|
+
### Evaluation Tools
|
|
36
|
+
- **`answer(final_answer: str)`** - Submit the final research answer.
|
|
37
|
+
- **`evaluate(rubric: list[dict])`** - Evaluate submitted answer using a structured rubric with weighted requirements.
|
|
38
|
+
|
|
39
|
+
### Rubric-Based Evaluation
|
|
40
|
+
|
|
41
|
+
The `evaluate` tool uses The LLM Data Company's [rubric](https://github.com/The-LLM-Data-Company/rubric/) package to grade answers against structured criteria with autograders.
|
|
42
|
+
|
|
43
|
+
## Setup
|
|
44
|
+
|
|
45
|
+
### Environment Variables
|
|
46
|
+
|
|
47
|
+
The environment requires several API keys and configuration:
|
|
48
|
+
|
|
49
|
+
**Required:**
|
|
50
|
+
- `EDGAR_IDENTITY` - Your identity for SEC EDGAR access (required by SEC regulations)
|
|
51
|
+
- Format: `"Your Name your.email@example.com"`
|
|
52
|
+
|
|
53
|
+
**Optional:**
|
|
54
|
+
- `EXA_API_KEY` - For web search and content fetching capabilities (if using web_search/web_fetch tools)
|
|
55
|
+
- `HUD_API_KEY` - For HUD telemetry and tracing
|
|
56
|
+
- `ANTHROPIC_API_KEY` - For Claude agent (if using Claude)
|
|
57
|
+
- `OPENAI_API_KEY` - For rubric evaluation (if using OpenAI-based autograders)
|
|
58
|
+
|
|
59
|
+
Add these to your .env before running `hud eval`:
|
|
60
|
+
```bash
|
|
61
|
+
export EDGAR_IDENTITY="Your Name your.email@example.com"
|
|
62
|
+
export EXA_API_KEY="your-exa-key" # optional, for web search
|
|
63
|
+
export ANTHROPIC_API_KEY="your-anthropic-key" # only if using an Anthropic model
|
|
64
|
+
export OPENAI_API_KEY="your-openai-key"
|
|
65
|
+
# Optional
|
|
66
|
+
export HUD_API_KEY="your-hud-key"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Development
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
# Terminal 1 - Environment backend
|
|
73
|
+
cd environment
|
|
74
|
+
export EDGAR_IDENTITY="Your Name your.email@example.com"
|
|
75
|
+
export EXA_API_KEY="your-exa-key" # optional, for web search
|
|
76
|
+
uv run uvicorn server:app --reload
|
|
77
|
+
|
|
78
|
+
# Terminal 2 - MCP server
|
|
79
|
+
cd server
|
|
80
|
+
uv run hud dev
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
The environment includes exponential backoff for rate limiting, so API calls will automatically retry on 429 errors.
|
|
84
|
+
|
|
85
|
+
In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
|
|
86
|
+
|
|
87
|
+
For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
|
|
88
|
+
```bash
|
|
89
|
+
cd ..
|
|
90
|
+
hud dev
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Tasks & Evaluation
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
# Build first in the global folder with the Dockerfile (creates rubrics:latest)
|
|
97
|
+
hud build
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Your `tasks.json` uses `docker run` to launch the environment:
|
|
101
|
+
|
|
102
|
+
```json
|
|
103
|
+
{
|
|
104
|
+
"prompt": "Analyze Tesla's FY2024 10-K filing...",
|
|
105
|
+
"mcp_config": {
|
|
106
|
+
"local": {
|
|
107
|
+
"command": "docker",
|
|
108
|
+
"args": ["run", "--rm", "-i", "rubrics:latest"]
|
|
109
|
+
}
|
|
110
|
+
},
|
|
111
|
+
"evaluate_tool": {
|
|
112
|
+
"name": "evaluate",
|
|
113
|
+
"arguments": {
|
|
114
|
+
"rubric": [...]
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
**Note:** Export environment variables before running. The Docker container will inherit them from your shell.
|
|
121
|
+
|
|
122
|
+
**Commands:**
|
|
123
|
+
```bash
|
|
124
|
+
# Build first
|
|
125
|
+
hud build
|
|
126
|
+
|
|
127
|
+
# Test task locally
|
|
128
|
+
export EDGAR_IDENTITY="Your Name your.email@example.com"
|
|
129
|
+
export EXA_API_KEY="your-exa-key" # optional, for web search
|
|
130
|
+
export ANTHROPIC_API_KEY="your-anthropic-key"
|
|
131
|
+
export OPENAI_API_KEY="your-openai-key"
|
|
132
|
+
hud eval tasks.json --max-steps 25
|
|
133
|
+
|
|
134
|
+
# Push environment for remote running
|
|
135
|
+
hud push
|
|
136
|
+
|
|
137
|
+
# Production RL training
|
|
138
|
+
hud rl tasks.json # Auto-converts docker→remote, builds & pushes if needed
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Publishing Your Environment
|
|
142
|
+
|
|
143
|
+
Once your environment is ready, you can share it with the community:
|
|
144
|
+
|
|
145
|
+
### 1. Push to Registry
|
|
146
|
+
```bash
|
|
147
|
+
# Build and push your environment (requires docker hub login and hud api key)
|
|
148
|
+
hud build
|
|
149
|
+
hud push
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### 2. Create a Dataset
|
|
153
|
+
|
|
154
|
+
Create a dataset on HuggingFace with your tasks:
|
|
155
|
+
|
|
156
|
+
**Option A: Upload manually**
|
|
157
|
+
1. Upload your `tasks.json` to HuggingFace
|
|
158
|
+
2. Make sure it's **public** to appear on leaderboards
|
|
159
|
+
|
|
160
|
+
**Option B: Use the SDK**
|
|
161
|
+
```python
|
|
162
|
+
from hud.datasets import save_tasks
|
|
163
|
+
import json
|
|
164
|
+
|
|
165
|
+
# Load your tasks
|
|
166
|
+
with open("tasks.json") as f:
|
|
167
|
+
tasks = json.load(f)
|
|
168
|
+
|
|
169
|
+
# Push to HuggingFace
|
|
170
|
+
save_tasks(tasks, repo_id="your-org/your-dataset")
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### 3. Run and Track Performance
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
# Run Claude on your benchmark
|
|
177
|
+
hud eval "your-org/your-dataset" --agent claude
|
|
178
|
+
|
|
179
|
+
# View results at:
|
|
180
|
+
# hud.so/leaderboards/your-org/your-dataset
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
**Note**: Only public HuggingFace datasets appear as leaderboards!
|
|
184
|
+
|
|
185
|
+
📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
|
|
186
|
+
|
|
187
|
+
## Example Research Workflow
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
# Initialize environment
|
|
191
|
+
setup()
|
|
192
|
+
|
|
193
|
+
# Agent searches for a company
|
|
194
|
+
company_info = search_company("TSLA")
|
|
195
|
+
# Returns: [{"ticker": "TSLA", "name": "Tesla Inc", "cik": "1318605"}]
|
|
196
|
+
|
|
197
|
+
# Agent gets recent filings
|
|
198
|
+
filings = get_filings(ticker="TSLA", form_type="10-K", limit=1)
|
|
199
|
+
# Returns: [{"filing_date": "2024-01-01", "form_type": "10-K", "accession_number": "...", "filing_url": "..."}]
|
|
200
|
+
|
|
201
|
+
# Agent extracts financial data
|
|
202
|
+
financial_data = get_financial_data(ticker="TSLA", accession_number=filings[0]["accession_number"])
|
|
203
|
+
# Returns: {"has_financials": True, "financial_data": {...income statement, balance sheet, etc...}}
|
|
204
|
+
|
|
205
|
+
# Agent gets specific sections from the filing
|
|
206
|
+
sections = get_filing_sections(ticker="TSLA", accession_number=filings[0]["accession_number"])
|
|
207
|
+
# Returns: {"sections": {"business": "...", "risk_factors": "...", "mda": "..."}}
|
|
208
|
+
|
|
209
|
+
# Agent uses web search for additional context
|
|
210
|
+
search_results = web_search("Tesla FY2024 revenue analysis")
|
|
211
|
+
# Returns: [{"title": "...", "url": "..."}]
|
|
212
|
+
|
|
213
|
+
# Agent fetches web content
|
|
214
|
+
web_content = web_fetch(search_results[0]["url"])
|
|
215
|
+
# Returns: "=== SUMMARY ===\n...\n=== KEY HIGHLIGHTS ===\n...\n=== FULL CONTENT ===\n..."
|
|
216
|
+
|
|
217
|
+
# Agent submits final answer
|
|
218
|
+
answer("Based on Tesla's FY2024 10-K, revenue was $96.8B...")
|
|
219
|
+
|
|
220
|
+
# Evaluate answer using rubric
|
|
221
|
+
result = evaluate(rubric=[
|
|
222
|
+
{"requirement": "Correctly states FY2024 revenue", "weight": 15},
|
|
223
|
+
{"requirement": "Provides segment breakdown", "weight": 5},
|
|
224
|
+
])
|
|
225
|
+
# Returns: {"reward": float, "info": {"report": [...]}, "done": True}
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
## Dependencies
|
|
229
|
+
|
|
230
|
+
- **edgartools**: Python library for accessing SEC EDGAR data
|
|
231
|
+
- **fastapi**: Web framework for the environment server
|
|
232
|
+
- **httpx**: HTTP client for API calls
|
|
233
|
+
- **rubric**: LLM Data Company's rubric evaluation package
|
|
234
|
+
- **Exa API**: Web search and content extraction (optional, for web_search/web_fetch tools)
|
|
235
|
+
|
|
236
|
+
## Acknowledgments
|
|
237
|
+
|
|
238
|
+
* [EdgarTools](https://github.com/dgunning/edgartools) - Python library to access SEC EDGAR
|
|
239
|
+
* [SEC EDGAR MCP](https://github.com/stefanoamorelli/sec-edgar-mcp) - Rich OSS SEC MCP server
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "rubrics-environment"
|
|
3
3
|
version = "0.1.0"
|
|
4
|
-
description = "Backend service for Rubrics environment"
|
|
4
|
+
description = "Backend service for Rubrics environment with SEC EDGAR integration"
|
|
5
5
|
requires-python = ">=3.11"
|
|
6
6
|
dependencies = [
|
|
7
7
|
"fastapi>=0.104.1",
|
|
8
8
|
"uvicorn[standard]>=0.24.0",
|
|
9
9
|
"httpx>=0.24.0",
|
|
10
|
-
"rubric
|
|
10
|
+
"rubric==1.1.8",
|
|
11
|
+
"edgartools>=4.21.3",
|
|
11
12
|
]
|
|
12
13
|
|
|
13
14
|
[build-system]
|
|
@@ -253,10 +253,23 @@ def debug(
|
|
|
253
253
|
else:
|
|
254
254
|
# Assume it's an image name
|
|
255
255
|
image = first_param
|
|
256
|
-
from .utils.docker import
|
|
256
|
+
from .utils.docker import create_docker_run_command
|
|
257
|
+
|
|
258
|
+
# For image mode, check if there's a .env file in current directory
|
|
259
|
+
# and use it if available (similar to hud dev behavior)
|
|
260
|
+
cwd = Path.cwd()
|
|
261
|
+
if (cwd / ".env").exists():
|
|
262
|
+
# Use create_docker_run_command to load .env from current directory
|
|
263
|
+
command = create_docker_run_command(
|
|
264
|
+
image,
|
|
265
|
+
docker_args=docker_args,
|
|
266
|
+
env_dir=cwd, # Load .env from current directory
|
|
267
|
+
)
|
|
268
|
+
else:
|
|
269
|
+
# No .env file, use basic command without env loading
|
|
270
|
+
from .utils.docker import build_run_command
|
|
257
271
|
|
|
258
|
-
|
|
259
|
-
command = build_run_command(image, docker_args)
|
|
272
|
+
command = build_run_command(image, docker_args)
|
|
260
273
|
else:
|
|
261
274
|
console.print(
|
|
262
275
|
"[red]Error: Must specify a directory, Docker image, --config, or --cursor[/red]"
|
|
@@ -741,14 +754,14 @@ def remove(
|
|
|
741
754
|
|
|
742
755
|
@app.command()
|
|
743
756
|
def init(
|
|
744
|
-
name: str = typer.Argument(None, help="Environment name (default:
|
|
757
|
+
name: str = typer.Argument(None, help="Environment name (default: chosen preset name)"),
|
|
745
758
|
preset: str | None = typer.Option(
|
|
746
759
|
None,
|
|
747
760
|
"--preset",
|
|
748
761
|
"-p",
|
|
749
762
|
help="Preset to use: blank, deep-research, browser, rubrics. If omitted, you'll choose interactively.", # noqa: E501
|
|
750
763
|
),
|
|
751
|
-
directory: str = typer.Option(".", "--dir", "-d", help="
|
|
764
|
+
directory: str = typer.Option(".", "--dir", "-d", help="Parent directory for the environment"),
|
|
752
765
|
force: bool = typer.Option(False, "--force", "-f", help="Overwrite existing files"),
|
|
753
766
|
) -> None:
|
|
754
767
|
"""🚀 Initialize a new HUD environment with minimal boilerplate.
|
|
@@ -760,8 +773,8 @@ def init(
|
|
|
760
773
|
- Required setup/evaluate tools
|
|
761
774
|
|
|
762
775
|
Examples:
|
|
763
|
-
hud init #
|
|
764
|
-
hud init my-env # Create
|
|
776
|
+
hud init # Choose preset interactively, create ./preset-name/
|
|
777
|
+
hud init my-env # Create new directory ./my-env/
|
|
765
778
|
hud init my-env --dir /tmp # Create in /tmp/my-env/
|
|
766
779
|
"""
|
|
767
780
|
create_environment(name, directory, force, preset)
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
+
import contextlib
|
|
6
7
|
import importlib
|
|
7
8
|
import importlib.util
|
|
8
9
|
import logging
|
|
@@ -13,6 +14,8 @@ import threading
|
|
|
13
14
|
from pathlib import Path
|
|
14
15
|
from typing import Any
|
|
15
16
|
|
|
17
|
+
import typer
|
|
18
|
+
|
|
16
19
|
from hud.utils.hud_console import HUDConsole
|
|
17
20
|
|
|
18
21
|
hud_console = HUDConsole()
|
|
@@ -26,6 +29,7 @@ def show_dev_server_info(
|
|
|
26
29
|
interactive: bool,
|
|
27
30
|
env_dir: Path | None = None,
|
|
28
31
|
new: bool = False,
|
|
32
|
+
docker_mode: bool = False,
|
|
29
33
|
) -> str:
|
|
30
34
|
"""Show consistent server info for both Python and Docker modes.
|
|
31
35
|
|
|
@@ -54,7 +58,15 @@ def show_dev_server_info(
|
|
|
54
58
|
if transport == "http":
|
|
55
59
|
hud_console.section_title("Quick Links")
|
|
56
60
|
hud_console.info(f"{hud_console.sym.ITEM} Docs: http://localhost:{port}/docs")
|
|
57
|
-
hud_console.info(f"{hud_console.sym.ITEM} Cursor:
|
|
61
|
+
hud_console.info(f"{hud_console.sym.ITEM} Cursor:")
|
|
62
|
+
# Display the Cursor link on its own line to prevent wrapping
|
|
63
|
+
hud_console.link(cursor_deeplink)
|
|
64
|
+
|
|
65
|
+
# Show eval endpoint if in Docker mode
|
|
66
|
+
if docker_mode:
|
|
67
|
+
hud_console.info(
|
|
68
|
+
f"{hud_console.sym.ITEM} Eval API: http://localhost:{port}/eval (POST)"
|
|
69
|
+
)
|
|
58
70
|
|
|
59
71
|
# Check for VNC (browser environment)
|
|
60
72
|
if env_dir and (env_dir / "environment" / "server.py").exists():
|
|
@@ -237,7 +249,7 @@ async def run_mcp_module(
|
|
|
237
249
|
|
|
238
250
|
from hud.cli.flows.dev import create_dynamic_trace
|
|
239
251
|
|
|
240
|
-
live_trace_url = await create_dynamic_trace(
|
|
252
|
+
_, live_trace_url = await create_dynamic_trace(
|
|
241
253
|
mcp_config=local_mcp_config,
|
|
242
254
|
build_status=False,
|
|
243
255
|
environment_name=mcp_server.name or "mcp-server",
|
|
@@ -510,6 +522,9 @@ def run_docker_dev_server(
|
|
|
510
522
|
new: bool = False,
|
|
511
523
|
) -> None:
|
|
512
524
|
"""Run MCP server in Docker with volume mounts, expose via local HTTP proxy."""
|
|
525
|
+
import atexit
|
|
526
|
+
import signal
|
|
527
|
+
|
|
513
528
|
import typer
|
|
514
529
|
import yaml
|
|
515
530
|
|
|
@@ -522,6 +537,69 @@ def run_docker_dev_server(
|
|
|
522
537
|
|
|
523
538
|
cwd = Path.cwd()
|
|
524
539
|
|
|
540
|
+
# Container name will be set later and used for cleanup
|
|
541
|
+
container_name: str | None = None
|
|
542
|
+
cleanup_done = False
|
|
543
|
+
|
|
544
|
+
def cleanup_container() -> None:
|
|
545
|
+
"""Clean up Docker container on exit."""
|
|
546
|
+
nonlocal cleanup_done
|
|
547
|
+
if cleanup_done or not container_name:
|
|
548
|
+
return
|
|
549
|
+
|
|
550
|
+
cleanup_done = True
|
|
551
|
+
hud_console.debug(f"Cleaning up container: {container_name}")
|
|
552
|
+
|
|
553
|
+
# Check if container is still running
|
|
554
|
+
try:
|
|
555
|
+
result = subprocess.run( # noqa: S603
|
|
556
|
+
["docker", "ps", "-q", "-f", f"name={container_name}"], # noqa: S607
|
|
557
|
+
stdout=subprocess.PIPE,
|
|
558
|
+
stderr=subprocess.DEVNULL,
|
|
559
|
+
text=True,
|
|
560
|
+
timeout=5,
|
|
561
|
+
)
|
|
562
|
+
if not result.stdout.strip():
|
|
563
|
+
# Container is not running, just try to remove it
|
|
564
|
+
subprocess.run( # noqa: S603
|
|
565
|
+
["docker", "rm", "-f", container_name], # noqa: S607
|
|
566
|
+
stdout=subprocess.DEVNULL,
|
|
567
|
+
stderr=subprocess.DEVNULL,
|
|
568
|
+
timeout=5,
|
|
569
|
+
)
|
|
570
|
+
return
|
|
571
|
+
except Exception: # noqa: S110
|
|
572
|
+
pass
|
|
573
|
+
|
|
574
|
+
try:
|
|
575
|
+
# First try to stop gracefully
|
|
576
|
+
subprocess.run( # noqa: S603
|
|
577
|
+
["docker", "stop", container_name], # noqa: S607
|
|
578
|
+
stdout=subprocess.DEVNULL,
|
|
579
|
+
stderr=subprocess.DEVNULL,
|
|
580
|
+
timeout=10,
|
|
581
|
+
)
|
|
582
|
+
hud_console.debug(f"Container {container_name} stopped successfully")
|
|
583
|
+
except subprocess.TimeoutExpired:
|
|
584
|
+
# Force kill if stop times out
|
|
585
|
+
hud_console.debug(f"Container {container_name} stop timeout, forcing kill")
|
|
586
|
+
with contextlib.suppress(Exception):
|
|
587
|
+
subprocess.run( # noqa: S603
|
|
588
|
+
["docker", "kill", container_name], # noqa: S607
|
|
589
|
+
stdout=subprocess.DEVNULL,
|
|
590
|
+
stderr=subprocess.DEVNULL,
|
|
591
|
+
timeout=5,
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
# Set up signal handlers for cleanup
|
|
595
|
+
def signal_handler(signum: int, frame: Any) -> None:
|
|
596
|
+
cleanup_container()
|
|
597
|
+
sys.exit(0)
|
|
598
|
+
|
|
599
|
+
signal.signal(signal.SIGTERM, signal_handler)
|
|
600
|
+
if sys.platform != "win32":
|
|
601
|
+
signal.signal(signal.SIGHUP, signal_handler)
|
|
602
|
+
|
|
525
603
|
# Find environment directory (current or parent with hud.lock.yaml)
|
|
526
604
|
env_dir = cwd
|
|
527
605
|
lock_path = env_dir / "hud.lock.yaml"
|
|
@@ -562,10 +640,14 @@ def run_docker_dev_server(
|
|
|
562
640
|
base_name = image_name.replace(":", "-").replace("/", "-")
|
|
563
641
|
container_name = f"{base_name}-dev-{pid}"
|
|
564
642
|
|
|
643
|
+
# Register cleanup function with atexit
|
|
644
|
+
atexit.register(cleanup_container)
|
|
645
|
+
|
|
565
646
|
# Build docker run command with volume mounts and folder-mode envs
|
|
566
647
|
from .utils.docker import create_docker_run_command
|
|
567
648
|
|
|
568
649
|
base_args = [
|
|
650
|
+
"--rm", # Automatically remove container when it stops
|
|
569
651
|
"--name",
|
|
570
652
|
container_name,
|
|
571
653
|
"-v",
|
|
@@ -608,7 +690,7 @@ def run_docker_dev_server(
|
|
|
608
690
|
"headers": {},
|
|
609
691
|
}
|
|
610
692
|
}
|
|
611
|
-
live_trace_url = _asy.run(
|
|
693
|
+
_, live_trace_url = _asy.run(
|
|
612
694
|
create_dynamic_trace(
|
|
613
695
|
mcp_config=local_mcp_config,
|
|
614
696
|
build_status=True,
|
|
@@ -643,6 +725,7 @@ def run_docker_dev_server(
|
|
|
643
725
|
interactive=interactive,
|
|
644
726
|
env_dir=env_dir,
|
|
645
727
|
new=new,
|
|
728
|
+
docker_mode=True,
|
|
646
729
|
)
|
|
647
730
|
hud_console.dim_info(
|
|
648
731
|
"",
|
|
@@ -661,13 +744,38 @@ def run_docker_dev_server(
|
|
|
661
744
|
# Create and run proxy with HUD helpers
|
|
662
745
|
async def run_proxy() -> None:
|
|
663
746
|
from fastmcp import FastMCP
|
|
747
|
+
from fastmcp.server.proxy import ProxyClient
|
|
748
|
+
|
|
749
|
+
# Create ProxyClient without custom log handler since we capture Docker logs directly
|
|
750
|
+
proxy_client = ProxyClient(mcp_config, name="HUD Docker Dev Proxy")
|
|
751
|
+
|
|
752
|
+
# Extract container name from docker args and store for logs endpoint
|
|
753
|
+
docker_cmd = mcp_config["docker"]["args"]
|
|
754
|
+
container_name = None
|
|
755
|
+
for i, arg in enumerate(docker_cmd):
|
|
756
|
+
if arg == "--name" and i + 1 < len(docker_cmd):
|
|
757
|
+
container_name = docker_cmd[i + 1]
|
|
758
|
+
break
|
|
664
759
|
|
|
665
|
-
|
|
666
|
-
|
|
760
|
+
if container_name:
|
|
761
|
+
# Store container name for logs endpoint to use
|
|
762
|
+
os.environ["_HUD_DEV_DOCKER_CONTAINER"] = container_name
|
|
763
|
+
hud_console.debug(f"Docker container: {container_name}")
|
|
764
|
+
|
|
765
|
+
# Store the docker mcp_config for the eval endpoint
|
|
766
|
+
import json
|
|
767
|
+
|
|
768
|
+
os.environ["_HUD_DEV_DOCKER_MCP_CONFIG"] = json.dumps(mcp_config)
|
|
769
|
+
|
|
770
|
+
# Create FastMCP proxy using the ProxyClient
|
|
771
|
+
fastmcp_proxy = FastMCP.as_proxy(proxy_client)
|
|
667
772
|
|
|
668
773
|
# Wrap in MCPServer to get /docs and REST wrappers
|
|
669
774
|
proxy = MCPServer(name="HUD Docker Dev Proxy")
|
|
670
775
|
|
|
776
|
+
# Enable logs endpoint on HTTP server
|
|
777
|
+
os.environ["_HUD_DEV_LOGS_PROVIDER"] = "enabled"
|
|
778
|
+
|
|
671
779
|
# Import all tools from the FastMCP proxy
|
|
672
780
|
await proxy.import_server(fastmcp_proxy)
|
|
673
781
|
|
|
@@ -693,7 +801,15 @@ def run_docker_dev_server(
|
|
|
693
801
|
asyncio.run(run_proxy())
|
|
694
802
|
except KeyboardInterrupt:
|
|
695
803
|
hud_console.info("\n\nStopping...")
|
|
804
|
+
cleanup_container()
|
|
696
805
|
raise typer.Exit(0) from None
|
|
806
|
+
except Exception:
|
|
807
|
+
# Ensure cleanup happens on any exception
|
|
808
|
+
cleanup_container()
|
|
809
|
+
raise
|
|
810
|
+
finally:
|
|
811
|
+
# Final cleanup attempt
|
|
812
|
+
cleanup_container()
|
|
697
813
|
|
|
698
814
|
|
|
699
815
|
def run_mcp_dev_server(
|
|
@@ -712,6 +828,20 @@ def run_mcp_dev_server(
|
|
|
712
828
|
docker_args = docker_args or []
|
|
713
829
|
cwd = Path.cwd()
|
|
714
830
|
|
|
831
|
+
# Find an available port if not using stdio transport
|
|
832
|
+
if not stdio:
|
|
833
|
+
from hud.cli.utils.logging import find_free_port
|
|
834
|
+
|
|
835
|
+
actual_port = find_free_port(port)
|
|
836
|
+
if actual_port is None:
|
|
837
|
+
hud_console.error(f"No available ports found starting from {port}")
|
|
838
|
+
raise typer.Exit(1)
|
|
839
|
+
|
|
840
|
+
if actual_port != port:
|
|
841
|
+
hud_console.info(f"Port {port} is in use, using port {actual_port} instead")
|
|
842
|
+
|
|
843
|
+
port = actual_port
|
|
844
|
+
|
|
715
845
|
# Auto-detect Docker mode if Dockerfile present and no module specified
|
|
716
846
|
if not docker and module is None and should_use_docker_mode(cwd):
|
|
717
847
|
hud_console.note("Detected Dockerfile - using Docker mode with volume mounts")
|