hud-python 0.4.54__tar.gz → 0.4.56__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.54 → hud_python-0.4.56}/PKG-INFO +1 -1
- hud_python-0.4.56/environments/rubrics/README.md +182 -0
- hud_python-0.4.56/environments/rubrics/environment/pyproject.toml +18 -0
- hud_python-0.4.56/environments/rubrics/pyproject.toml +19 -0
- hud_python-0.4.56/environments/rubrics/server/pyproject.toml +19 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/base.py +8 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/claude.py +4 -3
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/openai.py +2 -1
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/openai_chat_generic.py +3 -2
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/tests/test_claude.py +2 -2
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/tests/test_openai.py +1 -1
- hud_python-0.4.56/hud/agents/utils.py +50 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/__init__.py +52 -1
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/build.py +185 -25
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/dev.py +129 -39
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/eval.py +99 -1
- hud_python-0.4.56/hud/cli/flows/dev.py +155 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/flows/tasks.py +29 -9
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/init.py +3 -1
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/docker.py +6 -3
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/base.py +2 -2
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/context.py +42 -1
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/server.py +29 -3
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/settings.py +6 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/async_context.py +16 -2
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/trace.py +6 -1
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/group_eval.py +14 -2
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_agent_factories.py +2 -1
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/version.py +1 -1
- {hud_python-0.4.54 → hud_python-0.4.56}/pyproject.toml +1 -1
- {hud_python-0.4.54 → hud_python-0.4.56}/.gitignore +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/LICENSE +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/blank/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/blank/environment/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/blank/environment/pyproject.toml +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/blank/server/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/blank/server/pyproject.toml +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/environment/2048/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/environment/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/environment/pyproject.toml +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/environment/todo/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/pyproject.toml +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/server/pyproject.toml +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/deepresearch/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/deepresearch/environment/pyproject.toml +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/deepresearch/pyproject.toml +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/deepresearch/server/pyproject.toml +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/examples/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/__main__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/grounded_openai.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/lite_llm.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/misc/integration_test_agent.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/tests/test_base.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/tests/test_base_runtime.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/clone.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/debug.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/flows/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/get.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/list_func.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/pull.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/push.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/remove.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/celebrate.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/config.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/display.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/gpu.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/gpu_utils.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/local_runner.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/presets.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/remote_runner.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/rl_api.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/viewer.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/vllm.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/wait_utils.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_analyze_module.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_build.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_build_failure.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_build_module.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_cli_root.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_eval.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_mcp_server.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_push_happy.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_push_wrapper.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/config.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/env_check.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/interactive.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/local_runner.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/package_runner.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/runner.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/server.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/source_hash.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tasks.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_config.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_docker.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_docker_hints.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_env_check.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_environment.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_interactive_module.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_local_runner.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_logging_utils.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_metadata.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_package_runner.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_registry_utils.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_remote_runner.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_runner_modules.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_source_hash.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_tasks.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/version_check.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/fastmcp.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/mcp_use.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/tests/test_mcp_use_retry.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/utils/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/utils/mcp_use_retry.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/utils/retry.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/datasets/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/datasets/parallel.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/datasets/runner.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/datasets/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/datasets/tests/test_runner.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/datasets/tests/test_utils.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/datasets/utils.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/native/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/native/comparator.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/collector.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/config.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/exporters.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/instrumentation.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/processors.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/tests/test_instrumentation.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/py.typed +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/README.md +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/actor.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/buffer.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/chat_template.jinja +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/config.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/distributed.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/learner.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/tests/test_learner.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/train.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/types.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/utils/start_vllm_server.sh +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/utils.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/vllm_adapter.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/samples/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/samples/browser.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/context.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/low_level.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/router.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_add_tool.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_context.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_mcp_server_handlers.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_mcp_server_integration.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_mcp_server_more.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_run_wrapper.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_server_extra.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_sigterm_runner.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/exceptions.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/hints.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/requests.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/tests/test_exceptions.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/tests/test_hints.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/job.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/tests/test_async_context.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/tests/test_instrument.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/tests/test_job.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/tests/test_trace.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/base.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/bash.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/computer/qwen.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/edit.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/executors/base.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/grounding/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/grounding/config.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/grounding/grounded_tool.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/grounding/grounder.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/grounding/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/response.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/submit.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_submit.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_types.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/types.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/utils.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/types.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/agent_factories.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/hud_console.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/mcp.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/pretty_errors.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/progress.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/task_tracking.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tasks.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_mcp.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_pretty_errors.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_tasks.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_tool_shorthand.py +0 -0
- {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tool_shorthand.py +0 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# Rubrics Environment
|
|
2
|
+
|
|
3
|
+
Web research environment powered by Exa API for searching and fetching content, with rubric-based evaluation for structured grading.
|
|
4
|
+
See [docs](https://docs.hud.so/build-environments) for the complete environment design workflow.
|
|
5
|
+
|
|
6
|
+
## Architecture
|
|
7
|
+
|
|
8
|
+
**`environment/`** - Manages Exa API integration and state
|
|
9
|
+
- Holds the Exa API key server-side
|
|
10
|
+
- Exposes HTTP endpoints `/search`, `/fetch`, `/answer`, `/evaluate` for research workflows
|
|
11
|
+
- Implements exponential backoff for rate limiting
|
|
12
|
+
|
|
13
|
+
**`server/`** - Wraps data in MCP tools
|
|
14
|
+
- Provides `search()`, `fetch()`, `answer()`, `evaluate()` tools for agents
|
|
15
|
+
- Agents and tasks interact only with these tools
|
|
16
|
+
|
|
17
|
+
**Why separate?** Edit tools for the agent or tasks without restarting the environment backend.
|
|
18
|
+
|
|
19
|
+
## Tools
|
|
20
|
+
|
|
21
|
+
- **`search(query: str)`** - Search the web using Exa API, returns list of results with titles and URLs
|
|
22
|
+
- **`fetch(url: str)`** - Fetch full content from a URL, returns summary, highlights, and text
|
|
23
|
+
- **`answer(final_answer: str)`** - Submit the final research answer
|
|
24
|
+
- **`evaluate(rubric: list[dict])`** - Evaluate submitted answer using a structured rubric with weighted requirements
|
|
25
|
+
|
|
26
|
+
### Rubric-Based Evaluation
|
|
27
|
+
|
|
28
|
+
The `evaluate` tool uses The LLM Data Company's [rubric](https://github.com/The-LLM-Data-Company/rubric/) package to grade answers against structured criteria with autograders.
|
|
29
|
+
|
|
30
|
+
## Setup
|
|
31
|
+
|
|
32
|
+
### Requirements
|
|
33
|
+
- Exa API key (get one at [exa.ai](https://exa.ai))
|
|
34
|
+
|
|
35
|
+
### Environment Variables
|
|
36
|
+
```bash
|
|
37
|
+
export EXA_API_KEY="your_exa_api_key_here"
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Development
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Terminal 1 - Environment backend
|
|
44
|
+
cd environment
|
|
45
|
+
export EXA_API_KEY="your_key"
|
|
46
|
+
uv run uvicorn server:app --reload
|
|
47
|
+
|
|
48
|
+
# Terminal 2 - MCP server
|
|
49
|
+
cd server
|
|
50
|
+
uv run hud dev
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
The environment includes exponential backoff for rate limiting, so API calls will automatically retry on 429 errors.
|
|
54
|
+
|
|
55
|
+
In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
|
|
56
|
+
|
|
57
|
+
For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
|
|
58
|
+
```bash
|
|
59
|
+
cd ..
|
|
60
|
+
export EXA_API_KEY="your_key"
|
|
61
|
+
hud dev
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Tasks & Evaluation
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Build first in the global folder with the Dockerfile (creates rubrics:0.1.0)
|
|
68
|
+
hud build
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Your `tasks.json` uses `docker run` to launch the environment:
|
|
72
|
+
|
|
73
|
+
```json
|
|
74
|
+
{
|
|
75
|
+
"prompt": "Research and answer: What is the capital of France?",
|
|
76
|
+
"mcp_config": {
|
|
77
|
+
"local": {
|
|
78
|
+
"command": "docker",
|
|
79
|
+
"args": ["run", "--rm", "-i", "-e", "EXA_API_KEY", "rubrics:latest"]
|
|
80
|
+
}
|
|
81
|
+
},
|
|
82
|
+
"evaluate_tool": {
|
|
83
|
+
"name": "evaluate",
|
|
84
|
+
"arguments": {
|
|
85
|
+
"rubric": [
|
|
86
|
+
{
|
|
87
|
+
"requirement": "Correctly identifies Paris as the capital of France",
|
|
88
|
+
"weight": 5
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"requirement": "Provides additional context about Paris (population, history, or geography)",
|
|
92
|
+
"weight": 10
|
|
93
|
+
}
|
|
94
|
+
]
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
**Note:** The `-e EXA_API_KEY` flag passes your local API key to the container.
|
|
101
|
+
|
|
102
|
+
**Commands:**
|
|
103
|
+
```bash
|
|
104
|
+
# Build first
|
|
105
|
+
hud build
|
|
106
|
+
|
|
107
|
+
# Test task locally
|
|
108
|
+
export EXA_API_KEY="your_key"
|
|
109
|
+
hud eval tasks.json
|
|
110
|
+
|
|
111
|
+
# Push environment for remote running
|
|
112
|
+
hud push
|
|
113
|
+
|
|
114
|
+
# Production RL training
|
|
115
|
+
hud rl tasks.json # Auto-converts docker→remote, builds & pushes if needed
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Publishing Your Environment
|
|
119
|
+
|
|
120
|
+
Once your environment is ready, you can share it with the community:
|
|
121
|
+
|
|
122
|
+
### 1. Push to Registry
|
|
123
|
+
```bash
|
|
124
|
+
# Build and push your environment (requires docker hub login and hud api key)
|
|
125
|
+
hud build
|
|
126
|
+
hud push
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### 2. Create a Dataset
|
|
130
|
+
|
|
131
|
+
Create a dataset on HuggingFace with your tasks:
|
|
132
|
+
|
|
133
|
+
**Option A: Upload manually**
|
|
134
|
+
1. Upload your `tasks.json` to HuggingFace
|
|
135
|
+
2. Make sure it's **public** to appear on leaderboards
|
|
136
|
+
|
|
137
|
+
**Option B: Use the SDK**
|
|
138
|
+
```python
|
|
139
|
+
from hud.datasets import save_tasks
|
|
140
|
+
import json
|
|
141
|
+
|
|
142
|
+
# Load your tasks
|
|
143
|
+
with open("tasks.json") as f:
|
|
144
|
+
tasks = json.load(f)
|
|
145
|
+
|
|
146
|
+
# Push to HuggingFace
|
|
147
|
+
save_tasks(tasks, repo_id="your-org/your-dataset")
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### 3. Run and Track Performance
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
# Run Claude on your benchmark
|
|
154
|
+
hud eval "your-org/your-dataset" --agent claude
|
|
155
|
+
|
|
156
|
+
# View results at:
|
|
157
|
+
# hud.so/leaderboards/your-org/your-dataset
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
**Note**: Only public HuggingFace datasets appear as leaderboards!
|
|
161
|
+
|
|
162
|
+
📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
|
|
163
|
+
|
|
164
|
+
## Example Research Workflow
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
# Agent searches for information
|
|
168
|
+
results = search("latest AI developments 2024")
|
|
169
|
+
|
|
170
|
+
# Agent fetches detailed content from top result
|
|
171
|
+
content = fetch(results[0]["url"])
|
|
172
|
+
|
|
173
|
+
# Agent submits final answer
|
|
174
|
+
answer("Based on research, AI developments in 2024 include...")
|
|
175
|
+
|
|
176
|
+
# Evaluate answer using rubric
|
|
177
|
+
result = evaluate(rubric=[
|
|
178
|
+
{"requirement": "Mentions at least 3 specific AI developments", "weight": 15},
|
|
179
|
+
{"requirement": "Includes dates or timeframes for developments", "weight": 5},
|
|
180
|
+
])
|
|
181
|
+
# Returns: {"reward": float, "info": {"report": [...]}, "done": True}
|
|
182
|
+
```
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "rubrics-environment"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Backend service for Rubrics environment"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"fastapi>=0.104.1",
|
|
8
|
+
"uvicorn[standard]>=0.24.0",
|
|
9
|
+
"httpx>=0.24.0",
|
|
10
|
+
"rubric>=1.1.7",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[build-system]
|
|
14
|
+
requires = ["hatchling"]
|
|
15
|
+
build-backend = "hatchling.build"
|
|
16
|
+
|
|
17
|
+
[tool.hatch.build.targets.wheel]
|
|
18
|
+
packages = ["environment"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "rubrics"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Rubrics HUD environment with HTTP backend (EXA on server)"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [ "hud-python==0.4.42", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "httpx>=0.24.0",]
|
|
7
|
+
|
|
8
|
+
[build-system]
|
|
9
|
+
requires = [ "hatchling",]
|
|
10
|
+
build-backend = "hatchling.build"
|
|
11
|
+
|
|
12
|
+
[tool.hud]
|
|
13
|
+
image = "rubrics:dev"
|
|
14
|
+
|
|
15
|
+
[tool.hatch.metadata]
|
|
16
|
+
allow-direct-references = true
|
|
17
|
+
|
|
18
|
+
[tool.hatch.build.targets.wheel]
|
|
19
|
+
packages = [ "controller", "environment",]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "rubrics-mcp"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "MCP server for Rubrics environment"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"hud-python>=0.4.54",
|
|
8
|
+
"httpx>=0.24.0",
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
[build-system]
|
|
12
|
+
requires = ["hatchling"]
|
|
13
|
+
build-backend = "hatchling.build"
|
|
14
|
+
|
|
15
|
+
[tool.hatch.metadata]
|
|
16
|
+
allow-direct-references = true
|
|
17
|
+
|
|
18
|
+
[tool.hatch.build.targets.wheel]
|
|
19
|
+
packages = ["mcp"]
|
|
@@ -11,6 +11,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
|
|
11
11
|
|
|
12
12
|
import mcp.types as types
|
|
13
13
|
|
|
14
|
+
from hud.agents.utils import log_agent_metadata_to_status, log_task_config_to_current_trace
|
|
14
15
|
from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
|
|
15
16
|
from hud.utils.hud_console import HUDConsole
|
|
16
17
|
from hud.utils.mcp import MCPConfigPatch, patch_mcp_config, setup_hud_telemetry
|
|
@@ -62,6 +63,7 @@ class MCPAgent(ABC):
|
|
|
62
63
|
initial_screenshot: bool = True,
|
|
63
64
|
# Misc
|
|
64
65
|
model_name: str = "mcp-agent",
|
|
66
|
+
checkpoint_name: str | None = None,
|
|
65
67
|
response_agent: ResponseAgent | None = None,
|
|
66
68
|
auto_trace: bool = True,
|
|
67
69
|
verbose: bool = False,
|
|
@@ -92,6 +94,7 @@ class MCPAgent(ABC):
|
|
|
92
94
|
self._auto_created_client = False # Track if we created the client
|
|
93
95
|
|
|
94
96
|
self.model_name = model_name
|
|
97
|
+
self.checkpoint_name = checkpoint_name
|
|
95
98
|
self.console = HUDConsole(logger=logger)
|
|
96
99
|
|
|
97
100
|
# Set verbose mode if requested
|
|
@@ -198,6 +201,8 @@ class MCPAgent(ABC):
|
|
|
198
201
|
f"Agent initialized with {len(self.get_available_tools())} tools: {', '.join([t.name for t in self.get_available_tools()])}" # noqa: E501
|
|
199
202
|
)
|
|
200
203
|
|
|
204
|
+
await log_agent_metadata_to_status(self.model_name, self.checkpoint_name)
|
|
205
|
+
|
|
201
206
|
async def run(self, prompt_or_task: str | Task | dict[str, Any], max_steps: int = 10) -> Trace:
|
|
202
207
|
"""
|
|
203
208
|
Run the agent with the given prompt or task.
|
|
@@ -223,6 +228,9 @@ class MCPAgent(ABC):
|
|
|
223
228
|
|
|
224
229
|
# Handle Task objects with full lifecycle
|
|
225
230
|
if isinstance(prompt_or_task, Task):
|
|
231
|
+
# Log a compact summary of task config to the current trace (async)
|
|
232
|
+
await log_task_config_to_current_trace(prompt_or_task)
|
|
233
|
+
|
|
226
234
|
return await self.run_task(prompt_or_task, max_steps)
|
|
227
235
|
|
|
228
236
|
# Handle simple string prompts
|
|
@@ -89,7 +89,8 @@ class ClaudeAgent(MCPAgent):
|
|
|
89
89
|
self.use_computer_beta = use_computer_beta
|
|
90
90
|
self.hud_console = HUDConsole(logger=logger)
|
|
91
91
|
|
|
92
|
-
self.model_name =
|
|
92
|
+
self.model_name = "Claude"
|
|
93
|
+
self.checkpoint_name = self.model
|
|
93
94
|
|
|
94
95
|
# Track mapping from Claude tool names to MCP tool names
|
|
95
96
|
self._claude_to_mcp_tool_map: dict[str, str] = {}
|
|
@@ -98,14 +99,14 @@ class ClaudeAgent(MCPAgent):
|
|
|
98
99
|
# Append Claude-specific instructions to the base system prompt
|
|
99
100
|
claude_instructions = """
|
|
100
101
|
You are Claude, an AI assistant created by Anthropic. You are helpful, harmless, and honest.
|
|
101
|
-
|
|
102
|
+
|
|
102
103
|
When working on tasks:
|
|
103
104
|
1. Be thorough and systematic in your approach
|
|
104
105
|
2. Complete tasks autonomously without asking for confirmation
|
|
105
106
|
3. Use available tools efficiently to accomplish your goals
|
|
106
107
|
4. Verify your actions and ensure task completion
|
|
107
108
|
5. Be precise and accurate in all operations
|
|
108
|
-
|
|
109
|
+
|
|
109
110
|
Remember: You are expected to complete tasks autonomously. The user trusts you to accomplish what they asked.
|
|
110
111
|
""".strip() # noqa: E501
|
|
111
112
|
|
|
@@ -70,6 +70,7 @@ class OperatorAgent(MCPAgent):
|
|
|
70
70
|
|
|
71
71
|
self.openai_client = model_client
|
|
72
72
|
self.model = model
|
|
73
|
+
self.checkpoint_name = self.model
|
|
73
74
|
self.environment = environment
|
|
74
75
|
|
|
75
76
|
# State tracking for OpenAI's stateful API
|
|
@@ -84,7 +85,7 @@ class OperatorAgent(MCPAgent):
|
|
|
84
85
|
except Exception as e:
|
|
85
86
|
raise ValueError(f"OpenAI API key is invalid: {e}") from e
|
|
86
87
|
|
|
87
|
-
self.model_name = "
|
|
88
|
+
self.model_name = "Operator"
|
|
88
89
|
|
|
89
90
|
# Append OpenAI-specific instructions to the base system prompt
|
|
90
91
|
openai_instructions = """
|
|
@@ -62,7 +62,8 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
62
62
|
else:
|
|
63
63
|
raise ValueError("Either openai_client or (api_key and base_url) must be provided")
|
|
64
64
|
|
|
65
|
-
self.model_name =
|
|
65
|
+
self.model_name = "GenericOpenAI"
|
|
66
|
+
self.checkpoint_name = model_name
|
|
66
67
|
self.completion_kwargs: dict[str, Any] = completion_kwargs or {}
|
|
67
68
|
self.mcp_schemas = []
|
|
68
69
|
self.hud_console = HUDConsole(logger=logger)
|
|
@@ -194,7 +195,7 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
194
195
|
raise ValueError("openai_client is required for GenericOpenAIChatAgent")
|
|
195
196
|
# default transport = OpenAI SDK
|
|
196
197
|
return await self.oai.chat.completions.create(
|
|
197
|
-
model=self.
|
|
198
|
+
model=self.checkpoint_name,
|
|
198
199
|
messages=messages,
|
|
199
200
|
tools=tools, # type: ignore ready ChatCompletionToolParam-shaped
|
|
200
201
|
**extra,
|
|
@@ -89,7 +89,7 @@ class TestClaudeAgent:
|
|
|
89
89
|
validate_api_key=False, # Skip validation in tests
|
|
90
90
|
)
|
|
91
91
|
|
|
92
|
-
assert agent.model_name == "
|
|
92
|
+
assert agent.model_name == "Claude"
|
|
93
93
|
assert agent.max_tokens == 1000
|
|
94
94
|
assert agent.anthropic_client == mock_model_client
|
|
95
95
|
|
|
@@ -103,7 +103,7 @@ class TestClaudeAgent:
|
|
|
103
103
|
validate_api_key=False, # Skip validation in tests
|
|
104
104
|
)
|
|
105
105
|
|
|
106
|
-
assert agent.model_name == "
|
|
106
|
+
assert agent.model_name == "Claude"
|
|
107
107
|
assert agent.anthropic_client is not None
|
|
108
108
|
|
|
109
109
|
@pytest.mark.asyncio
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
from hud.otel.context import (
|
|
7
|
+
_update_task_status_async,
|
|
8
|
+
get_current_task_run_id,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from hud.datasets import Task
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def log_task_config_to_current_trace(task: Task) -> None:
|
|
16
|
+
with contextlib.suppress(Exception):
|
|
17
|
+
task_run_id = get_current_task_run_id()
|
|
18
|
+
if not task_run_id:
|
|
19
|
+
return
|
|
20
|
+
|
|
21
|
+
raw_config = task.model_dump()
|
|
22
|
+
|
|
23
|
+
await _update_task_status_async(
|
|
24
|
+
task_run_id,
|
|
25
|
+
"running",
|
|
26
|
+
task_id=task.id,
|
|
27
|
+
extra_metadata={"task_config": raw_config},
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
async def log_agent_metadata_to_status(
|
|
32
|
+
model_name: str | None = None, checkpoint_name: str | None = None
|
|
33
|
+
) -> None:
|
|
34
|
+
"""Attach agent metadata (model/checkpoint) to current trace status metadata."""
|
|
35
|
+
with contextlib.suppress(Exception):
|
|
36
|
+
task_run_id = get_current_task_run_id()
|
|
37
|
+
if not task_run_id or (not model_name and not checkpoint_name):
|
|
38
|
+
return
|
|
39
|
+
|
|
40
|
+
agent_meta = {}
|
|
41
|
+
if model_name is not None:
|
|
42
|
+
agent_meta["model_name"] = model_name
|
|
43
|
+
if checkpoint_name is not None:
|
|
44
|
+
agent_meta["checkpoint_name"] = checkpoint_name
|
|
45
|
+
|
|
46
|
+
await _update_task_status_async(
|
|
47
|
+
task_run_id,
|
|
48
|
+
"running",
|
|
49
|
+
extra_metadata={"agent": agent_meta},
|
|
50
|
+
)
|
|
@@ -382,6 +382,11 @@ def dev(
|
|
|
382
382
|
"--watch",
|
|
383
383
|
help="Additional directories to watch for changes (default: current directory)",
|
|
384
384
|
),
|
|
385
|
+
new: bool = typer.Option(
|
|
386
|
+
False,
|
|
387
|
+
"--new",
|
|
388
|
+
help="Show Cursor installation link for new server setup",
|
|
389
|
+
),
|
|
385
390
|
) -> None:
|
|
386
391
|
"""🔥 Development mode - run MCP server with hot-reload.
|
|
387
392
|
|
|
@@ -422,6 +427,7 @@ def dev(
|
|
|
422
427
|
watch,
|
|
423
428
|
docker=docker,
|
|
424
429
|
docker_args=docker_args,
|
|
430
|
+
new=new,
|
|
425
431
|
)
|
|
426
432
|
|
|
427
433
|
|
|
@@ -740,7 +746,7 @@ def init(
|
|
|
740
746
|
None,
|
|
741
747
|
"--preset",
|
|
742
748
|
"-p",
|
|
743
|
-
help="Preset to use: blank, deep-research, browser. If omitted, you'll choose interactively.", # noqa: E501
|
|
749
|
+
help="Preset to use: blank, deep-research, browser, rubrics. If omitted, you'll choose interactively.", # noqa: E501
|
|
744
750
|
),
|
|
745
751
|
directory: str = typer.Option(".", "--dir", "-d", help="Target directory"),
|
|
746
752
|
force: bool = typer.Option(False, "--force", "-f", help="Overwrite existing files"),
|
|
@@ -1079,6 +1085,51 @@ def rl(
|
|
|
1079
1085
|
)
|
|
1080
1086
|
|
|
1081
1087
|
|
|
1088
|
+
@app.command()
|
|
1089
|
+
def convert(
|
|
1090
|
+
tasks_file: str = typer.Argument(
|
|
1091
|
+
..., help="Path to tasks file (JSON/JSONL) to convert to remote MCP configuration"
|
|
1092
|
+
),
|
|
1093
|
+
) -> None:
|
|
1094
|
+
"""Convert local MCP task configs to remote (mcp.hud.so) format.
|
|
1095
|
+
|
|
1096
|
+
This mirrors the implicit conversion flow used by 'hud rl' and writes a new
|
|
1097
|
+
remote_<name>.json next to the source file when needed.
|
|
1098
|
+
"""
|
|
1099
|
+
from pathlib import Path
|
|
1100
|
+
|
|
1101
|
+
from hud.utils.hud_console import HUDConsole
|
|
1102
|
+
|
|
1103
|
+
hud_console = HUDConsole()
|
|
1104
|
+
|
|
1105
|
+
try:
|
|
1106
|
+
from .flows.tasks import convert_tasks_to_remote
|
|
1107
|
+
|
|
1108
|
+
result_path = convert_tasks_to_remote(tasks_file)
|
|
1109
|
+
|
|
1110
|
+
# If nothing changed, inform the user
|
|
1111
|
+
try:
|
|
1112
|
+
if Path(result_path).resolve() == Path(tasks_file).resolve():
|
|
1113
|
+
hud_console.success(
|
|
1114
|
+
"Tasks already reference remote MCP URLs. No conversion needed."
|
|
1115
|
+
)
|
|
1116
|
+
hud_console.hint("You can run them directly with: hud eval <tasks_file> --full")
|
|
1117
|
+
return
|
|
1118
|
+
except Exception as e:
|
|
1119
|
+
# Best effort; continue with success message
|
|
1120
|
+
hud_console.debug(f"Path comparison failed, continuing: {e}")
|
|
1121
|
+
|
|
1122
|
+
hud_console.success(f"Converted tasks written to: {result_path}")
|
|
1123
|
+
hud_console.hint(
|
|
1124
|
+
"You can now run remote flows: hud rl <converted_file> or hud eval <converted_file>"
|
|
1125
|
+
)
|
|
1126
|
+
except typer.Exit:
|
|
1127
|
+
raise
|
|
1128
|
+
except Exception as e:
|
|
1129
|
+
hud_console.error(f"Failed to convert tasks: {e}")
|
|
1130
|
+
raise typer.Exit(1) from e
|
|
1131
|
+
|
|
1132
|
+
|
|
1082
1133
|
@app.command()
|
|
1083
1134
|
def set(
|
|
1084
1135
|
assignments: list[str] = typer.Argument( # type: ignore[arg-type] # noqa: B008
|