hud-python 0.6.3__tar.gz → 0.6.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hud_python-0.6.3 → hud_python-0.6.4}/PKG-INFO +1 -1
- hud_python-0.6.4/cookbooks/connect4-selfplay/README.md +57 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/__init__.py +11 -3
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai_compatible/agent.py +15 -4
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_base.py +38 -2
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_provider_native_tools.py +4 -4
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/types.py +7 -3
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/__init__.py +4 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/eval.py +26 -7
- hud_python-0.6.4/hud/cli/jobs.py +146 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/models.py +21 -3
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_eval_config.py +40 -0
- hud_python-0.6.4/hud/cli/trace.py +215 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/run.py +23 -5
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/runtime.py +51 -8
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_hosted.py +48 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_rollout.py +26 -1
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/settings.py +2 -2
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/train/__init__.py +2 -0
- hud_python-0.6.4/hud/train/base.py +159 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/train/client.py +41 -17
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/train/types.py +38 -4
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/gateway.py +23 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/version.py +1 -1
- {hud_python-0.6.3 → hud_python-0.6.4}/pyproject.toml +1 -1
- hud_python-0.6.3/hud/train/base.py +0 -102
- {hud_python-0.6.3 → hud_python-0.6.4}/.gitignore +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/LICENSE +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/README.md +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/a2a-chat/README.md +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/a2a-chat/pyproject.toml +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/codex-coding/README.md +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/codex-coding/pyproject.toml +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/fireworks-rl-training/README.md +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/fireworks-rl-training/pyproject.toml +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/rl-training/README.md +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/rl-training/pyproject.toml +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/__main__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/_legacy.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/base.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/browser_use/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/browser_use/agent.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/agent.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/sdk/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/sdk/agent.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/sdk/computer_mcp.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/base.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/coding.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/computer.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/hosted.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/mcp_proxy.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/settings.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/tests/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/tests/test_computer.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/agent.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/settings.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/base.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/coding.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/computer.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/filesystem.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/hosted.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/mcp_proxy.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/tests/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/tests/test_computer.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/misc/response_automation.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/agent.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/apply_patch.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/base.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/coding.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/computer.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/hosted.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/mcp_proxy.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/strict_schema.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/tests/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/tests/test_computer.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/tests/test_strict_schema.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai_compatible/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/base.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/filesystem.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/mcp_proxy.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/robot/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/robot/_types.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/robot/adapter.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/robot/agent.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/robot/model.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_apply_patch.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_claude_agent.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_claude_sdk_agent.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_gemini_agent.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_openai_agent.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_openai_compatible_agent.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_tool_agent.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_trace.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tool_agent.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tools/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tools/base.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tools/hosted.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tools/mcp.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tools/rfb.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tools/ssh.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/base.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/cdp.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/filetracking.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/mcp.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/rfb.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/robot.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/ssh.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/__main__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/cancel.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/client.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/deploy.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/init.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/login.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/presets.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/serve.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/sync.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/task.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/templates.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_deploy.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_eval_bedrock.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_init.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_sync_export.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/api.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/build_display.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/build_logs.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/config.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/context.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/display.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/jobs.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/source.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tasks.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/test_build_display.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/test_config.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/test_context.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/test_registry.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/test_source.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/test_tasks.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/test_version_check.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/version_check.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/clients/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/clients/client.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/clients/tests/test_connect.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/conftest.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/env.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/file_tracker.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/file_tracking.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/legacy.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/robot/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/robot/bridge.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/robot/endpoint.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/robot/sim_runner.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/server.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/conftest.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_capability_backing.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_file_tracker.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_file_tracking.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_legacy.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_loader.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_manifest.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_server.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_tunnel.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/utils.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/workspace.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/chat.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/file_tracking.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/job.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/sync.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/task.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/taskset.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_chat.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_docker_provider.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_file_tracking_observer.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_job.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_sync.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_task.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/graders/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/graders/base.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/graders/bash.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/graders/combine.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/graders/judge.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/graders/results.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/graders/text.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/patches/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/patches/mcp_patches.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/patches/tests/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/patches/tests/test_warnings.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/patches/warnings.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/py.typed +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/server.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/context.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/exporter.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/filetracking.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/span.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/tests/test_exporter.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/tests/test_filetracking.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/tests/test_instrument.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/types.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/exceptions.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/hints.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/hud_console.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/modules.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/platform.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/requests.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/serialization.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/tests/test_exceptions.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/tests/test_hints.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/tests/test_hud_console.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/tests/test_platform.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/tests/test_requests.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/tests/test_serialization.py +0 -0
- {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/time.py +0 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Connect Four self-play
|
|
2
|
+
|
|
3
|
+
Symmetric self-play RL on a 6×7 Connect Four board. Draws are rare (you need a
|
|
4
|
+
full 42-cell board with no four-in-a-row), so the win/loss reward signal
|
|
5
|
+
persists as the policy improves and the GRPO advantage stays non-zero.
|
|
6
|
+
|
|
7
|
+
## How it works
|
|
8
|
+
|
|
9
|
+
- One agent ("outer") plays a full game against an inner model on the **same
|
|
10
|
+
slug** — true self-play. `seed % 2` decides who drops first, for symmetric
|
|
11
|
+
first-move coverage.
|
|
12
|
+
- Each game trains **both sides at once**: the outer agent's `Run` (reward from
|
|
13
|
+
its perspective) plus a hand-built `TrajectoryPayload` for the inner model
|
|
14
|
+
with the flipped reward (`1 - outer_reward`).
|
|
15
|
+
- `group_size=2` pairs each game's two trajectories so the GRPO advantage is
|
|
16
|
+
`reward - 0.5` per game.
|
|
17
|
+
- `loss_fn="ppo"` clips the importance-sampling ratio, so a single lucky game
|
|
18
|
+
can't blow up the update.
|
|
19
|
+
|
|
20
|
+
The training loop uses the public API directly — `forward_backward` accepts
|
|
21
|
+
`Run` and `TrajectoryPayload` mixed, so no private helpers are needed.
|
|
22
|
+
|
|
23
|
+
## Setup
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
hud models fork Qwen/Qwen3.5-4B --name c4-selfplay # prints a slug like c4-selfplay-<id>
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Put your `HUD_API_KEY` in a `.env` here (or the environment).
|
|
30
|
+
|
|
31
|
+
## Run
|
|
32
|
+
|
|
33
|
+
Local sanity check (one game, cheap external model as the outer agent):
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
hud eval env.py claude --model claude-haiku-4-5
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Train:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
python train.py --model c4-selfplay-<id> --steps 20 --group 4 --lr 1e-5
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Tuning notes
|
|
46
|
+
|
|
47
|
+
- **Memory scales with `tasks × group`.** Each task×rollout is a fresh `env.py`
|
|
48
|
+
subprocess. With 8 tasks and `--group 4` that's 32 concurrent games. Connect
|
|
49
|
+
Four games can run up to 42 plies, so they cost more tokens and time per game —
|
|
50
|
+
start at `--group 4` and raise only if you have RAM headroom.
|
|
51
|
+
- **Watch the server-side metrics.** The loop prints local win/draw/loss counts
|
|
52
|
+
each step and the last few checkpoints' `mean_reward` / `reward_std` via
|
|
53
|
+
`trainer.checkpoints()` at the end. A healthy run keeps non-trivial
|
|
54
|
+
`reward_std` (within-group spread); if it collapses, the policy has saturated.
|
|
55
|
+
- **Reset on changes.** If you edit the reward or the board, roll the head back
|
|
56
|
+
to a clean checkpoint (`hud models head <slug> --set <id>`) or fork fresh —
|
|
57
|
+
don't keep training a policy shaped by the old objective.
|
|
@@ -8,7 +8,12 @@ from __future__ import annotations
|
|
|
8
8
|
from typing import TYPE_CHECKING, Any, cast
|
|
9
9
|
|
|
10
10
|
from hud.types import AgentType
|
|
11
|
-
from hud.utils.gateway import
|
|
11
|
+
from hud.utils.gateway import (
|
|
12
|
+
build_gateway_client,
|
|
13
|
+
gateway_model_aliases,
|
|
14
|
+
list_gateway_models,
|
|
15
|
+
normalize_gateway_model_id,
|
|
16
|
+
)
|
|
12
17
|
|
|
13
18
|
if TYPE_CHECKING:
|
|
14
19
|
from typing import TypeAlias
|
|
@@ -27,6 +32,8 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
|
|
|
27
32
|
|
|
28
33
|
For direct API access with provider API keys, instantiate the agent classes directly.
|
|
29
34
|
"""
|
|
35
|
+
requested_model = model
|
|
36
|
+
model = normalize_gateway_model_id(model)
|
|
30
37
|
agent_type = next((candidate for candidate in AgentType if candidate.value == model), None)
|
|
31
38
|
if agent_type is not None:
|
|
32
39
|
model_id = model
|
|
@@ -73,7 +80,8 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
|
|
|
73
80
|
for n in (gm.id, gm.name, gm.model_name)
|
|
74
81
|
if isinstance(n, str)
|
|
75
82
|
]
|
|
76
|
-
|
|
83
|
+
known.extend(gateway_model_aliases())
|
|
84
|
+
near = difflib.get_close_matches(requested_model, known, n=3, cutoff=0.5)
|
|
77
85
|
hint = (
|
|
78
86
|
f" Did you mean: {', '.join(near)}?"
|
|
79
87
|
if near
|
|
@@ -84,7 +92,7 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
|
|
|
84
92
|
if gateway_models
|
|
85
93
|
else "the HUD gateway registry (empty — is HUD_API_KEY set?)"
|
|
86
94
|
)
|
|
87
|
-
raise ValueError(f"Model {
|
|
95
|
+
raise ValueError(f"Model {requested_model!r} not found in {source}.{hint}")
|
|
88
96
|
|
|
89
97
|
kwargs.setdefault("model", model_id)
|
|
90
98
|
kwargs.setdefault("model_client", build_gateway_client(provider_name))
|
|
@@ -193,16 +193,27 @@ class OpenAIChatAgent(ToolAgent[ChatCompletionMessageParam, OpenAIChatConfig]):
|
|
|
193
193
|
sample: Sample | None = None
|
|
194
194
|
if return_token_ids:
|
|
195
195
|
prompt_token_ids = getattr(choice, "prompt_token_ids", None)
|
|
196
|
+
# Multimodal prompt (text + image chunks): the only prompt representation
|
|
197
|
+
# that survives image inputs; flat prompt_token_ids is null in that case.
|
|
198
|
+
prompt_chunks = getattr(choice, "prompt_chunks", None)
|
|
196
199
|
token_ids = getattr(choice, "token_ids", None)
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
chat_state.continuation_message_count = len(messages)
|
|
200
|
+
has_prompt = prompt_token_ids is not None or prompt_chunks is not None
|
|
201
|
+
if token_ids is not None and has_prompt:
|
|
200
202
|
content_lp = choice.logprobs.content if choice.logprobs else None
|
|
201
203
|
sample = Sample(
|
|
202
|
-
prompt_token_ids=list(prompt_token_ids),
|
|
204
|
+
prompt_token_ids=list(prompt_token_ids) if prompt_token_ids is not None else [],
|
|
205
|
+
prompt_chunks=list(prompt_chunks) if prompt_chunks is not None else None,
|
|
203
206
|
output_token_ids=list(token_ids),
|
|
204
207
|
output_logprobs=[tok.logprob for tok in content_lp] if content_lp else [],
|
|
205
208
|
)
|
|
209
|
+
# KV-cache continuation only applies to flat text prompts; clear any
|
|
210
|
+
# stale state when the gateway returns chunks-only (multimodal turn).
|
|
211
|
+
if prompt_token_ids is not None:
|
|
212
|
+
chat_state.continuation_token_ids = list(prompt_token_ids) + list(token_ids)
|
|
213
|
+
chat_state.continuation_message_count = len(messages)
|
|
214
|
+
else:
|
|
215
|
+
chat_state.continuation_token_ids = None
|
|
216
|
+
chat_state.continuation_message_count = None
|
|
206
217
|
|
|
207
218
|
tool_calls: list[MCPToolCall] = []
|
|
208
219
|
for tc in function_calls:
|
|
@@ -108,7 +108,7 @@ def test_create_agent_resolves_gateway_model_metadata(
|
|
|
108
108
|
|
|
109
109
|
model = GatewayModelInfo(
|
|
110
110
|
id="ft:custom-123",
|
|
111
|
-
model_name="gpt-5.
|
|
111
|
+
model_name="gpt-5.5",
|
|
112
112
|
sdk_agent_type="openai_compatible",
|
|
113
113
|
provider=GatewayProviderInfo(name="openai"),
|
|
114
114
|
)
|
|
@@ -122,4 +122,40 @@ def test_create_agent_resolves_gateway_model_metadata(
|
|
|
122
122
|
agent = create_agent("ft:custom-123")
|
|
123
123
|
|
|
124
124
|
assert isinstance(agent, OpenAIChatAgent)
|
|
125
|
-
assert agent.config.model == "gpt-5.
|
|
125
|
+
assert agent.config.model == "gpt-5.5" # resolved to the model's real name
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@pytest.mark.parametrize(
|
|
129
|
+
("alias", "canonical"),
|
|
130
|
+
[
|
|
131
|
+
("deepseek-v4", "deepseek/deepseek-v4-pro"),
|
|
132
|
+
("deepseek-v4-flash", "deepseek/deepseek-v4-flash"),
|
|
133
|
+
("glm-5.2", "z-ai/glm-5.2"),
|
|
134
|
+
("kimi-k2.6", "moonshotai/kimi-k2.6"),
|
|
135
|
+
("minimax-m3", "MiniMax-M3"),
|
|
136
|
+
],
|
|
137
|
+
)
|
|
138
|
+
def test_create_agent_accepts_gateway_model_aliases(
|
|
139
|
+
alias: str,
|
|
140
|
+
canonical: str,
|
|
141
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
142
|
+
) -> None:
|
|
143
|
+
from hud.utils.gateway import GatewayModelInfo, GatewayProviderInfo
|
|
144
|
+
|
|
145
|
+
model = GatewayModelInfo(
|
|
146
|
+
id=canonical,
|
|
147
|
+
model_name=canonical,
|
|
148
|
+
sdk_agent_type="openai_compatible",
|
|
149
|
+
provider=GatewayProviderInfo(name="openai"),
|
|
150
|
+
)
|
|
151
|
+
monkeypatch.setattr("hud.agents.list_gateway_models", lambda: [model])
|
|
152
|
+
|
|
153
|
+
def _build_client(_provider: str) -> object:
|
|
154
|
+
return object()
|
|
155
|
+
|
|
156
|
+
monkeypatch.setattr("hud.agents.build_gateway_client", _build_client)
|
|
157
|
+
|
|
158
|
+
agent = create_agent(alias)
|
|
159
|
+
|
|
160
|
+
assert isinstance(agent, OpenAIChatAgent)
|
|
161
|
+
assert agent.config.model == canonical
|
|
@@ -102,7 +102,7 @@ def _commands(tool: Any) -> list[str]:
|
|
|
102
102
|
|
|
103
103
|
|
|
104
104
|
async def test_openai_shell_wraps_command_with_timeout() -> None:
|
|
105
|
-
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.
|
|
105
|
+
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
|
|
106
106
|
|
|
107
107
|
result = await tool.execute({"commands": ["pwd"], "timeout_ms": 2500})
|
|
108
108
|
|
|
@@ -114,7 +114,7 @@ async def test_openai_shell_wraps_command_with_timeout() -> None:
|
|
|
114
114
|
|
|
115
115
|
|
|
116
116
|
async def test_openai_shell_runs_each_command_without_timeout() -> None:
|
|
117
|
-
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.
|
|
117
|
+
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
|
|
118
118
|
|
|
119
119
|
await tool.execute({"commands": ["echo a", "echo b"]})
|
|
120
120
|
|
|
@@ -122,7 +122,7 @@ async def test_openai_shell_runs_each_command_without_timeout() -> None:
|
|
|
122
122
|
|
|
123
123
|
|
|
124
124
|
async def test_openai_shell_rejects_non_list_commands_without_running() -> None:
|
|
125
|
-
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.
|
|
125
|
+
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
|
|
126
126
|
|
|
127
127
|
result = await tool.execute({"commands": 123})
|
|
128
128
|
|
|
@@ -131,7 +131,7 @@ async def test_openai_shell_rejects_non_list_commands_without_running() -> None:
|
|
|
131
131
|
|
|
132
132
|
|
|
133
133
|
def test_openai_shell_to_params_is_shell_type() -> None:
|
|
134
|
-
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.
|
|
134
|
+
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
|
|
135
135
|
assert tool.to_params()["type"] == "shell"
|
|
136
136
|
|
|
137
137
|
|
|
@@ -99,7 +99,7 @@ class OpenAIConfig(AgentConfig):
|
|
|
99
99
|
"""Configuration for OpenAIAgent."""
|
|
100
100
|
|
|
101
101
|
model_name: str = "OpenAI"
|
|
102
|
-
model: str = Field(default="gpt-5.
|
|
102
|
+
model: str = Field(default="gpt-5.5", validation_alias=_model_alias)
|
|
103
103
|
max_output_tokens: int | None = None
|
|
104
104
|
temperature: float | None = None
|
|
105
105
|
reasoning: Any = None # openai Reasoning
|
|
@@ -113,7 +113,7 @@ class OpenAIChatConfig(AgentConfig):
|
|
|
113
113
|
"""Configuration for OpenAIChatAgent."""
|
|
114
114
|
|
|
115
115
|
model_name: str = "OpenAI Chat"
|
|
116
|
-
model: str = Field(default="gpt-5-mini", validation_alias=_model_alias)
|
|
116
|
+
model: str = Field(default="gpt-5.4-mini", validation_alias=_model_alias)
|
|
117
117
|
checkpoint: str | None = Field(
|
|
118
118
|
default=None,
|
|
119
119
|
description="Specific checkpoint name for inference routing. "
|
|
@@ -139,7 +139,7 @@ class ClaudeSDKConfig(AgentConfig):
|
|
|
139
139
|
"""
|
|
140
140
|
|
|
141
141
|
model_name: str = "Claude Code"
|
|
142
|
-
model: str = Field(default="claude-sonnet-4-
|
|
142
|
+
model: str = Field(default="claude-sonnet-4-6", validation_alias=_model_alias)
|
|
143
143
|
permission_mode: str = "bypassPermissions"
|
|
144
144
|
max_steps: int = -1
|
|
145
145
|
allowed_tools: list[str] = Field(
|
|
@@ -222,6 +222,10 @@ class Sample(BaseModel):
|
|
|
222
222
|
"""
|
|
223
223
|
|
|
224
224
|
prompt_token_ids: list[int] = Field(default_factory=list[int])
|
|
225
|
+
# Multimodal prompt as serialized ``ModelInput`` chunks (text + image), set by
|
|
226
|
+
# vision rollouts where the prompt is not a flat token list. When present it is
|
|
227
|
+
# the authoritative prompt for training; ``prompt_token_ids`` stays empty.
|
|
228
|
+
prompt_chunks: list[dict[str, Any]] | None = None
|
|
225
229
|
output_token_ids: list[int] = Field(default_factory=list[int])
|
|
226
230
|
output_logprobs: list[float] = Field(default_factory=list[float])
|
|
227
231
|
|
|
@@ -35,11 +35,13 @@ from .client import client_app # noqa: E402
|
|
|
35
35
|
from .deploy import deploy_command # noqa: E402
|
|
36
36
|
from .eval import eval_command # noqa: E402
|
|
37
37
|
from .init import init_command # noqa: E402
|
|
38
|
+
from .jobs import jobs_app # noqa: E402
|
|
38
39
|
from .login import login_command # noqa: E402
|
|
39
40
|
from .models import models_app # noqa: E402
|
|
40
41
|
from .serve import serve_command # noqa: E402
|
|
41
42
|
from .sync import sync_app # noqa: E402
|
|
42
43
|
from .task import task_app # noqa: E402
|
|
44
|
+
from .trace import trace_app # noqa: E402
|
|
43
45
|
|
|
44
46
|
app.command(name="serve")(serve_command)
|
|
45
47
|
app.command(name="dev", deprecated=True, hidden=True)(serve_command) # alias for now
|
|
@@ -49,6 +51,8 @@ app.command(name="eval")(eval_command)
|
|
|
49
51
|
app.command(name="init")(init_command)
|
|
50
52
|
app.command(name="cancel")(cancel_command)
|
|
51
53
|
app.add_typer(models_app, name="models")
|
|
54
|
+
app.add_typer(jobs_app, name="jobs")
|
|
55
|
+
app.add_typer(trace_app, name="trace")
|
|
52
56
|
|
|
53
57
|
|
|
54
58
|
@app.command(name="set")
|
|
@@ -43,8 +43,9 @@ def _resolve_model_from_catalog(model_id: str) -> tuple[AgentType, str] | None:
|
|
|
43
43
|
Returns None if the model isn't found or the catalog is unreachable.
|
|
44
44
|
"""
|
|
45
45
|
try:
|
|
46
|
-
from hud.utils.gateway import list_gateway_models
|
|
46
|
+
from hud.utils.gateway import list_gateway_models, normalize_gateway_model_id
|
|
47
47
|
|
|
48
|
+
model_id = normalize_gateway_model_id(model_id)
|
|
48
49
|
models = list_gateway_models()
|
|
49
50
|
except Exception:
|
|
50
51
|
return None
|
|
@@ -117,8 +118,9 @@ class AgentPreset:
|
|
|
117
118
|
|
|
118
119
|
_AGENT_PRESETS: list[AgentPreset] = [
|
|
119
120
|
AgentPreset("Claude Sonnet 4.6", AgentType.CLAUDE, "claude-sonnet-4-6"),
|
|
120
|
-
AgentPreset("
|
|
121
|
-
AgentPreset("
|
|
121
|
+
AgentPreset("Claude Opus 4.8", AgentType.CLAUDE, "claude-opus-4-8"),
|
|
122
|
+
AgentPreset("GPT-5.5", AgentType.OPENAI, "gpt-5.5"),
|
|
123
|
+
AgentPreset("Gemini 3.1 Pro (Preview)", AgentType.GEMINI, "gemini-3.1-pro-preview"),
|
|
122
124
|
AgentPreset(
|
|
123
125
|
"Grok 4-1 Fast (xAI)",
|
|
124
126
|
AgentType.OPENAI_COMPATIBLE,
|
|
@@ -131,10 +133,22 @@ _AGENT_PRESETS: list[AgentPreset] = [
|
|
|
131
133
|
},
|
|
132
134
|
),
|
|
133
135
|
AgentPreset(
|
|
134
|
-
"GLM
|
|
136
|
+
"GLM 5.2 (Z.ai)",
|
|
135
137
|
AgentType.OPENAI_COMPATIBLE,
|
|
136
|
-
"z-ai/glm-
|
|
137
|
-
{"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM
|
|
138
|
+
"z-ai/glm-5.2",
|
|
139
|
+
{"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM 5.2"}},
|
|
140
|
+
),
|
|
141
|
+
AgentPreset(
|
|
142
|
+
"Kimi K2.6 (Moonshot)",
|
|
143
|
+
AgentType.OPENAI_COMPATIBLE,
|
|
144
|
+
"moonshotai/kimi-k2.6",
|
|
145
|
+
{"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "Kimi K2.6"}},
|
|
146
|
+
),
|
|
147
|
+
AgentPreset(
|
|
148
|
+
"MiniMax M3",
|
|
149
|
+
AgentType.OPENAI_COMPATIBLE,
|
|
150
|
+
"MiniMax-M3",
|
|
151
|
+
{"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "MiniMax M3"}},
|
|
138
152
|
),
|
|
139
153
|
]
|
|
140
154
|
|
|
@@ -162,7 +176,7 @@ _DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
|
|
|
162
176
|
# use_computer_beta = true
|
|
163
177
|
|
|
164
178
|
[openai]
|
|
165
|
-
# model = "gpt-
|
|
179
|
+
# model = "gpt-5.5"
|
|
166
180
|
# temperature = 0.7
|
|
167
181
|
# max_output_tokens = 4096
|
|
168
182
|
|
|
@@ -402,6 +416,11 @@ class EvalConfig(BaseModel):
|
|
|
402
416
|
if self.model:
|
|
403
417
|
kwargs["model"] = self.model
|
|
404
418
|
|
|
419
|
+
if isinstance(kwargs.get("model"), str):
|
|
420
|
+
from hud.utils.gateway import normalize_gateway_model_id
|
|
421
|
+
|
|
422
|
+
kwargs["model"] = normalize_gateway_model_id(kwargs["model"])
|
|
423
|
+
|
|
405
424
|
if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
|
|
406
425
|
base_url = kwargs.get("base_url", "")
|
|
407
426
|
if settings.hud_gateway_url in base_url and settings.api_key:
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""``hud jobs`` — list jobs and their traces."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
import typer
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.panel import Panel
|
|
10
|
+
from rich.table import Table
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
jobs_app = typer.Typer(
|
|
15
|
+
name="jobs",
|
|
16
|
+
help="List jobs and their traces",
|
|
17
|
+
add_completion=False,
|
|
18
|
+
rich_markup_mode="rich",
|
|
19
|
+
no_args_is_help=False,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@jobs_app.callback(invoke_without_command=True)
|
|
24
|
+
def jobs_command(
|
|
25
|
+
ctx: typer.Context,
|
|
26
|
+
job_id: str | None = typer.Argument(None, help="Job ID — omit to list recent jobs"),
|
|
27
|
+
json_output: bool = typer.Option(False, "--json", help="Output as JSON"),
|
|
28
|
+
limit: int = typer.Option(20, "--limit", "-n", help="Max rows to show"),
|
|
29
|
+
) -> None:
|
|
30
|
+
"""List recent jobs, or show traces for a specific job.
|
|
31
|
+
|
|
32
|
+
Without an argument, lists the most recent jobs.
|
|
33
|
+
With a job id, lists all traces for that job.
|
|
34
|
+
"""
|
|
35
|
+
if ctx.invoked_subcommand is not None:
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
from hud.cli.utils.api import require_api_key
|
|
39
|
+
|
|
40
|
+
require_api_key("list jobs")
|
|
41
|
+
|
|
42
|
+
if job_id:
|
|
43
|
+
_show_job_traces(job_id, json_output=json_output, limit=limit)
|
|
44
|
+
else:
|
|
45
|
+
_list_jobs(json_output=json_output, limit=limit)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ── job listing ────────────────────────────────────────────────────────────────
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _list_jobs(*, json_output: bool, limit: int) -> None:
|
|
52
|
+
from hud.utils.platform import PlatformClient
|
|
53
|
+
|
|
54
|
+
client = PlatformClient.from_settings()
|
|
55
|
+
try:
|
|
56
|
+
data = client.get("/jobs", params={"limit": limit})
|
|
57
|
+
except Exception as e:
|
|
58
|
+
console.print(f"[red]Failed to fetch jobs: {e}[/red]")
|
|
59
|
+
raise typer.Exit(1) from e
|
|
60
|
+
|
|
61
|
+
items = data if isinstance(data, list) else (data.get("items") or [])
|
|
62
|
+
|
|
63
|
+
if json_output:
|
|
64
|
+
console.print_json(json.dumps(items, indent=2, default=str))
|
|
65
|
+
return
|
|
66
|
+
|
|
67
|
+
if not items:
|
|
68
|
+
console.print("[yellow]No jobs found.[/yellow]")
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
console.print(Panel.fit("[bold cyan]Recent Jobs[/bold cyan]", border_style="cyan"))
|
|
72
|
+
table = Table()
|
|
73
|
+
table.add_column("ID", style="blue", no_wrap=True)
|
|
74
|
+
table.add_column("Name", style="cyan")
|
|
75
|
+
table.add_column("Taskset", style="dim")
|
|
76
|
+
table.add_column("Status", style="yellow")
|
|
77
|
+
table.add_column("Created", style="dim")
|
|
78
|
+
|
|
79
|
+
from hud.settings import settings
|
|
80
|
+
|
|
81
|
+
web = settings.hud_web_url.rstrip("/")
|
|
82
|
+
|
|
83
|
+
for job in items:
|
|
84
|
+
jid = str(job.get("id") or "")
|
|
85
|
+
table.add_row(
|
|
86
|
+
jid,
|
|
87
|
+
job.get("name") or "-",
|
|
88
|
+
job.get("taskset_name") or "-",
|
|
89
|
+
job.get("status") or "-",
|
|
90
|
+
(str(job.get("created_at") or ""))[:19],
|
|
91
|
+
)
|
|
92
|
+
console.print(table)
|
|
93
|
+
console.print(f"\n[dim]View: {web}/jobs[/dim]")
|
|
94
|
+
console.print("[dim]Tip: hud jobs <id> to see traces for a specific job[/dim]")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# ── job traces ────────────────────────────────────────────────────────────────
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _show_job_traces(job_id: str, *, json_output: bool, limit: int) -> None:
|
|
101
|
+
from hud.settings import settings
|
|
102
|
+
from hud.utils.platform import PlatformClient
|
|
103
|
+
|
|
104
|
+
client = PlatformClient.from_settings()
|
|
105
|
+
try:
|
|
106
|
+
data = client.get(f"/jobs/{job_id}/traces", params={"limit": limit})
|
|
107
|
+
except Exception as e:
|
|
108
|
+
console.print(f"[red]Failed to fetch traces: {e}[/red]")
|
|
109
|
+
raise typer.Exit(1) from e
|
|
110
|
+
|
|
111
|
+
items = data if isinstance(data, list) else (data.get("items") or [])
|
|
112
|
+
|
|
113
|
+
if json_output:
|
|
114
|
+
console.print_json(json.dumps(items, indent=2, default=str))
|
|
115
|
+
return
|
|
116
|
+
|
|
117
|
+
web = settings.hud_web_url.rstrip("/")
|
|
118
|
+
|
|
119
|
+
if not items:
|
|
120
|
+
console.print("[yellow]No traces found for this job.[/yellow]")
|
|
121
|
+
console.print(f"[dim]View: {web}/jobs/{job_id}[/dim]")
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
console.print(
|
|
125
|
+
Panel.fit(f"[bold cyan]Job Traces[/bold cyan] [dim]{job_id}[/dim]", border_style="cyan")
|
|
126
|
+
)
|
|
127
|
+
table = Table()
|
|
128
|
+
table.add_column("Trace ID", style="blue", no_wrap=True)
|
|
129
|
+
table.add_column("Status", style="yellow")
|
|
130
|
+
table.add_column("Reward", style="green", justify="right")
|
|
131
|
+
table.add_column("Started", style="dim")
|
|
132
|
+
table.add_column("Error", style="red")
|
|
133
|
+
|
|
134
|
+
for tr in items:
|
|
135
|
+
tid = str(tr.get("id") or "")
|
|
136
|
+
reward = tr.get("reward")
|
|
137
|
+
table.add_row(
|
|
138
|
+
tid,
|
|
139
|
+
tr.get("status") or "-",
|
|
140
|
+
f"{reward:.3f}" if reward is not None else "-",
|
|
141
|
+
(str(tr.get("start_time") or tr.get("created_at") or ""))[:19],
|
|
142
|
+
(tr.get("error") or "")[:40],
|
|
143
|
+
)
|
|
144
|
+
console.print(table)
|
|
145
|
+
console.print(f"\n[dim]View: {web}/jobs/{job_id}[/dim]")
|
|
146
|
+
console.print("[dim]Tip: hud trace <trace_id> to inspect a specific rollout[/dim]")
|
|
@@ -71,6 +71,8 @@ def list_models(
|
|
|
71
71
|
)
|
|
72
72
|
console.print(table)
|
|
73
73
|
console.print(f"\n[dim]Gateway: {settings.hud_gateway_url}[/dim]")
|
|
74
|
+
web = settings.hud_web_url.rstrip("/")
|
|
75
|
+
console.print(f"[dim]View a model in the browser: {web}/models/<id>[/dim]")
|
|
74
76
|
|
|
75
77
|
|
|
76
78
|
@models_app.command("fork")
|
|
@@ -116,6 +118,7 @@ def fork_model(
|
|
|
116
118
|
)
|
|
117
119
|
)
|
|
118
120
|
console.print(f"\n[dim]Train it: hud.TrainingClient({slug!r})[/dim]")
|
|
121
|
+
console.print(f"[dim]View: {_model_url(model['id'])}[/dim]")
|
|
119
122
|
|
|
120
123
|
|
|
121
124
|
@models_app.command("checkpoints")
|
|
@@ -127,13 +130,15 @@ def list_checkpoints(
|
|
|
127
130
|
from hud.cli.utils.api import require_api_key
|
|
128
131
|
|
|
129
132
|
require_api_key("list checkpoints")
|
|
130
|
-
|
|
133
|
+
model_id = _resolve_model_id(model)
|
|
134
|
+
checkpoints = _get_checkpoints(model_id)
|
|
131
135
|
|
|
132
136
|
if json_output:
|
|
133
137
|
console.print_json(json.dumps(checkpoints, indent=2))
|
|
134
138
|
return
|
|
135
139
|
if not checkpoints:
|
|
136
140
|
console.print("[yellow]No checkpoints yet — this model serves its base weights[/yellow]")
|
|
141
|
+
console.print(f"[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
|
|
137
142
|
return
|
|
138
143
|
|
|
139
144
|
checkpoints = sorted(checkpoints, key=lambda c: c.get("created_at") or "")
|
|
@@ -155,6 +160,7 @@ def list_checkpoints(
|
|
|
155
160
|
(ckpt.get("created_at") or "")[:19],
|
|
156
161
|
)
|
|
157
162
|
console.print(table)
|
|
163
|
+
console.print(f"\n[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
|
|
158
164
|
|
|
159
165
|
|
|
160
166
|
@models_app.command("head")
|
|
@@ -170,19 +176,22 @@ def show_head(
|
|
|
170
176
|
from hud.cli.utils.api import require_api_key
|
|
171
177
|
|
|
172
178
|
require_api_key("manage head")
|
|
179
|
+
model_id = _resolve_model_id(model)
|
|
173
180
|
|
|
174
181
|
if set_to is not None:
|
|
175
|
-
_set_head(
|
|
182
|
+
_set_head(model_id, set_to)
|
|
176
183
|
console.print(f"[green]Head set to[/green] [cyan]{set_to}[/cyan]")
|
|
184
|
+
console.print(f"[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
|
|
177
185
|
return
|
|
178
186
|
|
|
179
|
-
head = next((c for c in _get_checkpoints(
|
|
187
|
+
head = next((c for c in _get_checkpoints(model_id) if c.get("is_active")), None)
|
|
180
188
|
|
|
181
189
|
if json_output:
|
|
182
190
|
console.print_json(json.dumps(head, indent=2))
|
|
183
191
|
return
|
|
184
192
|
if head is None:
|
|
185
193
|
console.print("[yellow]No active checkpoint — this model serves its base weights[/yellow]")
|
|
194
|
+
console.print(f"[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
|
|
186
195
|
return
|
|
187
196
|
|
|
188
197
|
reward = head.get("mean_reward")
|
|
@@ -196,6 +205,15 @@ def show_head(
|
|
|
196
205
|
border_style="green",
|
|
197
206
|
)
|
|
198
207
|
)
|
|
208
|
+
console.print(f"[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _model_url(model_id: str, *, tab: str | None = None) -> str:
|
|
212
|
+
"""Web app URL for a model (optionally a specific tab, e.g. ``checkpoints``)."""
|
|
213
|
+
from hud.settings import settings
|
|
214
|
+
|
|
215
|
+
url = f"{settings.hud_web_url.rstrip('/')}/models/{model_id}"
|
|
216
|
+
return f"{url}?tab={tab}" if tab else url
|
|
199
217
|
|
|
200
218
|
|
|
201
219
|
def _resolve_model_id(model: str) -> str:
|
|
@@ -50,6 +50,21 @@ def test_get_agent_kwargs_model_precedence_and_flags() -> None:
|
|
|
50
50
|
assert kwargs["verbose"] is True
|
|
51
51
|
|
|
52
52
|
|
|
53
|
+
def test_get_agent_kwargs_normalizes_gateway_model_alias() -> None:
|
|
54
|
+
cfg = EvalConfig(agent_type="openai_compatible", model="glm-5.2")
|
|
55
|
+
|
|
56
|
+
assert cfg.get_agent_kwargs()["model"] == "z-ai/glm-5.2"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_get_agent_kwargs_normalizes_config_model_alias() -> None:
|
|
60
|
+
cfg = EvalConfig(
|
|
61
|
+
agent_type="openai_compatible",
|
|
62
|
+
agent_config={"openai_compatible": {"model": "glm-5.2"}},
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
assert cfg.get_agent_kwargs()["model"] == "z-ai/glm-5.2"
|
|
66
|
+
|
|
67
|
+
|
|
53
68
|
def test_get_agent_kwargs_requires_agent_type() -> None:
|
|
54
69
|
with pytest.raises(ValueError, match="agent_type must be set"):
|
|
55
70
|
EvalConfig().get_agent_kwargs()
|
|
@@ -186,6 +201,31 @@ def test_merge_cli_overrides_fields() -> None:
|
|
|
186
201
|
assert merged.max_steps == 7
|
|
187
202
|
|
|
188
203
|
|
|
204
|
+
def test_merge_cli_resolves_gateway_model_alias(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
205
|
+
from hud.utils.gateway import GatewayModelInfo, GatewayProviderInfo
|
|
206
|
+
|
|
207
|
+
model = GatewayModelInfo(
|
|
208
|
+
id="z-ai/glm-5.2",
|
|
209
|
+
model_name="z-ai/glm-5.2",
|
|
210
|
+
sdk_agent_type="openai_compatible",
|
|
211
|
+
provider=GatewayProviderInfo(name="openai"),
|
|
212
|
+
)
|
|
213
|
+
monkeypatch.setattr("hud.utils.gateway.list_gateway_models", lambda: [model])
|
|
214
|
+
|
|
215
|
+
merged = EvalConfig().merge_cli(agent="glm-5.2")
|
|
216
|
+
|
|
217
|
+
assert merged.agent_type is not None and merged.agent_type.value == "openai_compatible"
|
|
218
|
+
assert merged.model == "z-ai/glm-5.2"
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def test_merge_cli_config_model_alias_is_normalized() -> None:
|
|
222
|
+
merged = EvalConfig(agent_type="openai_compatible").merge_cli(
|
|
223
|
+
config=["openai_compatible.model=glm-5.2"]
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
assert merged.get_agent_kwargs()["model"] == "z-ai/glm-5.2"
|
|
227
|
+
|
|
228
|
+
|
|
189
229
|
def test_merge_cli_namespaced_config() -> None:
|
|
190
230
|
merged = EvalConfig().merge_cli(config=["claude.max_tokens=100"])
|
|
191
231
|
assert merged.agent_config["claude"]["max_tokens"] == 100
|