hud-python 0.6.2__tar.gz → 0.6.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hud_python-0.6.2 → hud_python-0.6.4}/PKG-INFO +1 -1
- hud_python-0.6.4/cookbooks/connect4-selfplay/README.md +57 -0
- hud_python-0.6.4/cookbooks/fireworks-rl-training/README.md +114 -0
- hud_python-0.6.4/cookbooks/fireworks-rl-training/pyproject.toml +19 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/__init__.py +11 -3
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai_compatible/agent.py +15 -4
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_base.py +38 -2
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_provider_native_tools.py +4 -4
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/types.py +7 -3
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/__init__.py +4 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/eval.py +64 -11
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/init.py +3 -3
- hud_python-0.6.4/hud/cli/jobs.py +146 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/models.py +21 -3
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/templates.py +4 -5
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_deploy.py +1 -1
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_eval_config.py +69 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_init.py +8 -0
- hud_python-0.6.4/hud/cli/trace.py +215 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/job.py +33 -9
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/run.py +31 -6
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/runtime.py +51 -8
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/taskset.py +18 -2
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_hosted.py +48 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_rollout.py +26 -1
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/settings.py +2 -2
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/train/__init__.py +2 -0
- hud_python-0.6.4/hud/train/base.py +159 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/train/client.py +41 -17
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/train/types.py +38 -4
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/gateway.py +23 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/version.py +1 -1
- {hud_python-0.6.2 → hud_python-0.6.4}/pyproject.toml +1 -1
- hud_python-0.6.2/hud/train/base.py +0 -102
- {hud_python-0.6.2 → hud_python-0.6.4}/.gitignore +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/LICENSE +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/README.md +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/cookbooks/a2a-chat/README.md +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/cookbooks/a2a-chat/pyproject.toml +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/cookbooks/codex-coding/README.md +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/cookbooks/codex-coding/pyproject.toml +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/cookbooks/rl-training/README.md +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/cookbooks/rl-training/pyproject.toml +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/__main__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/_legacy.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/base.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/browser_use/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/browser_use/agent.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/agent.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/sdk/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/sdk/agent.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/sdk/computer_mcp.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/base.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/coding.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/computer.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/hosted.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/mcp_proxy.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/settings.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/tests/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/tests/test_computer.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/agent.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/settings.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/base.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/coding.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/computer.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/filesystem.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/hosted.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/mcp_proxy.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/tests/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/tests/test_computer.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/misc/response_automation.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/agent.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/apply_patch.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/base.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/coding.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/computer.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/hosted.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/mcp_proxy.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/strict_schema.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/tests/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/tests/test_computer.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/tests/test_strict_schema.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai_compatible/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/base.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/filesystem.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/mcp_proxy.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/robot/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/robot/_types.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/robot/adapter.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/robot/agent.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/robot/model.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_apply_patch.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_claude_agent.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_claude_sdk_agent.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_gemini_agent.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_openai_agent.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_openai_compatible_agent.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_tool_agent.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_trace.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tool_agent.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tools/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tools/base.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tools/hosted.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tools/mcp.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tools/rfb.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tools/ssh.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/base.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/cdp.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/filetracking.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/mcp.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/rfb.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/robot.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/ssh.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/__main__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/cancel.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/client.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/deploy.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/login.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/presets.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/serve.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/sync.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/task.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_eval_bedrock.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_sync_export.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/api.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/build_display.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/build_logs.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/config.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/context.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/display.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/jobs.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/source.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tasks.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/test_build_display.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/test_config.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/test_context.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/test_registry.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/test_source.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/test_tasks.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/test_version_check.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/version_check.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/clients/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/clients/client.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/clients/tests/test_connect.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/conftest.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/env.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/file_tracker.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/file_tracking.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/legacy.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/robot/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/robot/bridge.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/robot/endpoint.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/robot/sim_runner.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/server.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/conftest.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_capability_backing.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_file_tracker.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_file_tracking.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_legacy.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_loader.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_manifest.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_server.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_tunnel.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/utils.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/workspace.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/chat.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/file_tracking.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/sync.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/task.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_chat.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_docker_provider.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_file_tracking_observer.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_job.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_sync.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_task.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/graders/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/graders/base.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/graders/bash.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/graders/combine.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/graders/judge.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/graders/results.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/graders/text.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/patches/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/patches/mcp_patches.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/patches/tests/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/patches/tests/test_warnings.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/patches/warnings.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/py.typed +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/server.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/context.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/exporter.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/filetracking.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/span.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/tests/test_exporter.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/tests/test_filetracking.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/tests/test_instrument.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/types.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/exceptions.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/hints.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/hud_console.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/modules.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/platform.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/requests.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/serialization.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/tests/test_exceptions.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/tests/test_hints.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/tests/test_hud_console.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/tests/test_platform.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/tests/test_requests.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/tests/test_serialization.py +0 -0
- {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/time.py +0 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Connect Four self-play
|
|
2
|
+
|
|
3
|
+
Symmetric self-play RL on a 6×7 Connect Four board. Draws are rare (you need a
|
|
4
|
+
full 42-cell board with no four-in-a-row), so the win/loss reward signal
|
|
5
|
+
persists as the policy improves and the GRPO advantage stays non-zero.
|
|
6
|
+
|
|
7
|
+
## How it works
|
|
8
|
+
|
|
9
|
+
- One agent ("outer") plays a full game against an inner model on the **same
|
|
10
|
+
slug** — true self-play. `seed % 2` decides who drops first, for symmetric
|
|
11
|
+
first-move coverage.
|
|
12
|
+
- Each game trains **both sides at once**: the outer agent's `Run` (reward from
|
|
13
|
+
its perspective) plus a hand-built `TrajectoryPayload` for the inner model
|
|
14
|
+
with the flipped reward (`1 - outer_reward`).
|
|
15
|
+
- `group_size=2` pairs each game's two trajectories so the GRPO advantage is
|
|
16
|
+
`reward - 0.5` per game.
|
|
17
|
+
- `loss_fn="ppo"` clips the importance-sampling ratio, so a single lucky game
|
|
18
|
+
can't blow up the update.
|
|
19
|
+
|
|
20
|
+
The training loop uses the public API directly — `forward_backward` accepts
|
|
21
|
+
`Run` and `TrajectoryPayload` mixed, so no private helpers are needed.
|
|
22
|
+
|
|
23
|
+
## Setup
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
hud models fork Qwen/Qwen3.5-4B --name c4-selfplay # prints a slug like c4-selfplay-<id>
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Put your `HUD_API_KEY` in a `.env` here (or the environment).
|
|
30
|
+
|
|
31
|
+
## Run
|
|
32
|
+
|
|
33
|
+
Local sanity check (one game, cheap external model as the outer agent):
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
hud eval env.py claude --model claude-haiku-4-5
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Train:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
python train.py --model c4-selfplay-<id> --steps 20 --group 4 --lr 1e-5
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Tuning notes
|
|
46
|
+
|
|
47
|
+
- **Memory scales with `tasks × group`.** Each task×rollout is a fresh `env.py`
|
|
48
|
+
subprocess. With 8 tasks and `--group 4` that's 32 concurrent games. Connect
|
|
49
|
+
Four games can run up to 42 plies, so they cost more tokens and time per game —
|
|
50
|
+
start at `--group 4` and raise only if you have RAM headroom.
|
|
51
|
+
- **Watch the server-side metrics.** The loop prints local win/draw/loss counts
|
|
52
|
+
each step and the last few checkpoints' `mean_reward` / `reward_std` via
|
|
53
|
+
`trainer.checkpoints()` at the end. A healthy run keeps non-trivial
|
|
54
|
+
`reward_std` (within-group spread); if it collapses, the policy has saturated.
|
|
55
|
+
- **Reset on changes.** If you edit the reward or the board, roll the head back
|
|
56
|
+
to a clean checkpoint (`hud models head <slug> --set <id>`) or fork fresh —
|
|
57
|
+
don't keep training a policy shaped by the old objective.
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# Fireworks RL Training
|
|
2
|
+
|
|
3
|
+
Direct Fireworks Training API loop over the same arithmetic preview task used by
|
|
4
|
+
`cookbooks/rl-training`.
|
|
5
|
+
|
|
6
|
+
This does **not** use Fireworks native datasets or RFT jobs. It follows the
|
|
7
|
+
Training API service path from the Fireworks docs:
|
|
8
|
+
|
|
9
|
+
1. `FiretitanServiceClient.from_firetitan_config(...)`
|
|
10
|
+
2. `create_deployment_sampler(...)` for high-parallel rollouts
|
|
11
|
+
3. local grading of HUD-style multiplication tasks
|
|
12
|
+
4. `forward_backward_custom(...)` + `optim_step(...)`
|
|
13
|
+
5. `save_weights_for_sampler(...)` + sampler refresh
|
|
14
|
+
|
|
15
|
+
References:
|
|
16
|
+
|
|
17
|
+
- Fireworks Training API introduction: https://docs.fireworks.ai/fine-tuning/training-api/introduction
|
|
18
|
+
- Training and sampling lifecycle: https://docs.fireworks.ai/fine-tuning/training-api/training-and-sampling
|
|
19
|
+
- Loss functions / GRPO reference: https://docs.fireworks.ai/fine-tuning/training-api/loss-functions
|
|
20
|
+
|
|
21
|
+
## Setup
|
|
22
|
+
|
|
23
|
+
The repo-level `.env` is loaded automatically. It must contain:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
FIREWORKS_API_KEY=...
|
|
27
|
+
FIREWORKS_ACCOUNT_ID=...
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Install the isolated cookbook environment:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
uv sync --pre
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Calibrate task difficulty first
|
|
37
|
+
|
|
38
|
+
Calibration defaults to Fireworks' OpenAI-compatible inference API, so it does
|
|
39
|
+
**not** create a trainer, provision a Training API deployment, or call
|
|
40
|
+
`optim_step`. This is the cheap way to tune task difficulty before paying for a
|
|
41
|
+
Training API run.
|
|
42
|
+
|
|
43
|
+
The calibration model is separate from the training base model because the
|
|
44
|
+
`lorenss` key currently exposes only a small serverless inference catalog (no
|
|
45
|
+
Qwen3 8B deployment). Override it with `--inference-model` if you have a closer
|
|
46
|
+
deployed model.
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
uv run train.py --calibrate-only --groups-per-step 8 --rollouts-per-prompt 8 --parallelism 32
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
The goal is a reward distribution with variance. If reward is all zero, make the
|
|
53
|
+
task easier:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
uv run train.py --calibrate-only --min-a 10 --max-a 99 --min-b 2 --max-b 9
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
If reward is all one, make the task harder:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
uv run train.py --calibrate-only --min-a 1000 --max-a 9999 --min-b 11 --max-b 99
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
The current defaults are calibrated for the visible `gpt-oss-120b` inference
|
|
66
|
+
model on the `lorenss` key: 2-digit by 1-digit multiplication with a direct
|
|
67
|
+
"reply only with the integer" prompt. A 32-rollout calibration gave a non-trivial
|
|
68
|
+
baseline (`reward_mean ~= 0.22`, `reward_std ~= 0.42`), while the original
|
|
69
|
+
3-digit by 2-digit range was all-zero.
|
|
70
|
+
|
|
71
|
+
## Train
|
|
72
|
+
|
|
73
|
+
Once calibration has non-trivial rewards:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
uv run train.py --steps 5 --groups-per-step 8 --rollouts-per-prompt 8 --parallelism 32
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
This uses the direct Training API managed service path. If you want calibration
|
|
80
|
+
to go through the managed deployment sampler too, pass
|
|
81
|
+
`--calibration-backend managed`; this provisions the same resources as training.
|
|
82
|
+
|
|
83
|
+
### Current Fireworks preview account blocker
|
|
84
|
+
|
|
85
|
+
On the `lorenss` preview account, trainer creation currently fails before the
|
|
86
|
+
first train step with:
|
|
87
|
+
|
|
88
|
+
```text
|
|
89
|
+
failed to ensure FIREWORKS_API_KEY secret: unkey inference api id is not configured
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
This happens even with `create_deployment=False`, so it is an account/control
|
|
93
|
+
plane provisioning issue rather than a problem in the rollout or loss code. Once
|
|
94
|
+
Fireworks enables the missing Unkey inference API config for the account, the
|
|
95
|
+
same `uv run train.py ...` command should proceed to trainer startup and the
|
|
96
|
+
first `forward_backward_custom(...)` call.
|
|
97
|
+
|
|
98
|
+
Metrics are written to:
|
|
99
|
+
|
|
100
|
+
- `runs/fireworks-rl-preview/metrics.jsonl`
|
|
101
|
+
- `runs/fireworks-rl-preview/reward_loss.png` if `matplotlib` is installed
|
|
102
|
+
|
|
103
|
+
## Notes
|
|
104
|
+
|
|
105
|
+
- Defaults use Qwen 3 8B full-parameter training:
|
|
106
|
+
- `accounts/fireworks/models/qwen3-8b`
|
|
107
|
+
- `Qwen/Qwen3-8B`
|
|
108
|
+
- `accounts/fireworks/trainingShapes/qwen3-8b-128k`
|
|
109
|
+
- LoRA can be tested with `--lora-rank N`, but the validated Qwen3 8B training
|
|
110
|
+
shape currently rejects LoRA mode on the `lorenss` preview account.
|
|
111
|
+
- The first checkpoint sync happens after step 0 and subsequent rollouts sample
|
|
112
|
+
the updated weights through the same deployment.
|
|
113
|
+
- `--keep-trainer` and `--keep-deployment` are available for debugging. By
|
|
114
|
+
default the trainer is cleaned up and the deployment scales to zero on exit.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "fireworks-rl-training"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Direct Fireworks Training API RL loop over HUD-style arithmetic tasks"
|
|
5
|
+
requires-python = ">=3.11,<3.13"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"fireworks-ai[training]",
|
|
8
|
+
"hud-python",
|
|
9
|
+
"matplotlib",
|
|
10
|
+
"python-dotenv",
|
|
11
|
+
"torch>=2",
|
|
12
|
+
"transformers>=4.55",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[tool.uv]
|
|
16
|
+
package = false
|
|
17
|
+
|
|
18
|
+
[tool.uv.sources]
|
|
19
|
+
hud-python = { path = "../..", editable = true }
|
|
@@ -8,7 +8,12 @@ from __future__ import annotations
|
|
|
8
8
|
from typing import TYPE_CHECKING, Any, cast
|
|
9
9
|
|
|
10
10
|
from hud.types import AgentType
|
|
11
|
-
from hud.utils.gateway import
|
|
11
|
+
from hud.utils.gateway import (
|
|
12
|
+
build_gateway_client,
|
|
13
|
+
gateway_model_aliases,
|
|
14
|
+
list_gateway_models,
|
|
15
|
+
normalize_gateway_model_id,
|
|
16
|
+
)
|
|
12
17
|
|
|
13
18
|
if TYPE_CHECKING:
|
|
14
19
|
from typing import TypeAlias
|
|
@@ -27,6 +32,8 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
|
|
|
27
32
|
|
|
28
33
|
For direct API access with provider API keys, instantiate the agent classes directly.
|
|
29
34
|
"""
|
|
35
|
+
requested_model = model
|
|
36
|
+
model = normalize_gateway_model_id(model)
|
|
30
37
|
agent_type = next((candidate for candidate in AgentType if candidate.value == model), None)
|
|
31
38
|
if agent_type is not None:
|
|
32
39
|
model_id = model
|
|
@@ -73,7 +80,8 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
|
|
|
73
80
|
for n in (gm.id, gm.name, gm.model_name)
|
|
74
81
|
if isinstance(n, str)
|
|
75
82
|
]
|
|
76
|
-
|
|
83
|
+
known.extend(gateway_model_aliases())
|
|
84
|
+
near = difflib.get_close_matches(requested_model, known, n=3, cutoff=0.5)
|
|
77
85
|
hint = (
|
|
78
86
|
f" Did you mean: {', '.join(near)}?"
|
|
79
87
|
if near
|
|
@@ -84,7 +92,7 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
|
|
|
84
92
|
if gateway_models
|
|
85
93
|
else "the HUD gateway registry (empty — is HUD_API_KEY set?)"
|
|
86
94
|
)
|
|
87
|
-
raise ValueError(f"Model {
|
|
95
|
+
raise ValueError(f"Model {requested_model!r} not found in {source}.{hint}")
|
|
88
96
|
|
|
89
97
|
kwargs.setdefault("model", model_id)
|
|
90
98
|
kwargs.setdefault("model_client", build_gateway_client(provider_name))
|
|
@@ -193,16 +193,27 @@ class OpenAIChatAgent(ToolAgent[ChatCompletionMessageParam, OpenAIChatConfig]):
|
|
|
193
193
|
sample: Sample | None = None
|
|
194
194
|
if return_token_ids:
|
|
195
195
|
prompt_token_ids = getattr(choice, "prompt_token_ids", None)
|
|
196
|
+
# Multimodal prompt (text + image chunks): the only prompt representation
|
|
197
|
+
# that survives image inputs; flat prompt_token_ids is null in that case.
|
|
198
|
+
prompt_chunks = getattr(choice, "prompt_chunks", None)
|
|
196
199
|
token_ids = getattr(choice, "token_ids", None)
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
chat_state.continuation_message_count = len(messages)
|
|
200
|
+
has_prompt = prompt_token_ids is not None or prompt_chunks is not None
|
|
201
|
+
if token_ids is not None and has_prompt:
|
|
200
202
|
content_lp = choice.logprobs.content if choice.logprobs else None
|
|
201
203
|
sample = Sample(
|
|
202
|
-
prompt_token_ids=list(prompt_token_ids),
|
|
204
|
+
prompt_token_ids=list(prompt_token_ids) if prompt_token_ids is not None else [],
|
|
205
|
+
prompt_chunks=list(prompt_chunks) if prompt_chunks is not None else None,
|
|
203
206
|
output_token_ids=list(token_ids),
|
|
204
207
|
output_logprobs=[tok.logprob for tok in content_lp] if content_lp else [],
|
|
205
208
|
)
|
|
209
|
+
# KV-cache continuation only applies to flat text prompts; clear any
|
|
210
|
+
# stale state when the gateway returns chunks-only (multimodal turn).
|
|
211
|
+
if prompt_token_ids is not None:
|
|
212
|
+
chat_state.continuation_token_ids = list(prompt_token_ids) + list(token_ids)
|
|
213
|
+
chat_state.continuation_message_count = len(messages)
|
|
214
|
+
else:
|
|
215
|
+
chat_state.continuation_token_ids = None
|
|
216
|
+
chat_state.continuation_message_count = None
|
|
206
217
|
|
|
207
218
|
tool_calls: list[MCPToolCall] = []
|
|
208
219
|
for tc in function_calls:
|
|
@@ -108,7 +108,7 @@ def test_create_agent_resolves_gateway_model_metadata(
|
|
|
108
108
|
|
|
109
109
|
model = GatewayModelInfo(
|
|
110
110
|
id="ft:custom-123",
|
|
111
|
-
model_name="gpt-5.
|
|
111
|
+
model_name="gpt-5.5",
|
|
112
112
|
sdk_agent_type="openai_compatible",
|
|
113
113
|
provider=GatewayProviderInfo(name="openai"),
|
|
114
114
|
)
|
|
@@ -122,4 +122,40 @@ def test_create_agent_resolves_gateway_model_metadata(
|
|
|
122
122
|
agent = create_agent("ft:custom-123")
|
|
123
123
|
|
|
124
124
|
assert isinstance(agent, OpenAIChatAgent)
|
|
125
|
-
assert agent.config.model == "gpt-5.
|
|
125
|
+
assert agent.config.model == "gpt-5.5" # resolved to the model's real name
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@pytest.mark.parametrize(
|
|
129
|
+
("alias", "canonical"),
|
|
130
|
+
[
|
|
131
|
+
("deepseek-v4", "deepseek/deepseek-v4-pro"),
|
|
132
|
+
("deepseek-v4-flash", "deepseek/deepseek-v4-flash"),
|
|
133
|
+
("glm-5.2", "z-ai/glm-5.2"),
|
|
134
|
+
("kimi-k2.6", "moonshotai/kimi-k2.6"),
|
|
135
|
+
("minimax-m3", "MiniMax-M3"),
|
|
136
|
+
],
|
|
137
|
+
)
|
|
138
|
+
def test_create_agent_accepts_gateway_model_aliases(
|
|
139
|
+
alias: str,
|
|
140
|
+
canonical: str,
|
|
141
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
142
|
+
) -> None:
|
|
143
|
+
from hud.utils.gateway import GatewayModelInfo, GatewayProviderInfo
|
|
144
|
+
|
|
145
|
+
model = GatewayModelInfo(
|
|
146
|
+
id=canonical,
|
|
147
|
+
model_name=canonical,
|
|
148
|
+
sdk_agent_type="openai_compatible",
|
|
149
|
+
provider=GatewayProviderInfo(name="openai"),
|
|
150
|
+
)
|
|
151
|
+
monkeypatch.setattr("hud.agents.list_gateway_models", lambda: [model])
|
|
152
|
+
|
|
153
|
+
def _build_client(_provider: str) -> object:
|
|
154
|
+
return object()
|
|
155
|
+
|
|
156
|
+
monkeypatch.setattr("hud.agents.build_gateway_client", _build_client)
|
|
157
|
+
|
|
158
|
+
agent = create_agent(alias)
|
|
159
|
+
|
|
160
|
+
assert isinstance(agent, OpenAIChatAgent)
|
|
161
|
+
assert agent.config.model == canonical
|
|
@@ -102,7 +102,7 @@ def _commands(tool: Any) -> list[str]:
|
|
|
102
102
|
|
|
103
103
|
|
|
104
104
|
async def test_openai_shell_wraps_command_with_timeout() -> None:
|
|
105
|
-
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.
|
|
105
|
+
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
|
|
106
106
|
|
|
107
107
|
result = await tool.execute({"commands": ["pwd"], "timeout_ms": 2500})
|
|
108
108
|
|
|
@@ -114,7 +114,7 @@ async def test_openai_shell_wraps_command_with_timeout() -> None:
|
|
|
114
114
|
|
|
115
115
|
|
|
116
116
|
async def test_openai_shell_runs_each_command_without_timeout() -> None:
|
|
117
|
-
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.
|
|
117
|
+
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
|
|
118
118
|
|
|
119
119
|
await tool.execute({"commands": ["echo a", "echo b"]})
|
|
120
120
|
|
|
@@ -122,7 +122,7 @@ async def test_openai_shell_runs_each_command_without_timeout() -> None:
|
|
|
122
122
|
|
|
123
123
|
|
|
124
124
|
async def test_openai_shell_rejects_non_list_commands_without_running() -> None:
|
|
125
|
-
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.
|
|
125
|
+
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
|
|
126
126
|
|
|
127
127
|
result = await tool.execute({"commands": 123})
|
|
128
128
|
|
|
@@ -131,7 +131,7 @@ async def test_openai_shell_rejects_non_list_commands_without_running() -> None:
|
|
|
131
131
|
|
|
132
132
|
|
|
133
133
|
def test_openai_shell_to_params_is_shell_type() -> None:
|
|
134
|
-
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.
|
|
134
|
+
tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
|
|
135
135
|
assert tool.to_params()["type"] == "shell"
|
|
136
136
|
|
|
137
137
|
|
|
@@ -99,7 +99,7 @@ class OpenAIConfig(AgentConfig):
|
|
|
99
99
|
"""Configuration for OpenAIAgent."""
|
|
100
100
|
|
|
101
101
|
model_name: str = "OpenAI"
|
|
102
|
-
model: str = Field(default="gpt-5.
|
|
102
|
+
model: str = Field(default="gpt-5.5", validation_alias=_model_alias)
|
|
103
103
|
max_output_tokens: int | None = None
|
|
104
104
|
temperature: float | None = None
|
|
105
105
|
reasoning: Any = None # openai Reasoning
|
|
@@ -113,7 +113,7 @@ class OpenAIChatConfig(AgentConfig):
|
|
|
113
113
|
"""Configuration for OpenAIChatAgent."""
|
|
114
114
|
|
|
115
115
|
model_name: str = "OpenAI Chat"
|
|
116
|
-
model: str = Field(default="gpt-5-mini", validation_alias=_model_alias)
|
|
116
|
+
model: str = Field(default="gpt-5.4-mini", validation_alias=_model_alias)
|
|
117
117
|
checkpoint: str | None = Field(
|
|
118
118
|
default=None,
|
|
119
119
|
description="Specific checkpoint name for inference routing. "
|
|
@@ -139,7 +139,7 @@ class ClaudeSDKConfig(AgentConfig):
|
|
|
139
139
|
"""
|
|
140
140
|
|
|
141
141
|
model_name: str = "Claude Code"
|
|
142
|
-
model: str = Field(default="claude-sonnet-4-
|
|
142
|
+
model: str = Field(default="claude-sonnet-4-6", validation_alias=_model_alias)
|
|
143
143
|
permission_mode: str = "bypassPermissions"
|
|
144
144
|
max_steps: int = -1
|
|
145
145
|
allowed_tools: list[str] = Field(
|
|
@@ -222,6 +222,10 @@ class Sample(BaseModel):
|
|
|
222
222
|
"""
|
|
223
223
|
|
|
224
224
|
prompt_token_ids: list[int] = Field(default_factory=list[int])
|
|
225
|
+
# Multimodal prompt as serialized ``ModelInput`` chunks (text + image), set by
|
|
226
|
+
# vision rollouts where the prompt is not a flat token list. When present it is
|
|
227
|
+
# the authoritative prompt for training; ``prompt_token_ids`` stays empty.
|
|
228
|
+
prompt_chunks: list[dict[str, Any]] | None = None
|
|
225
229
|
output_token_ids: list[int] = Field(default_factory=list[int])
|
|
226
230
|
output_logprobs: list[float] = Field(default_factory=list[float])
|
|
227
231
|
|
|
@@ -35,11 +35,13 @@ from .client import client_app # noqa: E402
|
|
|
35
35
|
from .deploy import deploy_command # noqa: E402
|
|
36
36
|
from .eval import eval_command # noqa: E402
|
|
37
37
|
from .init import init_command # noqa: E402
|
|
38
|
+
from .jobs import jobs_app # noqa: E402
|
|
38
39
|
from .login import login_command # noqa: E402
|
|
39
40
|
from .models import models_app # noqa: E402
|
|
40
41
|
from .serve import serve_command # noqa: E402
|
|
41
42
|
from .sync import sync_app # noqa: E402
|
|
42
43
|
from .task import task_app # noqa: E402
|
|
44
|
+
from .trace import trace_app # noqa: E402
|
|
43
45
|
|
|
44
46
|
app.command(name="serve")(serve_command)
|
|
45
47
|
app.command(name="dev", deprecated=True, hidden=True)(serve_command) # alias for now
|
|
@@ -49,6 +51,8 @@ app.command(name="eval")(eval_command)
|
|
|
49
51
|
app.command(name="init")(init_command)
|
|
50
52
|
app.command(name="cancel")(cancel_command)
|
|
51
53
|
app.add_typer(models_app, name="models")
|
|
54
|
+
app.add_typer(jobs_app, name="jobs")
|
|
55
|
+
app.add_typer(trace_app, name="trace")
|
|
52
56
|
|
|
53
57
|
|
|
54
58
|
@app.command(name="set")
|
|
@@ -5,6 +5,7 @@ Config Override Order: CLI arguments > .hud_eval.toml > defaults
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
+
import ast
|
|
8
9
|
import asyncio
|
|
9
10
|
import logging
|
|
10
11
|
import os
|
|
@@ -42,8 +43,9 @@ def _resolve_model_from_catalog(model_id: str) -> tuple[AgentType, str] | None:
|
|
|
42
43
|
Returns None if the model isn't found or the catalog is unreachable.
|
|
43
44
|
"""
|
|
44
45
|
try:
|
|
45
|
-
from hud.utils.gateway import list_gateway_models
|
|
46
|
+
from hud.utils.gateway import list_gateway_models, normalize_gateway_model_id
|
|
46
47
|
|
|
48
|
+
model_id = normalize_gateway_model_id(model_id)
|
|
47
49
|
models = list_gateway_models()
|
|
48
50
|
except Exception:
|
|
49
51
|
return None
|
|
@@ -116,8 +118,9 @@ class AgentPreset:
|
|
|
116
118
|
|
|
117
119
|
_AGENT_PRESETS: list[AgentPreset] = [
|
|
118
120
|
AgentPreset("Claude Sonnet 4.6", AgentType.CLAUDE, "claude-sonnet-4-6"),
|
|
119
|
-
AgentPreset("
|
|
120
|
-
AgentPreset("
|
|
121
|
+
AgentPreset("Claude Opus 4.8", AgentType.CLAUDE, "claude-opus-4-8"),
|
|
122
|
+
AgentPreset("GPT-5.5", AgentType.OPENAI, "gpt-5.5"),
|
|
123
|
+
AgentPreset("Gemini 3.1 Pro (Preview)", AgentType.GEMINI, "gemini-3.1-pro-preview"),
|
|
121
124
|
AgentPreset(
|
|
122
125
|
"Grok 4-1 Fast (xAI)",
|
|
123
126
|
AgentType.OPENAI_COMPATIBLE,
|
|
@@ -130,10 +133,22 @@ _AGENT_PRESETS: list[AgentPreset] = [
|
|
|
130
133
|
},
|
|
131
134
|
),
|
|
132
135
|
AgentPreset(
|
|
133
|
-
"GLM
|
|
136
|
+
"GLM 5.2 (Z.ai)",
|
|
134
137
|
AgentType.OPENAI_COMPATIBLE,
|
|
135
|
-
"z-ai/glm-
|
|
136
|
-
{"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM
|
|
138
|
+
"z-ai/glm-5.2",
|
|
139
|
+
{"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM 5.2"}},
|
|
140
|
+
),
|
|
141
|
+
AgentPreset(
|
|
142
|
+
"Kimi K2.6 (Moonshot)",
|
|
143
|
+
AgentType.OPENAI_COMPATIBLE,
|
|
144
|
+
"moonshotai/kimi-k2.6",
|
|
145
|
+
{"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "Kimi K2.6"}},
|
|
146
|
+
),
|
|
147
|
+
AgentPreset(
|
|
148
|
+
"MiniMax M3",
|
|
149
|
+
AgentType.OPENAI_COMPATIBLE,
|
|
150
|
+
"MiniMax-M3",
|
|
151
|
+
{"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "MiniMax M3"}},
|
|
137
152
|
),
|
|
138
153
|
]
|
|
139
154
|
|
|
@@ -161,7 +176,7 @@ _DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
|
|
|
161
176
|
# use_computer_beta = true
|
|
162
177
|
|
|
163
178
|
[openai]
|
|
164
|
-
# model = "gpt-
|
|
179
|
+
# model = "gpt-5.5"
|
|
165
180
|
# temperature = 0.7
|
|
166
181
|
# max_output_tokens = 4096
|
|
167
182
|
|
|
@@ -401,6 +416,11 @@ class EvalConfig(BaseModel):
|
|
|
401
416
|
if self.model:
|
|
402
417
|
kwargs["model"] = self.model
|
|
403
418
|
|
|
419
|
+
if isinstance(kwargs.get("model"), str):
|
|
420
|
+
from hud.utils.gateway import normalize_gateway_model_id
|
|
421
|
+
|
|
422
|
+
kwargs["model"] = normalize_gateway_model_id(kwargs["model"])
|
|
423
|
+
|
|
404
424
|
if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
|
|
405
425
|
base_url = kwargs.get("base_url", "")
|
|
406
426
|
if settings.hud_gateway_url in base_url and settings.api_key:
|
|
@@ -665,13 +685,46 @@ def _build_agent(cfg: EvalConfig) -> Any:
|
|
|
665
685
|
return cast("Any", cfg.agent_type.cls)(config=config)
|
|
666
686
|
|
|
667
687
|
|
|
688
|
+
def _python_defines_environment(path: Path) -> bool:
|
|
689
|
+
"""Return True when ``path`` constructs a v6 :class:`~hud.environment.Environment`."""
|
|
690
|
+
try:
|
|
691
|
+
tree = ast.parse(path.read_text(encoding="utf-8"))
|
|
692
|
+
except (OSError, SyntaxError):
|
|
693
|
+
return False
|
|
694
|
+
for node in ast.walk(tree):
|
|
695
|
+
if not isinstance(node, ast.Call):
|
|
696
|
+
continue
|
|
697
|
+
callee = node.func
|
|
698
|
+
callee_name = (
|
|
699
|
+
callee.id
|
|
700
|
+
if isinstance(callee, ast.Name)
|
|
701
|
+
else callee.attr
|
|
702
|
+
if isinstance(callee, ast.Attribute)
|
|
703
|
+
else None
|
|
704
|
+
)
|
|
705
|
+
if callee_name == "Environment":
|
|
706
|
+
return True
|
|
707
|
+
return False
|
|
708
|
+
|
|
709
|
+
|
|
668
710
|
def _spawn_target(source: Path) -> Path:
|
|
669
|
-
"""The path the ``LocalRuntime`` provider serves
|
|
670
|
-
|
|
671
|
-
|
|
711
|
+
"""The path the ``LocalRuntime`` provider serves.
|
|
712
|
+
|
|
713
|
+
Directories and env-defining ``.py`` files are served as-is. Task-only
|
|
714
|
+
sources (``tasks.py`` importing from ``env.py``) resolve to a sibling
|
|
715
|
+
``env.py`` or the containing directory. JSON/JSONL data files use the
|
|
716
|
+
surrounding directory (the env source lives next to the tasks file).
|
|
717
|
+
"""
|
|
672
718
|
resolved = source.resolve()
|
|
673
|
-
if resolved.is_dir()
|
|
719
|
+
if resolved.is_dir():
|
|
720
|
+
return resolved
|
|
721
|
+
if resolved.suffix != ".py":
|
|
722
|
+
return resolved.parent
|
|
723
|
+
if _python_defines_environment(resolved):
|
|
674
724
|
return resolved
|
|
725
|
+
env_py = resolved.parent / "env.py"
|
|
726
|
+
if env_py.is_file():
|
|
727
|
+
return env_py
|
|
675
728
|
return resolved.parent
|
|
676
729
|
|
|
677
730
|
|
|
@@ -76,8 +76,8 @@ def init_command(
|
|
|
76
76
|
None,
|
|
77
77
|
"--preset",
|
|
78
78
|
"-p",
|
|
79
|
-
help="Starter preset to download from GitHub (e.g. blank,
|
|
80
|
-
"deepresearch,
|
|
79
|
+
help="Starter preset to download from GitHub (e.g. blank, browser, "
|
|
80
|
+
"deepresearch, cua, autonomous-businesses, verilog). Omit for an interactive picker; in a "
|
|
81
81
|
"non-interactive shell, omitting it writes the minimal local scaffold.",
|
|
82
82
|
),
|
|
83
83
|
) -> None:
|
|
@@ -89,7 +89,7 @@ def init_command(
|
|
|
89
89
|
|
|
90
90
|
Examples:
|
|
91
91
|
hud init my-env # interactive picker (or local scaffold)
|
|
92
|
-
hud init my-env --preset
|
|
92
|
+
hud init my-env --preset browser # download the browser starter
|
|
93
93
|
hud init my-env --dir envs # create ./envs/my-env[/not dim]
|
|
94
94
|
"""
|
|
95
95
|
hud_console = HUDConsole()
|