hud-python 0.6.3__tar.gz → 0.6.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. {hud_python-0.6.3 → hud_python-0.6.4}/PKG-INFO +1 -1
  2. hud_python-0.6.4/cookbooks/connect4-selfplay/README.md +57 -0
  3. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/__init__.py +11 -3
  4. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai_compatible/agent.py +15 -4
  5. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_base.py +38 -2
  6. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_provider_native_tools.py +4 -4
  7. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/types.py +7 -3
  8. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/__init__.py +4 -0
  9. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/eval.py +26 -7
  10. hud_python-0.6.4/hud/cli/jobs.py +146 -0
  11. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/models.py +21 -3
  12. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_eval_config.py +40 -0
  13. hud_python-0.6.4/hud/cli/trace.py +215 -0
  14. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/run.py +23 -5
  15. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/runtime.py +51 -8
  16. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_hosted.py +48 -0
  17. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_rollout.py +26 -1
  18. {hud_python-0.6.3 → hud_python-0.6.4}/hud/settings.py +2 -2
  19. {hud_python-0.6.3 → hud_python-0.6.4}/hud/train/__init__.py +2 -0
  20. hud_python-0.6.4/hud/train/base.py +159 -0
  21. {hud_python-0.6.3 → hud_python-0.6.4}/hud/train/client.py +41 -17
  22. {hud_python-0.6.3 → hud_python-0.6.4}/hud/train/types.py +38 -4
  23. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/gateway.py +23 -0
  24. {hud_python-0.6.3 → hud_python-0.6.4}/hud/version.py +1 -1
  25. {hud_python-0.6.3 → hud_python-0.6.4}/pyproject.toml +1 -1
  26. hud_python-0.6.3/hud/train/base.py +0 -102
  27. {hud_python-0.6.3 → hud_python-0.6.4}/.gitignore +0 -0
  28. {hud_python-0.6.3 → hud_python-0.6.4}/LICENSE +0 -0
  29. {hud_python-0.6.3 → hud_python-0.6.4}/README.md +0 -0
  30. {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/a2a-chat/README.md +0 -0
  31. {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/a2a-chat/pyproject.toml +0 -0
  32. {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/codex-coding/README.md +0 -0
  33. {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/codex-coding/pyproject.toml +0 -0
  34. {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/fireworks-rl-training/README.md +0 -0
  35. {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/fireworks-rl-training/pyproject.toml +0 -0
  36. {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/rl-training/README.md +0 -0
  37. {hud_python-0.6.3 → hud_python-0.6.4}/cookbooks/rl-training/pyproject.toml +0 -0
  38. {hud_python-0.6.3 → hud_python-0.6.4}/hud/__init__.py +0 -0
  39. {hud_python-0.6.3 → hud_python-0.6.4}/hud/__main__.py +0 -0
  40. {hud_python-0.6.3 → hud_python-0.6.4}/hud/_legacy.py +0 -0
  41. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/base.py +0 -0
  42. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/browser_use/__init__.py +0 -0
  43. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/browser_use/agent.py +0 -0
  44. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/__init__.py +0 -0
  45. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/agent.py +0 -0
  46. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/sdk/__init__.py +0 -0
  47. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/sdk/agent.py +0 -0
  48. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/sdk/computer_mcp.py +0 -0
  49. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/__init__.py +0 -0
  50. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/base.py +0 -0
  51. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/coding.py +0 -0
  52. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/computer.py +0 -0
  53. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/hosted.py +0 -0
  54. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/mcp_proxy.py +0 -0
  55. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/settings.py +0 -0
  56. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/tests/__init__.py +0 -0
  57. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/claude/tools/tests/test_computer.py +0 -0
  58. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/__init__.py +0 -0
  59. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/agent.py +0 -0
  60. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/settings.py +0 -0
  61. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/__init__.py +0 -0
  62. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/base.py +0 -0
  63. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/coding.py +0 -0
  64. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/computer.py +0 -0
  65. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/filesystem.py +0 -0
  66. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/hosted.py +0 -0
  67. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/mcp_proxy.py +0 -0
  68. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/tests/__init__.py +0 -0
  69. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/gemini/tools/tests/test_computer.py +0 -0
  70. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/misc/__init__.py +0 -0
  71. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/misc/response_automation.py +0 -0
  72. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/__init__.py +0 -0
  73. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/agent.py +0 -0
  74. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/__init__.py +0 -0
  75. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/apply_patch.py +0 -0
  76. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/base.py +0 -0
  77. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/coding.py +0 -0
  78. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/computer.py +0 -0
  79. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/hosted.py +0 -0
  80. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/mcp_proxy.py +0 -0
  81. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/strict_schema.py +0 -0
  82. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/tests/__init__.py +0 -0
  83. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/tests/test_computer.py +0 -0
  84. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai/tools/tests/test_strict_schema.py +0 -0
  85. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai_compatible/__init__.py +0 -0
  86. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/__init__.py +0 -0
  87. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/base.py +0 -0
  88. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/filesystem.py +0 -0
  89. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/mcp_proxy.py +0 -0
  90. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/robot/__init__.py +0 -0
  91. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/robot/_types.py +0 -0
  92. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/robot/adapter.py +0 -0
  93. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/robot/agent.py +0 -0
  94. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/robot/model.py +0 -0
  95. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/__init__.py +0 -0
  96. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_apply_patch.py +0 -0
  97. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_claude_agent.py +0 -0
  98. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_claude_sdk_agent.py +0 -0
  99. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_gemini_agent.py +0 -0
  100. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_openai_agent.py +0 -0
  101. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_openai_compatible_agent.py +0 -0
  102. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_tool_agent.py +0 -0
  103. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_trace.py +0 -0
  104. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tool_agent.py +0 -0
  105. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tools/__init__.py +0 -0
  106. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tools/base.py +0 -0
  107. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tools/hosted.py +0 -0
  108. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tools/mcp.py +0 -0
  109. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tools/rfb.py +0 -0
  110. {hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tools/ssh.py +0 -0
  111. {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/__init__.py +0 -0
  112. {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/base.py +0 -0
  113. {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/cdp.py +0 -0
  114. {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/filetracking.py +0 -0
  115. {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/mcp.py +0 -0
  116. {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/rfb.py +0 -0
  117. {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/robot.py +0 -0
  118. {hud_python-0.6.3 → hud_python-0.6.4}/hud/capabilities/ssh.py +0 -0
  119. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/__main__.py +0 -0
  120. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/cancel.py +0 -0
  121. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/client.py +0 -0
  122. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/deploy.py +0 -0
  123. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/init.py +0 -0
  124. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/login.py +0 -0
  125. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/presets.py +0 -0
  126. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/serve.py +0 -0
  127. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/sync.py +0 -0
  128. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/task.py +0 -0
  129. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/templates.py +0 -0
  130. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/__init__.py +0 -0
  131. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_cli_init.py +0 -0
  132. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_cli_main.py +0 -0
  133. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
  134. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_deploy.py +0 -0
  135. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_eval_bedrock.py +0 -0
  136. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_init.py +0 -0
  137. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_main_module.py +0 -0
  138. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_sync_export.py +0 -0
  139. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/__init__.py +0 -0
  140. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/api.py +0 -0
  141. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/build_display.py +0 -0
  142. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/build_logs.py +0 -0
  143. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/config.py +0 -0
  144. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/context.py +0 -0
  145. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/display.py +0 -0
  146. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/jobs.py +0 -0
  147. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/registry.py +0 -0
  148. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/source.py +0 -0
  149. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tasks.py +0 -0
  150. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/__init__.py +0 -0
  151. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/test_build_display.py +0 -0
  152. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/test_config.py +0 -0
  153. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/test_context.py +0 -0
  154. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/test_registry.py +0 -0
  155. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/test_source.py +0 -0
  156. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/test_tasks.py +0 -0
  157. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/tests/test_version_check.py +0 -0
  158. {hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/utils/version_check.py +0 -0
  159. {hud_python-0.6.3 → hud_python-0.6.4}/hud/clients/__init__.py +0 -0
  160. {hud_python-0.6.3 → hud_python-0.6.4}/hud/clients/client.py +0 -0
  161. {hud_python-0.6.3 → hud_python-0.6.4}/hud/clients/tests/__init__.py +0 -0
  162. {hud_python-0.6.3 → hud_python-0.6.4}/hud/clients/tests/test_connect.py +0 -0
  163. {hud_python-0.6.3 → hud_python-0.6.4}/hud/conftest.py +0 -0
  164. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/__init__.py +0 -0
  165. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/env.py +0 -0
  166. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/file_tracker.py +0 -0
  167. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/file_tracking.py +0 -0
  168. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/legacy.py +0 -0
  169. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/robot/__init__.py +0 -0
  170. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/robot/bridge.py +0 -0
  171. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/robot/endpoint.py +0 -0
  172. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/robot/sim_runner.py +0 -0
  173. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/server.py +0 -0
  174. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/__init__.py +0 -0
  175. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/conftest.py +0 -0
  176. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_capability_backing.py +0 -0
  177. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_file_tracker.py +0 -0
  178. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_file_tracking.py +0 -0
  179. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_legacy.py +0 -0
  180. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_loader.py +0 -0
  181. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_manifest.py +0 -0
  182. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_server.py +0 -0
  183. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/tests/test_tunnel.py +0 -0
  184. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/utils.py +0 -0
  185. {hud_python-0.6.3 → hud_python-0.6.4}/hud/environment/workspace.py +0 -0
  186. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/__init__.py +0 -0
  187. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/chat.py +0 -0
  188. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/file_tracking.py +0 -0
  189. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/job.py +0 -0
  190. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/sync.py +0 -0
  191. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/task.py +0 -0
  192. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/taskset.py +0 -0
  193. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/__init__.py +0 -0
  194. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_chat.py +0 -0
  195. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_docker_provider.py +0 -0
  196. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_file_tracking_observer.py +0 -0
  197. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_job.py +0 -0
  198. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_sync.py +0 -0
  199. {hud_python-0.6.3 → hud_python-0.6.4}/hud/eval/tests/test_task.py +0 -0
  200. {hud_python-0.6.3 → hud_python-0.6.4}/hud/graders/__init__.py +0 -0
  201. {hud_python-0.6.3 → hud_python-0.6.4}/hud/graders/base.py +0 -0
  202. {hud_python-0.6.3 → hud_python-0.6.4}/hud/graders/bash.py +0 -0
  203. {hud_python-0.6.3 → hud_python-0.6.4}/hud/graders/combine.py +0 -0
  204. {hud_python-0.6.3 → hud_python-0.6.4}/hud/graders/judge.py +0 -0
  205. {hud_python-0.6.3 → hud_python-0.6.4}/hud/graders/results.py +0 -0
  206. {hud_python-0.6.3 → hud_python-0.6.4}/hud/graders/text.py +0 -0
  207. {hud_python-0.6.3 → hud_python-0.6.4}/hud/patches/__init__.py +0 -0
  208. {hud_python-0.6.3 → hud_python-0.6.4}/hud/patches/mcp_patches.py +0 -0
  209. {hud_python-0.6.3 → hud_python-0.6.4}/hud/patches/tests/__init__.py +0 -0
  210. {hud_python-0.6.3 → hud_python-0.6.4}/hud/patches/tests/test_warnings.py +0 -0
  211. {hud_python-0.6.3 → hud_python-0.6.4}/hud/patches/warnings.py +0 -0
  212. {hud_python-0.6.3 → hud_python-0.6.4}/hud/py.typed +0 -0
  213. {hud_python-0.6.3 → hud_python-0.6.4}/hud/server.py +0 -0
  214. {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/__init__.py +0 -0
  215. {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/context.py +0 -0
  216. {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/exporter.py +0 -0
  217. {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/filetracking.py +0 -0
  218. {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/instrument.py +0 -0
  219. {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/span.py +0 -0
  220. {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/tests/__init__.py +0 -0
  221. {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/tests/test_exporter.py +0 -0
  222. {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/tests/test_filetracking.py +0 -0
  223. {hud_python-0.6.3 → hud_python-0.6.4}/hud/telemetry/tests/test_instrument.py +0 -0
  224. {hud_python-0.6.3 → hud_python-0.6.4}/hud/types.py +0 -0
  225. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/__init__.py +0 -0
  226. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/exceptions.py +0 -0
  227. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/hints.py +0 -0
  228. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/hud_console.py +0 -0
  229. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/modules.py +0 -0
  230. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/platform.py +0 -0
  231. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/requests.py +0 -0
  232. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/serialization.py +0 -0
  233. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/tests/__init__.py +0 -0
  234. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/tests/test_exceptions.py +0 -0
  235. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/tests/test_hints.py +0 -0
  236. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/tests/test_hud_console.py +0 -0
  237. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/tests/test_platform.py +0 -0
  238. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/tests/test_requests.py +0 -0
  239. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/tests/test_serialization.py +0 -0
  240. {hud_python-0.6.3 → hud_python-0.6.4}/hud/utils/time.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.6.3
3
+ Version: 0.6.4
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -0,0 +1,57 @@
1
+ # Connect Four self-play
2
+
3
+ Symmetric self-play RL on a 6×7 Connect Four board. Draws are rare (you need a
4
+ full 42-cell board with no four-in-a-row), so the win/loss reward signal
5
+ persists as the policy improves and the GRPO advantage stays non-zero.
6
+
7
+ ## How it works
8
+
9
+ - One agent ("outer") plays a full game against an inner model on the **same
10
+ slug** — true self-play. `seed % 2` decides who drops first, for symmetric
11
+ first-move coverage.
12
+ - Each game trains **both sides at once**: the outer agent's `Run` (reward from
13
+ its perspective) plus a hand-built `TrajectoryPayload` for the inner model
14
+ with the flipped reward (`1 - outer_reward`).
15
+ - `group_size=2` pairs each game's two trajectories so the GRPO advantage is
16
+ `reward - 0.5` per game.
17
+ - `loss_fn="ppo"` clips the importance-sampling ratio, so a single lucky game
18
+ can't blow up the update.
19
+
20
+ The training loop uses the public API directly — `forward_backward` accepts
21
+ `Run` and `TrajectoryPayload` mixed, so no private helpers are needed.
22
+
23
+ ## Setup
24
+
25
+ ```bash
26
+ hud models fork Qwen/Qwen3.5-4B --name c4-selfplay # prints a slug like c4-selfplay-<id>
27
+ ```
28
+
29
+ Put your `HUD_API_KEY` in a `.env` here (or the environment).
30
+
31
+ ## Run
32
+
33
+ Local sanity check (one game, cheap external model as the outer agent):
34
+
35
+ ```bash
36
+ hud eval env.py claude --model claude-haiku-4-5
37
+ ```
38
+
39
+ Train:
40
+
41
+ ```bash
42
+ python train.py --model c4-selfplay-<id> --steps 20 --group 4 --lr 1e-5
43
+ ```
44
+
45
+ ## Tuning notes
46
+
47
+ - **Memory scales with `tasks × group`.** Each task×rollout is a fresh `env.py`
48
+ subprocess. With 8 tasks and `--group 4` that's 32 concurrent games. Connect
49
+ Four games can run up to 42 plies, so they cost more tokens and time per game —
50
+ start at `--group 4` and raise only if you have RAM headroom.
51
+ - **Watch the server-side metrics.** The loop prints local win/draw/loss counts
52
+ each step and the last few checkpoints' `mean_reward` / `reward_std` via
53
+ `trainer.checkpoints()` at the end. A healthy run keeps non-trivial
54
+ `reward_std` (within-group spread); if it collapses, the policy has saturated.
55
+ - **Reset on changes.** If you edit the reward or the board, roll the head back
56
+ to a clean checkpoint (`hud models head <slug> --set <id>`) or fork fresh —
57
+ don't keep training a policy shaped by the old objective.
@@ -8,7 +8,12 @@ from __future__ import annotations
8
8
  from typing import TYPE_CHECKING, Any, cast
9
9
 
10
10
  from hud.types import AgentType
11
- from hud.utils.gateway import build_gateway_client, list_gateway_models
11
+ from hud.utils.gateway import (
12
+ build_gateway_client,
13
+ gateway_model_aliases,
14
+ list_gateway_models,
15
+ normalize_gateway_model_id,
16
+ )
12
17
 
13
18
  if TYPE_CHECKING:
14
19
  from typing import TypeAlias
@@ -27,6 +32,8 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
27
32
 
28
33
  For direct API access with provider API keys, instantiate the agent classes directly.
29
34
  """
35
+ requested_model = model
36
+ model = normalize_gateway_model_id(model)
30
37
  agent_type = next((candidate for candidate in AgentType if candidate.value == model), None)
31
38
  if agent_type is not None:
32
39
  model_id = model
@@ -73,7 +80,8 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
73
80
  for n in (gm.id, gm.name, gm.model_name)
74
81
  if isinstance(n, str)
75
82
  ]
76
- near = difflib.get_close_matches(model, known, n=3, cutoff=0.5)
83
+ known.extend(gateway_model_aliases())
84
+ near = difflib.get_close_matches(requested_model, known, n=3, cutoff=0.5)
77
85
  hint = (
78
86
  f" Did you mean: {', '.join(near)}?"
79
87
  if near
@@ -84,7 +92,7 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
84
92
  if gateway_models
85
93
  else "the HUD gateway registry (empty — is HUD_API_KEY set?)"
86
94
  )
87
- raise ValueError(f"Model {model!r} not found in {source}.{hint}")
95
+ raise ValueError(f"Model {requested_model!r} not found in {source}.{hint}")
88
96
 
89
97
  kwargs.setdefault("model", model_id)
90
98
  kwargs.setdefault("model_client", build_gateway_client(provider_name))
@@ -193,16 +193,27 @@ class OpenAIChatAgent(ToolAgent[ChatCompletionMessageParam, OpenAIChatConfig]):
193
193
  sample: Sample | None = None
194
194
  if return_token_ids:
195
195
  prompt_token_ids = getattr(choice, "prompt_token_ids", None)
196
+ # Multimodal prompt (text + image chunks): the only prompt representation
197
+ # that survives image inputs; flat prompt_token_ids is null in that case.
198
+ prompt_chunks = getattr(choice, "prompt_chunks", None)
196
199
  token_ids = getattr(choice, "token_ids", None)
197
- if prompt_token_ids is not None and token_ids is not None:
198
- chat_state.continuation_token_ids = list(prompt_token_ids) + list(token_ids)
199
- chat_state.continuation_message_count = len(messages)
200
+ has_prompt = prompt_token_ids is not None or prompt_chunks is not None
201
+ if token_ids is not None and has_prompt:
200
202
  content_lp = choice.logprobs.content if choice.logprobs else None
201
203
  sample = Sample(
202
- prompt_token_ids=list(prompt_token_ids),
204
+ prompt_token_ids=list(prompt_token_ids) if prompt_token_ids is not None else [],
205
+ prompt_chunks=list(prompt_chunks) if prompt_chunks is not None else None,
203
206
  output_token_ids=list(token_ids),
204
207
  output_logprobs=[tok.logprob for tok in content_lp] if content_lp else [],
205
208
  )
209
+ # KV-cache continuation only applies to flat text prompts; clear any
210
+ # stale state when the gateway returns chunks-only (multimodal turn).
211
+ if prompt_token_ids is not None:
212
+ chat_state.continuation_token_ids = list(prompt_token_ids) + list(token_ids)
213
+ chat_state.continuation_message_count = len(messages)
214
+ else:
215
+ chat_state.continuation_token_ids = None
216
+ chat_state.continuation_message_count = None
206
217
 
207
218
  tool_calls: list[MCPToolCall] = []
208
219
  for tc in function_calls:
@@ -108,7 +108,7 @@ def test_create_agent_resolves_gateway_model_metadata(
108
108
 
109
109
  model = GatewayModelInfo(
110
110
  id="ft:custom-123",
111
- model_name="gpt-5.4",
111
+ model_name="gpt-5.5",
112
112
  sdk_agent_type="openai_compatible",
113
113
  provider=GatewayProviderInfo(name="openai"),
114
114
  )
@@ -122,4 +122,40 @@ def test_create_agent_resolves_gateway_model_metadata(
122
122
  agent = create_agent("ft:custom-123")
123
123
 
124
124
  assert isinstance(agent, OpenAIChatAgent)
125
- assert agent.config.model == "gpt-5.4" # resolved to the model's real name
125
+ assert agent.config.model == "gpt-5.5" # resolved to the model's real name
126
+
127
+
128
+ @pytest.mark.parametrize(
129
+ ("alias", "canonical"),
130
+ [
131
+ ("deepseek-v4", "deepseek/deepseek-v4-pro"),
132
+ ("deepseek-v4-flash", "deepseek/deepseek-v4-flash"),
133
+ ("glm-5.2", "z-ai/glm-5.2"),
134
+ ("kimi-k2.6", "moonshotai/kimi-k2.6"),
135
+ ("minimax-m3", "MiniMax-M3"),
136
+ ],
137
+ )
138
+ def test_create_agent_accepts_gateway_model_aliases(
139
+ alias: str,
140
+ canonical: str,
141
+ monkeypatch: pytest.MonkeyPatch,
142
+ ) -> None:
143
+ from hud.utils.gateway import GatewayModelInfo, GatewayProviderInfo
144
+
145
+ model = GatewayModelInfo(
146
+ id=canonical,
147
+ model_name=canonical,
148
+ sdk_agent_type="openai_compatible",
149
+ provider=GatewayProviderInfo(name="openai"),
150
+ )
151
+ monkeypatch.setattr("hud.agents.list_gateway_models", lambda: [model])
152
+
153
+ def _build_client(_provider: str) -> object:
154
+ return object()
155
+
156
+ monkeypatch.setattr("hud.agents.build_gateway_client", _build_client)
157
+
158
+ agent = create_agent(alias)
159
+
160
+ assert isinstance(agent, OpenAIChatAgent)
161
+ assert agent.config.model == canonical
@@ -102,7 +102,7 @@ def _commands(tool: Any) -> list[str]:
102
102
 
103
103
 
104
104
  async def test_openai_shell_wraps_command_with_timeout() -> None:
105
- tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.4"), client=_ssh())
105
+ tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
106
106
 
107
107
  result = await tool.execute({"commands": ["pwd"], "timeout_ms": 2500})
108
108
 
@@ -114,7 +114,7 @@ async def test_openai_shell_wraps_command_with_timeout() -> None:
114
114
 
115
115
 
116
116
  async def test_openai_shell_runs_each_command_without_timeout() -> None:
117
- tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.4"), client=_ssh())
117
+ tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
118
118
 
119
119
  await tool.execute({"commands": ["echo a", "echo b"]})
120
120
 
@@ -122,7 +122,7 @@ async def test_openai_shell_runs_each_command_without_timeout() -> None:
122
122
 
123
123
 
124
124
  async def test_openai_shell_rejects_non_list_commands_without_running() -> None:
125
- tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.4"), client=_ssh())
125
+ tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
126
126
 
127
127
  result = await tool.execute({"commands": 123})
128
128
 
@@ -131,7 +131,7 @@ async def test_openai_shell_rejects_non_list_commands_without_running() -> None:
131
131
 
132
132
 
133
133
  def test_openai_shell_to_params_is_shell_type() -> None:
134
- tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.4"), client=_ssh())
134
+ tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
135
135
  assert tool.to_params()["type"] == "shell"
136
136
 
137
137
 
@@ -99,7 +99,7 @@ class OpenAIConfig(AgentConfig):
99
99
  """Configuration for OpenAIAgent."""
100
100
 
101
101
  model_name: str = "OpenAI"
102
- model: str = Field(default="gpt-5.4", validation_alias=_model_alias)
102
+ model: str = Field(default="gpt-5.5", validation_alias=_model_alias)
103
103
  max_output_tokens: int | None = None
104
104
  temperature: float | None = None
105
105
  reasoning: Any = None # openai Reasoning
@@ -113,7 +113,7 @@ class OpenAIChatConfig(AgentConfig):
113
113
  """Configuration for OpenAIChatAgent."""
114
114
 
115
115
  model_name: str = "OpenAI Chat"
116
- model: str = Field(default="gpt-5-mini", validation_alias=_model_alias)
116
+ model: str = Field(default="gpt-5.4-mini", validation_alias=_model_alias)
117
117
  checkpoint: str | None = Field(
118
118
  default=None,
119
119
  description="Specific checkpoint name for inference routing. "
@@ -139,7 +139,7 @@ class ClaudeSDKConfig(AgentConfig):
139
139
  """
140
140
 
141
141
  model_name: str = "Claude Code"
142
- model: str = Field(default="claude-sonnet-4-5", validation_alias=_model_alias)
142
+ model: str = Field(default="claude-sonnet-4-6", validation_alias=_model_alias)
143
143
  permission_mode: str = "bypassPermissions"
144
144
  max_steps: int = -1
145
145
  allowed_tools: list[str] = Field(
@@ -222,6 +222,10 @@ class Sample(BaseModel):
222
222
  """
223
223
 
224
224
  prompt_token_ids: list[int] = Field(default_factory=list[int])
225
+ # Multimodal prompt as serialized ``ModelInput`` chunks (text + image), set by
226
+ # vision rollouts where the prompt is not a flat token list. When present it is
227
+ # the authoritative prompt for training; ``prompt_token_ids`` stays empty.
228
+ prompt_chunks: list[dict[str, Any]] | None = None
225
229
  output_token_ids: list[int] = Field(default_factory=list[int])
226
230
  output_logprobs: list[float] = Field(default_factory=list[float])
227
231
 
@@ -35,11 +35,13 @@ from .client import client_app # noqa: E402
35
35
  from .deploy import deploy_command # noqa: E402
36
36
  from .eval import eval_command # noqa: E402
37
37
  from .init import init_command # noqa: E402
38
+ from .jobs import jobs_app # noqa: E402
38
39
  from .login import login_command # noqa: E402
39
40
  from .models import models_app # noqa: E402
40
41
  from .serve import serve_command # noqa: E402
41
42
  from .sync import sync_app # noqa: E402
42
43
  from .task import task_app # noqa: E402
44
+ from .trace import trace_app # noqa: E402
43
45
 
44
46
  app.command(name="serve")(serve_command)
45
47
  app.command(name="dev", deprecated=True, hidden=True)(serve_command) # alias for now
@@ -49,6 +51,8 @@ app.command(name="eval")(eval_command)
49
51
  app.command(name="init")(init_command)
50
52
  app.command(name="cancel")(cancel_command)
51
53
  app.add_typer(models_app, name="models")
54
+ app.add_typer(jobs_app, name="jobs")
55
+ app.add_typer(trace_app, name="trace")
52
56
 
53
57
 
54
58
  @app.command(name="set")
@@ -43,8 +43,9 @@ def _resolve_model_from_catalog(model_id: str) -> tuple[AgentType, str] | None:
43
43
  Returns None if the model isn't found or the catalog is unreachable.
44
44
  """
45
45
  try:
46
- from hud.utils.gateway import list_gateway_models
46
+ from hud.utils.gateway import list_gateway_models, normalize_gateway_model_id
47
47
 
48
+ model_id = normalize_gateway_model_id(model_id)
48
49
  models = list_gateway_models()
49
50
  except Exception:
50
51
  return None
@@ -117,8 +118,9 @@ class AgentPreset:
117
118
 
118
119
  _AGENT_PRESETS: list[AgentPreset] = [
119
120
  AgentPreset("Claude Sonnet 4.6", AgentType.CLAUDE, "claude-sonnet-4-6"),
120
- AgentPreset("GPT-5.4", AgentType.OPENAI, "gpt-5.4"),
121
- AgentPreset("Gemini 3.1 Pro (Preview)", AgentType.GEMINI, "gemini-3-1-pro"),
121
+ AgentPreset("Claude Opus 4.8", AgentType.CLAUDE, "claude-opus-4-8"),
122
+ AgentPreset("GPT-5.5", AgentType.OPENAI, "gpt-5.5"),
123
+ AgentPreset("Gemini 3.1 Pro (Preview)", AgentType.GEMINI, "gemini-3.1-pro-preview"),
122
124
  AgentPreset(
123
125
  "Grok 4-1 Fast (xAI)",
124
126
  AgentType.OPENAI_COMPATIBLE,
@@ -131,10 +133,22 @@ _AGENT_PRESETS: list[AgentPreset] = [
131
133
  },
132
134
  ),
133
135
  AgentPreset(
134
- "GLM-4.6V (Z-AI)",
136
+ "GLM 5.2 (Z.ai)",
135
137
  AgentType.OPENAI_COMPATIBLE,
136
- "z-ai/glm-4.6v",
137
- {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM-4.6V"}},
138
+ "z-ai/glm-5.2",
139
+ {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM 5.2"}},
140
+ ),
141
+ AgentPreset(
142
+ "Kimi K2.6 (Moonshot)",
143
+ AgentType.OPENAI_COMPATIBLE,
144
+ "moonshotai/kimi-k2.6",
145
+ {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "Kimi K2.6"}},
146
+ ),
147
+ AgentPreset(
148
+ "MiniMax M3",
149
+ AgentType.OPENAI_COMPATIBLE,
150
+ "MiniMax-M3",
151
+ {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "MiniMax M3"}},
138
152
  ),
139
153
  ]
140
154
 
@@ -162,7 +176,7 @@ _DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
162
176
  # use_computer_beta = true
163
177
 
164
178
  [openai]
165
- # model = "gpt-4o"
179
+ # model = "gpt-5.5"
166
180
  # temperature = 0.7
167
181
  # max_output_tokens = 4096
168
182
 
@@ -402,6 +416,11 @@ class EvalConfig(BaseModel):
402
416
  if self.model:
403
417
  kwargs["model"] = self.model
404
418
 
419
+ if isinstance(kwargs.get("model"), str):
420
+ from hud.utils.gateway import normalize_gateway_model_id
421
+
422
+ kwargs["model"] = normalize_gateway_model_id(kwargs["model"])
423
+
405
424
  if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
406
425
  base_url = kwargs.get("base_url", "")
407
426
  if settings.hud_gateway_url in base_url and settings.api_key:
@@ -0,0 +1,146 @@
1
+ """``hud jobs`` — list jobs and their traces."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+
7
+ import typer
8
+ from rich.console import Console
9
+ from rich.panel import Panel
10
+ from rich.table import Table
11
+
12
+ console = Console()
13
+
14
+ jobs_app = typer.Typer(
15
+ name="jobs",
16
+ help="List jobs and their traces",
17
+ add_completion=False,
18
+ rich_markup_mode="rich",
19
+ no_args_is_help=False,
20
+ )
21
+
22
+
23
+ @jobs_app.callback(invoke_without_command=True)
24
+ def jobs_command(
25
+ ctx: typer.Context,
26
+ job_id: str | None = typer.Argument(None, help="Job ID — omit to list recent jobs"),
27
+ json_output: bool = typer.Option(False, "--json", help="Output as JSON"),
28
+ limit: int = typer.Option(20, "--limit", "-n", help="Max rows to show"),
29
+ ) -> None:
30
+ """List recent jobs, or show traces for a specific job.
31
+
32
+ Without an argument, lists the most recent jobs.
33
+ With a job id, lists all traces for that job.
34
+ """
35
+ if ctx.invoked_subcommand is not None:
36
+ return
37
+
38
+ from hud.cli.utils.api import require_api_key
39
+
40
+ require_api_key("list jobs")
41
+
42
+ if job_id:
43
+ _show_job_traces(job_id, json_output=json_output, limit=limit)
44
+ else:
45
+ _list_jobs(json_output=json_output, limit=limit)
46
+
47
+
48
+ # ── job listing ────────────────────────────────────────────────────────────────
49
+
50
+
51
+ def _list_jobs(*, json_output: bool, limit: int) -> None:
52
+ from hud.utils.platform import PlatformClient
53
+
54
+ client = PlatformClient.from_settings()
55
+ try:
56
+ data = client.get("/jobs", params={"limit": limit})
57
+ except Exception as e:
58
+ console.print(f"[red]Failed to fetch jobs: {e}[/red]")
59
+ raise typer.Exit(1) from e
60
+
61
+ items = data if isinstance(data, list) else (data.get("items") or [])
62
+
63
+ if json_output:
64
+ console.print_json(json.dumps(items, indent=2, default=str))
65
+ return
66
+
67
+ if not items:
68
+ console.print("[yellow]No jobs found.[/yellow]")
69
+ return
70
+
71
+ console.print(Panel.fit("[bold cyan]Recent Jobs[/bold cyan]", border_style="cyan"))
72
+ table = Table()
73
+ table.add_column("ID", style="blue", no_wrap=True)
74
+ table.add_column("Name", style="cyan")
75
+ table.add_column("Taskset", style="dim")
76
+ table.add_column("Status", style="yellow")
77
+ table.add_column("Created", style="dim")
78
+
79
+ from hud.settings import settings
80
+
81
+ web = settings.hud_web_url.rstrip("/")
82
+
83
+ for job in items:
84
+ jid = str(job.get("id") or "")
85
+ table.add_row(
86
+ jid,
87
+ job.get("name") or "-",
88
+ job.get("taskset_name") or "-",
89
+ job.get("status") or "-",
90
+ (str(job.get("created_at") or ""))[:19],
91
+ )
92
+ console.print(table)
93
+ console.print(f"\n[dim]View: {web}/jobs[/dim]")
94
+ console.print("[dim]Tip: hud jobs <id> to see traces for a specific job[/dim]")
95
+
96
+
97
+ # ── job traces ────────────────────────────────────────────────────────────────
98
+
99
+
100
+ def _show_job_traces(job_id: str, *, json_output: bool, limit: int) -> None:
101
+ from hud.settings import settings
102
+ from hud.utils.platform import PlatformClient
103
+
104
+ client = PlatformClient.from_settings()
105
+ try:
106
+ data = client.get(f"/jobs/{job_id}/traces", params={"limit": limit})
107
+ except Exception as e:
108
+ console.print(f"[red]Failed to fetch traces: {e}[/red]")
109
+ raise typer.Exit(1) from e
110
+
111
+ items = data if isinstance(data, list) else (data.get("items") or [])
112
+
113
+ if json_output:
114
+ console.print_json(json.dumps(items, indent=2, default=str))
115
+ return
116
+
117
+ web = settings.hud_web_url.rstrip("/")
118
+
119
+ if not items:
120
+ console.print("[yellow]No traces found for this job.[/yellow]")
121
+ console.print(f"[dim]View: {web}/jobs/{job_id}[/dim]")
122
+ return
123
+
124
+ console.print(
125
+ Panel.fit(f"[bold cyan]Job Traces[/bold cyan] [dim]{job_id}[/dim]", border_style="cyan")
126
+ )
127
+ table = Table()
128
+ table.add_column("Trace ID", style="blue", no_wrap=True)
129
+ table.add_column("Status", style="yellow")
130
+ table.add_column("Reward", style="green", justify="right")
131
+ table.add_column("Started", style="dim")
132
+ table.add_column("Error", style="red")
133
+
134
+ for tr in items:
135
+ tid = str(tr.get("id") or "")
136
+ reward = tr.get("reward")
137
+ table.add_row(
138
+ tid,
139
+ tr.get("status") or "-",
140
+ f"{reward:.3f}" if reward is not None else "-",
141
+ (str(tr.get("start_time") or tr.get("created_at") or ""))[:19],
142
+ (tr.get("error") or "")[:40],
143
+ )
144
+ console.print(table)
145
+ console.print(f"\n[dim]View: {web}/jobs/{job_id}[/dim]")
146
+ console.print("[dim]Tip: hud trace <trace_id> to inspect a specific rollout[/dim]")
@@ -71,6 +71,8 @@ def list_models(
71
71
  )
72
72
  console.print(table)
73
73
  console.print(f"\n[dim]Gateway: {settings.hud_gateway_url}[/dim]")
74
+ web = settings.hud_web_url.rstrip("/")
75
+ console.print(f"[dim]View a model in the browser: {web}/models/<id>[/dim]")
74
76
 
75
77
 
76
78
  @models_app.command("fork")
@@ -116,6 +118,7 @@ def fork_model(
116
118
  )
117
119
  )
118
120
  console.print(f"\n[dim]Train it: hud.TrainingClient({slug!r})[/dim]")
121
+ console.print(f"[dim]View: {_model_url(model['id'])}[/dim]")
119
122
 
120
123
 
121
124
  @models_app.command("checkpoints")
@@ -127,13 +130,15 @@ def list_checkpoints(
127
130
  from hud.cli.utils.api import require_api_key
128
131
 
129
132
  require_api_key("list checkpoints")
130
- checkpoints = _get_checkpoints(model)
133
+ model_id = _resolve_model_id(model)
134
+ checkpoints = _get_checkpoints(model_id)
131
135
 
132
136
  if json_output:
133
137
  console.print_json(json.dumps(checkpoints, indent=2))
134
138
  return
135
139
  if not checkpoints:
136
140
  console.print("[yellow]No checkpoints yet — this model serves its base weights[/yellow]")
141
+ console.print(f"[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
137
142
  return
138
143
 
139
144
  checkpoints = sorted(checkpoints, key=lambda c: c.get("created_at") or "")
@@ -155,6 +160,7 @@ def list_checkpoints(
155
160
  (ckpt.get("created_at") or "")[:19],
156
161
  )
157
162
  console.print(table)
163
+ console.print(f"\n[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
158
164
 
159
165
 
160
166
  @models_app.command("head")
@@ -170,19 +176,22 @@ def show_head(
170
176
  from hud.cli.utils.api import require_api_key
171
177
 
172
178
  require_api_key("manage head")
179
+ model_id = _resolve_model_id(model)
173
180
 
174
181
  if set_to is not None:
175
- _set_head(model, set_to)
182
+ _set_head(model_id, set_to)
176
183
  console.print(f"[green]Head set to[/green] [cyan]{set_to}[/cyan]")
184
+ console.print(f"[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
177
185
  return
178
186
 
179
- head = next((c for c in _get_checkpoints(model) if c.get("is_active")), None)
187
+ head = next((c for c in _get_checkpoints(model_id) if c.get("is_active")), None)
180
188
 
181
189
  if json_output:
182
190
  console.print_json(json.dumps(head, indent=2))
183
191
  return
184
192
  if head is None:
185
193
  console.print("[yellow]No active checkpoint — this model serves its base weights[/yellow]")
194
+ console.print(f"[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
186
195
  return
187
196
 
188
197
  reward = head.get("mean_reward")
@@ -196,6 +205,15 @@ def show_head(
196
205
  border_style="green",
197
206
  )
198
207
  )
208
+ console.print(f"[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
209
+
210
+
211
+ def _model_url(model_id: str, *, tab: str | None = None) -> str:
212
+ """Web app URL for a model (optionally a specific tab, e.g. ``checkpoints``)."""
213
+ from hud.settings import settings
214
+
215
+ url = f"{settings.hud_web_url.rstrip('/')}/models/{model_id}"
216
+ return f"{url}?tab={tab}" if tab else url
199
217
 
200
218
 
201
219
  def _resolve_model_id(model: str) -> str:
@@ -50,6 +50,21 @@ def test_get_agent_kwargs_model_precedence_and_flags() -> None:
50
50
  assert kwargs["verbose"] is True
51
51
 
52
52
 
53
+ def test_get_agent_kwargs_normalizes_gateway_model_alias() -> None:
54
+ cfg = EvalConfig(agent_type="openai_compatible", model="glm-5.2")
55
+
56
+ assert cfg.get_agent_kwargs()["model"] == "z-ai/glm-5.2"
57
+
58
+
59
+ def test_get_agent_kwargs_normalizes_config_model_alias() -> None:
60
+ cfg = EvalConfig(
61
+ agent_type="openai_compatible",
62
+ agent_config={"openai_compatible": {"model": "glm-5.2"}},
63
+ )
64
+
65
+ assert cfg.get_agent_kwargs()["model"] == "z-ai/glm-5.2"
66
+
67
+
53
68
  def test_get_agent_kwargs_requires_agent_type() -> None:
54
69
  with pytest.raises(ValueError, match="agent_type must be set"):
55
70
  EvalConfig().get_agent_kwargs()
@@ -186,6 +201,31 @@ def test_merge_cli_overrides_fields() -> None:
186
201
  assert merged.max_steps == 7
187
202
 
188
203
 
204
+ def test_merge_cli_resolves_gateway_model_alias(monkeypatch: pytest.MonkeyPatch) -> None:
205
+ from hud.utils.gateway import GatewayModelInfo, GatewayProviderInfo
206
+
207
+ model = GatewayModelInfo(
208
+ id="z-ai/glm-5.2",
209
+ model_name="z-ai/glm-5.2",
210
+ sdk_agent_type="openai_compatible",
211
+ provider=GatewayProviderInfo(name="openai"),
212
+ )
213
+ monkeypatch.setattr("hud.utils.gateway.list_gateway_models", lambda: [model])
214
+
215
+ merged = EvalConfig().merge_cli(agent="glm-5.2")
216
+
217
+ assert merged.agent_type is not None and merged.agent_type.value == "openai_compatible"
218
+ assert merged.model == "z-ai/glm-5.2"
219
+
220
+
221
+ def test_merge_cli_config_model_alias_is_normalized() -> None:
222
+ merged = EvalConfig(agent_type="openai_compatible").merge_cli(
223
+ config=["openai_compatible.model=glm-5.2"]
224
+ )
225
+
226
+ assert merged.get_agent_kwargs()["model"] == "z-ai/glm-5.2"
227
+
228
+
189
229
  def test_merge_cli_namespaced_config() -> None:
190
230
  merged = EvalConfig().merge_cli(config=["claude.max_tokens=100"])
191
231
  assert merged.agent_config["claude"]["max_tokens"] == 100