hud-python 0.6.2__tar.gz → 0.6.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. {hud_python-0.6.2 → hud_python-0.6.4}/PKG-INFO +1 -1
  2. hud_python-0.6.4/cookbooks/connect4-selfplay/README.md +57 -0
  3. hud_python-0.6.4/cookbooks/fireworks-rl-training/README.md +114 -0
  4. hud_python-0.6.4/cookbooks/fireworks-rl-training/pyproject.toml +19 -0
  5. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/__init__.py +11 -3
  6. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai_compatible/agent.py +15 -4
  7. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_base.py +38 -2
  8. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_provider_native_tools.py +4 -4
  9. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/types.py +7 -3
  10. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/__init__.py +4 -0
  11. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/eval.py +64 -11
  12. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/init.py +3 -3
  13. hud_python-0.6.4/hud/cli/jobs.py +146 -0
  14. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/models.py +21 -3
  15. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/templates.py +4 -5
  16. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_deploy.py +1 -1
  17. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_eval_config.py +69 -0
  18. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_init.py +8 -0
  19. hud_python-0.6.4/hud/cli/trace.py +215 -0
  20. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/job.py +33 -9
  21. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/run.py +31 -6
  22. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/runtime.py +51 -8
  23. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/taskset.py +18 -2
  24. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_hosted.py +48 -0
  25. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_rollout.py +26 -1
  26. {hud_python-0.6.2 → hud_python-0.6.4}/hud/settings.py +2 -2
  27. {hud_python-0.6.2 → hud_python-0.6.4}/hud/train/__init__.py +2 -0
  28. hud_python-0.6.4/hud/train/base.py +159 -0
  29. {hud_python-0.6.2 → hud_python-0.6.4}/hud/train/client.py +41 -17
  30. {hud_python-0.6.2 → hud_python-0.6.4}/hud/train/types.py +38 -4
  31. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/gateway.py +23 -0
  32. {hud_python-0.6.2 → hud_python-0.6.4}/hud/version.py +1 -1
  33. {hud_python-0.6.2 → hud_python-0.6.4}/pyproject.toml +1 -1
  34. hud_python-0.6.2/hud/train/base.py +0 -102
  35. {hud_python-0.6.2 → hud_python-0.6.4}/.gitignore +0 -0
  36. {hud_python-0.6.2 → hud_python-0.6.4}/LICENSE +0 -0
  37. {hud_python-0.6.2 → hud_python-0.6.4}/README.md +0 -0
  38. {hud_python-0.6.2 → hud_python-0.6.4}/cookbooks/a2a-chat/README.md +0 -0
  39. {hud_python-0.6.2 → hud_python-0.6.4}/cookbooks/a2a-chat/pyproject.toml +0 -0
  40. {hud_python-0.6.2 → hud_python-0.6.4}/cookbooks/codex-coding/README.md +0 -0
  41. {hud_python-0.6.2 → hud_python-0.6.4}/cookbooks/codex-coding/pyproject.toml +0 -0
  42. {hud_python-0.6.2 → hud_python-0.6.4}/cookbooks/rl-training/README.md +0 -0
  43. {hud_python-0.6.2 → hud_python-0.6.4}/cookbooks/rl-training/pyproject.toml +0 -0
  44. {hud_python-0.6.2 → hud_python-0.6.4}/hud/__init__.py +0 -0
  45. {hud_python-0.6.2 → hud_python-0.6.4}/hud/__main__.py +0 -0
  46. {hud_python-0.6.2 → hud_python-0.6.4}/hud/_legacy.py +0 -0
  47. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/base.py +0 -0
  48. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/browser_use/__init__.py +0 -0
  49. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/browser_use/agent.py +0 -0
  50. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/__init__.py +0 -0
  51. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/agent.py +0 -0
  52. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/sdk/__init__.py +0 -0
  53. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/sdk/agent.py +0 -0
  54. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/sdk/computer_mcp.py +0 -0
  55. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/__init__.py +0 -0
  56. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/base.py +0 -0
  57. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/coding.py +0 -0
  58. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/computer.py +0 -0
  59. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/hosted.py +0 -0
  60. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/mcp_proxy.py +0 -0
  61. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/settings.py +0 -0
  62. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/tests/__init__.py +0 -0
  63. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/claude/tools/tests/test_computer.py +0 -0
  64. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/__init__.py +0 -0
  65. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/agent.py +0 -0
  66. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/settings.py +0 -0
  67. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/__init__.py +0 -0
  68. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/base.py +0 -0
  69. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/coding.py +0 -0
  70. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/computer.py +0 -0
  71. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/filesystem.py +0 -0
  72. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/hosted.py +0 -0
  73. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/mcp_proxy.py +0 -0
  74. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/tests/__init__.py +0 -0
  75. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/gemini/tools/tests/test_computer.py +0 -0
  76. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/misc/__init__.py +0 -0
  77. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/misc/response_automation.py +0 -0
  78. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/__init__.py +0 -0
  79. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/agent.py +0 -0
  80. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/__init__.py +0 -0
  81. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/apply_patch.py +0 -0
  82. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/base.py +0 -0
  83. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/coding.py +0 -0
  84. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/computer.py +0 -0
  85. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/hosted.py +0 -0
  86. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/mcp_proxy.py +0 -0
  87. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/strict_schema.py +0 -0
  88. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/tests/__init__.py +0 -0
  89. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/tests/test_computer.py +0 -0
  90. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai/tools/tests/test_strict_schema.py +0 -0
  91. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai_compatible/__init__.py +0 -0
  92. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/__init__.py +0 -0
  93. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/base.py +0 -0
  94. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/filesystem.py +0 -0
  95. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/openai_compatible/tools/mcp_proxy.py +0 -0
  96. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/robot/__init__.py +0 -0
  97. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/robot/_types.py +0 -0
  98. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/robot/adapter.py +0 -0
  99. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/robot/agent.py +0 -0
  100. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/robot/model.py +0 -0
  101. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/__init__.py +0 -0
  102. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_apply_patch.py +0 -0
  103. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_claude_agent.py +0 -0
  104. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_claude_sdk_agent.py +0 -0
  105. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_gemini_agent.py +0 -0
  106. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_openai_agent.py +0 -0
  107. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_openai_compatible_agent.py +0 -0
  108. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_tool_agent.py +0 -0
  109. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tests/test_trace.py +0 -0
  110. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tool_agent.py +0 -0
  111. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tools/__init__.py +0 -0
  112. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tools/base.py +0 -0
  113. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tools/hosted.py +0 -0
  114. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tools/mcp.py +0 -0
  115. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tools/rfb.py +0 -0
  116. {hud_python-0.6.2 → hud_python-0.6.4}/hud/agents/tools/ssh.py +0 -0
  117. {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/__init__.py +0 -0
  118. {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/base.py +0 -0
  119. {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/cdp.py +0 -0
  120. {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/filetracking.py +0 -0
  121. {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/mcp.py +0 -0
  122. {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/rfb.py +0 -0
  123. {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/robot.py +0 -0
  124. {hud_python-0.6.2 → hud_python-0.6.4}/hud/capabilities/ssh.py +0 -0
  125. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/__main__.py +0 -0
  126. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/cancel.py +0 -0
  127. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/client.py +0 -0
  128. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/deploy.py +0 -0
  129. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/login.py +0 -0
  130. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/presets.py +0 -0
  131. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/serve.py +0 -0
  132. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/sync.py +0 -0
  133. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/task.py +0 -0
  134. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/__init__.py +0 -0
  135. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_cli_init.py +0 -0
  136. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_cli_main.py +0 -0
  137. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
  138. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_eval_bedrock.py +0 -0
  139. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_main_module.py +0 -0
  140. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/tests/test_sync_export.py +0 -0
  141. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/__init__.py +0 -0
  142. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/api.py +0 -0
  143. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/build_display.py +0 -0
  144. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/build_logs.py +0 -0
  145. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/config.py +0 -0
  146. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/context.py +0 -0
  147. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/display.py +0 -0
  148. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/jobs.py +0 -0
  149. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/registry.py +0 -0
  150. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/source.py +0 -0
  151. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tasks.py +0 -0
  152. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/__init__.py +0 -0
  153. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/test_build_display.py +0 -0
  154. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/test_config.py +0 -0
  155. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/test_context.py +0 -0
  156. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/test_registry.py +0 -0
  157. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/test_source.py +0 -0
  158. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/test_tasks.py +0 -0
  159. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/tests/test_version_check.py +0 -0
  160. {hud_python-0.6.2 → hud_python-0.6.4}/hud/cli/utils/version_check.py +0 -0
  161. {hud_python-0.6.2 → hud_python-0.6.4}/hud/clients/__init__.py +0 -0
  162. {hud_python-0.6.2 → hud_python-0.6.4}/hud/clients/client.py +0 -0
  163. {hud_python-0.6.2 → hud_python-0.6.4}/hud/clients/tests/__init__.py +0 -0
  164. {hud_python-0.6.2 → hud_python-0.6.4}/hud/clients/tests/test_connect.py +0 -0
  165. {hud_python-0.6.2 → hud_python-0.6.4}/hud/conftest.py +0 -0
  166. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/__init__.py +0 -0
  167. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/env.py +0 -0
  168. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/file_tracker.py +0 -0
  169. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/file_tracking.py +0 -0
  170. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/legacy.py +0 -0
  171. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/robot/__init__.py +0 -0
  172. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/robot/bridge.py +0 -0
  173. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/robot/endpoint.py +0 -0
  174. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/robot/sim_runner.py +0 -0
  175. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/server.py +0 -0
  176. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/__init__.py +0 -0
  177. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/conftest.py +0 -0
  178. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_capability_backing.py +0 -0
  179. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_file_tracker.py +0 -0
  180. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_file_tracking.py +0 -0
  181. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_legacy.py +0 -0
  182. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_loader.py +0 -0
  183. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_manifest.py +0 -0
  184. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_server.py +0 -0
  185. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/tests/test_tunnel.py +0 -0
  186. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/utils.py +0 -0
  187. {hud_python-0.6.2 → hud_python-0.6.4}/hud/environment/workspace.py +0 -0
  188. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/__init__.py +0 -0
  189. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/chat.py +0 -0
  190. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/file_tracking.py +0 -0
  191. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/sync.py +0 -0
  192. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/task.py +0 -0
  193. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/__init__.py +0 -0
  194. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_chat.py +0 -0
  195. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_docker_provider.py +0 -0
  196. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_file_tracking_observer.py +0 -0
  197. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_job.py +0 -0
  198. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_sync.py +0 -0
  199. {hud_python-0.6.2 → hud_python-0.6.4}/hud/eval/tests/test_task.py +0 -0
  200. {hud_python-0.6.2 → hud_python-0.6.4}/hud/graders/__init__.py +0 -0
  201. {hud_python-0.6.2 → hud_python-0.6.4}/hud/graders/base.py +0 -0
  202. {hud_python-0.6.2 → hud_python-0.6.4}/hud/graders/bash.py +0 -0
  203. {hud_python-0.6.2 → hud_python-0.6.4}/hud/graders/combine.py +0 -0
  204. {hud_python-0.6.2 → hud_python-0.6.4}/hud/graders/judge.py +0 -0
  205. {hud_python-0.6.2 → hud_python-0.6.4}/hud/graders/results.py +0 -0
  206. {hud_python-0.6.2 → hud_python-0.6.4}/hud/graders/text.py +0 -0
  207. {hud_python-0.6.2 → hud_python-0.6.4}/hud/patches/__init__.py +0 -0
  208. {hud_python-0.6.2 → hud_python-0.6.4}/hud/patches/mcp_patches.py +0 -0
  209. {hud_python-0.6.2 → hud_python-0.6.4}/hud/patches/tests/__init__.py +0 -0
  210. {hud_python-0.6.2 → hud_python-0.6.4}/hud/patches/tests/test_warnings.py +0 -0
  211. {hud_python-0.6.2 → hud_python-0.6.4}/hud/patches/warnings.py +0 -0
  212. {hud_python-0.6.2 → hud_python-0.6.4}/hud/py.typed +0 -0
  213. {hud_python-0.6.2 → hud_python-0.6.4}/hud/server.py +0 -0
  214. {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/__init__.py +0 -0
  215. {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/context.py +0 -0
  216. {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/exporter.py +0 -0
  217. {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/filetracking.py +0 -0
  218. {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/instrument.py +0 -0
  219. {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/span.py +0 -0
  220. {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/tests/__init__.py +0 -0
  221. {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/tests/test_exporter.py +0 -0
  222. {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/tests/test_filetracking.py +0 -0
  223. {hud_python-0.6.2 → hud_python-0.6.4}/hud/telemetry/tests/test_instrument.py +0 -0
  224. {hud_python-0.6.2 → hud_python-0.6.4}/hud/types.py +0 -0
  225. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/__init__.py +0 -0
  226. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/exceptions.py +0 -0
  227. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/hints.py +0 -0
  228. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/hud_console.py +0 -0
  229. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/modules.py +0 -0
  230. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/platform.py +0 -0
  231. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/requests.py +0 -0
  232. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/serialization.py +0 -0
  233. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/tests/__init__.py +0 -0
  234. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/tests/test_exceptions.py +0 -0
  235. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/tests/test_hints.py +0 -0
  236. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/tests/test_hud_console.py +0 -0
  237. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/tests/test_platform.py +0 -0
  238. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/tests/test_requests.py +0 -0
  239. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/tests/test_serialization.py +0 -0
  240. {hud_python-0.6.2 → hud_python-0.6.4}/hud/utils/time.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.6.2
3
+ Version: 0.6.4
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -0,0 +1,57 @@
1
+ # Connect Four self-play
2
+
3
+ Symmetric self-play RL on a 6×7 Connect Four board. Draws are rare (you need a
4
+ full 42-cell board with no four-in-a-row), so the win/loss reward signal
5
+ persists as the policy improves and the GRPO advantage stays non-zero.
6
+
7
+ ## How it works
8
+
9
+ - One agent ("outer") plays a full game against an inner model on the **same
10
+ slug** — true self-play. `seed % 2` decides who drops first, for symmetric
11
+ first-move coverage.
12
+ - Each game trains **both sides at once**: the outer agent's `Run` (reward from
13
+ its perspective) plus a hand-built `TrajectoryPayload` for the inner model
14
+ with the flipped reward (`1 - outer_reward`).
15
+ - `group_size=2` pairs each game's two trajectories so the GRPO advantage is
16
+ `reward - 0.5` per game.
17
+ - `loss_fn="ppo"` clips the importance-sampling ratio, so a single lucky game
18
+ can't blow up the update.
19
+
20
+ The training loop uses the public API directly — `forward_backward` accepts
21
+ `Run` and `TrajectoryPayload` mixed, so no private helpers are needed.
22
+
23
+ ## Setup
24
+
25
+ ```bash
26
+ hud models fork Qwen/Qwen3.5-4B --name c4-selfplay # prints a slug like c4-selfplay-<id>
27
+ ```
28
+
29
+ Put your `HUD_API_KEY` in a `.env` here (or the environment).
30
+
31
+ ## Run
32
+
33
+ Local sanity check (one game, cheap external model as the outer agent):
34
+
35
+ ```bash
36
+ hud eval env.py claude --model claude-haiku-4-5
37
+ ```
38
+
39
+ Train:
40
+
41
+ ```bash
42
+ python train.py --model c4-selfplay-<id> --steps 20 --group 4 --lr 1e-5
43
+ ```
44
+
45
+ ## Tuning notes
46
+
47
+ - **Memory scales with `tasks × group`.** Each task×rollout is a fresh `env.py`
48
+ subprocess. With 8 tasks and `--group 4` that's 32 concurrent games. Connect
49
+ Four games can run up to 42 plies, so they cost more tokens and time per game —
50
+ start at `--group 4` and raise only if you have RAM headroom.
51
+ - **Watch the server-side metrics.** The loop prints local win/draw/loss counts
52
+ each step and the last few checkpoints' `mean_reward` / `reward_std` via
53
+ `trainer.checkpoints()` at the end. A healthy run keeps non-trivial
54
+ `reward_std` (within-group spread); if it collapses, the policy has saturated.
55
+ - **Reset on changes.** If you edit the reward or the board, roll the head back
56
+ to a clean checkpoint (`hud models head <slug> --set <id>`) or fork fresh —
57
+ don't keep training a policy shaped by the old objective.
@@ -0,0 +1,114 @@
1
+ # Fireworks RL Training
2
+
3
+ Direct Fireworks Training API loop over the same arithmetic preview task used by
4
+ `cookbooks/rl-training`.
5
+
6
+ This does **not** use Fireworks native datasets or RFT jobs. It follows the
7
+ Training API service path from the Fireworks docs:
8
+
9
+ 1. `FiretitanServiceClient.from_firetitan_config(...)`
10
+ 2. `create_deployment_sampler(...)` for high-parallel rollouts
11
+ 3. local grading of HUD-style multiplication tasks
12
+ 4. `forward_backward_custom(...)` + `optim_step(...)`
13
+ 5. `save_weights_for_sampler(...)` + sampler refresh
14
+
15
+ References:
16
+
17
+ - Fireworks Training API introduction: https://docs.fireworks.ai/fine-tuning/training-api/introduction
18
+ - Training and sampling lifecycle: https://docs.fireworks.ai/fine-tuning/training-api/training-and-sampling
19
+ - Loss functions / GRPO reference: https://docs.fireworks.ai/fine-tuning/training-api/loss-functions
20
+
21
+ ## Setup
22
+
23
+ The repo-level `.env` is loaded automatically. It must contain:
24
+
25
+ ```bash
26
+ FIREWORKS_API_KEY=...
27
+ FIREWORKS_ACCOUNT_ID=...
28
+ ```
29
+
30
+ Install the isolated cookbook environment:
31
+
32
+ ```bash
33
+ uv sync --pre
34
+ ```
35
+
36
+ ## Calibrate task difficulty first
37
+
38
+ Calibration defaults to Fireworks' OpenAI-compatible inference API, so it does
39
+ **not** create a trainer, provision a Training API deployment, or call
40
+ `optim_step`. This is the cheap way to tune task difficulty before paying for a
41
+ Training API run.
42
+
43
+ The calibration model is separate from the training base model because the
44
+ `lorenss` key currently exposes only a small serverless inference catalog (no
45
+ Qwen3 8B deployment). Override it with `--inference-model` if you have a closer
46
+ deployed model.
47
+
48
+ ```bash
49
+ uv run train.py --calibrate-only --groups-per-step 8 --rollouts-per-prompt 8 --parallelism 32
50
+ ```
51
+
52
+ The goal is a reward distribution with variance. If reward is all zero, make the
53
+ task easier:
54
+
55
+ ```bash
56
+ uv run train.py --calibrate-only --min-a 10 --max-a 99 --min-b 2 --max-b 9
57
+ ```
58
+
59
+ If reward is all one, make the task harder:
60
+
61
+ ```bash
62
+ uv run train.py --calibrate-only --min-a 1000 --max-a 9999 --min-b 11 --max-b 99
63
+ ```
64
+
65
+ The current defaults are calibrated for the visible `gpt-oss-120b` inference
66
+ model on the `lorenss` key: 2-digit by 1-digit multiplication with a direct
67
+ "reply only with the integer" prompt. A 32-rollout calibration gave a non-trivial
68
+ baseline (`reward_mean ~= 0.22`, `reward_std ~= 0.42`), while the original
69
+ 3-digit by 2-digit range was all-zero.
70
+
71
+ ## Train
72
+
73
+ Once calibration has non-trivial rewards:
74
+
75
+ ```bash
76
+ uv run train.py --steps 5 --groups-per-step 8 --rollouts-per-prompt 8 --parallelism 32
77
+ ```
78
+
79
+ This uses the direct Training API managed service path. If you want calibration
80
+ to go through the managed deployment sampler too, pass
81
+ `--calibration-backend managed`; this provisions the same resources as training.
82
+
83
+ ### Current Fireworks preview account blocker
84
+
85
+ On the `lorenss` preview account, trainer creation currently fails before the
86
+ first train step with:
87
+
88
+ ```text
89
+ failed to ensure FIREWORKS_API_KEY secret: unkey inference api id is not configured
90
+ ```
91
+
92
+ This happens even with `create_deployment=False`, so it is an account/control
93
+ plane provisioning issue rather than a problem in the rollout or loss code. Once
94
+ Fireworks enables the missing Unkey inference API config for the account, the
95
+ same `uv run train.py ...` command should proceed to trainer startup and the
96
+ first `forward_backward_custom(...)` call.
97
+
98
+ Metrics are written to:
99
+
100
+ - `runs/fireworks-rl-preview/metrics.jsonl`
101
+ - `runs/fireworks-rl-preview/reward_loss.png` if `matplotlib` is installed
102
+
103
+ ## Notes
104
+
105
+ - Defaults use Qwen 3 8B full-parameter training:
106
+ - `accounts/fireworks/models/qwen3-8b`
107
+ - `Qwen/Qwen3-8B`
108
+ - `accounts/fireworks/trainingShapes/qwen3-8b-128k`
109
+ - LoRA can be tested with `--lora-rank N`, but the validated Qwen3 8B training
110
+ shape currently rejects LoRA mode on the `lorenss` preview account.
111
+ - The first checkpoint sync happens after step 0 and subsequent rollouts sample
112
+ the updated weights through the same deployment.
113
+ - `--keep-trainer` and `--keep-deployment` are available for debugging. By
114
+ default the trainer is cleaned up and the deployment scales to zero on exit.
@@ -0,0 +1,19 @@
1
+ [project]
2
+ name = "fireworks-rl-training"
3
+ version = "0.1.0"
4
+ description = "Direct Fireworks Training API RL loop over HUD-style arithmetic tasks"
5
+ requires-python = ">=3.11,<3.13"
6
+ dependencies = [
7
+ "fireworks-ai[training]",
8
+ "hud-python",
9
+ "matplotlib",
10
+ "python-dotenv",
11
+ "torch>=2",
12
+ "transformers>=4.55",
13
+ ]
14
+
15
+ [tool.uv]
16
+ package = false
17
+
18
+ [tool.uv.sources]
19
+ hud-python = { path = "../..", editable = true }
@@ -8,7 +8,12 @@ from __future__ import annotations
8
8
  from typing import TYPE_CHECKING, Any, cast
9
9
 
10
10
  from hud.types import AgentType
11
- from hud.utils.gateway import build_gateway_client, list_gateway_models
11
+ from hud.utils.gateway import (
12
+ build_gateway_client,
13
+ gateway_model_aliases,
14
+ list_gateway_models,
15
+ normalize_gateway_model_id,
16
+ )
12
17
 
13
18
  if TYPE_CHECKING:
14
19
  from typing import TypeAlias
@@ -27,6 +32,8 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
27
32
 
28
33
  For direct API access with provider API keys, instantiate the agent classes directly.
29
34
  """
35
+ requested_model = model
36
+ model = normalize_gateway_model_id(model)
30
37
  agent_type = next((candidate for candidate in AgentType if candidate.value == model), None)
31
38
  if agent_type is not None:
32
39
  model_id = model
@@ -73,7 +80,8 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
73
80
  for n in (gm.id, gm.name, gm.model_name)
74
81
  if isinstance(n, str)
75
82
  ]
76
- near = difflib.get_close_matches(model, known, n=3, cutoff=0.5)
83
+ known.extend(gateway_model_aliases())
84
+ near = difflib.get_close_matches(requested_model, known, n=3, cutoff=0.5)
77
85
  hint = (
78
86
  f" Did you mean: {', '.join(near)}?"
79
87
  if near
@@ -84,7 +92,7 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
84
92
  if gateway_models
85
93
  else "the HUD gateway registry (empty — is HUD_API_KEY set?)"
86
94
  )
87
- raise ValueError(f"Model {model!r} not found in {source}.{hint}")
95
+ raise ValueError(f"Model {requested_model!r} not found in {source}.{hint}")
88
96
 
89
97
  kwargs.setdefault("model", model_id)
90
98
  kwargs.setdefault("model_client", build_gateway_client(provider_name))
@@ -193,16 +193,27 @@ class OpenAIChatAgent(ToolAgent[ChatCompletionMessageParam, OpenAIChatConfig]):
193
193
  sample: Sample | None = None
194
194
  if return_token_ids:
195
195
  prompt_token_ids = getattr(choice, "prompt_token_ids", None)
196
+ # Multimodal prompt (text + image chunks): the only prompt representation
197
+ # that survives image inputs; flat prompt_token_ids is null in that case.
198
+ prompt_chunks = getattr(choice, "prompt_chunks", None)
196
199
  token_ids = getattr(choice, "token_ids", None)
197
- if prompt_token_ids is not None and token_ids is not None:
198
- chat_state.continuation_token_ids = list(prompt_token_ids) + list(token_ids)
199
- chat_state.continuation_message_count = len(messages)
200
+ has_prompt = prompt_token_ids is not None or prompt_chunks is not None
201
+ if token_ids is not None and has_prompt:
200
202
  content_lp = choice.logprobs.content if choice.logprobs else None
201
203
  sample = Sample(
202
- prompt_token_ids=list(prompt_token_ids),
204
+ prompt_token_ids=list(prompt_token_ids) if prompt_token_ids is not None else [],
205
+ prompt_chunks=list(prompt_chunks) if prompt_chunks is not None else None,
203
206
  output_token_ids=list(token_ids),
204
207
  output_logprobs=[tok.logprob for tok in content_lp] if content_lp else [],
205
208
  )
209
+ # KV-cache continuation only applies to flat text prompts; clear any
210
+ # stale state when the gateway returns chunks-only (multimodal turn).
211
+ if prompt_token_ids is not None:
212
+ chat_state.continuation_token_ids = list(prompt_token_ids) + list(token_ids)
213
+ chat_state.continuation_message_count = len(messages)
214
+ else:
215
+ chat_state.continuation_token_ids = None
216
+ chat_state.continuation_message_count = None
206
217
 
207
218
  tool_calls: list[MCPToolCall] = []
208
219
  for tc in function_calls:
@@ -108,7 +108,7 @@ def test_create_agent_resolves_gateway_model_metadata(
108
108
 
109
109
  model = GatewayModelInfo(
110
110
  id="ft:custom-123",
111
- model_name="gpt-5.4",
111
+ model_name="gpt-5.5",
112
112
  sdk_agent_type="openai_compatible",
113
113
  provider=GatewayProviderInfo(name="openai"),
114
114
  )
@@ -122,4 +122,40 @@ def test_create_agent_resolves_gateway_model_metadata(
122
122
  agent = create_agent("ft:custom-123")
123
123
 
124
124
  assert isinstance(agent, OpenAIChatAgent)
125
- assert agent.config.model == "gpt-5.4" # resolved to the model's real name
125
+ assert agent.config.model == "gpt-5.5" # resolved to the model's real name
126
+
127
+
128
+ @pytest.mark.parametrize(
129
+ ("alias", "canonical"),
130
+ [
131
+ ("deepseek-v4", "deepseek/deepseek-v4-pro"),
132
+ ("deepseek-v4-flash", "deepseek/deepseek-v4-flash"),
133
+ ("glm-5.2", "z-ai/glm-5.2"),
134
+ ("kimi-k2.6", "moonshotai/kimi-k2.6"),
135
+ ("minimax-m3", "MiniMax-M3"),
136
+ ],
137
+ )
138
+ def test_create_agent_accepts_gateway_model_aliases(
139
+ alias: str,
140
+ canonical: str,
141
+ monkeypatch: pytest.MonkeyPatch,
142
+ ) -> None:
143
+ from hud.utils.gateway import GatewayModelInfo, GatewayProviderInfo
144
+
145
+ model = GatewayModelInfo(
146
+ id=canonical,
147
+ model_name=canonical,
148
+ sdk_agent_type="openai_compatible",
149
+ provider=GatewayProviderInfo(name="openai"),
150
+ )
151
+ monkeypatch.setattr("hud.agents.list_gateway_models", lambda: [model])
152
+
153
+ def _build_client(_provider: str) -> object:
154
+ return object()
155
+
156
+ monkeypatch.setattr("hud.agents.build_gateway_client", _build_client)
157
+
158
+ agent = create_agent(alias)
159
+
160
+ assert isinstance(agent, OpenAIChatAgent)
161
+ assert agent.config.model == canonical
@@ -102,7 +102,7 @@ def _commands(tool: Any) -> list[str]:
102
102
 
103
103
 
104
104
  async def test_openai_shell_wraps_command_with_timeout() -> None:
105
- tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.4"), client=_ssh())
105
+ tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
106
106
 
107
107
  result = await tool.execute({"commands": ["pwd"], "timeout_ms": 2500})
108
108
 
@@ -114,7 +114,7 @@ async def test_openai_shell_wraps_command_with_timeout() -> None:
114
114
 
115
115
 
116
116
  async def test_openai_shell_runs_each_command_without_timeout() -> None:
117
- tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.4"), client=_ssh())
117
+ tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
118
118
 
119
119
  await tool.execute({"commands": ["echo a", "echo b"]})
120
120
 
@@ -122,7 +122,7 @@ async def test_openai_shell_runs_each_command_without_timeout() -> None:
122
122
 
123
123
 
124
124
  async def test_openai_shell_rejects_non_list_commands_without_running() -> None:
125
- tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.4"), client=_ssh())
125
+ tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
126
126
 
127
127
  result = await tool.execute({"commands": 123})
128
128
 
@@ -131,7 +131,7 @@ async def test_openai_shell_rejects_non_list_commands_without_running() -> None:
131
131
 
132
132
 
133
133
  def test_openai_shell_to_params_is_shell_type() -> None:
134
- tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.4"), client=_ssh())
134
+ tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
135
135
  assert tool.to_params()["type"] == "shell"
136
136
 
137
137
 
@@ -99,7 +99,7 @@ class OpenAIConfig(AgentConfig):
99
99
  """Configuration for OpenAIAgent."""
100
100
 
101
101
  model_name: str = "OpenAI"
102
- model: str = Field(default="gpt-5.4", validation_alias=_model_alias)
102
+ model: str = Field(default="gpt-5.5", validation_alias=_model_alias)
103
103
  max_output_tokens: int | None = None
104
104
  temperature: float | None = None
105
105
  reasoning: Any = None # openai Reasoning
@@ -113,7 +113,7 @@ class OpenAIChatConfig(AgentConfig):
113
113
  """Configuration for OpenAIChatAgent."""
114
114
 
115
115
  model_name: str = "OpenAI Chat"
116
- model: str = Field(default="gpt-5-mini", validation_alias=_model_alias)
116
+ model: str = Field(default="gpt-5.4-mini", validation_alias=_model_alias)
117
117
  checkpoint: str | None = Field(
118
118
  default=None,
119
119
  description="Specific checkpoint name for inference routing. "
@@ -139,7 +139,7 @@ class ClaudeSDKConfig(AgentConfig):
139
139
  """
140
140
 
141
141
  model_name: str = "Claude Code"
142
- model: str = Field(default="claude-sonnet-4-5", validation_alias=_model_alias)
142
+ model: str = Field(default="claude-sonnet-4-6", validation_alias=_model_alias)
143
143
  permission_mode: str = "bypassPermissions"
144
144
  max_steps: int = -1
145
145
  allowed_tools: list[str] = Field(
@@ -222,6 +222,10 @@ class Sample(BaseModel):
222
222
  """
223
223
 
224
224
  prompt_token_ids: list[int] = Field(default_factory=list[int])
225
+ # Multimodal prompt as serialized ``ModelInput`` chunks (text + image), set by
226
+ # vision rollouts where the prompt is not a flat token list. When present it is
227
+ # the authoritative prompt for training; ``prompt_token_ids`` stays empty.
228
+ prompt_chunks: list[dict[str, Any]] | None = None
225
229
  output_token_ids: list[int] = Field(default_factory=list[int])
226
230
  output_logprobs: list[float] = Field(default_factory=list[float])
227
231
 
@@ -35,11 +35,13 @@ from .client import client_app # noqa: E402
35
35
  from .deploy import deploy_command # noqa: E402
36
36
  from .eval import eval_command # noqa: E402
37
37
  from .init import init_command # noqa: E402
38
+ from .jobs import jobs_app # noqa: E402
38
39
  from .login import login_command # noqa: E402
39
40
  from .models import models_app # noqa: E402
40
41
  from .serve import serve_command # noqa: E402
41
42
  from .sync import sync_app # noqa: E402
42
43
  from .task import task_app # noqa: E402
44
+ from .trace import trace_app # noqa: E402
43
45
 
44
46
  app.command(name="serve")(serve_command)
45
47
  app.command(name="dev", deprecated=True, hidden=True)(serve_command) # alias for now
@@ -49,6 +51,8 @@ app.command(name="eval")(eval_command)
49
51
  app.command(name="init")(init_command)
50
52
  app.command(name="cancel")(cancel_command)
51
53
  app.add_typer(models_app, name="models")
54
+ app.add_typer(jobs_app, name="jobs")
55
+ app.add_typer(trace_app, name="trace")
52
56
 
53
57
 
54
58
  @app.command(name="set")
@@ -5,6 +5,7 @@ Config Override Order: CLI arguments > .hud_eval.toml > defaults
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
+ import ast
8
9
  import asyncio
9
10
  import logging
10
11
  import os
@@ -42,8 +43,9 @@ def _resolve_model_from_catalog(model_id: str) -> tuple[AgentType, str] | None:
42
43
  Returns None if the model isn't found or the catalog is unreachable.
43
44
  """
44
45
  try:
45
- from hud.utils.gateway import list_gateway_models
46
+ from hud.utils.gateway import list_gateway_models, normalize_gateway_model_id
46
47
 
48
+ model_id = normalize_gateway_model_id(model_id)
47
49
  models = list_gateway_models()
48
50
  except Exception:
49
51
  return None
@@ -116,8 +118,9 @@ class AgentPreset:
116
118
 
117
119
  _AGENT_PRESETS: list[AgentPreset] = [
118
120
  AgentPreset("Claude Sonnet 4.6", AgentType.CLAUDE, "claude-sonnet-4-6"),
119
- AgentPreset("GPT-5.4", AgentType.OPENAI, "gpt-5.4"),
120
- AgentPreset("Gemini 3.1 Pro (Preview)", AgentType.GEMINI, "gemini-3-1-pro"),
121
+ AgentPreset("Claude Opus 4.8", AgentType.CLAUDE, "claude-opus-4-8"),
122
+ AgentPreset("GPT-5.5", AgentType.OPENAI, "gpt-5.5"),
123
+ AgentPreset("Gemini 3.1 Pro (Preview)", AgentType.GEMINI, "gemini-3.1-pro-preview"),
121
124
  AgentPreset(
122
125
  "Grok 4-1 Fast (xAI)",
123
126
  AgentType.OPENAI_COMPATIBLE,
@@ -130,10 +133,22 @@ _AGENT_PRESETS: list[AgentPreset] = [
130
133
  },
131
134
  ),
132
135
  AgentPreset(
133
- "GLM-4.6V (Z-AI)",
136
+ "GLM 5.2 (Z.ai)",
134
137
  AgentType.OPENAI_COMPATIBLE,
135
- "z-ai/glm-4.6v",
136
- {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM-4.6V"}},
138
+ "z-ai/glm-5.2",
139
+ {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM 5.2"}},
140
+ ),
141
+ AgentPreset(
142
+ "Kimi K2.6 (Moonshot)",
143
+ AgentType.OPENAI_COMPATIBLE,
144
+ "moonshotai/kimi-k2.6",
145
+ {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "Kimi K2.6"}},
146
+ ),
147
+ AgentPreset(
148
+ "MiniMax M3",
149
+ AgentType.OPENAI_COMPATIBLE,
150
+ "MiniMax-M3",
151
+ {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "MiniMax M3"}},
137
152
  ),
138
153
  ]
139
154
 
@@ -161,7 +176,7 @@ _DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
161
176
  # use_computer_beta = true
162
177
 
163
178
  [openai]
164
- # model = "gpt-4o"
179
+ # model = "gpt-5.5"
165
180
  # temperature = 0.7
166
181
  # max_output_tokens = 4096
167
182
 
@@ -401,6 +416,11 @@ class EvalConfig(BaseModel):
401
416
  if self.model:
402
417
  kwargs["model"] = self.model
403
418
 
419
+ if isinstance(kwargs.get("model"), str):
420
+ from hud.utils.gateway import normalize_gateway_model_id
421
+
422
+ kwargs["model"] = normalize_gateway_model_id(kwargs["model"])
423
+
404
424
  if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
405
425
  base_url = kwargs.get("base_url", "")
406
426
  if settings.hud_gateway_url in base_url and settings.api_key:
@@ -665,13 +685,46 @@ def _build_agent(cfg: EvalConfig) -> Any:
665
685
  return cast("Any", cfg.agent_type.cls)(config=config)
666
686
 
667
687
 
688
+ def _python_defines_environment(path: Path) -> bool:
689
+ """Return True when ``path`` constructs a v6 :class:`~hud.environment.Environment`."""
690
+ try:
691
+ tree = ast.parse(path.read_text(encoding="utf-8"))
692
+ except (OSError, SyntaxError):
693
+ return False
694
+ for node in ast.walk(tree):
695
+ if not isinstance(node, ast.Call):
696
+ continue
697
+ callee = node.func
698
+ callee_name = (
699
+ callee.id
700
+ if isinstance(callee, ast.Name)
701
+ else callee.attr
702
+ if isinstance(callee, ast.Attribute)
703
+ else None
704
+ )
705
+ if callee_name == "Environment":
706
+ return True
707
+ return False
708
+
709
+
668
710
  def _spawn_target(source: Path) -> Path:
669
- """The path the ``LocalRuntime`` provider serves: the source itself for ``.py``
670
- files and directories, the surrounding directory for JSON/JSONL data files
671
- (the env's ``.py`` source lives next to the tasks file)."""
711
+ """The path the ``LocalRuntime`` provider serves.
712
+
713
+ Directories and env-defining ``.py`` files are served as-is. Task-only
714
+ sources (``tasks.py`` importing from ``env.py``) resolve to a sibling
715
+ ``env.py`` or the containing directory. JSON/JSONL data files use the
716
+ surrounding directory (the env source lives next to the tasks file).
717
+ """
672
718
  resolved = source.resolve()
673
- if resolved.is_dir() or resolved.suffix == ".py":
719
+ if resolved.is_dir():
720
+ return resolved
721
+ if resolved.suffix != ".py":
722
+ return resolved.parent
723
+ if _python_defines_environment(resolved):
674
724
  return resolved
725
+ env_py = resolved.parent / "env.py"
726
+ if env_py.is_file():
727
+ return env_py
675
728
  return resolved.parent
676
729
 
677
730
 
@@ -76,8 +76,8 @@ def init_command(
76
76
  None,
77
77
  "--preset",
78
78
  "-p",
79
- help="Starter preset to download from GitHub (e.g. blank, coding, browser, "
80
- "deepresearch, rubrics, remote-browser). Omit for an interactive picker; in a "
79
+ help="Starter preset to download from GitHub (e.g. blank, browser, "
80
+ "deepresearch, cua, autonomous-businesses, verilog). Omit for an interactive picker; in a "
81
81
  "non-interactive shell, omitting it writes the minimal local scaffold.",
82
82
  ),
83
83
  ) -> None:
@@ -89,7 +89,7 @@ def init_command(
89
89
 
90
90
  Examples:
91
91
  hud init my-env # interactive picker (or local scaffold)
92
- hud init my-env --preset coding # download the coding starter
92
+ hud init my-env --preset browser # download the browser starter
93
93
  hud init my-env --dir envs # create ./envs/my-env[/not dim]
94
94
  """
95
95
  hud_console = HUDConsole()