hud-python 0.6.6__tar.gz → 0.6.8.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/PKG-INFO +27 -24
  2. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/README.md +26 -23
  3. hud_python-0.6.8.dev0/cookbooks/fireworks-rl-training/README.md +129 -0
  4. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai_compatible/agent.py +7 -3
  5. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai_compatible/tools/__init__.py +4 -2
  6. hud_python-0.6.8.dev0/hud/agents/openai_compatible/tools/filesystem.py +332 -0
  7. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tests/test_provider_native_tools.py +135 -6
  8. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/deploy.py +41 -1
  9. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/tests/test_deploy.py +86 -0
  10. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/runtime.py +4 -1
  11. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/sync.py +2 -2
  12. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/tests/test_hosted.py +19 -0
  13. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/tests/test_rollout.py +90 -0
  14. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/tests/test_sync.py +12 -0
  15. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/version.py +1 -1
  16. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/pyproject.toml +1 -1
  17. hud_python-0.6.6/cookbooks/fireworks-rl-training/README.md +0 -114
  18. hud_python-0.6.6/hud/agents/openai_compatible/tools/filesystem.py +0 -138
  19. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/.gitignore +0 -0
  20. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/LICENSE +0 -0
  21. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/cookbooks/a2a-chat/README.md +0 -0
  22. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/cookbooks/a2a-chat/pyproject.toml +0 -0
  23. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/cookbooks/codex-coding/README.md +0 -0
  24. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/cookbooks/codex-coding/pyproject.toml +0 -0
  25. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/cookbooks/connect4-selfplay/README.md +0 -0
  26. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/cookbooks/fireworks-rl-training/pyproject.toml +0 -0
  27. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/cookbooks/rl-training/README.md +0 -0
  28. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/cookbooks/rl-training/pyproject.toml +0 -0
  29. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/__init__.py +0 -0
  30. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/__main__.py +0 -0
  31. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/_legacy.py +0 -0
  32. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/__init__.py +0 -0
  33. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/base.py +0 -0
  34. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/browser_use/__init__.py +0 -0
  35. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/browser_use/agent.py +0 -0
  36. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/claude/__init__.py +0 -0
  37. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/claude/agent.py +0 -0
  38. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/claude/sdk/__init__.py +0 -0
  39. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/claude/sdk/agent.py +0 -0
  40. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/claude/sdk/computer_mcp.py +0 -0
  41. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/claude/tools/__init__.py +0 -0
  42. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/claude/tools/base.py +0 -0
  43. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/claude/tools/coding.py +0 -0
  44. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/claude/tools/computer.py +0 -0
  45. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/claude/tools/hosted.py +0 -0
  46. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/claude/tools/mcp_proxy.py +0 -0
  47. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/claude/tools/settings.py +0 -0
  48. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/claude/tools/tests/__init__.py +0 -0
  49. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/claude/tools/tests/test_computer.py +0 -0
  50. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/gemini/__init__.py +0 -0
  51. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/gemini/agent.py +0 -0
  52. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/gemini/settings.py +0 -0
  53. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/gemini/tools/__init__.py +0 -0
  54. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/gemini/tools/base.py +0 -0
  55. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/gemini/tools/coding.py +0 -0
  56. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/gemini/tools/computer.py +0 -0
  57. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/gemini/tools/filesystem.py +0 -0
  58. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/gemini/tools/hosted.py +0 -0
  59. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/gemini/tools/mcp_proxy.py +0 -0
  60. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/gemini/tools/tests/__init__.py +0 -0
  61. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/gemini/tools/tests/test_computer.py +0 -0
  62. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/misc/__init__.py +0 -0
  63. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/misc/response_automation.py +0 -0
  64. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai/__init__.py +0 -0
  65. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai/agent.py +0 -0
  66. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai/tools/__init__.py +0 -0
  67. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai/tools/apply_patch.py +0 -0
  68. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai/tools/base.py +0 -0
  69. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai/tools/coding.py +0 -0
  70. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai/tools/computer.py +0 -0
  71. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai/tools/hosted.py +0 -0
  72. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai/tools/mcp_proxy.py +0 -0
  73. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai/tools/strict_schema.py +0 -0
  74. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai/tools/tests/__init__.py +0 -0
  75. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai/tools/tests/test_computer.py +0 -0
  76. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai/tools/tests/test_strict_schema.py +0 -0
  77. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai_compatible/__init__.py +0 -0
  78. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai_compatible/tools/base.py +0 -0
  79. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/openai_compatible/tools/mcp_proxy.py +0 -0
  80. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/robot/__init__.py +0 -0
  81. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/robot/_types.py +0 -0
  82. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/robot/adapter.py +0 -0
  83. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/robot/agent.py +0 -0
  84. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/robot/batching.py +0 -0
  85. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/robot/model.py +0 -0
  86. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/robot/record.py +0 -0
  87. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/robot/video.py +0 -0
  88. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tests/__init__.py +0 -0
  89. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tests/test_apply_patch.py +0 -0
  90. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tests/test_base.py +0 -0
  91. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tests/test_claude_agent.py +0 -0
  92. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tests/test_claude_sdk_agent.py +0 -0
  93. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tests/test_gemini_agent.py +0 -0
  94. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tests/test_openai_agent.py +0 -0
  95. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tests/test_openai_compatible_agent.py +0 -0
  96. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tests/test_tool_agent.py +0 -0
  97. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tests/test_trace.py +0 -0
  98. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tool_agent.py +0 -0
  99. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tools/__init__.py +0 -0
  100. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tools/base.py +0 -0
  101. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tools/hosted.py +0 -0
  102. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tools/mcp.py +0 -0
  103. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tools/rfb.py +0 -0
  104. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/tools/ssh.py +0 -0
  105. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/agents/types.py +0 -0
  106. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/capabilities/__init__.py +0 -0
  107. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/capabilities/base.py +0 -0
  108. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/capabilities/cdp.py +0 -0
  109. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/capabilities/filetracking.py +0 -0
  110. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/capabilities/mcp.py +0 -0
  111. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/capabilities/rfb.py +0 -0
  112. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/capabilities/robot.py +0 -0
  113. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/capabilities/ssh.py +0 -0
  114. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/__init__.py +0 -0
  115. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/__main__.py +0 -0
  116. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/cancel.py +0 -0
  117. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/client.py +0 -0
  118. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/eval.py +0 -0
  119. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/init.py +0 -0
  120. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/jobs.py +0 -0
  121. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/login.py +0 -0
  122. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/models.py +0 -0
  123. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/presets.py +0 -0
  124. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/serve.py +0 -0
  125. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/sync.py +0 -0
  126. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/task.py +0 -0
  127. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/templates.py +0 -0
  128. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/tests/__init__.py +0 -0
  129. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/tests/test_cli_init.py +0 -0
  130. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/tests/test_cli_main.py +0 -0
  131. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
  132. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/tests/test_eval_bedrock.py +0 -0
  133. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/tests/test_eval_config.py +0 -0
  134. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/tests/test_init.py +0 -0
  135. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/tests/test_main_module.py +0 -0
  136. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/tests/test_sync_export.py +0 -0
  137. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/trace.py +0 -0
  138. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/__init__.py +0 -0
  139. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/api.py +0 -0
  140. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/build_display.py +0 -0
  141. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/build_logs.py +0 -0
  142. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/config.py +0 -0
  143. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/context.py +0 -0
  144. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/display.py +0 -0
  145. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/jobs.py +0 -0
  146. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/registry.py +0 -0
  147. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/source.py +0 -0
  148. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/tasks.py +0 -0
  149. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/tests/__init__.py +0 -0
  150. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/tests/test_build_display.py +0 -0
  151. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/tests/test_config.py +0 -0
  152. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/tests/test_context.py +0 -0
  153. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/tests/test_registry.py +0 -0
  154. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/tests/test_source.py +0 -0
  155. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/tests/test_tasks.py +0 -0
  156. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/tests/test_version_check.py +0 -0
  157. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/cli/utils/version_check.py +0 -0
  158. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/clients/__init__.py +0 -0
  159. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/clients/client.py +0 -0
  160. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/clients/tests/__init__.py +0 -0
  161. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/clients/tests/test_connect.py +0 -0
  162. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/conftest.py +0 -0
  163. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/__init__.py +0 -0
  164. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/env.py +0 -0
  165. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/file_tracker.py +0 -0
  166. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/file_tracking.py +0 -0
  167. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/legacy.py +0 -0
  168. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/robot/__init__.py +0 -0
  169. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/robot/bridge.py +0 -0
  170. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/robot/endpoint.py +0 -0
  171. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/robot/sim_runner.py +0 -0
  172. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/server.py +0 -0
  173. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/tests/__init__.py +0 -0
  174. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/tests/conftest.py +0 -0
  175. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/tests/test_capability_backing.py +0 -0
  176. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/tests/test_file_tracker.py +0 -0
  177. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/tests/test_file_tracking.py +0 -0
  178. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/tests/test_legacy.py +0 -0
  179. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/tests/test_loader.py +0 -0
  180. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/tests/test_manifest.py +0 -0
  181. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/tests/test_server.py +0 -0
  182. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/tests/test_tunnel.py +0 -0
  183. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/utils.py +0 -0
  184. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/environment/workspace.py +0 -0
  185. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/__init__.py +0 -0
  186. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/chat.py +0 -0
  187. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/file_tracking.py +0 -0
  188. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/job.py +0 -0
  189. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/run.py +0 -0
  190. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/task.py +0 -0
  191. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/taskset.py +0 -0
  192. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/tests/__init__.py +0 -0
  193. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/tests/test_chat.py +0 -0
  194. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/tests/test_docker_provider.py +0 -0
  195. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/tests/test_file_tracking_observer.py +0 -0
  196. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/tests/test_job.py +0 -0
  197. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/eval/tests/test_task.py +0 -0
  198. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/graders/__init__.py +0 -0
  199. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/graders/base.py +0 -0
  200. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/graders/bash.py +0 -0
  201. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/graders/combine.py +0 -0
  202. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/graders/judge.py +0 -0
  203. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/graders/results.py +0 -0
  204. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/graders/text.py +0 -0
  205. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/patches/__init__.py +0 -0
  206. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/patches/mcp_patches.py +0 -0
  207. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/patches/tests/__init__.py +0 -0
  208. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/patches/tests/test_warnings.py +0 -0
  209. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/patches/warnings.py +0 -0
  210. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/py.typed +0 -0
  211. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/server.py +0 -0
  212. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/settings.py +0 -0
  213. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/telemetry/__init__.py +0 -0
  214. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/telemetry/context.py +0 -0
  215. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/telemetry/exporter.py +0 -0
  216. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/telemetry/filetracking.py +0 -0
  217. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/telemetry/instrument.py +0 -0
  218. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/telemetry/span.py +0 -0
  219. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/telemetry/tests/__init__.py +0 -0
  220. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/telemetry/tests/test_exporter.py +0 -0
  221. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/telemetry/tests/test_filetracking.py +0 -0
  222. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/telemetry/tests/test_instrument.py +0 -0
  223. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/train/__init__.py +0 -0
  224. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/train/base.py +0 -0
  225. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/train/client.py +0 -0
  226. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/train/types.py +0 -0
  227. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/types.py +0 -0
  228. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/__init__.py +0 -0
  229. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/exceptions.py +0 -0
  230. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/gateway.py +0 -0
  231. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/hints.py +0 -0
  232. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/hud_console.py +0 -0
  233. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/modules.py +0 -0
  234. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/platform.py +0 -0
  235. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/requests.py +0 -0
  236. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/serialization.py +0 -0
  237. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/tests/__init__.py +0 -0
  238. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/tests/test_exceptions.py +0 -0
  239. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/tests/test_hints.py +0 -0
  240. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/tests/test_hud_console.py +0 -0
  241. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/tests/test_platform.py +0 -0
  242. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/tests/test_requests.py +0 -0
  243. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/tests/test_serialization.py +0 -0
  244. {hud_python-0.6.6 → hud_python-0.6.8.dev0}/hud/utils/time.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.6.6
3
+ Version: 0.6.8.dev0
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -87,7 +87,7 @@ Description-Content-Type: text/markdown
87
87
 
88
88
  HUD is a platform for building RL environments for AI agents, across coding, browser, computer-use, and robotics. Define an environment, write tasks, and run them as evals and training across any model, at any scale.
89
89
 
90
- To learn more, see the [documentation](https://docs.hud.ai) and [API reference](https://docs.hud.ai/reference/environment).
90
+ To learn more, see the [documentation](https://docs.hud.ai) and [environment reference](https://docs.hud.ai/v6/core/environment).
91
91
 
92
92
  [![PyPI](https://img.shields.io/pypi/v/hud-python?style=flat-square)](https://pypi.org/project/hud-python/)
93
93
  [![License](https://img.shields.io/badge/license-MIT-green?style=flat-square)](LICENSE)
@@ -120,7 +120,7 @@ Then scaffold your first environment:
120
120
  hud init my-env
121
121
  ```
122
122
 
123
- ![Agent running on SheetBench](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
123
+ ![Agent running on SheetBench](docs/src/images/trace_sheet.gif)
124
124
 
125
125
  ## The protocol
126
126
 
@@ -159,14 +159,14 @@ hud eval my-taskset --remote
159
159
  For local iteration, the same protocol works against a container on your laptop:
160
160
 
161
161
  ```bash
162
- hud build .
163
- docker run -d --name run1 my-env
164
- docker exec run1 hud task start fix_bug
165
- docker exec run1 hud task grade fix_bug --answer ""
162
+ docker build -f Dockerfile.hud -t my-env .
163
+ docker run -d --name run1 -p 8765:8765 my-env
164
+ hud task start fix_bug --url tcp://127.0.0.1:8765
165
+ hud task grade fix_bug --url tcp://127.0.0.1:8765 --answer "..."
166
166
  docker rm -f run1
167
167
  ```
168
168
 
169
- → [Package & deploy](https://docs.hud.ai/run/deploy)
169
+ → [Run & deploy](https://docs.hud.ai/v6/core/runtime)
170
170
 
171
171
  ## Environments & templates
172
172
 
@@ -193,7 +193,7 @@ hud eval tasks.py claude --group 3
193
193
 
194
194
  Each graded evaluation is a **trace** (the SDK's live handle is a `Run`). With `HUD_API_KEY` set, every rollout is recorded on [hud.ai](https://hud.ai). Tasks that need a shell, browser, GUI, or robot declare **capabilities** (below); everything else — variants, grading, batching — stays identical.
195
195
 
196
- → [Quickstart](https://docs.hud.ai/quickstart) · [Tasks & tasksets](https://docs.hud.ai/reference/tasks)
196
+ → [Quickstart](https://docs.hud.ai/v6/start/quickstart) · [Tasks & tasksets](https://docs.hud.ai/v6/core/tasks)
197
197
 
198
198
  ## Capabilities & harnesses
199
199
 
@@ -211,39 +211,42 @@ A **capability** is a connection the environment exposes; a **harness** attaches
211
211
 
212
212
  **Bring your own:** a harness attaches to a capability and defines a tool spec — wrap `browser-use` on `cdp`, a VLA policy on `robot`, or your own agent on `ssh` / `mcp`. No protocol work required.
213
213
 
214
- → [Capabilities](https://docs.hud.ai/reference/capabilities) · [Models](https://docs.hud.ai/run/models) · [Robots](https://docs.hud.ai/reference/robots)
214
+ → [Capabilities](https://docs.hud.ai/v6/core/capabilities) · [Models](https://docs.hud.ai/v6/core/agents) · [Robots](https://docs.hud.ai/v6/advanced/robots)
215
215
 
216
216
  ## Deploy on the platform
217
217
 
218
218
  From the [platform UI](https://hud.ai) you can run batches, compare models on the same taskset, and inspect every trace.
219
219
 
220
- → [Deploy](https://docs.hud.ai/run/deploy) · [Leaderboards](https://hud.ai/leaderboards)
220
+ → [Run & deploy](https://docs.hud.ai/v6/core/runtime)
221
221
 
222
222
  ## Train on rewards
223
223
 
224
- Every rollout returns a `Run` carrying a `trace_id` and a `reward`, so the tasks you evaluate are already training data. Run a **group** per task and turn the rewards into GRPO advantages with `group_relative()`:
224
+ Every rollout returns a `Run` carrying a `trace_id` and a `reward`, so the tasks you evaluate are already training data. Run a **group** per task and pass the graded runs to `TrainingClient.step()`:
225
225
 
226
226
  ```python
227
+ from hud import TrainingClient
227
228
  from hud.agents import create_agent
228
- from hud.eval import Taskset, group_relative
229
+ from hud.eval import Job
229
230
 
230
- agent = create_agent("claude-sonnet-4-5")
231
- job = await Taskset(count_letter(word=w) for w in words).run(agent, group=16)
232
- for runs in job.results.values():
233
- advantages = group_relative([r.reward for r in runs], normalize_std=True)
234
- ... # feed (run.trace_id, adv) into your optimizer
231
+ agent = create_agent("arith-rl", completion_kwargs={"extra_body": {"return_token_ids": True}})
232
+ trainer = TrainingClient("arith-rl")
233
+ taskset, runtime = ... # your Taskset and where rollouts run
234
+
235
+ session = await Job.start("arith-rl", group=8)
236
+ start = len(session.runs)
237
+ await taskset.run(agent, runtime=runtime, group=8, job=session)
238
+ await trainer.step(session.runs[start:], learning_rate=1e-5, group_size=8)
235
239
  ```
236
240
 
237
241
  HUD is the environment-and-reward source for your own GRPO/PPO loop — the same environment trains any model, text or multimodal, unchanged.
238
242
 
239
- → [Training](https://docs.hud.ai/run/training) · [Designing tasks for signal](https://docs.hud.ai/run/signal)
243
+ → [Training](https://docs.hud.ai/v6/core/training) · [Designing tasks for signal](https://docs.hud.ai/v6/core/advice)
240
244
 
241
245
  ## Links
242
246
 
243
247
  - [Documentation](https://docs.hud.ai)
244
- - [Quickstart](https://docs.hud.ai/quickstart)
245
- - [CLI reference](https://docs.hud.ai/reference/cli)
246
- - [Leaderboards](https://hud.ai/leaderboards)
248
+ - [Quickstart](https://docs.hud.ai/v6/start/quickstart)
249
+ - [CLI reference](https://docs.hud.ai/v6/core/cli)
247
250
  - [Environment templates](https://hud.ai/environments)
248
251
  - [Supported models](https://hud.ai/models)
249
252
  - [Discord](https://discord.gg/wkjtmHYYjm)
@@ -268,8 +271,8 @@ Key areas: [Agents](hud/agents/) · [Environments](hud/environment/) · [Capabil
268
271
 
269
272
  ```bibtex
270
273
  @software{hud2025agentevalplatform,
271
- author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Govind Pimpale and Dylan Bowman and Jaideep and Nguyen Nhat Minh},
272
- title = {HUD: An Evaluation and RL Envrionments Platform for Agents},
274
+ author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Govind Pimpale and Dylan Bowman and Jaideep Chawla and Nguyen Nhat Minh},
275
+ title = {HUD: An Evaluation and RL Environments Platform for Agents},
273
276
  date = {2025-04},
274
277
  url = {https://github.com/hud-evals/hud-python},
275
278
  langid = {en}
@@ -8,7 +8,7 @@
8
8
 
9
9
  HUD is a platform for building RL environments for AI agents, across coding, browser, computer-use, and robotics. Define an environment, write tasks, and run them as evals and training across any model, at any scale.
10
10
 
11
- To learn more, see the [documentation](https://docs.hud.ai) and [API reference](https://docs.hud.ai/reference/environment).
11
+ To learn more, see the [documentation](https://docs.hud.ai) and [environment reference](https://docs.hud.ai/v6/core/environment).
12
12
 
13
13
  [![PyPI](https://img.shields.io/pypi/v/hud-python?style=flat-square)](https://pypi.org/project/hud-python/)
14
14
  [![License](https://img.shields.io/badge/license-MIT-green?style=flat-square)](LICENSE)
@@ -41,7 +41,7 @@ Then scaffold your first environment:
41
41
  hud init my-env
42
42
  ```
43
43
 
44
- ![Agent running on SheetBench](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
44
+ ![Agent running on SheetBench](docs/src/images/trace_sheet.gif)
45
45
 
46
46
  ## The protocol
47
47
 
@@ -80,14 +80,14 @@ hud eval my-taskset --remote
80
80
  For local iteration, the same protocol works against a container on your laptop:
81
81
 
82
82
  ```bash
83
- hud build .
84
- docker run -d --name run1 my-env
85
- docker exec run1 hud task start fix_bug
86
- docker exec run1 hud task grade fix_bug --answer ""
83
+ docker build -f Dockerfile.hud -t my-env .
84
+ docker run -d --name run1 -p 8765:8765 my-env
85
+ hud task start fix_bug --url tcp://127.0.0.1:8765
86
+ hud task grade fix_bug --url tcp://127.0.0.1:8765 --answer "..."
87
87
  docker rm -f run1
88
88
  ```
89
89
 
90
- → [Package & deploy](https://docs.hud.ai/run/deploy)
90
+ → [Run & deploy](https://docs.hud.ai/v6/core/runtime)
91
91
 
92
92
  ## Environments & templates
93
93
 
@@ -114,7 +114,7 @@ hud eval tasks.py claude --group 3
114
114
 
115
115
  Each graded evaluation is a **trace** (the SDK's live handle is a `Run`). With `HUD_API_KEY` set, every rollout is recorded on [hud.ai](https://hud.ai). Tasks that need a shell, browser, GUI, or robot declare **capabilities** (below); everything else — variants, grading, batching — stays identical.
116
116
 
117
- → [Quickstart](https://docs.hud.ai/quickstart) · [Tasks & tasksets](https://docs.hud.ai/reference/tasks)
117
+ → [Quickstart](https://docs.hud.ai/v6/start/quickstart) · [Tasks & tasksets](https://docs.hud.ai/v6/core/tasks)
118
118
 
119
119
  ## Capabilities & harnesses
120
120
 
@@ -132,39 +132,42 @@ A **capability** is a connection the environment exposes; a **harness** attaches
132
132
 
133
133
  **Bring your own:** a harness attaches to a capability and defines a tool spec — wrap `browser-use` on `cdp`, a VLA policy on `robot`, or your own agent on `ssh` / `mcp`. No protocol work required.
134
134
 
135
- → [Capabilities](https://docs.hud.ai/reference/capabilities) · [Models](https://docs.hud.ai/run/models) · [Robots](https://docs.hud.ai/reference/robots)
135
+ → [Capabilities](https://docs.hud.ai/v6/core/capabilities) · [Models](https://docs.hud.ai/v6/core/agents) · [Robots](https://docs.hud.ai/v6/advanced/robots)
136
136
 
137
137
  ## Deploy on the platform
138
138
 
139
139
  From the [platform UI](https://hud.ai) you can run batches, compare models on the same taskset, and inspect every trace.
140
140
 
141
- → [Deploy](https://docs.hud.ai/run/deploy) · [Leaderboards](https://hud.ai/leaderboards)
141
+ → [Run & deploy](https://docs.hud.ai/v6/core/runtime)
142
142
 
143
143
  ## Train on rewards
144
144
 
145
- Every rollout returns a `Run` carrying a `trace_id` and a `reward`, so the tasks you evaluate are already training data. Run a **group** per task and turn the rewards into GRPO advantages with `group_relative()`:
145
+ Every rollout returns a `Run` carrying a `trace_id` and a `reward`, so the tasks you evaluate are already training data. Run a **group** per task and pass the graded runs to `TrainingClient.step()`:
146
146
 
147
147
  ```python
148
+ from hud import TrainingClient
148
149
  from hud.agents import create_agent
149
- from hud.eval import Taskset, group_relative
150
+ from hud.eval import Job
150
151
 
151
- agent = create_agent("claude-sonnet-4-5")
152
- job = await Taskset(count_letter(word=w) for w in words).run(agent, group=16)
153
- for runs in job.results.values():
154
- advantages = group_relative([r.reward for r in runs], normalize_std=True)
155
- ... # feed (run.trace_id, adv) into your optimizer
152
+ agent = create_agent("arith-rl", completion_kwargs={"extra_body": {"return_token_ids": True}})
153
+ trainer = TrainingClient("arith-rl")
154
+ taskset, runtime = ... # your Taskset and where rollouts run
155
+
156
+ session = await Job.start("arith-rl", group=8)
157
+ start = len(session.runs)
158
+ await taskset.run(agent, runtime=runtime, group=8, job=session)
159
+ await trainer.step(session.runs[start:], learning_rate=1e-5, group_size=8)
156
160
  ```
157
161
 
158
162
  HUD is the environment-and-reward source for your own GRPO/PPO loop — the same environment trains any model, text or multimodal, unchanged.
159
163
 
160
- → [Training](https://docs.hud.ai/run/training) · [Designing tasks for signal](https://docs.hud.ai/run/signal)
164
+ → [Training](https://docs.hud.ai/v6/core/training) · [Designing tasks for signal](https://docs.hud.ai/v6/core/advice)
161
165
 
162
166
  ## Links
163
167
 
164
168
  - [Documentation](https://docs.hud.ai)
165
- - [Quickstart](https://docs.hud.ai/quickstart)
166
- - [CLI reference](https://docs.hud.ai/reference/cli)
167
- - [Leaderboards](https://hud.ai/leaderboards)
169
+ - [Quickstart](https://docs.hud.ai/v6/start/quickstart)
170
+ - [CLI reference](https://docs.hud.ai/v6/core/cli)
168
171
  - [Environment templates](https://hud.ai/environments)
169
172
  - [Supported models](https://hud.ai/models)
170
173
  - [Discord](https://discord.gg/wkjtmHYYjm)
@@ -189,8 +192,8 @@ Key areas: [Agents](hud/agents/) · [Environments](hud/environment/) · [Capabil
189
192
 
190
193
  ```bibtex
191
194
  @software{hud2025agentevalplatform,
192
- author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Govind Pimpale and Dylan Bowman and Jaideep and Nguyen Nhat Minh},
193
- title = {HUD: An Evaluation and RL Envrionments Platform for Agents},
195
+ author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Govind Pimpale and Dylan Bowman and Jaideep Chawla and Nguyen Nhat Minh},
196
+ title = {HUD: An Evaluation and RL Environments Platform for Agents},
194
197
  date = {2025-04},
195
198
  url = {https://github.com/hud-evals/hud-python},
196
199
  langid = {en}
@@ -0,0 +1,129 @@
1
+ # Fireworks RL Training
2
+
3
+ Direct Fireworks Training API loop over the same arithmetic preview task used by
4
+ `cookbooks/rl-training`.
5
+
6
+ This does **not** use Fireworks native datasets or RFT jobs. It follows the
7
+ Training API service path from the Fireworks docs:
8
+
9
+ 1. `FiretitanServiceClient.from_firetitan_config(...)`
10
+ 2. `create_deployment_sampler(...)` for high-parallel rollouts
11
+ 3. local grading of HUD-style multiplication tasks
12
+ 4. `forward_backward_custom(...)` + `optim_step(...)`
13
+ 5. `save_weights_for_sampler(...)` + sampler refresh
14
+
15
+ References:
16
+
17
+ - Fireworks Training API introduction: https://docs.fireworks.ai/fine-tuning/training-api/introduction
18
+ - Training and sampling lifecycle: https://docs.fireworks.ai/fine-tuning/training-api/training-and-sampling
19
+ - Loss functions / GRPO reference: https://docs.fireworks.ai/fine-tuning/training-api/loss-functions
20
+
21
+ ## Setup
22
+
23
+ The repo-level `.env` is loaded automatically. It must contain:
24
+
25
+ ```bash
26
+ FIREWORKS_API_KEY=...
27
+ FIREWORKS_ACCOUNT_ID=...
28
+ ```
29
+
30
+ Install the isolated cookbook environment:
31
+
32
+ ```bash
33
+ uv sync --pre
34
+ ```
35
+
36
+ ## Calibrate task difficulty first
37
+
38
+ What matters for GRPO is **within-group** reward spread: advantages are computed
39
+ within each prompt group, so a group whose rollouts all score the same (all 0 or
40
+ all 1) produces zero advantage and no gradient — even if the *overall* mean looks
41
+ healthy. Calibration reports `within_group_reward_std` for exactly this; treat
42
+ it, not `reward_mean`, as the signal that training has something to learn.
43
+
44
+ Two backends:
45
+
46
+ - `--calibration-backend inference` (default): Fireworks' OpenAI-compatible API.
47
+ Cheap, but samples `gpt-oss-120b` (`--inference-model`), not the training base —
48
+ the small serverless catalog on the `lorenss` key has no Qwen3 8B. Use it only
49
+ for a rough task sanity check.
50
+ - `--calibration-backend managed`: provisions the same deployment sampler that
51
+ training uses and samples the **actual base model** (Qwen3 8B). This is the
52
+ calibration that counts. It still skips the trainer and `optim_step`.
53
+
54
+ ```bash
55
+ uv run train.py --calibrate-only --calibration-backend managed \
56
+ --groups-per-step 6 --rollouts-per-prompt 6 --parallelism 18 --debug-samples 4
57
+ ```
58
+
59
+ `--debug-samples N` prints the first N rollouts (reward, output-token count,
60
+ text) so you can see *why* a group scored the way it did. Tune the multiplication
61
+ range until `within_group_reward_std` is clearly above zero:
62
+
63
+ - Groups all-correct (`within_group_reward_std ~= 0`) → make it harder
64
+ (`--min-a/--max-a/--min-b/--max-b`).
65
+ - Groups all-wrong → make it easier, or raise `--max-tokens` so the model can
66
+ finish its working before the budget cuts it off.
67
+
68
+ The shipped defaults (3-digit × 3-digit, `--max-tokens 512`, thinking disabled)
69
+ calibrate to `reward_mean ~= 0.47`, `within_group_reward_std ~= 0.20` on Qwen3 8B:
70
+ a regime where the same problem is sometimes solved (when the model shows its
71
+ work) and sometimes slipped (when it answers directly) — so RL has a gradient to
72
+ follow.
73
+
74
+ ### Reasoning models and the token budget
75
+
76
+ Qwen3 is a hybrid reasoning model: by default it opens a `<think>` block and, on
77
+ a tight `--max-tokens`, spends the whole budget reasoning and never emits the
78
+ answer (reward collapses to zero). This cookbook disables thinking by default
79
+ through the chat template so direct rollouts reach the integer. Pass
80
+ `--enable-thinking` to keep the reasoning block — and raise `--max-tokens`
81
+ accordingly so the answer still fits.
82
+
83
+ ## Train
84
+
85
+ Once calibration has non-trivial rewards:
86
+
87
+ ```bash
88
+ uv run train.py --steps 5 --groups-per-step 8 --rollouts-per-prompt 8 --parallelism 32
89
+ ```
90
+
91
+ This uses the direct Training API managed service path. If you want calibration
92
+ to go through the managed deployment sampler too, pass
93
+ `--calibration-backend managed`; this provisions the same resources as training.
94
+
95
+ ### Preview account constraints
96
+
97
+ On the `lorenss` preview account today:
98
+
99
+ - **Trainer creation works** end to end with a provisioned key: rollouts,
100
+ `forward_backward_custom`, `optim_step`, checkpoint save, and sampler hotload
101
+ all run, and multi-step training completes. (An earlier `unkey inference api id
102
+ is not configured` 500 on trainer creation was an account-side provisioning gap,
103
+ now resolved.)
104
+ - **LoRA is unavailable**: the validated `qwen3-8b-128k` shape only accepts
105
+ full-parameter training, so `--lora-rank > 0` fails at trainer creation with
106
+ `no validated training shape exists for ... trainer_mode=LORA_TRAINER`.
107
+ - **Hotloads sync full 8B weights** between steps and occasionally exceed the
108
+ SDK's 600s hotload budget (`RuntimeError: Hotload failed for sampler snapshot
109
+ ...`). This is transient preview-infra latency, not a loop bug — re-running the
110
+ same command generally proceeds. There is no clean knob to extend the timeout
111
+ on the managed sampler path.
112
+
113
+ Metrics are written to:
114
+
115
+ - `runs/fireworks-rl-preview/metrics.jsonl`
116
+ - `runs/fireworks-rl-preview/reward_loss.png` if `matplotlib` is installed
117
+
118
+ ## Notes
119
+
120
+ - Defaults use Qwen 3 8B full-parameter training:
121
+ - `accounts/fireworks/models/qwen3-8b`
122
+ - `Qwen/Qwen3-8B`
123
+ - `accounts/fireworks/trainingShapes/qwen3-8b-128k`
124
+ - LoRA can be tested with `--lora-rank N`, but the validated Qwen3 8B training
125
+ shape currently rejects LoRA mode on the `lorenss` preview account.
126
+ - The first checkpoint sync happens after step 0 and subsequent rollouts sample
127
+ the updated weights through the same deployment.
128
+ - `--keep-trainer` and `--keep-deployment` are available for debugging. By
129
+ default the trainer is cleaned up and the deployment scales to zero on exit.
@@ -17,11 +17,13 @@ from hud.types import MCPToolCall, MCPToolResult
17
17
  from hud.utils import gateway
18
18
 
19
19
  from .tools import (
20
+ BashTool,
21
+ EditTool,
20
22
  GlobTool,
21
23
  GrepTool,
22
- ListTool,
23
24
  OpenAICompatibleMCPProxyTool,
24
25
  ReadTool,
26
+ WriteTool,
25
27
  )
26
28
  from .tools.base import format_chat_result
27
29
 
@@ -41,10 +43,12 @@ class OpenAIChatAgent(ToolAgent[ChatCompletionMessageParam, OpenAIChatConfig]):
41
43
  """OpenAI-compatible agent using the chat.completions protocol."""
42
44
 
43
45
  tool_catalog = (
46
+ BashTool,
44
47
  ReadTool,
45
- GrepTool,
46
48
  GlobTool,
47
- ListTool,
49
+ GrepTool,
50
+ EditTool,
51
+ WriteTool,
48
52
  OpenAICompatibleMCPProxyTool,
49
53
  )
50
54
 
@@ -2,13 +2,15 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from .filesystem import GlobTool, GrepTool, ListTool, ReadTool
5
+ from .filesystem import BashTool, EditTool, GlobTool, GrepTool, ReadTool, WriteTool
6
6
  from .mcp_proxy import OpenAICompatibleMCPProxyTool
7
7
 
8
8
  __all__ = [
9
+ "BashTool",
10
+ "EditTool",
9
11
  "GlobTool",
10
12
  "GrepTool",
11
- "ListTool",
12
13
  "OpenAICompatibleMCPProxyTool",
13
14
  "ReadTool",
15
+ "WriteTool",
14
16
  ]