hud-python 0.4.16__tar.gz → 0.4.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (183) hide show
  1. {hud_python-0.4.16 → hud_python-0.4.18}/PKG-INFO +1 -1
  2. {hud_python-0.4.16 → hud_python-0.4.18}/hud/agents/claude.py +8 -2
  3. {hud_python-0.4.16 → hud_python-0.4.18}/hud/agents/misc/response_agent.py +1 -1
  4. {hud_python-0.4.16 → hud_python-0.4.18}/hud/agents/openai.py +8 -2
  5. hud_python-0.4.18/hud/agents/openai_chat_generic.py +288 -0
  6. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/rl/__init__.py +11 -2
  7. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/rl/pod.py +4 -0
  8. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/rl/ssh.py +34 -2
  9. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/rl/train.py +190 -51
  10. {hud_python-0.4.16 → hud_python-0.4.18}/hud/datasets/execution/parallel.py +113 -37
  11. {hud_python-0.4.16 → hud_python-0.4.18}/hud/otel/exporters.py +3 -0
  12. {hud_python-0.4.16 → hud_python-0.4.18}/hud/otel/processors.py +3 -0
  13. {hud_python-0.4.16 → hud_python-0.4.18}/hud/utils/tests/test_version.py +1 -1
  14. {hud_python-0.4.16 → hud_python-0.4.18}/hud/version.py +1 -1
  15. {hud_python-0.4.16 → hud_python-0.4.18}/pyproject.toml +1 -1
  16. hud_python-0.4.16/hud/agents/openai_chat_generic.py +0 -154
  17. {hud_python-0.4.16 → hud_python-0.4.18}/.gitignore +0 -0
  18. {hud_python-0.4.16 → hud_python-0.4.18}/LICENSE +0 -0
  19. {hud_python-0.4.16 → hud_python-0.4.18}/README.md +0 -0
  20. {hud_python-0.4.16 → hud_python-0.4.18}/environments/README.md +0 -0
  21. {hud_python-0.4.16 → hud_python-0.4.18}/environments/browser/README.md +0 -0
  22. {hud_python-0.4.16 → hud_python-0.4.18}/environments/browser/apps/2048/README.md +0 -0
  23. {hud_python-0.4.16 → hud_python-0.4.18}/environments/browser/apps/2048/backend/pyproject.toml +0 -0
  24. {hud_python-0.4.16 → hud_python-0.4.18}/environments/browser/apps/README.md +0 -0
  25. {hud_python-0.4.16 → hud_python-0.4.18}/environments/browser/apps/todo/README.md +0 -0
  26. {hud_python-0.4.16 → hud_python-0.4.18}/environments/browser/apps/todo/backend/pyproject.toml +0 -0
  27. {hud_python-0.4.16 → hud_python-0.4.18}/environments/browser/pyproject.toml +0 -0
  28. {hud_python-0.4.16 → hud_python-0.4.18}/environments/remote_browser/README.md +0 -0
  29. {hud_python-0.4.16 → hud_python-0.4.18}/environments/remote_browser/pyproject.toml +0 -0
  30. {hud_python-0.4.16 → hud_python-0.4.18}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  31. {hud_python-0.4.16 → hud_python-0.4.18}/environments/text_2048/README.md +0 -0
  32. {hud_python-0.4.16 → hud_python-0.4.18}/environments/text_2048/pyproject.toml +0 -0
  33. {hud_python-0.4.16 → hud_python-0.4.18}/examples/README.md +0 -0
  34. {hud_python-0.4.16 → hud_python-0.4.18}/hud/__init__.py +0 -0
  35. {hud_python-0.4.16 → hud_python-0.4.18}/hud/__main__.py +0 -0
  36. {hud_python-0.4.16 → hud_python-0.4.18}/hud/agents/__init__.py +0 -0
  37. {hud_python-0.4.16 → hud_python-0.4.18}/hud/agents/base.py +0 -0
  38. {hud_python-0.4.16 → hud_python-0.4.18}/hud/agents/langchain.py +0 -0
  39. {hud_python-0.4.16 → hud_python-0.4.18}/hud/agents/misc/__init__.py +0 -0
  40. {hud_python-0.4.16 → hud_python-0.4.18}/hud/agents/tests/__init__.py +0 -0
  41. {hud_python-0.4.16 → hud_python-0.4.18}/hud/agents/tests/test_base.py +0 -0
  42. {hud_python-0.4.16 → hud_python-0.4.18}/hud/agents/tests/test_claude.py +0 -0
  43. {hud_python-0.4.16 → hud_python-0.4.18}/hud/agents/tests/test_client.py +0 -0
  44. {hud_python-0.4.16 → hud_python-0.4.18}/hud/agents/tests/test_openai.py +0 -0
  45. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/__init__.py +0 -0
  46. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/__main__.py +0 -0
  47. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/analyze.py +0 -0
  48. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/build.py +0 -0
  49. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/clone.py +0 -0
  50. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/debug.py +0 -0
  51. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/dev.py +0 -0
  52. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/eval.py +0 -0
  53. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/hf.py +0 -0
  54. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/init.py +0 -0
  55. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/list_func.py +0 -0
  56. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/pull.py +0 -0
  57. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/push.py +0 -0
  58. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/remove.py +0 -0
  59. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/rl/README.md +0 -0
  60. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/rl/init.py +0 -0
  61. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/rl/utils.py +0 -0
  62. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/__init__.py +0 -0
  63. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_analyze.py +0 -0
  64. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_analyze_metadata.py +0 -0
  65. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_build.py +0 -0
  66. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_cli_init.py +0 -0
  67. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_cli_main.py +0 -0
  68. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_clone.py +0 -0
  69. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_cursor.py +0 -0
  70. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_debug.py +0 -0
  71. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_list_func.py +0 -0
  72. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_main_module.py +0 -0
  73. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_mcp_server.py +0 -0
  74. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_pull.py +0 -0
  75. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_push.py +0 -0
  76. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_registry.py +0 -0
  77. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/tests/test_utils.py +0 -0
  78. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/utils/__init__.py +0 -0
  79. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/utils/cursor.py +0 -0
  80. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/utils/docker.py +0 -0
  81. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/utils/environment.py +0 -0
  82. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/utils/interactive.py +0 -0
  83. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/utils/logging.py +0 -0
  84. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/utils/metadata.py +0 -0
  85. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/utils/registry.py +0 -0
  86. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/utils/remote_runner.py +0 -0
  87. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/utils/runner.py +0 -0
  88. {hud_python-0.4.16 → hud_python-0.4.18}/hud/cli/utils/server.py +0 -0
  89. {hud_python-0.4.16 → hud_python-0.4.18}/hud/clients/README.md +0 -0
  90. {hud_python-0.4.16 → hud_python-0.4.18}/hud/clients/__init__.py +0 -0
  91. {hud_python-0.4.16 → hud_python-0.4.18}/hud/clients/base.py +0 -0
  92. {hud_python-0.4.16 → hud_python-0.4.18}/hud/clients/fastmcp.py +0 -0
  93. {hud_python-0.4.16 → hud_python-0.4.18}/hud/clients/mcp_use.py +0 -0
  94. {hud_python-0.4.16 → hud_python-0.4.18}/hud/clients/tests/__init__.py +0 -0
  95. {hud_python-0.4.16 → hud_python-0.4.18}/hud/clients/tests/test_client_integration.py +0 -0
  96. {hud_python-0.4.16 → hud_python-0.4.18}/hud/clients/tests/test_fastmcp.py +0 -0
  97. {hud_python-0.4.16 → hud_python-0.4.18}/hud/clients/tests/test_protocol.py +0 -0
  98. {hud_python-0.4.16 → hud_python-0.4.18}/hud/clients/utils/__init__.py +0 -0
  99. {hud_python-0.4.16 → hud_python-0.4.18}/hud/clients/utils/retry_transport.py +0 -0
  100. {hud_python-0.4.16 → hud_python-0.4.18}/hud/datasets/__init__.py +0 -0
  101. {hud_python-0.4.16 → hud_python-0.4.18}/hud/datasets/execution/__init__.py +0 -0
  102. {hud_python-0.4.16 → hud_python-0.4.18}/hud/datasets/execution/runner.py +0 -0
  103. {hud_python-0.4.16 → hud_python-0.4.18}/hud/datasets/task.py +0 -0
  104. {hud_python-0.4.16 → hud_python-0.4.18}/hud/datasets/utils.py +0 -0
  105. {hud_python-0.4.16 → hud_python-0.4.18}/hud/misc/__init__.py +0 -0
  106. {hud_python-0.4.16 → hud_python-0.4.18}/hud/misc/claude_plays_pokemon.py +0 -0
  107. {hud_python-0.4.16 → hud_python-0.4.18}/hud/otel/__init__.py +0 -0
  108. {hud_python-0.4.16 → hud_python-0.4.18}/hud/otel/collector.py +0 -0
  109. {hud_python-0.4.16 → hud_python-0.4.18}/hud/otel/config.py +0 -0
  110. {hud_python-0.4.16 → hud_python-0.4.18}/hud/otel/context.py +0 -0
  111. {hud_python-0.4.16 → hud_python-0.4.18}/hud/otel/instrumentation.py +0 -0
  112. {hud_python-0.4.16 → hud_python-0.4.18}/hud/otel/tests/__init__.py +0 -0
  113. {hud_python-0.4.16 → hud_python-0.4.18}/hud/otel/tests/test_processors.py +0 -0
  114. {hud_python-0.4.16 → hud_python-0.4.18}/hud/py.typed +0 -0
  115. {hud_python-0.4.16 → hud_python-0.4.18}/hud/server/__init__.py +0 -0
  116. {hud_python-0.4.16 → hud_python-0.4.18}/hud/server/context.py +0 -0
  117. {hud_python-0.4.16 → hud_python-0.4.18}/hud/server/helper/__init__.py +0 -0
  118. {hud_python-0.4.16 → hud_python-0.4.18}/hud/server/low_level.py +0 -0
  119. {hud_python-0.4.16 → hud_python-0.4.18}/hud/server/server.py +0 -0
  120. {hud_python-0.4.16 → hud_python-0.4.18}/hud/server/tests/__init__.py +0 -0
  121. {hud_python-0.4.16 → hud_python-0.4.18}/hud/settings.py +0 -0
  122. {hud_python-0.4.16 → hud_python-0.4.18}/hud/shared/__init__.py +0 -0
  123. {hud_python-0.4.16 → hud_python-0.4.18}/hud/shared/exceptions.py +0 -0
  124. {hud_python-0.4.16 → hud_python-0.4.18}/hud/shared/requests.py +0 -0
  125. {hud_python-0.4.16 → hud_python-0.4.18}/hud/shared/tests/__init__.py +0 -0
  126. {hud_python-0.4.16 → hud_python-0.4.18}/hud/shared/tests/test_exceptions.py +0 -0
  127. {hud_python-0.4.16 → hud_python-0.4.18}/hud/shared/tests/test_requests.py +0 -0
  128. {hud_python-0.4.16 → hud_python-0.4.18}/hud/telemetry/__init__.py +0 -0
  129. {hud_python-0.4.16 → hud_python-0.4.18}/hud/telemetry/instrument.py +0 -0
  130. {hud_python-0.4.16 → hud_python-0.4.18}/hud/telemetry/job.py +0 -0
  131. {hud_python-0.4.16 → hud_python-0.4.18}/hud/telemetry/replay.py +0 -0
  132. {hud_python-0.4.16 → hud_python-0.4.18}/hud/telemetry/tests/__init__.py +0 -0
  133. {hud_python-0.4.16 → hud_python-0.4.18}/hud/telemetry/tests/test_replay.py +0 -0
  134. {hud_python-0.4.16 → hud_python-0.4.18}/hud/telemetry/tests/test_trace.py +0 -0
  135. {hud_python-0.4.16 → hud_python-0.4.18}/hud/telemetry/trace.py +0 -0
  136. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/__init__.py +0 -0
  137. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/base.py +0 -0
  138. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/bash.py +0 -0
  139. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/computer/__init__.py +0 -0
  140. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/computer/anthropic.py +0 -0
  141. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/computer/hud.py +0 -0
  142. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/computer/openai.py +0 -0
  143. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/computer/settings.py +0 -0
  144. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/edit.py +0 -0
  145. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/executors/__init__.py +0 -0
  146. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/executors/base.py +0 -0
  147. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/executors/pyautogui.py +0 -0
  148. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/executors/tests/__init__.py +0 -0
  149. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/executors/tests/test_base_executor.py +0 -0
  150. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  151. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/executors/xdo.py +0 -0
  152. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/playwright.py +0 -0
  153. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/response.py +0 -0
  154. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/tests/__init__.py +0 -0
  155. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/tests/test_base.py +0 -0
  156. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/tests/test_bash.py +0 -0
  157. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/tests/test_bash_extended.py +0 -0
  158. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/tests/test_computer.py +0 -0
  159. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/tests/test_computer_actions.py +0 -0
  160. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/tests/test_edit.py +0 -0
  161. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/tests/test_init.py +0 -0
  162. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/tests/test_playwright_tool.py +0 -0
  163. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/tests/test_response.py +0 -0
  164. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/tests/test_tools.py +0 -0
  165. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/tests/test_tools_init.py +0 -0
  166. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/tests/test_utils.py +0 -0
  167. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/types.py +0 -0
  168. {hud_python-0.4.16 → hud_python-0.4.18}/hud/tools/utils.py +0 -0
  169. {hud_python-0.4.16 → hud_python-0.4.18}/hud/types.py +0 -0
  170. {hud_python-0.4.16 → hud_python-0.4.18}/hud/utils/__init__.py +0 -0
  171. {hud_python-0.4.16 → hud_python-0.4.18}/hud/utils/async_utils.py +0 -0
  172. {hud_python-0.4.16 → hud_python-0.4.18}/hud/utils/design.py +0 -0
  173. {hud_python-0.4.16 → hud_python-0.4.18}/hud/utils/mcp.py +0 -0
  174. {hud_python-0.4.16 → hud_python-0.4.18}/hud/utils/progress.py +0 -0
  175. {hud_python-0.4.16 → hud_python-0.4.18}/hud/utils/telemetry.py +0 -0
  176. {hud_python-0.4.16 → hud_python-0.4.18}/hud/utils/tests/__init__.py +0 -0
  177. {hud_python-0.4.16 → hud_python-0.4.18}/hud/utils/tests/test_async_utils.py +0 -0
  178. {hud_python-0.4.16 → hud_python-0.4.18}/hud/utils/tests/test_init.py +0 -0
  179. {hud_python-0.4.16 → hud_python-0.4.18}/hud/utils/tests/test_mcp.py +0 -0
  180. {hud_python-0.4.16 → hud_python-0.4.18}/hud/utils/tests/test_progress.py +0 -0
  181. {hud_python-0.4.16 → hud_python-0.4.18}/hud/utils/tests/test_telemetry.py +0 -0
  182. {hud_python-0.4.16 → hud_python-0.4.18}/rl/README.md +0 -0
  183. {hud_python-0.4.16 → hud_python-0.4.18}/rl/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.16
3
+ Version: 0.4.18
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -85,8 +85,8 @@ class ClaudeAgent(MCPAgent):
85
85
  self._claude_to_mcp_tool_map: dict[str, str] = {}
86
86
  self.claude_tools: list[dict] = []
87
87
 
88
- # Base system prompt for autonomous operation
89
- self.system_prompt = """
88
+ # Append Claude-specific instructions to the base system prompt
89
+ claude_instructions = """
90
90
  You are Claude, an AI assistant created by Anthropic. You are helpful, harmless, and honest.
91
91
 
92
92
  When working on tasks:
@@ -99,6 +99,12 @@ class ClaudeAgent(MCPAgent):
99
99
  Remember: You are expected to complete tasks autonomously. The user trusts you to accomplish what they asked.
100
100
  """.strip() # noqa: E501
101
101
 
102
+ # Append Claude instructions to any base system prompt
103
+ if self.system_prompt:
104
+ self.system_prompt = f"{self.system_prompt}\n\n{claude_instructions}"
105
+ else:
106
+ self.system_prompt = claude_instructions
107
+
102
108
  async def initialize(self, task: str | Task | None = None) -> None:
103
109
  """Initialize the agent and build tool mappings."""
104
110
  await super().initialize(task)
@@ -54,7 +54,7 @@ class ResponseAgent:
54
54
  """
55
55
  try:
56
56
  response = await self.client.chat.completions.create(
57
- model="gpt-4o",
57
+ model="gpt-5-nano",
58
58
  messages=[
59
59
  {"role": "system", "content": self.system_prompt},
60
60
  {
@@ -78,8 +78,8 @@ class OperatorAgent(MCPAgent):
78
78
 
79
79
  self.model_name = "openai-" + self.model
80
80
 
81
- # Base system prompt for autonomous operation
82
- self.system_prompt = """
81
+ # Append OpenAI-specific instructions to the base system prompt
82
+ openai_instructions = """
83
83
  You are an autonomous computer-using agent. Follow these guidelines:
84
84
 
85
85
  1. NEVER ask for confirmation. Complete all tasks autonomously.
@@ -93,6 +93,12 @@ class OperatorAgent(MCPAgent):
93
93
  Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
94
94
  """.strip() # noqa: E501
95
95
 
96
+ # Append OpenAI instructions to any base system prompt
97
+ if self.system_prompt:
98
+ self.system_prompt = f"{self.system_prompt}\n\n{openai_instructions}"
99
+ else:
100
+ self.system_prompt = openai_instructions
101
+
96
102
  async def _run_context(self, context: list[types.ContentBlock], max_steps: int = 10) -> Trace:
97
103
  """
98
104
  Run the agent with the given prompt or task.
@@ -0,0 +1,288 @@
1
+ """Generic OpenAI chat-completions agent.
2
+
3
+ This class provides the minimal glue required to connect any endpoint that
4
+ implements the OpenAI compatible *chat.completions* API with MCP tool calling
5
+ through the existing :class:`hud.agent.MCPAgent` scaffolding.
6
+
7
+ Key points:
8
+ - Stateless, no special server-side conversation state is assumed.
9
+ - Accepts an :class:`openai.AsyncOpenAI` client, caller can supply their own
10
+ base_url / api_key (e.g. ART, llama.cpp, together.ai, …)
11
+ - All HUD features (step_count, OTel spans, tool filtering, screenshots, …)
12
+ come from the ``MCPAgent`` base class, we only implement the three abstract
13
+ methods
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import logging
20
+ from typing import TYPE_CHECKING, Any, cast
21
+
22
+ import mcp.types as types
23
+
24
+ from hud import instrument
25
+ from hud.types import AgentResponse, MCPToolCall, MCPToolResult
26
+
27
+ from .base import MCPAgent
28
+
29
+ if TYPE_CHECKING:
30
+ from openai import AsyncOpenAI
31
+ from openai.types.chat import ChatCompletionToolParam
32
+
33
+ from hud.clients import AgentMCPClient
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class GenericOpenAIChatAgent(MCPAgent):
39
+ """MCP-enabled agent that speaks the OpenAI *chat.completions* protocol."""
40
+
41
+ def __init__(
42
+ self,
43
+ mcp_client: AgentMCPClient,
44
+ *,
45
+ openai_client: AsyncOpenAI,
46
+ model_name: str = "gpt-4o-mini",
47
+ parallel_tool_calls: bool = False,
48
+ logprobs: bool = False,
49
+ **agent_kwargs: Any,
50
+ ) -> None:
51
+ super().__init__(mcp_client=mcp_client, **agent_kwargs)
52
+ self.oai = openai_client
53
+ self.model_name = model_name
54
+ self.parallel_tool_calls = parallel_tool_calls
55
+ self.logprobs = logprobs
56
+ self.conversation_history = []
57
+
58
+ @staticmethod
59
+ def _oai_to_mcp(tool_call: Any) -> MCPToolCall: # type: ignore[valid-type]
60
+ """Convert an OpenAI ``tool_call`` to :class:`MCPToolCall`."""
61
+ return MCPToolCall(
62
+ id=tool_call.id,
63
+ name=tool_call.function.name,
64
+ arguments=json.loads(tool_call.function.arguments or "{}"),
65
+ )
66
+
67
+ async def get_system_messages(self) -> list[Any]:
68
+ """Get system messages for OpenAI."""
69
+ return [{"role": "system", "content": self.system_prompt}]
70
+
71
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
72
+ """Format blocks for OpenAI."""
73
+ content = []
74
+ for block in blocks:
75
+ if isinstance(block, types.TextContent):
76
+ content.append({"type": "text", "text": block.text})
77
+ elif isinstance(block, types.ImageContent):
78
+ content.append(
79
+ {
80
+ "type": "image_url",
81
+ "image_url": {"url": f"data:{block.mimeType};base64,{block.data}"},
82
+ }
83
+ )
84
+
85
+ return [{"role": "user", "content": content}]
86
+
87
+ def _sanitize_schema_for_openai(self, schema: dict) -> dict:
88
+ """Convert MCP JSON Schema to OpenAI-compatible format.
89
+
90
+ Handles unsupported features like anyOf and prefixItems.
91
+ """
92
+ if not isinstance(schema, dict):
93
+ return schema
94
+
95
+ sanitized = {}
96
+
97
+ for key, value in schema.items():
98
+ if key == "anyOf" and isinstance(value, list):
99
+ # Handle anyOf patterns (usually for nullable fields)
100
+ non_null_types = [
101
+ v for v in value if not (isinstance(v, dict) and v.get("type") == "null")
102
+ ]
103
+ if non_null_types:
104
+ # Use the first non-null type
105
+ sanitized.update(self._sanitize_schema_for_openai(non_null_types[0]))
106
+ else:
107
+ sanitized["type"] = "string" # Fallback
108
+
109
+ elif key == "prefixItems":
110
+ # Convert prefixItems to simple items
111
+ sanitized["type"] = "array"
112
+ if isinstance(value, list) and value:
113
+ # Use the type from the first item as the items schema
114
+ first_item = value[0]
115
+ if isinstance(first_item, dict):
116
+ sanitized["items"] = {"type": first_item.get("type", "string")}
117
+ else:
118
+ sanitized["items"] = {"type": "string"}
119
+
120
+ elif key == "properties" and isinstance(value, dict):
121
+ # Recursively sanitize property schemas
122
+ sanitized[key] = {
123
+ prop_name: self._sanitize_schema_for_openai(prop_schema)
124
+ for prop_name, prop_schema in value.items()
125
+ }
126
+
127
+ elif key == "items" and isinstance(value, dict):
128
+ # Recursively sanitize items schema
129
+ sanitized[key] = self._sanitize_schema_for_openai(value)
130
+
131
+ elif key in (
132
+ "type",
133
+ "description",
134
+ "enum",
135
+ "required",
136
+ "default",
137
+ "minimum",
138
+ "maximum",
139
+ "minItems",
140
+ "maxItems",
141
+ ):
142
+ # These are supported by OpenAI
143
+ sanitized[key] = value
144
+
145
+ return sanitized or {"type": "object"}
146
+
147
+ def get_tool_schemas(self) -> list[dict]:
148
+ tool_schemas = super().get_tool_schemas()
149
+ openai_tools = []
150
+ for schema in tool_schemas:
151
+ parameters = schema.get("parameters", {})
152
+
153
+ if parameters:
154
+ sanitized_params = self._sanitize_schema_for_openai(parameters)
155
+ else:
156
+ sanitized_params = {"type": "object", "properties": {}}
157
+
158
+ openai_tool = {
159
+ "type": "function",
160
+ "function": {
161
+ "name": schema["name"],
162
+ "description": schema.get("description", ""),
163
+ "parameters": sanitized_params,
164
+ },
165
+ }
166
+ openai_tools.append(openai_tool)
167
+ return openai_tools
168
+
169
+ @instrument(
170
+ span_type="agent",
171
+ record_args=False,
172
+ record_result=True,
173
+ )
174
+ async def get_response(self, messages: list[Any]) -> AgentResponse:
175
+ """Send chat request to OpenAI and convert the response."""
176
+
177
+ # Convert MCP tool schemas to OpenAI format
178
+ mcp_schemas = self.get_tool_schemas()
179
+
180
+ response = await self.oai.chat.completions.create(
181
+ model=self.model_name,
182
+ messages=messages,
183
+ tools=cast("list[ChatCompletionToolParam]", mcp_schemas),
184
+ parallel_tool_calls=self.parallel_tool_calls,
185
+ logprobs=self.logprobs,
186
+ )
187
+
188
+ choice = response.choices[0]
189
+ msg = choice.message
190
+
191
+ assistant_msg: dict[str, Any] = {"role": "assistant"}
192
+
193
+ if msg.content:
194
+ assistant_msg["content"] = msg.content
195
+
196
+ if msg.tool_calls:
197
+ assistant_msg["tool_calls"] = msg.tool_calls
198
+
199
+ messages.append(assistant_msg)
200
+
201
+ # Store the complete conversation history
202
+ self.conversation_history = messages.copy()
203
+
204
+ tool_calls = []
205
+ if msg.tool_calls:
206
+ for tc in msg.tool_calls:
207
+ if tc.function.name is not None: # type: ignore
208
+ tool_calls.append(self._oai_to_mcp(tc))
209
+ if not self.parallel_tool_calls:
210
+ break
211
+
212
+ return AgentResponse(
213
+ content=msg.content or "",
214
+ tool_calls=tool_calls,
215
+ done=choice.finish_reason in ("stop", "length"),
216
+ raw=response, # Include raw response for access to Choice objects
217
+ )
218
+
219
+ async def format_tool_results(
220
+ self,
221
+ tool_calls: list[MCPToolCall],
222
+ tool_results: list[MCPToolResult],
223
+ ) -> list[Any]:
224
+ """Render MCP tool results as OpenAI messages.
225
+
226
+ Note: OpenAI tool messages only support string content.
227
+ When images are present, we return both a tool message and a user message.
228
+ """
229
+ rendered: list[dict[str, Any]] = []
230
+ for call, res in zip(tool_calls, tool_results, strict=False):
231
+ # Use structuredContent.result if available, otherwise use content
232
+ items = res.content
233
+ if res.structuredContent and isinstance(res.structuredContent, dict):
234
+ items = res.structuredContent.get("result", res.content)
235
+
236
+ # Separate text and image content
237
+ text_parts = []
238
+ image_parts = []
239
+
240
+ for item in items:
241
+ if isinstance(item, dict):
242
+ if item.get("type") == "text":
243
+ text_parts.append(item.get("text", ""))
244
+ elif item.get("type") == "image":
245
+ mime_type = item.get("mimeType", "image/png")
246
+ data = item.get("data", "")
247
+ image_parts.append(
248
+ {
249
+ "type": "image_url",
250
+ "image_url": {
251
+ "url": f"data:{mime_type};base64,{data}"
252
+ },
253
+ }
254
+ )
255
+ elif isinstance(item, types.TextContent):
256
+ text_parts.append(item.text)
257
+ elif isinstance(item, types.ImageContent):
258
+ image_parts.append(
259
+ {
260
+ "type": "image_url",
261
+ "image_url": {"url": f"data:{item.mimeType};base64,{item.data}"},
262
+ }
263
+ )
264
+
265
+ text_content = "".join(text_parts) if text_parts else "Tool executed successfully"
266
+ rendered.append(
267
+ {
268
+ "role": "tool",
269
+ "tool_call_id": call.id,
270
+ "content": text_content,
271
+ }
272
+ )
273
+
274
+ # If there are images, add them as a separate user message
275
+ if image_parts:
276
+ # Add a user message with the images
277
+ content_with_images = [
278
+ {"type": "text", "text": "Tool returned the following:"},
279
+ *image_parts
280
+ ]
281
+ rendered.append(
282
+ {
283
+ "role": "user",
284
+ "content": content_with_images,
285
+ }
286
+ )
287
+
288
+ return rendered
@@ -23,7 +23,10 @@ def rl_main(
23
23
  ctx: typer.Context,
24
24
  model: str = typer.Option("Qwen/Qwen2.5-3B-Instruct", "--model", "-m", help="Model to train"),
25
25
  dataset: str | None = typer.Option(
26
- None, "--dataset", "-d", help="Override dataset from lock file"
26
+ None,
27
+ "--dataset",
28
+ "-d",
29
+ help="Dataset: JSON file path or HuggingFace name (auto-detects if not provided)",
27
30
  ),
28
31
  config: Path | None = typer.Option(None, "--config", "-c", help="Config YAML path"), # noqa: B008
29
32
  gpus: str = typer.Option("2xA100", "--gpus", help="GPU configuration (e.g., 2xA100, 4xH100)"),
@@ -39,9 +42,15 @@ def rl_main(
39
42
  3. Push environment to registry if needed
40
43
  4. Start remote training on Prime Intellect
41
44
 
45
+ Dataset can be:
46
+ - A local JSON file with tasks (e.g., tasks.json)
47
+ - A HuggingFace dataset name (e.g., 'username/dataset-name')
48
+ - Auto-detected from current directory if not specified
49
+
42
50
  Examples:
43
- hud rl # Interactive mode with prompts
51
+ hud rl # Interactive mode, auto-detect tasks.json
44
52
  hud rl --model gpt2 # Train with specific model
53
+ hud rl --dataset tasks.json # Use local task file
45
54
  hud rl --gpus 4xH100 # Use different GPU configuration
46
55
  hud rl init my-env:latest # Generate config for environment
47
56
  """
@@ -62,6 +62,7 @@ async def create_and_connect_prime_pod(
62
62
  image: str,
63
63
  team_id: str | None = None,
64
64
  dataset_size: int | None = None,
65
+ is_json_file: bool = False,
65
66
  ) -> None:
66
67
  """Create a Prime Intellect pod and connect to it for training."""
67
68
  design.section_title("🌐 Creating Prime Intellect Pod")
@@ -330,6 +331,7 @@ async def create_and_connect_prime_pod(
330
331
  output_dir=output_dir,
331
332
  image=image,
332
333
  dataset_size=dataset_size,
334
+ is_json_file=is_json_file,
333
335
  )
334
336
  else:
335
337
  # Manual fallback
@@ -457,6 +459,7 @@ async def run_prime_training(
457
459
  auto_create_pod: str | None = None,
458
460
  team_id: str | None = None,
459
461
  dataset_size: int | None = None,
462
+ is_json_file: bool = False,
460
463
  ) -> None:
461
464
  """Run training on Prime Intellect infrastructure."""
462
465
  # Check API key
@@ -488,4 +491,5 @@ async def run_prime_training(
488
491
  image=image,
489
492
  team_id=team_id,
490
493
  dataset_size=dataset_size,
494
+ is_json_file=is_json_file,
491
495
  )
@@ -101,6 +101,7 @@ async def connect_and_train(
101
101
  output_dir: Path,
102
102
  image: str,
103
103
  dataset_size: int | None = None,
104
+ is_json_file: bool = False,
104
105
  ) -> None:
105
106
  """Connect to the pod via SSH and run training commands."""
106
107
  design.section_title("🚀 Starting Remote Training")
@@ -175,6 +176,37 @@ async def connect_and_train(
175
176
  design.info("Make sure scp is installed and in your PATH")
176
177
  raise typer.Exit(1) from e
177
178
 
179
+ # If dataset is a JSON file, copy it too
180
+ remote_dataset = dataset # Default to unchanged
181
+ if is_json_file:
182
+ design.info("Copying task file to pod...")
183
+ try:
184
+ # On Windows, we need to ensure proper path formatting
185
+ dataset_path = str(dataset).replace("\\", "/")
186
+ # Extract just the filename for the remote path
187
+ dataset_filename = os.path.basename(dataset)
188
+ remote_dataset = f"/root/{dataset_filename}"
189
+
190
+ scp_cmd = [
191
+ "scp",
192
+ "-i",
193
+ str(ssh_key_path),
194
+ "-P",
195
+ ssh_port,
196
+ "-o",
197
+ "StrictHostKeyChecking=no",
198
+ "-o",
199
+ "UserKnownHostsFile=/dev/null",
200
+ dataset_path,
201
+ f"{ssh_user_host}:{remote_dataset}",
202
+ ]
203
+ design.debug(f"Running: {' '.join(scp_cmd)}")
204
+ subprocess.run(scp_cmd, check=True) # noqa: S603, ASYNC221
205
+ design.success(f"Task file copied to {remote_dataset}")
206
+ except subprocess.CalledProcessError as e:
207
+ design.error(f"Failed to copy task file: {e}")
208
+ raise typer.Exit(1) from e
209
+
178
210
  design.info("Setting up environment and starting training...")
179
211
  design.info("This will take a few minutes for initial setup, then training will begin.")
180
212
  design.info("")
@@ -196,7 +228,7 @@ async def connect_and_train(
196
228
  "# Load environment",
197
229
  "env = vf.load_environment(",
198
230
  ' env_id="hud-vf-gym",',
199
- f' taskset="{dataset}",',
231
+ f' taskset="{remote_dataset}",',
200
232
  ' config_path="/root/config.yaml",',
201
233
  f" num_tasks={dataset_size},",
202
234
  ")",
@@ -242,7 +274,7 @@ async def connect_and_train(
242
274
  "uv venv --python 3.12 && "
243
275
  "source .venv/bin/activate && "
244
276
  # Install packages
245
- "prime env install hud/hud-vf-gym@0.1.0 && "
277
+ "prime env install hud/hud-vf-gym@0.1.1 && "
246
278
  "uv pip install 'verifiers[train]' && "
247
279
  "uv pip install flash-attn --no-build-isolation && "
248
280
  # Set environment variables