hud-python 0.4.17__tar.gz → 0.4.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (183) hide show
  1. {hud_python-0.4.17 → hud_python-0.4.18}/PKG-INFO +1 -1
  2. {hud_python-0.4.17 → hud_python-0.4.18}/hud/agents/misc/response_agent.py +1 -1
  3. hud_python-0.4.18/hud/agents/openai_chat_generic.py +288 -0
  4. {hud_python-0.4.17 → hud_python-0.4.18}/hud/datasets/execution/parallel.py +113 -37
  5. {hud_python-0.4.17 → hud_python-0.4.18}/hud/otel/exporters.py +3 -0
  6. {hud_python-0.4.17 → hud_python-0.4.18}/hud/otel/processors.py +3 -0
  7. {hud_python-0.4.17 → hud_python-0.4.18}/hud/utils/tests/test_version.py +1 -1
  8. {hud_python-0.4.17 → hud_python-0.4.18}/hud/version.py +1 -1
  9. {hud_python-0.4.17 → hud_python-0.4.18}/pyproject.toml +1 -1
  10. hud_python-0.4.17/hud/agents/openai_chat_generic.py +0 -154
  11. {hud_python-0.4.17 → hud_python-0.4.18}/.gitignore +0 -0
  12. {hud_python-0.4.17 → hud_python-0.4.18}/LICENSE +0 -0
  13. {hud_python-0.4.17 → hud_python-0.4.18}/README.md +0 -0
  14. {hud_python-0.4.17 → hud_python-0.4.18}/environments/README.md +0 -0
  15. {hud_python-0.4.17 → hud_python-0.4.18}/environments/browser/README.md +0 -0
  16. {hud_python-0.4.17 → hud_python-0.4.18}/environments/browser/apps/2048/README.md +0 -0
  17. {hud_python-0.4.17 → hud_python-0.4.18}/environments/browser/apps/2048/backend/pyproject.toml +0 -0
  18. {hud_python-0.4.17 → hud_python-0.4.18}/environments/browser/apps/README.md +0 -0
  19. {hud_python-0.4.17 → hud_python-0.4.18}/environments/browser/apps/todo/README.md +0 -0
  20. {hud_python-0.4.17 → hud_python-0.4.18}/environments/browser/apps/todo/backend/pyproject.toml +0 -0
  21. {hud_python-0.4.17 → hud_python-0.4.18}/environments/browser/pyproject.toml +0 -0
  22. {hud_python-0.4.17 → hud_python-0.4.18}/environments/remote_browser/README.md +0 -0
  23. {hud_python-0.4.17 → hud_python-0.4.18}/environments/remote_browser/pyproject.toml +0 -0
  24. {hud_python-0.4.17 → hud_python-0.4.18}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  25. {hud_python-0.4.17 → hud_python-0.4.18}/environments/text_2048/README.md +0 -0
  26. {hud_python-0.4.17 → hud_python-0.4.18}/environments/text_2048/pyproject.toml +0 -0
  27. {hud_python-0.4.17 → hud_python-0.4.18}/examples/README.md +0 -0
  28. {hud_python-0.4.17 → hud_python-0.4.18}/hud/__init__.py +0 -0
  29. {hud_python-0.4.17 → hud_python-0.4.18}/hud/__main__.py +0 -0
  30. {hud_python-0.4.17 → hud_python-0.4.18}/hud/agents/__init__.py +0 -0
  31. {hud_python-0.4.17 → hud_python-0.4.18}/hud/agents/base.py +0 -0
  32. {hud_python-0.4.17 → hud_python-0.4.18}/hud/agents/claude.py +0 -0
  33. {hud_python-0.4.17 → hud_python-0.4.18}/hud/agents/langchain.py +0 -0
  34. {hud_python-0.4.17 → hud_python-0.4.18}/hud/agents/misc/__init__.py +0 -0
  35. {hud_python-0.4.17 → hud_python-0.4.18}/hud/agents/openai.py +0 -0
  36. {hud_python-0.4.17 → hud_python-0.4.18}/hud/agents/tests/__init__.py +0 -0
  37. {hud_python-0.4.17 → hud_python-0.4.18}/hud/agents/tests/test_base.py +0 -0
  38. {hud_python-0.4.17 → hud_python-0.4.18}/hud/agents/tests/test_claude.py +0 -0
  39. {hud_python-0.4.17 → hud_python-0.4.18}/hud/agents/tests/test_client.py +0 -0
  40. {hud_python-0.4.17 → hud_python-0.4.18}/hud/agents/tests/test_openai.py +0 -0
  41. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/__init__.py +0 -0
  42. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/__main__.py +0 -0
  43. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/analyze.py +0 -0
  44. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/build.py +0 -0
  45. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/clone.py +0 -0
  46. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/debug.py +0 -0
  47. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/dev.py +0 -0
  48. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/eval.py +0 -0
  49. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/hf.py +0 -0
  50. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/init.py +0 -0
  51. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/list_func.py +0 -0
  52. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/pull.py +0 -0
  53. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/push.py +0 -0
  54. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/remove.py +0 -0
  55. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/rl/README.md +0 -0
  56. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/rl/__init__.py +0 -0
  57. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/rl/init.py +0 -0
  58. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/rl/pod.py +0 -0
  59. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/rl/ssh.py +0 -0
  60. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/rl/train.py +0 -0
  61. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/rl/utils.py +0 -0
  62. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/__init__.py +0 -0
  63. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_analyze.py +0 -0
  64. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_analyze_metadata.py +0 -0
  65. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_build.py +0 -0
  66. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_cli_init.py +0 -0
  67. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_cli_main.py +0 -0
  68. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_clone.py +0 -0
  69. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_cursor.py +0 -0
  70. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_debug.py +0 -0
  71. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_list_func.py +0 -0
  72. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_main_module.py +0 -0
  73. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_mcp_server.py +0 -0
  74. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_pull.py +0 -0
  75. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_push.py +0 -0
  76. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_registry.py +0 -0
  77. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/tests/test_utils.py +0 -0
  78. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/utils/__init__.py +0 -0
  79. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/utils/cursor.py +0 -0
  80. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/utils/docker.py +0 -0
  81. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/utils/environment.py +0 -0
  82. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/utils/interactive.py +0 -0
  83. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/utils/logging.py +0 -0
  84. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/utils/metadata.py +0 -0
  85. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/utils/registry.py +0 -0
  86. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/utils/remote_runner.py +0 -0
  87. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/utils/runner.py +0 -0
  88. {hud_python-0.4.17 → hud_python-0.4.18}/hud/cli/utils/server.py +0 -0
  89. {hud_python-0.4.17 → hud_python-0.4.18}/hud/clients/README.md +0 -0
  90. {hud_python-0.4.17 → hud_python-0.4.18}/hud/clients/__init__.py +0 -0
  91. {hud_python-0.4.17 → hud_python-0.4.18}/hud/clients/base.py +0 -0
  92. {hud_python-0.4.17 → hud_python-0.4.18}/hud/clients/fastmcp.py +0 -0
  93. {hud_python-0.4.17 → hud_python-0.4.18}/hud/clients/mcp_use.py +0 -0
  94. {hud_python-0.4.17 → hud_python-0.4.18}/hud/clients/tests/__init__.py +0 -0
  95. {hud_python-0.4.17 → hud_python-0.4.18}/hud/clients/tests/test_client_integration.py +0 -0
  96. {hud_python-0.4.17 → hud_python-0.4.18}/hud/clients/tests/test_fastmcp.py +0 -0
  97. {hud_python-0.4.17 → hud_python-0.4.18}/hud/clients/tests/test_protocol.py +0 -0
  98. {hud_python-0.4.17 → hud_python-0.4.18}/hud/clients/utils/__init__.py +0 -0
  99. {hud_python-0.4.17 → hud_python-0.4.18}/hud/clients/utils/retry_transport.py +0 -0
  100. {hud_python-0.4.17 → hud_python-0.4.18}/hud/datasets/__init__.py +0 -0
  101. {hud_python-0.4.17 → hud_python-0.4.18}/hud/datasets/execution/__init__.py +0 -0
  102. {hud_python-0.4.17 → hud_python-0.4.18}/hud/datasets/execution/runner.py +0 -0
  103. {hud_python-0.4.17 → hud_python-0.4.18}/hud/datasets/task.py +0 -0
  104. {hud_python-0.4.17 → hud_python-0.4.18}/hud/datasets/utils.py +0 -0
  105. {hud_python-0.4.17 → hud_python-0.4.18}/hud/misc/__init__.py +0 -0
  106. {hud_python-0.4.17 → hud_python-0.4.18}/hud/misc/claude_plays_pokemon.py +0 -0
  107. {hud_python-0.4.17 → hud_python-0.4.18}/hud/otel/__init__.py +0 -0
  108. {hud_python-0.4.17 → hud_python-0.4.18}/hud/otel/collector.py +0 -0
  109. {hud_python-0.4.17 → hud_python-0.4.18}/hud/otel/config.py +0 -0
  110. {hud_python-0.4.17 → hud_python-0.4.18}/hud/otel/context.py +0 -0
  111. {hud_python-0.4.17 → hud_python-0.4.18}/hud/otel/instrumentation.py +0 -0
  112. {hud_python-0.4.17 → hud_python-0.4.18}/hud/otel/tests/__init__.py +0 -0
  113. {hud_python-0.4.17 → hud_python-0.4.18}/hud/otel/tests/test_processors.py +0 -0
  114. {hud_python-0.4.17 → hud_python-0.4.18}/hud/py.typed +0 -0
  115. {hud_python-0.4.17 → hud_python-0.4.18}/hud/server/__init__.py +0 -0
  116. {hud_python-0.4.17 → hud_python-0.4.18}/hud/server/context.py +0 -0
  117. {hud_python-0.4.17 → hud_python-0.4.18}/hud/server/helper/__init__.py +0 -0
  118. {hud_python-0.4.17 → hud_python-0.4.18}/hud/server/low_level.py +0 -0
  119. {hud_python-0.4.17 → hud_python-0.4.18}/hud/server/server.py +0 -0
  120. {hud_python-0.4.17 → hud_python-0.4.18}/hud/server/tests/__init__.py +0 -0
  121. {hud_python-0.4.17 → hud_python-0.4.18}/hud/settings.py +0 -0
  122. {hud_python-0.4.17 → hud_python-0.4.18}/hud/shared/__init__.py +0 -0
  123. {hud_python-0.4.17 → hud_python-0.4.18}/hud/shared/exceptions.py +0 -0
  124. {hud_python-0.4.17 → hud_python-0.4.18}/hud/shared/requests.py +0 -0
  125. {hud_python-0.4.17 → hud_python-0.4.18}/hud/shared/tests/__init__.py +0 -0
  126. {hud_python-0.4.17 → hud_python-0.4.18}/hud/shared/tests/test_exceptions.py +0 -0
  127. {hud_python-0.4.17 → hud_python-0.4.18}/hud/shared/tests/test_requests.py +0 -0
  128. {hud_python-0.4.17 → hud_python-0.4.18}/hud/telemetry/__init__.py +0 -0
  129. {hud_python-0.4.17 → hud_python-0.4.18}/hud/telemetry/instrument.py +0 -0
  130. {hud_python-0.4.17 → hud_python-0.4.18}/hud/telemetry/job.py +0 -0
  131. {hud_python-0.4.17 → hud_python-0.4.18}/hud/telemetry/replay.py +0 -0
  132. {hud_python-0.4.17 → hud_python-0.4.18}/hud/telemetry/tests/__init__.py +0 -0
  133. {hud_python-0.4.17 → hud_python-0.4.18}/hud/telemetry/tests/test_replay.py +0 -0
  134. {hud_python-0.4.17 → hud_python-0.4.18}/hud/telemetry/tests/test_trace.py +0 -0
  135. {hud_python-0.4.17 → hud_python-0.4.18}/hud/telemetry/trace.py +0 -0
  136. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/__init__.py +0 -0
  137. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/base.py +0 -0
  138. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/bash.py +0 -0
  139. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/computer/__init__.py +0 -0
  140. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/computer/anthropic.py +0 -0
  141. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/computer/hud.py +0 -0
  142. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/computer/openai.py +0 -0
  143. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/computer/settings.py +0 -0
  144. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/edit.py +0 -0
  145. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/executors/__init__.py +0 -0
  146. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/executors/base.py +0 -0
  147. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/executors/pyautogui.py +0 -0
  148. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/executors/tests/__init__.py +0 -0
  149. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/executors/tests/test_base_executor.py +0 -0
  150. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  151. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/executors/xdo.py +0 -0
  152. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/playwright.py +0 -0
  153. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/response.py +0 -0
  154. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/tests/__init__.py +0 -0
  155. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/tests/test_base.py +0 -0
  156. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/tests/test_bash.py +0 -0
  157. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/tests/test_bash_extended.py +0 -0
  158. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/tests/test_computer.py +0 -0
  159. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/tests/test_computer_actions.py +0 -0
  160. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/tests/test_edit.py +0 -0
  161. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/tests/test_init.py +0 -0
  162. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/tests/test_playwright_tool.py +0 -0
  163. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/tests/test_response.py +0 -0
  164. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/tests/test_tools.py +0 -0
  165. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/tests/test_tools_init.py +0 -0
  166. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/tests/test_utils.py +0 -0
  167. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/types.py +0 -0
  168. {hud_python-0.4.17 → hud_python-0.4.18}/hud/tools/utils.py +0 -0
  169. {hud_python-0.4.17 → hud_python-0.4.18}/hud/types.py +0 -0
  170. {hud_python-0.4.17 → hud_python-0.4.18}/hud/utils/__init__.py +0 -0
  171. {hud_python-0.4.17 → hud_python-0.4.18}/hud/utils/async_utils.py +0 -0
  172. {hud_python-0.4.17 → hud_python-0.4.18}/hud/utils/design.py +0 -0
  173. {hud_python-0.4.17 → hud_python-0.4.18}/hud/utils/mcp.py +0 -0
  174. {hud_python-0.4.17 → hud_python-0.4.18}/hud/utils/progress.py +0 -0
  175. {hud_python-0.4.17 → hud_python-0.4.18}/hud/utils/telemetry.py +0 -0
  176. {hud_python-0.4.17 → hud_python-0.4.18}/hud/utils/tests/__init__.py +0 -0
  177. {hud_python-0.4.17 → hud_python-0.4.18}/hud/utils/tests/test_async_utils.py +0 -0
  178. {hud_python-0.4.17 → hud_python-0.4.18}/hud/utils/tests/test_init.py +0 -0
  179. {hud_python-0.4.17 → hud_python-0.4.18}/hud/utils/tests/test_mcp.py +0 -0
  180. {hud_python-0.4.17 → hud_python-0.4.18}/hud/utils/tests/test_progress.py +0 -0
  181. {hud_python-0.4.17 → hud_python-0.4.18}/hud/utils/tests/test_telemetry.py +0 -0
  182. {hud_python-0.4.17 → hud_python-0.4.18}/rl/README.md +0 -0
  183. {hud_python-0.4.17 → hud_python-0.4.18}/rl/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.17
3
+ Version: 0.4.18
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -54,7 +54,7 @@ class ResponseAgent:
54
54
  """
55
55
  try:
56
56
  response = await self.client.chat.completions.create(
57
- model="gpt-4o",
57
+ model="gpt-5-nano",
58
58
  messages=[
59
59
  {"role": "system", "content": self.system_prompt},
60
60
  {
@@ -0,0 +1,288 @@
1
+ """Generic OpenAI chat-completions agent.
2
+
3
+ This class provides the minimal glue required to connect any endpoint that
4
+ implements the OpenAI compatible *chat.completions* API with MCP tool calling
5
+ through the existing :class:`hud.agent.MCPAgent` scaffolding.
6
+
7
+ Key points:
8
+ - Stateless, no special server-side conversation state is assumed.
9
+ - Accepts an :class:`openai.AsyncOpenAI` client, caller can supply their own
10
+ base_url / api_key (e.g. ART, llama.cpp, together.ai, …)
11
+ - All HUD features (step_count, OTel spans, tool filtering, screenshots, …)
12
+ come from the ``MCPAgent`` base class, we only implement the three abstract
13
+ methods
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import logging
20
+ from typing import TYPE_CHECKING, Any, cast
21
+
22
+ import mcp.types as types
23
+
24
+ from hud import instrument
25
+ from hud.types import AgentResponse, MCPToolCall, MCPToolResult
26
+
27
+ from .base import MCPAgent
28
+
29
+ if TYPE_CHECKING:
30
+ from openai import AsyncOpenAI
31
+ from openai.types.chat import ChatCompletionToolParam
32
+
33
+ from hud.clients import AgentMCPClient
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class GenericOpenAIChatAgent(MCPAgent):
39
+ """MCP-enabled agent that speaks the OpenAI *chat.completions* protocol."""
40
+
41
+ def __init__(
42
+ self,
43
+ mcp_client: AgentMCPClient,
44
+ *,
45
+ openai_client: AsyncOpenAI,
46
+ model_name: str = "gpt-4o-mini",
47
+ parallel_tool_calls: bool = False,
48
+ logprobs: bool = False,
49
+ **agent_kwargs: Any,
50
+ ) -> None:
51
+ super().__init__(mcp_client=mcp_client, **agent_kwargs)
52
+ self.oai = openai_client
53
+ self.model_name = model_name
54
+ self.parallel_tool_calls = parallel_tool_calls
55
+ self.logprobs = logprobs
56
+ self.conversation_history = []
57
+
58
+ @staticmethod
59
+ def _oai_to_mcp(tool_call: Any) -> MCPToolCall: # type: ignore[valid-type]
60
+ """Convert an OpenAI ``tool_call`` to :class:`MCPToolCall`."""
61
+ return MCPToolCall(
62
+ id=tool_call.id,
63
+ name=tool_call.function.name,
64
+ arguments=json.loads(tool_call.function.arguments or "{}"),
65
+ )
66
+
67
+ async def get_system_messages(self) -> list[Any]:
68
+ """Get system messages for OpenAI."""
69
+ return [{"role": "system", "content": self.system_prompt}]
70
+
71
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
72
+ """Format blocks for OpenAI."""
73
+ content = []
74
+ for block in blocks:
75
+ if isinstance(block, types.TextContent):
76
+ content.append({"type": "text", "text": block.text})
77
+ elif isinstance(block, types.ImageContent):
78
+ content.append(
79
+ {
80
+ "type": "image_url",
81
+ "image_url": {"url": f"data:{block.mimeType};base64,{block.data}"},
82
+ }
83
+ )
84
+
85
+ return [{"role": "user", "content": content}]
86
+
87
+ def _sanitize_schema_for_openai(self, schema: dict) -> dict:
88
+ """Convert MCP JSON Schema to OpenAI-compatible format.
89
+
90
+ Handles unsupported features like anyOf and prefixItems.
91
+ """
92
+ if not isinstance(schema, dict):
93
+ return schema
94
+
95
+ sanitized = {}
96
+
97
+ for key, value in schema.items():
98
+ if key == "anyOf" and isinstance(value, list):
99
+ # Handle anyOf patterns (usually for nullable fields)
100
+ non_null_types = [
101
+ v for v in value if not (isinstance(v, dict) and v.get("type") == "null")
102
+ ]
103
+ if non_null_types:
104
+ # Use the first non-null type
105
+ sanitized.update(self._sanitize_schema_for_openai(non_null_types[0]))
106
+ else:
107
+ sanitized["type"] = "string" # Fallback
108
+
109
+ elif key == "prefixItems":
110
+ # Convert prefixItems to simple items
111
+ sanitized["type"] = "array"
112
+ if isinstance(value, list) and value:
113
+ # Use the type from the first item as the items schema
114
+ first_item = value[0]
115
+ if isinstance(first_item, dict):
116
+ sanitized["items"] = {"type": first_item.get("type", "string")}
117
+ else:
118
+ sanitized["items"] = {"type": "string"}
119
+
120
+ elif key == "properties" and isinstance(value, dict):
121
+ # Recursively sanitize property schemas
122
+ sanitized[key] = {
123
+ prop_name: self._sanitize_schema_for_openai(prop_schema)
124
+ for prop_name, prop_schema in value.items()
125
+ }
126
+
127
+ elif key == "items" and isinstance(value, dict):
128
+ # Recursively sanitize items schema
129
+ sanitized[key] = self._sanitize_schema_for_openai(value)
130
+
131
+ elif key in (
132
+ "type",
133
+ "description",
134
+ "enum",
135
+ "required",
136
+ "default",
137
+ "minimum",
138
+ "maximum",
139
+ "minItems",
140
+ "maxItems",
141
+ ):
142
+ # These are supported by OpenAI
143
+ sanitized[key] = value
144
+
145
+ return sanitized or {"type": "object"}
146
+
147
+ def get_tool_schemas(self) -> list[dict]:
148
+ tool_schemas = super().get_tool_schemas()
149
+ openai_tools = []
150
+ for schema in tool_schemas:
151
+ parameters = schema.get("parameters", {})
152
+
153
+ if parameters:
154
+ sanitized_params = self._sanitize_schema_for_openai(parameters)
155
+ else:
156
+ sanitized_params = {"type": "object", "properties": {}}
157
+
158
+ openai_tool = {
159
+ "type": "function",
160
+ "function": {
161
+ "name": schema["name"],
162
+ "description": schema.get("description", ""),
163
+ "parameters": sanitized_params,
164
+ },
165
+ }
166
+ openai_tools.append(openai_tool)
167
+ return openai_tools
168
+
169
+ @instrument(
170
+ span_type="agent",
171
+ record_args=False,
172
+ record_result=True,
173
+ )
174
+ async def get_response(self, messages: list[Any]) -> AgentResponse:
175
+ """Send chat request to OpenAI and convert the response."""
176
+
177
+ # Convert MCP tool schemas to OpenAI format
178
+ mcp_schemas = self.get_tool_schemas()
179
+
180
+ response = await self.oai.chat.completions.create(
181
+ model=self.model_name,
182
+ messages=messages,
183
+ tools=cast("list[ChatCompletionToolParam]", mcp_schemas),
184
+ parallel_tool_calls=self.parallel_tool_calls,
185
+ logprobs=self.logprobs,
186
+ )
187
+
188
+ choice = response.choices[0]
189
+ msg = choice.message
190
+
191
+ assistant_msg: dict[str, Any] = {"role": "assistant"}
192
+
193
+ if msg.content:
194
+ assistant_msg["content"] = msg.content
195
+
196
+ if msg.tool_calls:
197
+ assistant_msg["tool_calls"] = msg.tool_calls
198
+
199
+ messages.append(assistant_msg)
200
+
201
+ # Store the complete conversation history
202
+ self.conversation_history = messages.copy()
203
+
204
+ tool_calls = []
205
+ if msg.tool_calls:
206
+ for tc in msg.tool_calls:
207
+ if tc.function.name is not None: # type: ignore
208
+ tool_calls.append(self._oai_to_mcp(tc))
209
+ if not self.parallel_tool_calls:
210
+ break
211
+
212
+ return AgentResponse(
213
+ content=msg.content or "",
214
+ tool_calls=tool_calls,
215
+ done=choice.finish_reason in ("stop", "length"),
216
+ raw=response, # Include raw response for access to Choice objects
217
+ )
218
+
219
+ async def format_tool_results(
220
+ self,
221
+ tool_calls: list[MCPToolCall],
222
+ tool_results: list[MCPToolResult],
223
+ ) -> list[Any]:
224
+ """Render MCP tool results as OpenAI messages.
225
+
226
+ Note: OpenAI tool messages only support string content.
227
+ When images are present, we return both a tool message and a user message.
228
+ """
229
+ rendered: list[dict[str, Any]] = []
230
+ for call, res in zip(tool_calls, tool_results, strict=False):
231
+ # Use structuredContent.result if available, otherwise use content
232
+ items = res.content
233
+ if res.structuredContent and isinstance(res.structuredContent, dict):
234
+ items = res.structuredContent.get("result", res.content)
235
+
236
+ # Separate text and image content
237
+ text_parts = []
238
+ image_parts = []
239
+
240
+ for item in items:
241
+ if isinstance(item, dict):
242
+ if item.get("type") == "text":
243
+ text_parts.append(item.get("text", ""))
244
+ elif item.get("type") == "image":
245
+ mime_type = item.get("mimeType", "image/png")
246
+ data = item.get("data", "")
247
+ image_parts.append(
248
+ {
249
+ "type": "image_url",
250
+ "image_url": {
251
+ "url": f"data:{mime_type};base64,{data}"
252
+ },
253
+ }
254
+ )
255
+ elif isinstance(item, types.TextContent):
256
+ text_parts.append(item.text)
257
+ elif isinstance(item, types.ImageContent):
258
+ image_parts.append(
259
+ {
260
+ "type": "image_url",
261
+ "image_url": {"url": f"data:{item.mimeType};base64,{item.data}"},
262
+ }
263
+ )
264
+
265
+ text_content = "".join(text_parts) if text_parts else "Tool executed successfully"
266
+ rendered.append(
267
+ {
268
+ "role": "tool",
269
+ "tool_call_id": call.id,
270
+ "content": text_content,
271
+ }
272
+ )
273
+
274
+ # If there are images, add them as a separate user message
275
+ if image_parts:
276
+ # Add a user message with the images
277
+ content_with_images = [
278
+ {"type": "text", "text": "Tool returned the following:"},
279
+ *image_parts
280
+ ]
281
+ rendered.append(
282
+ {
283
+ "role": "user",
284
+ "content": content_with_images,
285
+ }
286
+ )
287
+
288
+ return rendered
@@ -40,6 +40,7 @@ def _process_worker(
40
40
  2. Creates its own event loop
41
41
  3. Processes a batch of tasks asynchronously
42
42
  4. Returns results with their original indices
43
+ 5. Handles interruption signals gracefully
43
44
 
44
45
  Args:
45
46
  task_batch: List of (index, task_dict) tuples
@@ -58,6 +59,7 @@ def _process_worker(
58
59
  List of (index, result) tuples
59
60
  """
60
61
  # Import inside worker to avoid pickling issues
62
+ import signal
61
63
  import sys
62
64
 
63
65
  import hud
@@ -72,6 +74,14 @@ def _process_worker(
72
74
  except AttributeError:
73
75
  pass
74
76
 
77
+ # Set up signal handler for clean interruption
78
+ def signal_handler(signum: int, frame: Any) -> None:
79
+ logger.warning("Worker %s: Received interrupt signal", worker_id)
80
+ # Raise KeyboardInterrupt to actually interrupt the worker
81
+ raise KeyboardInterrupt(f"Worker {worker_id} interrupted by user")
82
+
83
+ signal.signal(signal.SIGINT, signal_handler)
84
+
75
85
  # Reinitialize telemetry in this process
76
86
  configure_telemetry()
77
87
 
@@ -157,8 +167,25 @@ def _process_worker(
157
167
  # Process all tasks in parallel within this process
158
168
  tasks = [process_single_task(idx, task_dict) for idx, task_dict in task_batch]
159
169
 
160
- results = await asyncio.gather(*tasks, return_exceptions=False)
161
- return results
170
+ try:
171
+ results = await asyncio.gather(*tasks, return_exceptions=False)
172
+ return results
173
+ except asyncio.CancelledError:
174
+ logger.info("Worker %s: Tasks cancelled due to interruption", worker_id)
175
+ # Return error results for all tasks
176
+ return [
177
+ (
178
+ idx,
179
+ {
180
+ "error": "Task cancelled (Ctrl+C)",
181
+ "isError": True,
182
+ "reward": 0.0,
183
+ "done": False,
184
+ "content": "Task cancelled",
185
+ },
186
+ )
187
+ for idx, _ in task_batch
188
+ ]
162
189
 
163
190
  try:
164
191
  # Run the async batch processing
@@ -180,6 +207,24 @@ def _process_worker(
180
207
  logger.warning("Worker %s: Telemetry flush timed out", worker_id)
181
208
 
182
209
  return results
210
+ except KeyboardInterrupt:
211
+ logger.info("Worker %s: Interrupted by user, stopping gracefully", worker_id)
212
+ # Return partial results for tasks that completed
213
+ partial_results = []
214
+ for idx, _ in task_batch:
215
+ partial_results.append(
216
+ (
217
+ idx,
218
+ {
219
+ "error": "Worker interrupted by user (Ctrl+C)",
220
+ "isError": True,
221
+ "reward": 0.0,
222
+ "done": False,
223
+ "content": "Task interrupted",
224
+ },
225
+ )
226
+ )
227
+ return partial_results
183
228
  except Exception as e:
184
229
  logger.error("[Worker %s] Batch processing failed: %s", worker_id, e)
185
230
  logger.error("Worker %s batch processing failed: %s", worker_id, e)
@@ -365,7 +410,8 @@ async def run_dataset_parallel_manual(
365
410
  )
366
411
 
367
412
  # Process batches in parallel using ProcessPoolExecutor
368
- with ProcessPoolExecutor(max_workers=max_workers) as executor:
413
+ executor = ProcessPoolExecutor(max_workers=max_workers)
414
+ try:
369
415
  # Submit all batches to workers
370
416
  future_to_batch = {
371
417
  executor.submit(worker_func, batch, worker_id=i): batch
@@ -377,48 +423,78 @@ async def run_dataset_parallel_manual(
377
423
  total = len(task_dicts)
378
424
 
379
425
  # Process results as they complete
380
- for future in as_completed(future_to_batch):
381
- batch = future_to_batch[future]
382
-
383
- try:
384
- # Get results from this worker
385
- batch_results = future.result()
386
-
387
- # Place results in correct positions
388
- for index, result in batch_results:
389
- results[index] = result
390
- completed += 1
391
-
392
- # Calculate success rate so far
393
- successful_so_far = sum(
394
- 1
395
- for r in results[:completed]
396
- if r is not None and getattr(r, "reward", 0) > 0
397
- )
426
+ try:
427
+ for future in as_completed(future_to_batch):
428
+ batch = future_to_batch[future]
429
+
430
+ try:
431
+ # Get results from this worker
432
+ batch_results = future.result()
433
+
434
+ # Place results in correct positions
435
+ for index, result in batch_results:
436
+ results[index] = result
437
+ completed += 1
438
+
439
+ # Calculate success rate so far
440
+ successful_so_far = sum(
441
+ 1
442
+ for r in results[:completed]
443
+ if r is not None and getattr(r, "reward", 0) > 0
444
+ )
398
445
 
399
- progress_msg = (
400
- f"Progress: {completed}/{total} tasks completed "
401
- f"({100 * completed / total:.1f}%) | "
402
- f"Success rate: {successful_so_far}/{completed} "
403
- f"({100 * successful_so_far / completed:.1f}%)"
404
- )
446
+ progress_msg = (
447
+ f"Progress: {completed}/{total} tasks completed "
448
+ f"({100 * completed / total:.1f}%) | "
449
+ f"Success rate: {successful_so_far}/{completed} "
450
+ f"({100 * successful_so_far / completed:.1f}%)"
451
+ )
405
452
 
406
- logger.info(progress_msg)
453
+ logger.info(progress_msg)
407
454
 
408
- except Exception as e:
409
- # Handle worker failure
410
- logger.error("Worker failed with exception: %s\n%s", e, traceback.format_exc())
455
+ except Exception as e:
456
+ # Handle worker failure
457
+ logger.error(
458
+ "Worker failed with exception: %s\n%s", e, traceback.format_exc()
459
+ )
411
460
 
412
- # Mark all tasks in this batch as failed
413
- for index, _ in batch:
414
- results[index] = {
415
- "error": f"Worker process failed: {e}",
461
+ # Mark all tasks in this batch as failed
462
+ for index, _ in batch:
463
+ results[index] = {
464
+ "error": f"Worker process failed: {e}",
465
+ "isError": True,
466
+ "reward": 0.0,
467
+ "done": False,
468
+ "content": f"Worker process failed: {e}",
469
+ }
470
+ completed += 1
471
+
472
+ except KeyboardInterrupt:
473
+ logger.warning("\n⚠️ Parallel evaluation interrupted by user (Ctrl+C)")
474
+ logger.info("Cancelling pending tasks...")
475
+
476
+ # Cancel all pending futures
477
+ for future in future_to_batch:
478
+ if not future.done():
479
+ future.cancel()
480
+
481
+ # Mark uncompleted tasks as interrupted
482
+ for i, r in enumerate(results):
483
+ if r is None:
484
+ results[i] = {
485
+ "error": "Evaluation interrupted by user",
416
486
  "isError": True,
417
487
  "reward": 0.0,
418
488
  "done": False,
419
- "content": f"Worker process failed: {e}",
489
+ "content": "Task interrupted (Ctrl+C)",
420
490
  }
421
- completed += 1
491
+
492
+ logger.info("Interrupted after %s/%s tasks", completed, total)
493
+ raise # Re-raise to propagate the interrupt
494
+
495
+ finally:
496
+ # Always shutdown the executor properly
497
+ executor.shutdown(wait=False, cancel_futures=True)
422
498
 
423
499
  # Verify all results are populated
424
500
  missing = [i for i, r in enumerate(results) if r is None]
@@ -14,6 +14,7 @@ from __future__ import annotations
14
14
  import contextlib
15
15
  import json
16
16
  import logging
17
+ import time
17
18
  from collections import defaultdict
18
19
  from datetime import UTC, datetime
19
20
  from typing import TYPE_CHECKING, Any
@@ -362,5 +363,7 @@ class HudSpanExporter(SpanExporter):
362
363
  pass
363
364
 
364
365
  def force_flush(self, timeout_millis: int | None = None) -> bool: # type: ignore[override]
366
+ if timeout_millis:
367
+ time.sleep(timeout_millis / 1000)
365
368
  # Synchronous export, nothing buffered here
366
369
  return True
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ import time
4
5
  from typing import Any
5
6
 
6
7
  from opentelemetry import baggage
@@ -115,4 +116,6 @@ class HudEnrichmentProcessor(SpanProcessor):
115
116
  pass
116
117
 
117
118
  def force_flush(self, timeout_millis: int | None = None) -> bool: # type: ignore[override]
119
+ if timeout_millis:
120
+ time.sleep(timeout_millis / 1000)
118
121
  return True
@@ -5,4 +5,4 @@ def test_import():
5
5
  """Test that the package can be imported."""
6
6
  import hud
7
7
 
8
- assert hud.__version__ == "0.4.17"
8
+ assert hud.__version__ == "0.4.18"
@@ -4,4 +4,4 @@ Version information for the HUD SDK.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- __version__ = "0.4.17"
7
+ __version__ = "0.4.18"
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "hud-python"
3
- version = "0.4.17"
3
+ version = "0.4.18"
4
4
  description = "SDK for the HUD platform."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11, <3.14"