hud-python 0.4.21__tar.gz → 0.4.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (199) hide show
  1. {hud_python-0.4.21 → hud_python-0.4.22}/PKG-INFO +1 -1
  2. {hud_python-0.4.21 → hud_python-0.4.22}/hud/agents/base.py +2 -0
  3. {hud_python-0.4.21 → hud_python-0.4.22}/hud/agents/claude.py +11 -6
  4. hud_python-0.4.22/hud/agents/grounded_openai.py +280 -0
  5. {hud_python-0.4.21 → hud_python-0.4.22}/hud/agents/tests/test_client.py +6 -1
  6. hud_python-0.4.22/hud/agents/tests/test_grounded_openai_agent.py +155 -0
  7. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/eval.py +2 -2
  8. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/utils/interactive.py +1 -1
  9. {hud_python-0.4.21 → hud_python-0.4.22}/hud/settings.py +6 -0
  10. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/executors/tests/test_base_executor.py +1 -1
  11. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/executors/xdo.py +1 -1
  12. hud_python-0.4.22/hud/tools/grounding/__init__.py +13 -0
  13. hud_python-0.4.22/hud/tools/grounding/config.py +54 -0
  14. hud_python-0.4.22/hud/tools/grounding/grounded_tool.py +314 -0
  15. hud_python-0.4.22/hud/tools/grounding/grounder.py +301 -0
  16. hud_python-0.4.22/hud/tools/grounding/tests/__init__.py +1 -0
  17. hud_python-0.4.22/hud/tools/grounding/tests/test_grounded_tool.py +196 -0
  18. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/tests/test_playwright_tool.py +1 -1
  19. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/tests/test_tools_init.py +1 -1
  20. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/tests/test_utils.py +2 -2
  21. hud_python-0.4.22/hud/utils/agent_factories.py +86 -0
  22. {hud_python-0.4.21 → hud_python-0.4.22}/hud/utils/tests/test_version.py +1 -1
  23. {hud_python-0.4.21 → hud_python-0.4.22}/hud/version.py +1 -1
  24. {hud_python-0.4.21 → hud_python-0.4.22}/pyproject.toml +1 -1
  25. {hud_python-0.4.21 → hud_python-0.4.22}/.gitignore +0 -0
  26. {hud_python-0.4.21 → hud_python-0.4.22}/LICENSE +0 -0
  27. {hud_python-0.4.21 → hud_python-0.4.22}/README.md +0 -0
  28. {hud_python-0.4.21 → hud_python-0.4.22}/environments/README.md +0 -0
  29. {hud_python-0.4.21 → hud_python-0.4.22}/environments/browser/README.md +0 -0
  30. {hud_python-0.4.21 → hud_python-0.4.22}/environments/browser/apps/2048/README.md +0 -0
  31. {hud_python-0.4.21 → hud_python-0.4.22}/environments/browser/apps/2048/backend/pyproject.toml +0 -0
  32. {hud_python-0.4.21 → hud_python-0.4.22}/environments/browser/apps/README.md +0 -0
  33. {hud_python-0.4.21 → hud_python-0.4.22}/environments/browser/apps/todo/README.md +0 -0
  34. {hud_python-0.4.21 → hud_python-0.4.22}/environments/browser/apps/todo/backend/pyproject.toml +0 -0
  35. {hud_python-0.4.21 → hud_python-0.4.22}/environments/browser/pyproject.toml +0 -0
  36. {hud_python-0.4.21 → hud_python-0.4.22}/environments/remote_browser/README.md +0 -0
  37. {hud_python-0.4.21 → hud_python-0.4.22}/environments/remote_browser/pyproject.toml +0 -0
  38. {hud_python-0.4.21 → hud_python-0.4.22}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  39. {hud_python-0.4.21 → hud_python-0.4.22}/environments/text_2048/README.md +0 -0
  40. {hud_python-0.4.21 → hud_python-0.4.22}/environments/text_2048/pyproject.toml +0 -0
  41. {hud_python-0.4.21 → hud_python-0.4.22}/examples/README.md +0 -0
  42. {hud_python-0.4.21 → hud_python-0.4.22}/hud/__init__.py +0 -0
  43. {hud_python-0.4.21 → hud_python-0.4.22}/hud/__main__.py +0 -0
  44. {hud_python-0.4.21 → hud_python-0.4.22}/hud/agents/__init__.py +0 -0
  45. {hud_python-0.4.21 → hud_python-0.4.22}/hud/agents/langchain.py +0 -0
  46. {hud_python-0.4.21 → hud_python-0.4.22}/hud/agents/misc/__init__.py +0 -0
  47. {hud_python-0.4.21 → hud_python-0.4.22}/hud/agents/misc/response_agent.py +0 -0
  48. {hud_python-0.4.21 → hud_python-0.4.22}/hud/agents/openai.py +0 -0
  49. {hud_python-0.4.21 → hud_python-0.4.22}/hud/agents/openai_chat_generic.py +0 -0
  50. {hud_python-0.4.21 → hud_python-0.4.22}/hud/agents/tests/__init__.py +0 -0
  51. {hud_python-0.4.21 → hud_python-0.4.22}/hud/agents/tests/test_base.py +0 -0
  52. {hud_python-0.4.21 → hud_python-0.4.22}/hud/agents/tests/test_claude.py +0 -0
  53. {hud_python-0.4.21 → hud_python-0.4.22}/hud/agents/tests/test_openai.py +0 -0
  54. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/__init__.py +0 -0
  55. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/__main__.py +0 -0
  56. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/analyze.py +0 -0
  57. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/build.py +0 -0
  58. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/clone.py +0 -0
  59. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/debug.py +0 -0
  60. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/dev.py +0 -0
  61. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/hf.py +0 -0
  62. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/init.py +0 -0
  63. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/list_func.py +0 -0
  64. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/pull.py +0 -0
  65. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/push.py +0 -0
  66. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/remove.py +0 -0
  67. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/rl/README.md +0 -0
  68. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/rl/__init__.py +0 -0
  69. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/rl/init.py +0 -0
  70. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/rl/pod.py +0 -0
  71. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/rl/ssh.py +0 -0
  72. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/rl/train.py +0 -0
  73. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/rl/utils.py +0 -0
  74. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/__init__.py +0 -0
  75. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_analyze.py +0 -0
  76. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_analyze_metadata.py +0 -0
  77. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_build.py +0 -0
  78. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_cli_init.py +0 -0
  79. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_cli_main.py +0 -0
  80. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_clone.py +0 -0
  81. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_cursor.py +0 -0
  82. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_debug.py +0 -0
  83. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_list_func.py +0 -0
  84. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_main_module.py +0 -0
  85. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_mcp_server.py +0 -0
  86. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_pull.py +0 -0
  87. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_push.py +0 -0
  88. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_registry.py +0 -0
  89. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/tests/test_utils.py +0 -0
  90. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/utils/__init__.py +0 -0
  91. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/utils/cursor.py +0 -0
  92. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/utils/docker.py +0 -0
  93. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/utils/environment.py +0 -0
  94. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/utils/logging.py +0 -0
  95. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/utils/metadata.py +0 -0
  96. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/utils/registry.py +0 -0
  97. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/utils/remote_runner.py +0 -0
  98. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/utils/runner.py +0 -0
  99. {hud_python-0.4.21 → hud_python-0.4.22}/hud/cli/utils/server.py +0 -0
  100. {hud_python-0.4.21 → hud_python-0.4.22}/hud/clients/README.md +0 -0
  101. {hud_python-0.4.21 → hud_python-0.4.22}/hud/clients/__init__.py +0 -0
  102. {hud_python-0.4.21 → hud_python-0.4.22}/hud/clients/base.py +0 -0
  103. {hud_python-0.4.21 → hud_python-0.4.22}/hud/clients/fastmcp.py +0 -0
  104. {hud_python-0.4.21 → hud_python-0.4.22}/hud/clients/mcp_use.py +0 -0
  105. {hud_python-0.4.21 → hud_python-0.4.22}/hud/clients/tests/__init__.py +0 -0
  106. {hud_python-0.4.21 → hud_python-0.4.22}/hud/clients/tests/test_client_integration.py +0 -0
  107. {hud_python-0.4.21 → hud_python-0.4.22}/hud/clients/tests/test_fastmcp.py +0 -0
  108. {hud_python-0.4.21 → hud_python-0.4.22}/hud/clients/tests/test_protocol.py +0 -0
  109. {hud_python-0.4.21 → hud_python-0.4.22}/hud/clients/utils/__init__.py +0 -0
  110. {hud_python-0.4.21 → hud_python-0.4.22}/hud/clients/utils/retry_transport.py +0 -0
  111. {hud_python-0.4.21 → hud_python-0.4.22}/hud/datasets/__init__.py +0 -0
  112. {hud_python-0.4.21 → hud_python-0.4.22}/hud/datasets/execution/__init__.py +0 -0
  113. {hud_python-0.4.21 → hud_python-0.4.22}/hud/datasets/execution/parallel.py +0 -0
  114. {hud_python-0.4.21 → hud_python-0.4.22}/hud/datasets/execution/runner.py +0 -0
  115. {hud_python-0.4.21 → hud_python-0.4.22}/hud/datasets/task.py +0 -0
  116. {hud_python-0.4.21 → hud_python-0.4.22}/hud/datasets/utils.py +0 -0
  117. {hud_python-0.4.21 → hud_python-0.4.22}/hud/misc/__init__.py +0 -0
  118. {hud_python-0.4.21 → hud_python-0.4.22}/hud/misc/claude_plays_pokemon.py +0 -0
  119. {hud_python-0.4.21 → hud_python-0.4.22}/hud/native/__init__.py +0 -0
  120. {hud_python-0.4.21 → hud_python-0.4.22}/hud/native/comparator.py +0 -0
  121. {hud_python-0.4.21 → hud_python-0.4.22}/hud/native/tests/__init__.py +0 -0
  122. {hud_python-0.4.21 → hud_python-0.4.22}/hud/native/tests/test_comparator.py +0 -0
  123. {hud_python-0.4.21 → hud_python-0.4.22}/hud/native/tests/test_native_init.py +0 -0
  124. {hud_python-0.4.21 → hud_python-0.4.22}/hud/otel/__init__.py +0 -0
  125. {hud_python-0.4.21 → hud_python-0.4.22}/hud/otel/collector.py +0 -0
  126. {hud_python-0.4.21 → hud_python-0.4.22}/hud/otel/config.py +0 -0
  127. {hud_python-0.4.21 → hud_python-0.4.22}/hud/otel/context.py +0 -0
  128. {hud_python-0.4.21 → hud_python-0.4.22}/hud/otel/exporters.py +0 -0
  129. {hud_python-0.4.21 → hud_python-0.4.22}/hud/otel/instrumentation.py +0 -0
  130. {hud_python-0.4.21 → hud_python-0.4.22}/hud/otel/processors.py +0 -0
  131. {hud_python-0.4.21 → hud_python-0.4.22}/hud/otel/tests/__init__.py +0 -0
  132. {hud_python-0.4.21 → hud_python-0.4.22}/hud/otel/tests/test_processors.py +0 -0
  133. {hud_python-0.4.21 → hud_python-0.4.22}/hud/py.typed +0 -0
  134. {hud_python-0.4.21 → hud_python-0.4.22}/hud/server/__init__.py +0 -0
  135. {hud_python-0.4.21 → hud_python-0.4.22}/hud/server/context.py +0 -0
  136. {hud_python-0.4.21 → hud_python-0.4.22}/hud/server/helper/__init__.py +0 -0
  137. {hud_python-0.4.21 → hud_python-0.4.22}/hud/server/low_level.py +0 -0
  138. {hud_python-0.4.21 → hud_python-0.4.22}/hud/server/server.py +0 -0
  139. {hud_python-0.4.21 → hud_python-0.4.22}/hud/server/tests/__init__.py +0 -0
  140. {hud_python-0.4.21 → hud_python-0.4.22}/hud/shared/__init__.py +0 -0
  141. {hud_python-0.4.21 → hud_python-0.4.22}/hud/shared/exceptions.py +0 -0
  142. {hud_python-0.4.21 → hud_python-0.4.22}/hud/shared/hints.py +0 -0
  143. {hud_python-0.4.21 → hud_python-0.4.22}/hud/shared/requests.py +0 -0
  144. {hud_python-0.4.21 → hud_python-0.4.22}/hud/shared/tests/__init__.py +0 -0
  145. {hud_python-0.4.21 → hud_python-0.4.22}/hud/shared/tests/test_exceptions.py +0 -0
  146. {hud_python-0.4.21 → hud_python-0.4.22}/hud/shared/tests/test_requests.py +0 -0
  147. {hud_python-0.4.21 → hud_python-0.4.22}/hud/telemetry/__init__.py +0 -0
  148. {hud_python-0.4.21 → hud_python-0.4.22}/hud/telemetry/instrument.py +0 -0
  149. {hud_python-0.4.21 → hud_python-0.4.22}/hud/telemetry/job.py +0 -0
  150. {hud_python-0.4.21 → hud_python-0.4.22}/hud/telemetry/replay.py +0 -0
  151. {hud_python-0.4.21 → hud_python-0.4.22}/hud/telemetry/tests/__init__.py +0 -0
  152. {hud_python-0.4.21 → hud_python-0.4.22}/hud/telemetry/tests/test_replay.py +0 -0
  153. {hud_python-0.4.21 → hud_python-0.4.22}/hud/telemetry/tests/test_trace.py +0 -0
  154. {hud_python-0.4.21 → hud_python-0.4.22}/hud/telemetry/trace.py +0 -0
  155. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/__init__.py +0 -0
  156. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/base.py +0 -0
  157. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/bash.py +0 -0
  158. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/computer/__init__.py +0 -0
  159. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/computer/anthropic.py +0 -0
  160. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/computer/hud.py +0 -0
  161. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/computer/openai.py +0 -0
  162. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/computer/settings.py +0 -0
  163. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/edit.py +0 -0
  164. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/executors/__init__.py +0 -0
  165. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/executors/base.py +0 -0
  166. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/executors/pyautogui.py +0 -0
  167. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/executors/tests/__init__.py +0 -0
  168. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  169. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/playwright.py +0 -0
  170. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/response.py +0 -0
  171. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/submit.py +0 -0
  172. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/tests/__init__.py +0 -0
  173. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/tests/test_base.py +0 -0
  174. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/tests/test_bash.py +0 -0
  175. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/tests/test_bash_extended.py +0 -0
  176. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/tests/test_computer.py +0 -0
  177. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/tests/test_computer_actions.py +0 -0
  178. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/tests/test_edit.py +0 -0
  179. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/tests/test_init.py +0 -0
  180. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/tests/test_response.py +0 -0
  181. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/tests/test_tools.py +0 -0
  182. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/types.py +0 -0
  183. {hud_python-0.4.21 → hud_python-0.4.22}/hud/tools/utils.py +0 -0
  184. {hud_python-0.4.21 → hud_python-0.4.22}/hud/types.py +0 -0
  185. {hud_python-0.4.21 → hud_python-0.4.22}/hud/utils/__init__.py +0 -0
  186. {hud_python-0.4.21 → hud_python-0.4.22}/hud/utils/async_utils.py +0 -0
  187. {hud_python-0.4.21 → hud_python-0.4.22}/hud/utils/design.py +0 -0
  188. {hud_python-0.4.21 → hud_python-0.4.22}/hud/utils/mcp.py +0 -0
  189. {hud_python-0.4.21 → hud_python-0.4.22}/hud/utils/pretty_errors.py +0 -0
  190. {hud_python-0.4.21 → hud_python-0.4.22}/hud/utils/progress.py +0 -0
  191. {hud_python-0.4.21 → hud_python-0.4.22}/hud/utils/telemetry.py +0 -0
  192. {hud_python-0.4.21 → hud_python-0.4.22}/hud/utils/tests/__init__.py +0 -0
  193. {hud_python-0.4.21 → hud_python-0.4.22}/hud/utils/tests/test_async_utils.py +0 -0
  194. {hud_python-0.4.21 → hud_python-0.4.22}/hud/utils/tests/test_init.py +0 -0
  195. {hud_python-0.4.21 → hud_python-0.4.22}/hud/utils/tests/test_mcp.py +0 -0
  196. {hud_python-0.4.21 → hud_python-0.4.22}/hud/utils/tests/test_progress.py +0 -0
  197. {hud_python-0.4.21 → hud_python-0.4.22}/hud/utils/tests/test_telemetry.py +0 -0
  198. {hud_python-0.4.21 → hud_python-0.4.22}/rl/README.md +0 -0
  199. {hud_python-0.4.21 → hud_python-0.4.22}/rl/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.21
3
+ Version: 0.4.22
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -94,6 +94,8 @@ class MCPAgent(ABC):
94
94
  self.model_name = model_name
95
95
  self.design = HUDDesign(logger=logger)
96
96
 
97
+ self.metadata = {}
98
+
97
99
  # Set verbose mode if requested
98
100
  if verbose:
99
101
  self.design.set_verbose(True)
@@ -364,16 +364,21 @@ class ClaudeAgent(MCPAgent):
364
364
  messages_cached = copy.deepcopy(messages)
365
365
 
366
366
  # Mark last user message with cache control
367
- if messages_cached and messages_cached[-1].get("role") == "user":
367
+ if (
368
+ messages_cached
369
+ and isinstance(messages_cached[-1], dict)
370
+ and messages_cached[-1].get("role") == "user"
371
+ ):
368
372
  last_content = messages_cached[-1]["content"]
369
373
  # Content is formatted to be list of ContentBlock in format_blocks and format_message
370
374
  if isinstance(last_content, list):
371
375
  for block in last_content:
372
- # Only add cache control to block types that support it
373
- block_type = block.get("type")
374
- if block_type in ["text", "image", "tool_use", "tool_result"]:
375
- cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
376
- block["cache_control"] = cache_control # type: ignore[reportGeneralTypeIssues]
376
+ # Only add cache control to dict-like block types that support it
377
+ if isinstance(block, dict):
378
+ block_type = block.get("type")
379
+ if block_type in ["text", "image", "tool_use", "tool_result"]:
380
+ cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
381
+ block["cache_control"] = cache_control # type: ignore[reportGeneralTypeIssues]
377
382
 
378
383
  return messages_cached
379
384
 
@@ -0,0 +1,280 @@
1
+ """Grounded OpenAI agent that separates visual grounding from reasoning."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any
7
+
8
+ from hud import instrument
9
+ from hud.tools.grounding import GroundedComputerTool, Grounder, GrounderConfig
10
+ from hud.types import AgentResponse, MCPToolCall, MCPToolResult
11
+
12
+ from .openai_chat_generic import GenericOpenAIChatAgent
13
+
14
+
15
+ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
16
+ """OpenAI agent that uses a separate grounding model for element detection.
17
+
18
+ This agent:
19
+ - Exposes only a synthetic "computer" tool to the planning model
20
+ - Intercepts tool calls to ground element descriptions to coordinates
21
+ - Converts grounded results to real computer tool calls
22
+ - Maintains screenshot state for grounding operations
23
+
24
+ The architecture separates concerns:
25
+ - Planning model (GPT-4o etc) focuses on high-level reasoning
26
+ - Grounding model (Qwen2-VL etc) handles visual element detection
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ *,
32
+ grounder_config: GrounderConfig,
33
+ model_name: str = "gpt-4o-mini",
34
+ allowed_tools: list[str] | None = None,
35
+ append_setup_output: bool = False,
36
+ system_prompt: str | None = None,
37
+ **kwargs: Any,
38
+ ) -> None:
39
+ """Initialize the grounded OpenAI agent.
40
+
41
+ Args:
42
+ grounder_config: Configuration for the grounding model
43
+ openai_client: OpenAI client for the planning model
44
+ model: Name of the OpenAI model to use for planning (e.g., "gpt-4o", "gpt-4o-mini")
45
+ real_computer_tool_name: Name of the actual computer tool to execute
46
+ **kwargs: Additional arguments passed to GenericOpenAIChatAgent
47
+ """
48
+ # Set defaults for grounded agent
49
+ if allowed_tools is None:
50
+ allowed_tools = ["computer"]
51
+
52
+ if system_prompt is None:
53
+ system_prompt = (
54
+ "You are a helpful AI assistant that can control the computer "
55
+ "through visual interaction.\n\n"
56
+ "IMPORTANT: Always explain your reasoning and observations before taking actions:\n"
57
+ "1. First, describe what you see on the screen\n"
58
+ "2. Explain what you plan to do and why\n"
59
+ "3. Then use the computer tool with natural language descriptions\n\n"
60
+ "For example:\n"
61
+ "- 'I can see a login form with username and password fields. "
62
+ "I need to click on the username field first.'\n"
63
+ "- 'There's a blue submit button at the bottom. "
64
+ "I'll click on it to submit the form.'\n"
65
+ "- 'I notice a red close button in the top right corner. "
66
+ "I'll click it to close this dialog.'\n\n"
67
+ "Use descriptive element descriptions like:\n"
68
+ "- Colors: 'red button', 'blue link', 'green checkmark'\n"
69
+ "- Position: 'top right corner', 'bottom of the page', 'left sidebar'\n"
70
+ "- Text content: 'Submit button', 'Login link', 'Cancel option'\n"
71
+ "- Element type: 'text field', 'dropdown menu', 'checkbox'"
72
+ )
73
+
74
+ super().__init__(
75
+ model_name=model_name,
76
+ allowed_tools=allowed_tools,
77
+ append_setup_output=append_setup_output,
78
+ system_prompt=system_prompt,
79
+ **kwargs,
80
+ )
81
+
82
+ self.grounder = Grounder(grounder_config)
83
+ self.grounded_tool = None
84
+
85
+ async def initialize(self, task: Any = None) -> None:
86
+ """Initialize the agent and create the grounded tool with mcp_client."""
87
+ # Call parent initialization first
88
+ await super().initialize(task)
89
+
90
+ if self.mcp_client is None:
91
+ raise ValueError("mcp_client must be initialized before creating grounded tool")
92
+ self.grounded_tool = GroundedComputerTool(
93
+ grounder=self.grounder, mcp_client=self.mcp_client, computer_tool_name="computer"
94
+ )
95
+
96
+ def get_tool_schemas(self) -> list[Any]:
97
+ """Override to expose only the synthetic grounded tool.
98
+
99
+ The planning model only sees the synthetic "computer" tool,
100
+ which is provided by the grounded tool itself.
101
+
102
+ Returns:
103
+ List containing only the grounded computer tool schema
104
+ """
105
+ if self.grounded_tool is None:
106
+ return []
107
+ return [self.grounded_tool.get_openai_tool_schema()]
108
+
109
+ @instrument(
110
+ span_type="agent",
111
+ record_args=False,
112
+ record_result=True,
113
+ )
114
+ async def get_response(self, messages: Any) -> AgentResponse:
115
+ """Get response from the planning model and handle grounded tool calls.
116
+
117
+ This method:
118
+ 1. Calls the planning model with the grounded tool schema
119
+ 2. Executes any tool calls directly through the grounded tool
120
+ 3. Returns the response
121
+
122
+ Args:
123
+ messages: Conversation messages
124
+
125
+ Returns:
126
+ AgentResponse with either content or tool calls for MCP execution
127
+ """
128
+ tool_schemas = self.get_tool_schemas()
129
+
130
+ # Take initial screenshot and add to messages if this is the first turn
131
+ has_image = any(
132
+ isinstance(m.get("content"), list)
133
+ and any(
134
+ block.get("type") == "image_url"
135
+ for block in m["content"]
136
+ if isinstance(block, dict)
137
+ )
138
+ for m in messages
139
+ if isinstance(m.get("content"), list)
140
+ )
141
+
142
+ if not has_image:
143
+ if self.mcp_client is None:
144
+ raise ValueError("mcp_client is not initialized")
145
+ screenshot_result = await self.mcp_client.call_tool(
146
+ MCPToolCall(name="computer", arguments={"action": "screenshot"})
147
+ )
148
+
149
+ for block in screenshot_result.content:
150
+ # Check for ImageContent type from MCP
151
+ if hasattr(block, "data") and hasattr(block, "mimeType"):
152
+ mime_type = getattr(block, "mimeType", "image/png")
153
+ data = getattr(block, "data", "")
154
+ messages.append(
155
+ {
156
+ "role": "user",
157
+ "content": [
158
+ {
159
+ "type": "image_url",
160
+ "image_url": {"url": f"data:{mime_type};base64,{data}"},
161
+ }
162
+ ],
163
+ }
164
+ )
165
+ break
166
+
167
+ protected_keys = {"model", "messages", "tools", "parallel_tool_calls"}
168
+ extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
169
+
170
+ response = await self.oai.chat.completions.create(
171
+ model=self.model_name,
172
+ messages=messages,
173
+ tools=tool_schemas,
174
+ parallel_tool_calls=False,
175
+ **extra,
176
+ )
177
+
178
+ choice = response.choices[0]
179
+ msg = choice.message
180
+
181
+ assistant_msg: dict[str, Any] = {"role": "assistant"}
182
+ if msg.content:
183
+ assistant_msg["content"] = msg.content
184
+ if msg.tool_calls:
185
+ assistant_msg["tool_calls"] = msg.tool_calls
186
+
187
+ messages.append(assistant_msg)
188
+
189
+ self.conversation_history = messages.copy()
190
+
191
+ if not msg.tool_calls:
192
+ return AgentResponse(
193
+ content=msg.content or "",
194
+ tool_calls=[],
195
+ done=choice.finish_reason in ("stop", "length"),
196
+ raw=response,
197
+ )
198
+
199
+ tc = msg.tool_calls[0]
200
+
201
+ if tc.function.name != "computer":
202
+ return AgentResponse(
203
+ content=f"Error: Model called unexpected tool '{tc.function.name}'",
204
+ tool_calls=[],
205
+ done=True,
206
+ raw=response,
207
+ )
208
+
209
+ # Parse the arguments
210
+ try:
211
+ args = json.loads(tc.function.arguments or "{}")
212
+ except json.JSONDecodeError:
213
+ return AgentResponse(
214
+ content="Error: Invalid tool arguments", tool_calls=[], done=True, raw=response
215
+ )
216
+
217
+ tool_call = MCPToolCall(name="computer", arguments=args, id=tc.id)
218
+
219
+ return AgentResponse(
220
+ content=msg.content or "", tool_calls=[tool_call], done=False, raw=response
221
+ )
222
+
223
+ async def call_tools(
224
+ self, tool_call: MCPToolCall | list[MCPToolCall] | None = None
225
+ ) -> list[MCPToolResult]:
226
+ """Override call_tools to intercept computer tool calls.
227
+
228
+ Execute them through grounded tool.
229
+ """
230
+ if tool_call is None:
231
+ return []
232
+
233
+ if isinstance(tool_call, MCPToolCall):
234
+ tool_call = [tool_call]
235
+
236
+ results: list[MCPToolResult] = []
237
+ for tc in tool_call:
238
+ if tc.name == "computer":
239
+ # Execute through grounded tool instead of MCP
240
+ try:
241
+ # Extract latest screenshot from conversation history
242
+ screenshot_b64 = None
243
+ for m in reversed(self.conversation_history):
244
+ if m.get("role") == "user" and isinstance(m.get("content"), list):
245
+ for block in m["content"]:
246
+ if (
247
+ isinstance(block, dict)
248
+ and block.get("type") == "image_url"
249
+ and isinstance(block.get("image_url"), dict)
250
+ ):
251
+ url = block["image_url"].get("url", "")
252
+ if url.startswith("data:"):
253
+ screenshot_b64 = (
254
+ url.split(",", 1)[1] if "," in url else None
255
+ )
256
+ break
257
+ if screenshot_b64:
258
+ break
259
+
260
+ # Pass screenshot to grounded tool
261
+ args_with_screenshot = dict(tc.arguments) if tc.arguments else {}
262
+ if screenshot_b64:
263
+ args_with_screenshot["screenshot_b64"] = screenshot_b64
264
+
265
+ if self.grounded_tool is None:
266
+ raise ValueError("Grounded tool is not initialized")
267
+ content_blocks = await self.grounded_tool(**args_with_screenshot)
268
+ results.append(MCPToolResult(content=content_blocks, isError=False))
269
+ except Exception as e:
270
+ # Create error result
271
+ from mcp.types import TextContent
272
+
273
+ error_content = TextContent(text=str(e), type="text")
274
+ results.append(MCPToolResult(content=[error_content], isError=True))
275
+ else:
276
+ # For non-computer tools, use parent implementation
277
+ parent_results = await super().call_tools(tc)
278
+ results.extend(parent_results)
279
+
280
+ return results
@@ -200,7 +200,12 @@ class TestMCPClient:
200
200
  # Calling a non-existent tool should return an error result
201
201
  result = await client.call_tool(name="nonexistent", arguments={})
202
202
  assert result.isError is True
203
- assert "Tool 'nonexistent' not found" in result.content[0].text
203
+ # Check that the error message is in the text content
204
+ text_content = ""
205
+ for content in result.content:
206
+ if isinstance(content, types.TextContent):
207
+ text_content += content.text
208
+ assert "Tool 'nonexistent' not found" in text_content
204
209
 
205
210
  @pytest.mark.asyncio
206
211
  async def test_get_telemetry_data(self, mock_telemetry, mock_mcp_use_client):
@@ -0,0 +1,155 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import Any
5
+
6
+ import mcp.types as types
7
+ import pytest
8
+
9
+ from hud.agents.grounded_openai import GroundedOpenAIChatAgent
10
+ from hud.tools.grounding import GrounderConfig
11
+ from hud.types import MCPToolCall, MCPToolResult
12
+
13
+
14
+ class DummyOpenAI:
15
+ class chat: # type: ignore[no-redef]
16
+ class completions:
17
+ @staticmethod
18
+ async def create(**kwargs: Any) -> Any:
19
+ # Return a minimal object mimicking OpenAI response
20
+ class Msg:
21
+ def __init__(self) -> None:
22
+ self.content = "Thinking..."
23
+ self.tool_calls = [
24
+ type(
25
+ "ToolCall",
26
+ (),
27
+ {
28
+ "id": "call_1",
29
+ "function": type(
30
+ "Fn",
31
+ (),
32
+ {
33
+ "name": "computer",
34
+ "arguments": json.dumps(
35
+ {
36
+ "action": "click",
37
+ "element_description": "blue button",
38
+ }
39
+ ),
40
+ },
41
+ ),
42
+ },
43
+ )()
44
+ ]
45
+
46
+ class Choice:
47
+ def __init__(self) -> None:
48
+ self.message = Msg()
49
+ self.finish_reason = "tool_calls"
50
+
51
+ class Resp:
52
+ def __init__(self) -> None:
53
+ self.choices = [Choice()]
54
+
55
+ return Resp()
56
+
57
+
58
+ class FakeMCPClient:
59
+ def __init__(self) -> None:
60
+ self.tools: list[types.Tool] = [
61
+ types.Tool(name="computer", description="", inputSchema={}),
62
+ types.Tool(name="setup", description="internal functions", inputSchema={}),
63
+ ]
64
+ self.called: list[MCPToolCall] = []
65
+
66
+ async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
67
+ return None
68
+
69
+ async def list_tools(self) -> list[types.Tool]:
70
+ return self.tools
71
+
72
+ async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
73
+ self.called.append(tool_call)
74
+ return MCPToolResult(content=[types.TextContent(text="ok", type="text")], isError=False)
75
+
76
+ @property
77
+ def mcp_config(self) -> dict[str, dict[str, Any]]:
78
+ return {"local": {"command": "echo", "args": ["ok"]}}
79
+
80
+ async def shutdown(self) -> None:
81
+ return None
82
+
83
+ async def list_resources(self) -> list[types.Resource]: # not used here
84
+ return []
85
+
86
+ async def read_resource(self, uri: str) -> types.ReadResourceResult | None:
87
+ return None
88
+
89
+
90
+ class DummyGrounder:
91
+ async def predict_click(self, *, image_b64: str, instruction: str, max_retries: int = 3):
92
+ return (7, 9)
93
+
94
+
95
+ class DummyGroundedTool:
96
+ def __init__(self) -> None:
97
+ self.last_args: dict[str, Any] | None = None
98
+
99
+ async def __call__(self, **kwargs: Any):
100
+ self.last_args = kwargs
101
+ return [types.TextContent(text="ok", type="text")]
102
+
103
+ def get_openai_tool_schema(self) -> dict:
104
+ return {
105
+ "type": "function",
106
+ "function": {"name": "computer", "parameters": {"type": "object"}},
107
+ }
108
+
109
+
110
+ @pytest.mark.asyncio
111
+ async def test_call_tools_injects_screenshot_and_delegates(monkeypatch: pytest.MonkeyPatch) -> None:
112
+ # Agent with fake OpenAI client and fake MCP client
113
+ grounder_cfg = GrounderConfig(api_base="http://example", model="qwen")
114
+ agent = GroundedOpenAIChatAgent(
115
+ grounder_config=grounder_cfg,
116
+ openai_client=DummyOpenAI(),
117
+ model_name="gpt-4o-mini",
118
+ mcp_client=FakeMCPClient(),
119
+ initial_screenshot=False,
120
+ )
121
+
122
+ # Inject a dummy grounded tool to observe args without full initialization
123
+ dummy_tool = DummyGroundedTool()
124
+ agent.grounded_tool = dummy_tool # type: ignore
125
+
126
+ # Seed conversation history with a user image
127
+ png_b64 = (
128
+ "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGMAAQAABQAB"
129
+ "J2n0mQAAAABJRU5ErkJggg=="
130
+ )
131
+ agent.conversation_history = [
132
+ {
133
+ "role": "user",
134
+ "content": [
135
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{png_b64}"}},
136
+ ],
137
+ }
138
+ ]
139
+
140
+ # Build a tool call as GroundedOpenAIChatAgent.get_response would produce
141
+ tool_call = MCPToolCall(
142
+ name="computer", arguments={"action": "click", "element_description": "blue button"}
143
+ )
144
+
145
+ results = await agent.call_tools(tool_call)
146
+
147
+ # One result returned
148
+ assert len(results) == 1 and not results[0].isError
149
+
150
+ # Grounded tool received screenshot_b64 injected
151
+ assert dummy_tool.last_args is not None
152
+ assert dummy_tool.last_args["action"] == "click"
153
+ assert dummy_tool.last_args["element_description"] == "blue button"
154
+ assert "screenshot_b64" in dummy_tool.last_args
155
+ assert isinstance(dummy_tool.last_args["screenshot_b64"], str)
@@ -87,7 +87,7 @@ async def run_single_task(
87
87
  except ImportError as e:
88
88
  design.error(
89
89
  "Dataset dependencies are not installed. "
90
- "Please install with: pip install 'hud-python\u27E6agent\u27E7'"
90
+ "Please install with: pip install 'hud-python\u27e6agent\u27e7'"
91
91
  )
92
92
  raise typer.Exit(1) from e
93
93
 
@@ -111,7 +111,7 @@ async def run_single_task(
111
111
  except ImportError as e:
112
112
  design.error(
113
113
  "OpenAI agent dependencies are not installed. "
114
- "Please install with: pip install 'hud-python\u27E6agent\u27E7'"
114
+ "Please install with: pip install 'hud-python\u27e6agent\u27e7'"
115
115
  )
116
116
  raise typer.Exit(1) from e
117
117
 
@@ -74,7 +74,7 @@ class InteractiveMCPTester:
74
74
 
75
75
  for tool in self.tools:
76
76
  if "/" in tool.name:
77
- hub, name = tool.name.split("/", 1)
77
+ hub, _ = tool.name.split("/", 1)
78
78
  if hub not in hub_tools:
79
79
  hub_tools[hub] = []
80
80
  hub_tools[hub].append(tool)
@@ -44,6 +44,12 @@ class Settings(BaseSettings):
44
44
  validation_alias="OPENAI_API_KEY",
45
45
  )
46
46
 
47
+ openrouter_api_key: str | None = Field(
48
+ default=None,
49
+ description="API key for OpenRouter models",
50
+ validation_alias="OPENROUTER_API_KEY",
51
+ )
52
+
47
53
  wandb_api_key: str | None = Field(
48
54
  default=None,
49
55
  description="API key for Weights & Biases",
@@ -361,5 +361,5 @@ class TestLazyImports:
361
361
  """Test lazy import with invalid attribute name."""
362
362
  import hud.tools.executors as executors_module
363
363
 
364
- with pytest.raises(AttributeError, match="module '.*' has no attribute 'InvalidExecutor'"):
364
+ with pytest.raises(AttributeError, match=r"module '.*' has no attribute 'InvalidExecutor'"):
365
365
  _ = executors_module.InvalidExecutor
@@ -175,7 +175,7 @@ class XDOExecutor(BaseExecutor):
175
175
 
176
176
  screenshot_cmd = f"{self._display_prefix}scrot -p {screenshot_path}"
177
177
 
178
- returncode, _, stderr = await run(screenshot_cmd)
178
+ returncode, _, _stderr = await run(screenshot_cmd)
179
179
 
180
180
  if returncode == 0 and screenshot_path.exists():
181
181
  try:
@@ -0,0 +1,13 @@
1
+ """Grounding module for visual element detection and coordinate resolution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .config import GrounderConfig
6
+ from .grounded_tool import GroundedComputerTool
7
+ from .grounder import Grounder
8
+
9
+ __all__ = [
10
+ "GroundedComputerTool",
11
+ "Grounder",
12
+ "GrounderConfig",
13
+ ]
@@ -0,0 +1,54 @@
1
+ """Configuration for grounding models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any
7
+
8
+ SYSTEM_PROMPT = (
9
+ "You are a visual grounding model. Given an image and a description, "
10
+ "return ONLY the center pixel coordinates of the described element as a "
11
+ "single point in parentheses format: (x, y). Do not return bounding boxes "
12
+ "or multiple coordinates."
13
+ )
14
+
15
+
16
+ @dataclass
17
+ class GrounderConfig:
18
+ """Configuration for grounding model clients.
19
+
20
+ Attributes:
21
+ api_base: Base URL for the grounding model API endpoint
22
+ model: Model identifier to use for grounding
23
+ api_key: API key for authentication (default: "EMPTY" for local models)
24
+ system_prompt: System prompt to guide the grounding model
25
+ output_format: Format for coordinate output ("pixels", "norm_0_1", "norm_0_999")
26
+ parser_regex: Regular expression to parse coordinates from model output
27
+ resize: Image resizing configuration dictionary
28
+ """
29
+
30
+ api_base: str
31
+ model: str
32
+ api_key: str = "EMPTY"
33
+ system_prompt: str = SYSTEM_PROMPT
34
+ output_format: str = "pixels" # "pixels" | "norm_0_1" | "norm_0_999"
35
+ parser_regex: str = r"\((\d+),\s*(\d+)\)"
36
+ resize: dict[str, Any] = field(
37
+ default_factory=lambda: {
38
+ "enabled": True,
39
+ "min_pixels": 3136,
40
+ "max_pixels": 4096 * 2160,
41
+ "factor": 28,
42
+ }
43
+ )
44
+
45
+ def __post_init__(self) -> None:
46
+ """Validate configuration after initialization."""
47
+ if self.output_format not in ("pixels", "norm_0_1", "norm_0_999"):
48
+ raise ValueError(f"Invalid output_format: {self.output_format}")
49
+
50
+ if not self.api_base:
51
+ raise ValueError("api_base is required")
52
+
53
+ if not self.model:
54
+ raise ValueError("model is required")