hud-python 0.4.20__tar.gz → 0.4.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (201) hide show
  1. {hud_python-0.4.20 → hud_python-0.4.22}/PKG-INFO +2 -4
  2. {hud_python-0.4.20 → hud_python-0.4.22}/hud/__init__.py +7 -0
  3. {hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/base.py +42 -10
  4. {hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/claude.py +24 -14
  5. hud_python-0.4.22/hud/agents/grounded_openai.py +280 -0
  6. {hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/tests/test_client.py +11 -27
  7. hud_python-0.4.22/hud/agents/tests/test_grounded_openai_agent.py +155 -0
  8. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/__init__.py +50 -20
  9. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/build.py +3 -44
  10. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/eval.py +25 -6
  11. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/init.py +4 -4
  12. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/push.py +3 -1
  13. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_push.py +6 -6
  14. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/utils/interactive.py +1 -1
  15. {hud_python-0.4.20 → hud_python-0.4.22}/hud/clients/__init__.py +3 -2
  16. {hud_python-0.4.20 → hud_python-0.4.22}/hud/clients/base.py +20 -9
  17. {hud_python-0.4.20 → hud_python-0.4.22}/hud/clients/mcp_use.py +44 -22
  18. {hud_python-0.4.20 → hud_python-0.4.22}/hud/datasets/task.py +6 -2
  19. hud_python-0.4.22/hud/native/__init__.py +6 -0
  20. hud_python-0.4.22/hud/native/comparator.py +546 -0
  21. hud_python-0.4.22/hud/native/tests/__init__.py +1 -0
  22. hud_python-0.4.22/hud/native/tests/test_comparator.py +539 -0
  23. hud_python-0.4.22/hud/native/tests/test_native_init.py +79 -0
  24. {hud_python-0.4.20 → hud_python-0.4.22}/hud/otel/instrumentation.py +0 -2
  25. {hud_python-0.4.20 → hud_python-0.4.22}/hud/server/server.py +9 -2
  26. {hud_python-0.4.20 → hud_python-0.4.22}/hud/settings.py +6 -0
  27. hud_python-0.4.22/hud/shared/exceptions.py +364 -0
  28. hud_python-0.4.22/hud/shared/hints.py +177 -0
  29. {hud_python-0.4.20 → hud_python-0.4.22}/hud/shared/requests.py +15 -3
  30. hud_python-0.4.22/hud/shared/tests/test_exceptions.py +420 -0
  31. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/__init__.py +2 -0
  32. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/executors/tests/test_base_executor.py +1 -1
  33. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/executors/xdo.py +1 -1
  34. hud_python-0.4.22/hud/tools/grounding/__init__.py +13 -0
  35. hud_python-0.4.22/hud/tools/grounding/config.py +54 -0
  36. hud_python-0.4.22/hud/tools/grounding/grounded_tool.py +314 -0
  37. hud_python-0.4.22/hud/tools/grounding/grounder.py +301 -0
  38. hud_python-0.4.22/hud/tools/grounding/tests/__init__.py +1 -0
  39. hud_python-0.4.22/hud/tools/grounding/tests/test_grounded_tool.py +196 -0
  40. hud_python-0.4.22/hud/tools/submit.py +66 -0
  41. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/tests/test_playwright_tool.py +1 -1
  42. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/tests/test_tools_init.py +1 -1
  43. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/tests/test_utils.py +2 -2
  44. {hud_python-0.4.20 → hud_python-0.4.22}/hud/types.py +33 -5
  45. hud_python-0.4.22/hud/utils/agent_factories.py +86 -0
  46. {hud_python-0.4.20 → hud_python-0.4.22}/hud/utils/design.py +57 -0
  47. {hud_python-0.4.20 → hud_python-0.4.22}/hud/utils/mcp.py +6 -0
  48. hud_python-0.4.22/hud/utils/pretty_errors.py +68 -0
  49. {hud_python-0.4.20 → hud_python-0.4.22}/hud/utils/tests/test_version.py +1 -1
  50. {hud_python-0.4.20 → hud_python-0.4.22}/hud/version.py +1 -1
  51. {hud_python-0.4.20 → hud_python-0.4.22}/pyproject.toml +2 -3
  52. hud_python-0.4.20/hud/shared/exceptions.py +0 -191
  53. hud_python-0.4.20/hud/shared/tests/test_exceptions.py +0 -179
  54. {hud_python-0.4.20 → hud_python-0.4.22}/.gitignore +0 -0
  55. {hud_python-0.4.20 → hud_python-0.4.22}/LICENSE +0 -0
  56. {hud_python-0.4.20 → hud_python-0.4.22}/README.md +0 -0
  57. {hud_python-0.4.20 → hud_python-0.4.22}/environments/README.md +0 -0
  58. {hud_python-0.4.20 → hud_python-0.4.22}/environments/browser/README.md +0 -0
  59. {hud_python-0.4.20 → hud_python-0.4.22}/environments/browser/apps/2048/README.md +0 -0
  60. {hud_python-0.4.20 → hud_python-0.4.22}/environments/browser/apps/2048/backend/pyproject.toml +0 -0
  61. {hud_python-0.4.20 → hud_python-0.4.22}/environments/browser/apps/README.md +0 -0
  62. {hud_python-0.4.20 → hud_python-0.4.22}/environments/browser/apps/todo/README.md +0 -0
  63. {hud_python-0.4.20 → hud_python-0.4.22}/environments/browser/apps/todo/backend/pyproject.toml +0 -0
  64. {hud_python-0.4.20 → hud_python-0.4.22}/environments/browser/pyproject.toml +0 -0
  65. {hud_python-0.4.20 → hud_python-0.4.22}/environments/remote_browser/README.md +0 -0
  66. {hud_python-0.4.20 → hud_python-0.4.22}/environments/remote_browser/pyproject.toml +0 -0
  67. {hud_python-0.4.20 → hud_python-0.4.22}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  68. {hud_python-0.4.20 → hud_python-0.4.22}/environments/text_2048/README.md +0 -0
  69. {hud_python-0.4.20 → hud_python-0.4.22}/environments/text_2048/pyproject.toml +0 -0
  70. {hud_python-0.4.20 → hud_python-0.4.22}/examples/README.md +0 -0
  71. {hud_python-0.4.20 → hud_python-0.4.22}/hud/__main__.py +0 -0
  72. {hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/__init__.py +0 -0
  73. {hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/langchain.py +0 -0
  74. {hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/misc/__init__.py +0 -0
  75. {hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/misc/response_agent.py +0 -0
  76. {hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/openai.py +0 -0
  77. {hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/openai_chat_generic.py +0 -0
  78. {hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/tests/__init__.py +0 -0
  79. {hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/tests/test_base.py +0 -0
  80. {hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/tests/test_claude.py +0 -0
  81. {hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/tests/test_openai.py +0 -0
  82. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/__main__.py +0 -0
  83. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/analyze.py +0 -0
  84. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/clone.py +0 -0
  85. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/debug.py +0 -0
  86. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/dev.py +0 -0
  87. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/hf.py +0 -0
  88. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/list_func.py +0 -0
  89. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/pull.py +0 -0
  90. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/remove.py +0 -0
  91. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/rl/README.md +0 -0
  92. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/rl/__init__.py +0 -0
  93. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/rl/init.py +0 -0
  94. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/rl/pod.py +0 -0
  95. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/rl/ssh.py +0 -0
  96. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/rl/train.py +0 -0
  97. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/rl/utils.py +0 -0
  98. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/__init__.py +0 -0
  99. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_analyze.py +0 -0
  100. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_analyze_metadata.py +0 -0
  101. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_build.py +0 -0
  102. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_cli_init.py +0 -0
  103. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_cli_main.py +0 -0
  104. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_clone.py +0 -0
  105. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_cursor.py +0 -0
  106. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_debug.py +0 -0
  107. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_list_func.py +0 -0
  108. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_main_module.py +0 -0
  109. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_mcp_server.py +0 -0
  110. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_pull.py +0 -0
  111. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_registry.py +0 -0
  112. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/tests/test_utils.py +0 -0
  113. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/utils/__init__.py +0 -0
  114. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/utils/cursor.py +0 -0
  115. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/utils/docker.py +0 -0
  116. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/utils/environment.py +0 -0
  117. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/utils/logging.py +0 -0
  118. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/utils/metadata.py +0 -0
  119. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/utils/registry.py +0 -0
  120. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/utils/remote_runner.py +0 -0
  121. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/utils/runner.py +0 -0
  122. {hud_python-0.4.20 → hud_python-0.4.22}/hud/cli/utils/server.py +0 -0
  123. {hud_python-0.4.20 → hud_python-0.4.22}/hud/clients/README.md +0 -0
  124. {hud_python-0.4.20 → hud_python-0.4.22}/hud/clients/fastmcp.py +0 -0
  125. {hud_python-0.4.20 → hud_python-0.4.22}/hud/clients/tests/__init__.py +0 -0
  126. {hud_python-0.4.20 → hud_python-0.4.22}/hud/clients/tests/test_client_integration.py +0 -0
  127. {hud_python-0.4.20 → hud_python-0.4.22}/hud/clients/tests/test_fastmcp.py +0 -0
  128. {hud_python-0.4.20 → hud_python-0.4.22}/hud/clients/tests/test_protocol.py +0 -0
  129. {hud_python-0.4.20 → hud_python-0.4.22}/hud/clients/utils/__init__.py +0 -0
  130. {hud_python-0.4.20 → hud_python-0.4.22}/hud/clients/utils/retry_transport.py +0 -0
  131. {hud_python-0.4.20 → hud_python-0.4.22}/hud/datasets/__init__.py +0 -0
  132. {hud_python-0.4.20 → hud_python-0.4.22}/hud/datasets/execution/__init__.py +0 -0
  133. {hud_python-0.4.20 → hud_python-0.4.22}/hud/datasets/execution/parallel.py +0 -0
  134. {hud_python-0.4.20 → hud_python-0.4.22}/hud/datasets/execution/runner.py +0 -0
  135. {hud_python-0.4.20 → hud_python-0.4.22}/hud/datasets/utils.py +0 -0
  136. {hud_python-0.4.20 → hud_python-0.4.22}/hud/misc/__init__.py +0 -0
  137. {hud_python-0.4.20 → hud_python-0.4.22}/hud/misc/claude_plays_pokemon.py +0 -0
  138. {hud_python-0.4.20 → hud_python-0.4.22}/hud/otel/__init__.py +0 -0
  139. {hud_python-0.4.20 → hud_python-0.4.22}/hud/otel/collector.py +0 -0
  140. {hud_python-0.4.20 → hud_python-0.4.22}/hud/otel/config.py +0 -0
  141. {hud_python-0.4.20 → hud_python-0.4.22}/hud/otel/context.py +0 -0
  142. {hud_python-0.4.20 → hud_python-0.4.22}/hud/otel/exporters.py +0 -0
  143. {hud_python-0.4.20 → hud_python-0.4.22}/hud/otel/processors.py +0 -0
  144. {hud_python-0.4.20 → hud_python-0.4.22}/hud/otel/tests/__init__.py +0 -0
  145. {hud_python-0.4.20 → hud_python-0.4.22}/hud/otel/tests/test_processors.py +0 -0
  146. {hud_python-0.4.20 → hud_python-0.4.22}/hud/py.typed +0 -0
  147. {hud_python-0.4.20 → hud_python-0.4.22}/hud/server/__init__.py +0 -0
  148. {hud_python-0.4.20 → hud_python-0.4.22}/hud/server/context.py +0 -0
  149. {hud_python-0.4.20 → hud_python-0.4.22}/hud/server/helper/__init__.py +0 -0
  150. {hud_python-0.4.20 → hud_python-0.4.22}/hud/server/low_level.py +0 -0
  151. {hud_python-0.4.20 → hud_python-0.4.22}/hud/server/tests/__init__.py +0 -0
  152. {hud_python-0.4.20 → hud_python-0.4.22}/hud/shared/__init__.py +0 -0
  153. {hud_python-0.4.20 → hud_python-0.4.22}/hud/shared/tests/__init__.py +0 -0
  154. {hud_python-0.4.20 → hud_python-0.4.22}/hud/shared/tests/test_requests.py +0 -0
  155. {hud_python-0.4.20 → hud_python-0.4.22}/hud/telemetry/__init__.py +0 -0
  156. {hud_python-0.4.20 → hud_python-0.4.22}/hud/telemetry/instrument.py +0 -0
  157. {hud_python-0.4.20 → hud_python-0.4.22}/hud/telemetry/job.py +0 -0
  158. {hud_python-0.4.20 → hud_python-0.4.22}/hud/telemetry/replay.py +0 -0
  159. {hud_python-0.4.20 → hud_python-0.4.22}/hud/telemetry/tests/__init__.py +0 -0
  160. {hud_python-0.4.20 → hud_python-0.4.22}/hud/telemetry/tests/test_replay.py +0 -0
  161. {hud_python-0.4.20 → hud_python-0.4.22}/hud/telemetry/tests/test_trace.py +0 -0
  162. {hud_python-0.4.20 → hud_python-0.4.22}/hud/telemetry/trace.py +0 -0
  163. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/base.py +0 -0
  164. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/bash.py +0 -0
  165. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/computer/__init__.py +0 -0
  166. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/computer/anthropic.py +0 -0
  167. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/computer/hud.py +0 -0
  168. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/computer/openai.py +0 -0
  169. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/computer/settings.py +0 -0
  170. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/edit.py +0 -0
  171. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/executors/__init__.py +0 -0
  172. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/executors/base.py +0 -0
  173. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/executors/pyautogui.py +0 -0
  174. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/executors/tests/__init__.py +0 -0
  175. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  176. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/playwright.py +0 -0
  177. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/response.py +0 -0
  178. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/tests/__init__.py +0 -0
  179. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/tests/test_base.py +0 -0
  180. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/tests/test_bash.py +0 -0
  181. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/tests/test_bash_extended.py +0 -0
  182. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/tests/test_computer.py +0 -0
  183. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/tests/test_computer_actions.py +0 -0
  184. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/tests/test_edit.py +0 -0
  185. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/tests/test_init.py +0 -0
  186. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/tests/test_response.py +0 -0
  187. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/tests/test_tools.py +0 -0
  188. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/types.py +0 -0
  189. {hud_python-0.4.20 → hud_python-0.4.22}/hud/tools/utils.py +0 -0
  190. {hud_python-0.4.20 → hud_python-0.4.22}/hud/utils/__init__.py +0 -0
  191. {hud_python-0.4.20 → hud_python-0.4.22}/hud/utils/async_utils.py +0 -0
  192. {hud_python-0.4.20 → hud_python-0.4.22}/hud/utils/progress.py +0 -0
  193. {hud_python-0.4.20 → hud_python-0.4.22}/hud/utils/telemetry.py +0 -0
  194. {hud_python-0.4.20 → hud_python-0.4.22}/hud/utils/tests/__init__.py +0 -0
  195. {hud_python-0.4.20 → hud_python-0.4.22}/hud/utils/tests/test_async_utils.py +0 -0
  196. {hud_python-0.4.20 → hud_python-0.4.22}/hud/utils/tests/test_init.py +0 -0
  197. {hud_python-0.4.20 → hud_python-0.4.22}/hud/utils/tests/test_mcp.py +0 -0
  198. {hud_python-0.4.20 → hud_python-0.4.22}/hud/utils/tests/test_progress.py +0 -0
  199. {hud_python-0.4.20 → hud_python-0.4.22}/hud/utils/tests/test_telemetry.py +0 -0
  200. {hud_python-0.4.20 → hud_python-0.4.22}/rl/README.md +0 -0
  201. {hud_python-0.4.20 → hud_python-0.4.22}/rl/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.20
3
+ Version: 0.4.22
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -38,6 +38,7 @@ Requires-Python: <3.14,>=3.11
38
38
  Requires-Dist: httpx<1,>=0.23.0
39
39
  Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
40
40
  Requires-Dist: hud-mcp-python-sdk>=3.13.2
41
+ Requires-Dist: hud-mcp-use-python-sdk>=2.3.16
41
42
  Requires-Dist: opentelemetry-api>=1.34.1
42
43
  Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
43
44
  Requires-Dist: opentelemetry-instrumentation-mcp>=0.44.1
@@ -56,7 +57,6 @@ Provides-Extra: agent
56
57
  Requires-Dist: anthropic; extra == 'agent'
57
58
  Requires-Dist: datasets>=2.14.0; extra == 'agent'
58
59
  Requires-Dist: dotenv>=0.9.9; extra == 'agent'
59
- Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agent'
60
60
  Requires-Dist: ipykernel; extra == 'agent'
61
61
  Requires-Dist: ipython<9; extra == 'agent'
62
62
  Requires-Dist: jupyter-client; extra == 'agent'
@@ -70,7 +70,6 @@ Provides-Extra: agents
70
70
  Requires-Dist: anthropic; extra == 'agents'
71
71
  Requires-Dist: datasets>=2.14.0; extra == 'agents'
72
72
  Requires-Dist: dotenv>=0.9.9; extra == 'agents'
73
- Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agents'
74
73
  Requires-Dist: ipykernel; extra == 'agents'
75
74
  Requires-Dist: ipython<9; extra == 'agents'
76
75
  Requires-Dist: jupyter-client; extra == 'agents'
@@ -85,7 +84,6 @@ Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
85
84
  Requires-Dist: anthropic; extra == 'dev'
86
85
  Requires-Dist: datasets>=2.14.0; extra == 'dev'
87
86
  Requires-Dist: dotenv>=0.9.9; extra == 'dev'
88
- Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'dev'
89
87
  Requires-Dist: inspect-ai>=0.3.80; extra == 'dev'
90
88
  Requires-Dist: ipykernel; extra == 'dev'
91
89
  Requires-Dist: ipython<9; extra == 'dev'
@@ -20,3 +20,10 @@ try:
20
20
  from .version import __version__
21
21
  except ImportError:
22
22
  __version__ = "unknown"
23
+
24
+ try:
25
+ from .utils.pretty_errors import install_pretty_errors
26
+
27
+ install_pretty_errors()
28
+ except Exception: # noqa: S110
29
+ pass
@@ -94,6 +94,8 @@ class MCPAgent(ABC):
94
94
  self.model_name = model_name
95
95
  self.design = HUDDesign(logger=logger)
96
96
 
97
+ self.metadata = {}
98
+
97
99
  # Set verbose mode if requested
98
100
  if verbose:
99
101
  self.design.set_verbose(True)
@@ -111,10 +113,12 @@ class MCPAgent(ABC):
111
113
  # Initialize these here so methods can be called before initialize()
112
114
  self._available_tools: list[types.Tool] = []
113
115
  self._tool_map: dict[str, types.Tool] = {} # Simplified: just name to tool
114
- self.screenshot_history: list[str] = []
116
+ self.response_tool_name = None
117
+ self.initialization_complete = False
118
+
119
+ # Trace
115
120
  self._auto_trace = auto_trace
116
121
  self._auto_trace_cm: Any | None = None # Store auto-created trace context manager
117
- self.initialization_complete = False
118
122
 
119
123
  # Response agent to automatically interact with the model
120
124
  self.response_agent = response_agent
@@ -530,6 +534,9 @@ class MCPAgent(ABC):
530
534
  self._available_tools = []
531
535
  self._tool_map = {}
532
536
 
537
+ # Track response tools by server
538
+ response_tools_by_server: dict[str, str] = {} # server_name -> tool_name
539
+
533
540
  for tool in all_tools:
534
541
  # Check if tool should be included
535
542
  if self.allowed_tools and tool.name not in self.allowed_tools:
@@ -541,10 +548,36 @@ class MCPAgent(ABC):
541
548
  # Simplified mapping - just tool name to tool
542
549
  self._tool_map[tool.name] = tool
543
550
 
544
- # Auto-detect response tool as a lifecycle tool
545
- if tool.name == "response" and "response" not in self.lifecycle_tools:
546
- self.design.debug("Auto-detected 'response' tool as a lifecycle tool")
547
- self.lifecycle_tools.append("response")
551
+ # Track response tools
552
+ if "response" in tool.name or tool.name == "response":
553
+ # Extract server name from tool name (e.g., "grader_response" -> "grader")
554
+ if "_" in tool.name:
555
+ server_name = tool.name.split("_", 1)[0]
556
+ response_tools_by_server[server_name] = tool.name
557
+ else:
558
+ response_tools_by_server["_default"] = tool.name
559
+
560
+ # Find the response tool to use (prioritize last server in config)
561
+ if response_tools_by_server and hasattr(self.mcp_client, "mcp_config"):
562
+ # Get server names in order from mcp_config
563
+ server_names = list(self.mcp_client.mcp_config.keys())
564
+
565
+ # Try to find response tool from last server first
566
+ response_tool_name = None
567
+ for server_name in reversed(server_names):
568
+ if server_name in response_tools_by_server:
569
+ response_tool_name = response_tools_by_server[server_name]
570
+ break
571
+
572
+ # Fallback to any response tool
573
+ if not response_tool_name and response_tools_by_server:
574
+ response_tool_name = next(iter(response_tools_by_server.values()))
575
+
576
+ # Add to lifecycle tools if found
577
+ if response_tool_name and response_tool_name not in self.lifecycle_tools:
578
+ self.design.debug(f"Auto-detected '{response_tool_name}' tool as a lifecycle tool")
579
+ self.response_tool_name = response_tool_name
580
+ self.lifecycle_tools.append(response_tool_name)
548
581
 
549
582
  # Check if all required tools are available
550
583
  if self.required_tools:
@@ -565,13 +598,12 @@ class MCPAgent(ABC):
565
598
  response: The agent's response
566
599
  messages: The current message history (will be modified in-place)
567
600
  """
568
- # Check if we have a response lifecycle tool
569
- if "response" in self.lifecycle_tools and "response" in self._tool_map:
570
- self.design.debug("Calling response lifecycle tool")
601
+ if self.response_tool_name:
602
+ self.design.debug(f"Calling response lifecycle tool: {self.response_tool_name}")
571
603
  try:
572
604
  # Call the response tool with the agent's response
573
605
  response_tool_call = MCPToolCall(
574
- name="response", arguments={"response": response.content, "messages": messages}
606
+ name=self.response_tool_name, arguments={"response": response.content}
575
607
  )
576
608
  response_results = await self.call_tools(response_tool_call)
577
609
 
@@ -306,19 +306,20 @@ class ClaudeAgent(MCPAgent):
306
306
  """Convert MCP tools to Claude tool format."""
307
307
  claude_tools = []
308
308
  self._claude_to_mcp_tool_map = {} # Reset mapping
309
-
309
+
310
310
  # Find computer tool by priority
311
311
  computer_tool_priority = ["anthropic_computer", "computer_anthropic", "computer"]
312
312
  selected_computer_tool = None
313
-
313
+
314
314
  for priority_name in computer_tool_priority:
315
315
  for tool in self._available_tools:
316
- if tool.name == priority_name:
316
+ # Check both exact match and suffix match (for prefixed tools)
317
+ if tool.name == priority_name or tool.name.endswith(f"_{priority_name}"):
317
318
  selected_computer_tool = tool
318
319
  break
319
320
  if selected_computer_tool:
320
321
  break
321
-
322
+
322
323
  # Add the selected computer tool if found
323
324
  if selected_computer_tool:
324
325
  claude_tool = {
@@ -330,14 +331,18 @@ class ClaudeAgent(MCPAgent):
330
331
  # Map Claude's "computer" back to the actual MCP tool name
331
332
  self._claude_to_mcp_tool_map["computer"] = selected_computer_tool.name
332
333
  claude_tools.append(claude_tool)
333
- logger.debug(f"Using {selected_computer_tool.name} as computer tool for Claude")
334
-
334
+ logger.debug("Using %s as computer tool for Claude", selected_computer_tool.name)
335
+
335
336
  # Add other non-computer tools
336
337
  for tool in self._available_tools:
337
338
  # Skip computer tools (already handled) and lifecycle tools
338
- if tool.name in computer_tool_priority or tool.name in self.lifecycle_tools:
339
+ is_computer_tool = any(
340
+ tool.name == priority_name or tool.name.endswith(f"_{priority_name}")
341
+ for priority_name in computer_tool_priority
342
+ )
343
+ if is_computer_tool or tool.name in self.lifecycle_tools:
339
344
  continue
340
-
345
+
341
346
  claude_tool = {
342
347
  "name": tool.name,
343
348
  "description": tool.description or f"Execute {tool.name}",
@@ -359,16 +364,21 @@ class ClaudeAgent(MCPAgent):
359
364
  messages_cached = copy.deepcopy(messages)
360
365
 
361
366
  # Mark last user message with cache control
362
- if messages_cached and messages_cached[-1].get("role") == "user":
367
+ if (
368
+ messages_cached
369
+ and isinstance(messages_cached[-1], dict)
370
+ and messages_cached[-1].get("role") == "user"
371
+ ):
363
372
  last_content = messages_cached[-1]["content"]
364
373
  # Content is formatted to be list of ContentBlock in format_blocks and format_message
365
374
  if isinstance(last_content, list):
366
375
  for block in last_content:
367
- # Only add cache control to block types that support it
368
- block_type = block.get("type")
369
- if block_type in ["text", "image", "tool_use", "tool_result"]:
370
- cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
371
- block["cache_control"] = cache_control # type: ignore[reportGeneralTypeIssues]
376
+ # Only add cache control to dict-like block types that support it
377
+ if isinstance(block, dict):
378
+ block_type = block.get("type")
379
+ if block_type in ["text", "image", "tool_use", "tool_result"]:
380
+ cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
381
+ block["cache_control"] = cache_control # type: ignore[reportGeneralTypeIssues]
372
382
 
373
383
  return messages_cached
374
384
 
@@ -0,0 +1,280 @@
1
+ """Grounded OpenAI agent that separates visual grounding from reasoning."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any
7
+
8
+ from hud import instrument
9
+ from hud.tools.grounding import GroundedComputerTool, Grounder, GrounderConfig
10
+ from hud.types import AgentResponse, MCPToolCall, MCPToolResult
11
+
12
+ from .openai_chat_generic import GenericOpenAIChatAgent
13
+
14
+
15
+ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
16
+ """OpenAI agent that uses a separate grounding model for element detection.
17
+
18
+ This agent:
19
+ - Exposes only a synthetic "computer" tool to the planning model
20
+ - Intercepts tool calls to ground element descriptions to coordinates
21
+ - Converts grounded results to real computer tool calls
22
+ - Maintains screenshot state for grounding operations
23
+
24
+ The architecture separates concerns:
25
+ - Planning model (GPT-4o etc) focuses on high-level reasoning
26
+ - Grounding model (Qwen2-VL etc) handles visual element detection
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ *,
32
+ grounder_config: GrounderConfig,
33
+ model_name: str = "gpt-4o-mini",
34
+ allowed_tools: list[str] | None = None,
35
+ append_setup_output: bool = False,
36
+ system_prompt: str | None = None,
37
+ **kwargs: Any,
38
+ ) -> None:
39
+ """Initialize the grounded OpenAI agent.
40
+
41
+ Args:
42
+ grounder_config: Configuration for the grounding model
43
+ openai_client: OpenAI client for the planning model
44
+ model: Name of the OpenAI model to use for planning (e.g., "gpt-4o", "gpt-4o-mini")
45
+ real_computer_tool_name: Name of the actual computer tool to execute
46
+ **kwargs: Additional arguments passed to GenericOpenAIChatAgent
47
+ """
48
+ # Set defaults for grounded agent
49
+ if allowed_tools is None:
50
+ allowed_tools = ["computer"]
51
+
52
+ if system_prompt is None:
53
+ system_prompt = (
54
+ "You are a helpful AI assistant that can control the computer "
55
+ "through visual interaction.\n\n"
56
+ "IMPORTANT: Always explain your reasoning and observations before taking actions:\n"
57
+ "1. First, describe what you see on the screen\n"
58
+ "2. Explain what you plan to do and why\n"
59
+ "3. Then use the computer tool with natural language descriptions\n\n"
60
+ "For example:\n"
61
+ "- 'I can see a login form with username and password fields. "
62
+ "I need to click on the username field first.'\n"
63
+ "- 'There's a blue submit button at the bottom. "
64
+ "I'll click on it to submit the form.'\n"
65
+ "- 'I notice a red close button in the top right corner. "
66
+ "I'll click it to close this dialog.'\n\n"
67
+ "Use descriptive element descriptions like:\n"
68
+ "- Colors: 'red button', 'blue link', 'green checkmark'\n"
69
+ "- Position: 'top right corner', 'bottom of the page', 'left sidebar'\n"
70
+ "- Text content: 'Submit button', 'Login link', 'Cancel option'\n"
71
+ "- Element type: 'text field', 'dropdown menu', 'checkbox'"
72
+ )
73
+
74
+ super().__init__(
75
+ model_name=model_name,
76
+ allowed_tools=allowed_tools,
77
+ append_setup_output=append_setup_output,
78
+ system_prompt=system_prompt,
79
+ **kwargs,
80
+ )
81
+
82
+ self.grounder = Grounder(grounder_config)
83
+ self.grounded_tool = None
84
+
85
+ async def initialize(self, task: Any = None) -> None:
86
+ """Initialize the agent and create the grounded tool with mcp_client."""
87
+ # Call parent initialization first
88
+ await super().initialize(task)
89
+
90
+ if self.mcp_client is None:
91
+ raise ValueError("mcp_client must be initialized before creating grounded tool")
92
+ self.grounded_tool = GroundedComputerTool(
93
+ grounder=self.grounder, mcp_client=self.mcp_client, computer_tool_name="computer"
94
+ )
95
+
96
+ def get_tool_schemas(self) -> list[Any]:
97
+ """Override to expose only the synthetic grounded tool.
98
+
99
+ The planning model only sees the synthetic "computer" tool,
100
+ which is provided by the grounded tool itself.
101
+
102
+ Returns:
103
+ List containing only the grounded computer tool schema
104
+ """
105
+ if self.grounded_tool is None:
106
+ return []
107
+ return [self.grounded_tool.get_openai_tool_schema()]
108
+
109
+ @instrument(
110
+ span_type="agent",
111
+ record_args=False,
112
+ record_result=True,
113
+ )
114
+ async def get_response(self, messages: Any) -> AgentResponse:
115
+ """Get response from the planning model and handle grounded tool calls.
116
+
117
+ This method:
118
+ 1. Calls the planning model with the grounded tool schema
119
+ 2. Executes any tool calls directly through the grounded tool
120
+ 3. Returns the response
121
+
122
+ Args:
123
+ messages: Conversation messages
124
+
125
+ Returns:
126
+ AgentResponse with either content or tool calls for MCP execution
127
+ """
128
+ tool_schemas = self.get_tool_schemas()
129
+
130
+ # Take initial screenshot and add to messages if this is the first turn
131
+ has_image = any(
132
+ isinstance(m.get("content"), list)
133
+ and any(
134
+ block.get("type") == "image_url"
135
+ for block in m["content"]
136
+ if isinstance(block, dict)
137
+ )
138
+ for m in messages
139
+ if isinstance(m.get("content"), list)
140
+ )
141
+
142
+ if not has_image:
143
+ if self.mcp_client is None:
144
+ raise ValueError("mcp_client is not initialized")
145
+ screenshot_result = await self.mcp_client.call_tool(
146
+ MCPToolCall(name="computer", arguments={"action": "screenshot"})
147
+ )
148
+
149
+ for block in screenshot_result.content:
150
+ # Check for ImageContent type from MCP
151
+ if hasattr(block, "data") and hasattr(block, "mimeType"):
152
+ mime_type = getattr(block, "mimeType", "image/png")
153
+ data = getattr(block, "data", "")
154
+ messages.append(
155
+ {
156
+ "role": "user",
157
+ "content": [
158
+ {
159
+ "type": "image_url",
160
+ "image_url": {"url": f"data:{mime_type};base64,{data}"},
161
+ }
162
+ ],
163
+ }
164
+ )
165
+ break
166
+
167
+ protected_keys = {"model", "messages", "tools", "parallel_tool_calls"}
168
+ extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
169
+
170
+ response = await self.oai.chat.completions.create(
171
+ model=self.model_name,
172
+ messages=messages,
173
+ tools=tool_schemas,
174
+ parallel_tool_calls=False,
175
+ **extra,
176
+ )
177
+
178
+ choice = response.choices[0]
179
+ msg = choice.message
180
+
181
+ assistant_msg: dict[str, Any] = {"role": "assistant"}
182
+ if msg.content:
183
+ assistant_msg["content"] = msg.content
184
+ if msg.tool_calls:
185
+ assistant_msg["tool_calls"] = msg.tool_calls
186
+
187
+ messages.append(assistant_msg)
188
+
189
+ self.conversation_history = messages.copy()
190
+
191
+ if not msg.tool_calls:
192
+ return AgentResponse(
193
+ content=msg.content or "",
194
+ tool_calls=[],
195
+ done=choice.finish_reason in ("stop", "length"),
196
+ raw=response,
197
+ )
198
+
199
+ tc = msg.tool_calls[0]
200
+
201
+ if tc.function.name != "computer":
202
+ return AgentResponse(
203
+ content=f"Error: Model called unexpected tool '{tc.function.name}'",
204
+ tool_calls=[],
205
+ done=True,
206
+ raw=response,
207
+ )
208
+
209
+ # Parse the arguments
210
+ try:
211
+ args = json.loads(tc.function.arguments or "{}")
212
+ except json.JSONDecodeError:
213
+ return AgentResponse(
214
+ content="Error: Invalid tool arguments", tool_calls=[], done=True, raw=response
215
+ )
216
+
217
+ tool_call = MCPToolCall(name="computer", arguments=args, id=tc.id)
218
+
219
+ return AgentResponse(
220
+ content=msg.content or "", tool_calls=[tool_call], done=False, raw=response
221
+ )
222
+
223
+ async def call_tools(
224
+ self, tool_call: MCPToolCall | list[MCPToolCall] | None = None
225
+ ) -> list[MCPToolResult]:
226
+ """Override call_tools to intercept computer tool calls.
227
+
228
+ Execute them through grounded tool.
229
+ """
230
+ if tool_call is None:
231
+ return []
232
+
233
+ if isinstance(tool_call, MCPToolCall):
234
+ tool_call = [tool_call]
235
+
236
+ results: list[MCPToolResult] = []
237
+ for tc in tool_call:
238
+ if tc.name == "computer":
239
+ # Execute through grounded tool instead of MCP
240
+ try:
241
+ # Extract latest screenshot from conversation history
242
+ screenshot_b64 = None
243
+ for m in reversed(self.conversation_history):
244
+ if m.get("role") == "user" and isinstance(m.get("content"), list):
245
+ for block in m["content"]:
246
+ if (
247
+ isinstance(block, dict)
248
+ and block.get("type") == "image_url"
249
+ and isinstance(block.get("image_url"), dict)
250
+ ):
251
+ url = block["image_url"].get("url", "")
252
+ if url.startswith("data:"):
253
+ screenshot_b64 = (
254
+ url.split(",", 1)[1] if "," in url else None
255
+ )
256
+ break
257
+ if screenshot_b64:
258
+ break
259
+
260
+ # Pass screenshot to grounded tool
261
+ args_with_screenshot = dict(tc.arguments) if tc.arguments else {}
262
+ if screenshot_b64:
263
+ args_with_screenshot["screenshot_b64"] = screenshot_b64
264
+
265
+ if self.grounded_tool is None:
266
+ raise ValueError("Grounded tool is not initialized")
267
+ content_blocks = await self.grounded_tool(**args_with_screenshot)
268
+ results.append(MCPToolResult(content=content_blocks, isError=False))
269
+ except Exception as e:
270
+ # Create error result
271
+ from mcp.types import TextContent
272
+
273
+ error_content = TextContent(text=str(e), type="text")
274
+ results.append(MCPToolResult(content=[error_content], isError=True))
275
+ else:
276
+ # For non-computer tools, use parent implementation
277
+ parent_results = await super().call_tools(tc)
278
+ results.extend(parent_results)
279
+
280
+ return results
@@ -33,29 +33,6 @@ class TestMCPClient:
33
33
  with patch("mcp_use.client.MCPClient.from_dict", return_value=mock_instance):
34
34
  yield mock_instance
35
35
 
36
- @pytest.mark.asyncio
37
- async def test_init_with_config(self, mock_telemetry):
38
- """Test client initialization with config dictionary."""
39
- mcp_config = {
40
- "test_server": {
41
- "command": "python",
42
- "args": ["-m", "test_server"],
43
- "env": {"TEST": "true"},
44
- }
45
- }
46
-
47
- with patch("mcp_use.client.MCPClient.from_dict") as mock_from_dict:
48
- mock_instance = MagicMock()
49
- mock_instance.create_all_sessions = AsyncMock(return_value={})
50
- mock_from_dict.return_value = mock_instance
51
- client = MCPClient(mcp_config=mcp_config, verbose=True)
52
- # Initialize to trigger connection
53
- await client.initialize()
54
-
55
- assert client.verbose is True
56
- # Verify MCPUseClient.from_dict was called with proper config
57
- mock_from_dict.assert_called_once_with({"mcpServers": mcp_config})
58
-
59
36
  @pytest.mark.asyncio
60
37
  async def test_connect_single_server(self, mock_telemetry, mock_mcp_use_client):
61
38
  """Test connecting to a single server."""
@@ -146,10 +123,10 @@ class TestMCPClient:
146
123
  # Verify sessions were created
147
124
  mock_mcp_use_client.create_all_sessions.assert_called_once()
148
125
 
149
- # Check tools from both servers
126
+ # Check tools from both servers - should be prefixed with server names
150
127
  tools = await client.list_tools()
151
128
  names = {t.name for t in tools}
152
- assert names == {"tool1", "tool2"}
129
+ assert names == {"server1_tool1", "server2_tool2"}
153
130
 
154
131
  @pytest.mark.asyncio
155
132
  async def test_call_tool(self, mock_telemetry, mock_mcp_use_client):
@@ -220,8 +197,15 @@ class TestMCPClient:
220
197
 
221
198
  await client.initialize()
222
199
 
223
- with pytest.raises(ValueError, match="Tool 'nonexistent' not found"):
224
- await client.call_tool(name="nonexistent", arguments={})
200
+ # Calling a non-existent tool should return an error result
201
+ result = await client.call_tool(name="nonexistent", arguments={})
202
+ assert result.isError is True
203
+ # Check that the error message is in the text content
204
+ text_content = ""
205
+ for content in result.content:
206
+ if isinstance(content, types.TextContent):
207
+ text_content += content.text
208
+ assert "Tool 'nonexistent' not found" in text_content
225
209
 
226
210
  @pytest.mark.asyncio
227
211
  async def test_get_telemetry_data(self, mock_telemetry, mock_mcp_use_client):