hud-python 0.4.34__tar.gz → 0.4.36__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (235) hide show
  1. {hud_python-0.4.34 → hud_python-0.4.36}/.gitignore +3 -2
  2. {hud_python-0.4.34 → hud_python-0.4.36}/PKG-INFO +30 -12
  3. hud_python-0.4.36/environments/blank/README.md +92 -0
  4. hud_python-0.4.36/environments/blank/controller/README.md +16 -0
  5. hud_python-0.4.36/environments/blank/environment/README.md +16 -0
  6. hud_python-0.4.36/environments/blank/pyproject.toml +19 -0
  7. {hud_python-0.4.34 → hud_python-0.4.36}/environments/browser/README.md +67 -88
  8. hud_python-0.4.36/environments/browser/environment/pyproject.toml +20 -0
  9. hud_python-0.4.36/environments/browser/pyproject.toml +22 -0
  10. hud_python-0.4.36/environments/deepresearch/pyproject.toml +19 -0
  11. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/claude.py +9 -1
  12. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/openai.py +9 -1
  13. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/tests/test_claude.py +32 -7
  14. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/tests/test_openai.py +29 -6
  15. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/__init__.py +209 -75
  16. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/build.py +10 -5
  17. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/dev.py +20 -39
  18. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/eval.py +4 -3
  19. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/flows/tasks.py +1 -0
  20. hud_python-0.4.36/hud/cli/init.py +270 -0
  21. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/pull.py +6 -0
  22. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/push.py +2 -1
  23. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/rl/remote_runner.py +3 -1
  24. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_build.py +3 -27
  25. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_mcp_server.py +1 -12
  26. hud_python-0.4.36/hud/cli/utils/config.py +85 -0
  27. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/utils/docker.py +21 -39
  28. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/utils/environment.py +4 -3
  29. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/utils/interactive.py +2 -1
  30. hud_python-0.4.36/hud/cli/utils/local_runner.py +204 -0
  31. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/utils/metadata.py +3 -1
  32. hud_python-0.4.36/hud/cli/utils/package_runner.py +292 -0
  33. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/utils/remote_runner.py +4 -1
  34. {hud_python-0.4.34 → hud_python-0.4.36}/hud/clients/mcp_use.py +30 -7
  35. {hud_python-0.4.34 → hud_python-0.4.36}/hud/datasets/parallel.py +3 -1
  36. {hud_python-0.4.34 → hud_python-0.4.36}/hud/datasets/runner.py +5 -2
  37. {hud_python-0.4.34 → hud_python-0.4.36}/hud/otel/context.py +38 -4
  38. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/buffer.py +3 -0
  39. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/tests/test_learner.py +1 -1
  40. {hud_python-0.4.34 → hud_python-0.4.36}/hud/server/server.py +157 -1
  41. {hud_python-0.4.34 → hud_python-0.4.36}/hud/settings.py +38 -0
  42. {hud_python-0.4.34 → hud_python-0.4.36}/hud/shared/hints.py +1 -1
  43. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/tests/test_version.py +1 -1
  44. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/tool_shorthand.py +7 -4
  45. {hud_python-0.4.34 → hud_python-0.4.36}/hud/version.py +1 -1
  46. {hud_python-0.4.34 → hud_python-0.4.36}/pyproject.toml +19 -22
  47. hud_python-0.4.34/environments/browser/pyproject.toml +0 -22
  48. hud_python-0.4.34/hud/cli/init.py +0 -677
  49. {hud_python-0.4.34 → hud_python-0.4.36}/LICENSE +0 -0
  50. {hud_python-0.4.34 → hud_python-0.4.36}/README.md +0 -0
  51. {hud_python-0.4.34 → hud_python-0.4.36}/environments/README.md +0 -0
  52. {hud_python-0.4.34/environments/browser/apps → hud_python-0.4.36/environments/browser/environment}/2048/README.md +0 -0
  53. {hud_python-0.4.34/environments/browser/apps → hud_python-0.4.36/environments/browser/environment}/2048/backend/pyproject.toml +0 -0
  54. {hud_python-0.4.34/environments/browser/apps → hud_python-0.4.36/environments/browser/environment}/README.md +0 -0
  55. {hud_python-0.4.34/environments/browser/apps → hud_python-0.4.36/environments/browser/environment}/todo/README.md +0 -0
  56. {hud_python-0.4.34/environments/browser/apps → hud_python-0.4.36/environments/browser/environment}/todo/backend/pyproject.toml +0 -0
  57. {hud_python-0.4.34 → hud_python-0.4.36}/environments/remote_browser/README.md +0 -0
  58. {hud_python-0.4.34 → hud_python-0.4.36}/environments/remote_browser/pyproject.toml +0 -0
  59. {hud_python-0.4.34 → hud_python-0.4.36}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  60. {hud_python-0.4.34 → hud_python-0.4.36}/environments/text_2048/README.md +0 -0
  61. {hud_python-0.4.34 → hud_python-0.4.36}/environments/text_2048/pyproject.toml +0 -0
  62. {hud_python-0.4.34 → hud_python-0.4.36}/examples/README.md +0 -0
  63. {hud_python-0.4.34 → hud_python-0.4.36}/hud/__init__.py +0 -0
  64. {hud_python-0.4.34 → hud_python-0.4.36}/hud/__main__.py +0 -0
  65. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/__init__.py +0 -0
  66. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/base.py +0 -0
  67. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/grounded_openai.py +0 -0
  68. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/langchain.py +0 -0
  69. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/misc/__init__.py +0 -0
  70. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/misc/response_agent.py +0 -0
  71. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/openai_chat_generic.py +0 -0
  72. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/tests/__init__.py +0 -0
  73. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/tests/test_base.py +0 -0
  74. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/tests/test_client.py +0 -0
  75. {hud_python-0.4.34 → hud_python-0.4.36}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  76. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/__main__.py +0 -0
  77. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/analyze.py +0 -0
  78. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/clone.py +0 -0
  79. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/debug.py +0 -0
  80. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/flows/__init__.py +0 -0
  81. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/get.py +0 -0
  82. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/list_func.py +0 -0
  83. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/remove.py +0 -0
  84. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/rl/__init__.py +0 -0
  85. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/rl/config.py +0 -0
  86. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/rl/display.py +0 -0
  87. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/rl/gpu.py +0 -0
  88. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/rl/gpu_utils.py +0 -0
  89. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/rl/local_runner.py +0 -0
  90. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/rl/presets.py +0 -0
  91. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/rl/rl_api.py +0 -0
  92. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/rl/vllm.py +0 -0
  93. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/__init__.py +0 -0
  94. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_analyze.py +0 -0
  95. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_analyze_metadata.py +0 -0
  96. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_cli_init.py +0 -0
  97. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_cli_main.py +0 -0
  98. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_clone.py +0 -0
  99. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_cursor.py +0 -0
  100. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_debug.py +0 -0
  101. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_list_func.py +0 -0
  102. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_main_module.py +0 -0
  103. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_pull.py +0 -0
  104. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_push.py +0 -0
  105. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_registry.py +0 -0
  106. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/tests/test_utils.py +0 -0
  107. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/utils/__init__.py +0 -0
  108. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/utils/cursor.py +0 -0
  109. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/utils/logging.py +0 -0
  110. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/utils/registry.py +0 -0
  111. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/utils/runner.py +0 -0
  112. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/utils/server.py +0 -0
  113. {hud_python-0.4.34 → hud_python-0.4.36}/hud/cli/utils/tasks.py +0 -0
  114. {hud_python-0.4.34 → hud_python-0.4.36}/hud/clients/README.md +0 -0
  115. {hud_python-0.4.34 → hud_python-0.4.36}/hud/clients/__init__.py +0 -0
  116. {hud_python-0.4.34 → hud_python-0.4.36}/hud/clients/base.py +0 -0
  117. {hud_python-0.4.34 → hud_python-0.4.36}/hud/clients/fastmcp.py +0 -0
  118. {hud_python-0.4.34 → hud_python-0.4.36}/hud/clients/tests/__init__.py +0 -0
  119. {hud_python-0.4.34 → hud_python-0.4.36}/hud/clients/tests/test_client_integration.py +0 -0
  120. {hud_python-0.4.34 → hud_python-0.4.36}/hud/clients/tests/test_fastmcp.py +0 -0
  121. {hud_python-0.4.34 → hud_python-0.4.36}/hud/clients/tests/test_mcp_use_retry.py +0 -0
  122. {hud_python-0.4.34 → hud_python-0.4.36}/hud/clients/tests/test_protocol.py +0 -0
  123. {hud_python-0.4.34 → hud_python-0.4.36}/hud/clients/utils/__init__.py +0 -0
  124. {hud_python-0.4.34 → hud_python-0.4.36}/hud/clients/utils/mcp_use_retry.py +0 -0
  125. {hud_python-0.4.34 → hud_python-0.4.36}/hud/clients/utils/retry.py +0 -0
  126. {hud_python-0.4.34 → hud_python-0.4.36}/hud/clients/utils/retry_transport.py +0 -0
  127. {hud_python-0.4.34 → hud_python-0.4.36}/hud/datasets/__init__.py +0 -0
  128. {hud_python-0.4.34 → hud_python-0.4.36}/hud/datasets/utils.py +0 -0
  129. {hud_python-0.4.34 → hud_python-0.4.36}/hud/misc/__init__.py +0 -0
  130. {hud_python-0.4.34 → hud_python-0.4.36}/hud/misc/claude_plays_pokemon.py +0 -0
  131. {hud_python-0.4.34 → hud_python-0.4.36}/hud/native/__init__.py +0 -0
  132. {hud_python-0.4.34 → hud_python-0.4.36}/hud/native/comparator.py +0 -0
  133. {hud_python-0.4.34 → hud_python-0.4.36}/hud/native/tests/__init__.py +0 -0
  134. {hud_python-0.4.34 → hud_python-0.4.36}/hud/native/tests/test_comparator.py +0 -0
  135. {hud_python-0.4.34 → hud_python-0.4.36}/hud/native/tests/test_native_init.py +0 -0
  136. {hud_python-0.4.34 → hud_python-0.4.36}/hud/otel/__init__.py +0 -0
  137. {hud_python-0.4.34 → hud_python-0.4.36}/hud/otel/collector.py +0 -0
  138. {hud_python-0.4.34 → hud_python-0.4.36}/hud/otel/config.py +0 -0
  139. {hud_python-0.4.34 → hud_python-0.4.36}/hud/otel/exporters.py +0 -0
  140. {hud_python-0.4.34 → hud_python-0.4.36}/hud/otel/instrumentation.py +0 -0
  141. {hud_python-0.4.34 → hud_python-0.4.36}/hud/otel/processors.py +0 -0
  142. {hud_python-0.4.34 → hud_python-0.4.36}/hud/otel/tests/__init__.py +0 -0
  143. {hud_python-0.4.34 → hud_python-0.4.36}/hud/otel/tests/test_processors.py +0 -0
  144. {hud_python-0.4.34 → hud_python-0.4.36}/hud/py.typed +0 -0
  145. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/README.md +0 -0
  146. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/__init__.py +0 -0
  147. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/actor.py +0 -0
  148. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/chat_template.jinja +0 -0
  149. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/config.py +0 -0
  150. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/distributed.py +0 -0
  151. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/learner.py +0 -0
  152. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/tests/__init__.py +0 -0
  153. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/train.py +0 -0
  154. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/types.py +0 -0
  155. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/utils/start_vllm_server.sh +0 -0
  156. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/utils.py +0 -0
  157. {hud_python-0.4.34 → hud_python-0.4.36}/hud/rl/vllm_adapter.py +0 -0
  158. {hud_python-0.4.34 → hud_python-0.4.36}/hud/samples/__init__.py +0 -0
  159. {hud_python-0.4.34 → hud_python-0.4.36}/hud/samples/browser.py +0 -0
  160. {hud_python-0.4.34 → hud_python-0.4.36}/hud/server/__init__.py +0 -0
  161. {hud_python-0.4.34 → hud_python-0.4.36}/hud/server/context.py +0 -0
  162. {hud_python-0.4.34 → hud_python-0.4.36}/hud/server/helper/__init__.py +0 -0
  163. {hud_python-0.4.34 → hud_python-0.4.36}/hud/server/low_level.py +0 -0
  164. {hud_python-0.4.34 → hud_python-0.4.36}/hud/server/tests/__init__.py +0 -0
  165. {hud_python-0.4.34 → hud_python-0.4.36}/hud/shared/__init__.py +0 -0
  166. {hud_python-0.4.34 → hud_python-0.4.36}/hud/shared/exceptions.py +0 -0
  167. {hud_python-0.4.34 → hud_python-0.4.36}/hud/shared/requests.py +0 -0
  168. {hud_python-0.4.34 → hud_python-0.4.36}/hud/shared/tests/__init__.py +0 -0
  169. {hud_python-0.4.34 → hud_python-0.4.36}/hud/shared/tests/test_exceptions.py +0 -0
  170. {hud_python-0.4.34 → hud_python-0.4.36}/hud/shared/tests/test_requests.py +0 -0
  171. {hud_python-0.4.34 → hud_python-0.4.36}/hud/telemetry/__init__.py +0 -0
  172. {hud_python-0.4.34 → hud_python-0.4.36}/hud/telemetry/instrument.py +0 -0
  173. {hud_python-0.4.34 → hud_python-0.4.36}/hud/telemetry/job.py +0 -0
  174. {hud_python-0.4.34 → hud_python-0.4.36}/hud/telemetry/replay.py +0 -0
  175. {hud_python-0.4.34 → hud_python-0.4.36}/hud/telemetry/tests/__init__.py +0 -0
  176. {hud_python-0.4.34 → hud_python-0.4.36}/hud/telemetry/tests/test_replay.py +0 -0
  177. {hud_python-0.4.34 → hud_python-0.4.36}/hud/telemetry/tests/test_trace.py +0 -0
  178. {hud_python-0.4.34 → hud_python-0.4.36}/hud/telemetry/trace.py +0 -0
  179. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/__init__.py +0 -0
  180. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/base.py +0 -0
  181. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/bash.py +0 -0
  182. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/computer/__init__.py +0 -0
  183. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/computer/anthropic.py +0 -0
  184. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/computer/hud.py +0 -0
  185. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/computer/openai.py +0 -0
  186. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/computer/settings.py +0 -0
  187. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/edit.py +0 -0
  188. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/executors/__init__.py +0 -0
  189. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/executors/base.py +0 -0
  190. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/executors/pyautogui.py +0 -0
  191. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/executors/tests/__init__.py +0 -0
  192. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/executors/tests/test_base_executor.py +0 -0
  193. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  194. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/executors/xdo.py +0 -0
  195. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/grounding/__init__.py +0 -0
  196. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/grounding/config.py +0 -0
  197. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/grounding/grounded_tool.py +0 -0
  198. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/grounding/grounder.py +0 -0
  199. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/grounding/tests/__init__.py +0 -0
  200. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  201. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/playwright.py +0 -0
  202. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/response.py +0 -0
  203. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/submit.py +0 -0
  204. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/tests/__init__.py +0 -0
  205. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/tests/test_base.py +0 -0
  206. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/tests/test_bash.py +0 -0
  207. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/tests/test_bash_extended.py +0 -0
  208. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/tests/test_computer.py +0 -0
  209. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/tests/test_computer_actions.py +0 -0
  210. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/tests/test_edit.py +0 -0
  211. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/tests/test_init.py +0 -0
  212. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/tests/test_playwright_tool.py +0 -0
  213. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/tests/test_response.py +0 -0
  214. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/tests/test_tools.py +0 -0
  215. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/tests/test_tools_init.py +0 -0
  216. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/tests/test_utils.py +0 -0
  217. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/types.py +0 -0
  218. {hud_python-0.4.34 → hud_python-0.4.36}/hud/tools/utils.py +0 -0
  219. {hud_python-0.4.34 → hud_python-0.4.36}/hud/types.py +0 -0
  220. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/__init__.py +0 -0
  221. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/agent_factories.py +0 -0
  222. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/async_utils.py +0 -0
  223. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/group_eval.py +0 -0
  224. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/hud_console.py +0 -0
  225. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/mcp.py +0 -0
  226. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/pretty_errors.py +0 -0
  227. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/progress.py +0 -0
  228. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/tasks.py +0 -0
  229. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/telemetry.py +0 -0
  230. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/tests/__init__.py +0 -0
  231. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/tests/test_async_utils.py +0 -0
  232. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/tests/test_init.py +0 -0
  233. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/tests/test_mcp.py +0 -0
  234. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/tests/test_progress.py +0 -0
  235. {hud_python-0.4.34 → hud_python-0.4.36}/hud/utils/tests/test_telemetry.py +0 -0
@@ -22,7 +22,6 @@ uv.lock
22
22
 
23
23
  # Test files
24
24
  /*.ipynb
25
- test.json
26
25
  TODO.md
27
26
 
28
27
  .coverage
@@ -50,4 +49,6 @@ test/
50
49
  /checkpoints/
51
50
  /checkpoints_test/
52
51
  hud/rl/checkpoints/
53
- hud/rl/checkpoints_test/
52
+ hud/rl/checkpoints_test/
53
+
54
+ .ck/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.34
3
+ Version: 0.4.36
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -40,7 +40,7 @@ Requires-Dist: datasets>=2.14.0
40
40
  Requires-Dist: httpx<1,>=0.23.0
41
41
  Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
42
42
  Requires-Dist: hud-mcp-python-sdk>=3.13.2
43
- Requires-Dist: hud-mcp-use-python-sdk>=2.3.16
43
+ Requires-Dist: hud-mcp-use-python-sdk==2.3.19
44
44
  Requires-Dist: numpy>=1.24.0
45
45
  Requires-Dist: openai
46
46
  Requires-Dist: opentelemetry-api>=1.34.1
@@ -50,8 +50,8 @@ Requires-Dist: opentelemetry-sdk>=1.34.1
50
50
  Requires-Dist: pathspec>=0.12.1
51
51
  Requires-Dist: pillow>=11.1.0
52
52
  Requires-Dist: prompt-toolkit==3.0.51
53
- Requires-Dist: pydantic-settings<3,>=2
54
- Requires-Dist: pydantic<3,>=2
53
+ Requires-Dist: pydantic-settings<3,>=2.2
54
+ Requires-Dist: pydantic<3,>=2.6
55
55
  Requires-Dist: questionary==2.1.0
56
56
  Requires-Dist: rich>=13.0.0
57
57
  Requires-Dist: toml>=0.10.2
@@ -59,7 +59,9 @@ Requires-Dist: typer>=0.9.0
59
59
  Requires-Dist: watchfiles>=0.21.0
60
60
  Requires-Dist: wrapt>=1.14.0
61
61
  Provides-Extra: agent
62
+ Requires-Dist: aiodocker>=0.24.0; extra == 'agent'
62
63
  Requires-Dist: dotenv>=0.9.9; extra == 'agent'
64
+ Requires-Dist: inspect-ai>=0.3.80; extra == 'agent'
63
65
  Requires-Dist: ipykernel; extra == 'agent'
64
66
  Requires-Dist: ipython<9; extra == 'agent'
65
67
  Requires-Dist: jupyter-client; extra == 'agent'
@@ -67,8 +69,21 @@ Requires-Dist: jupyter-core; extra == 'agent'
67
69
  Requires-Dist: langchain; extra == 'agent'
68
70
  Requires-Dist: langchain-anthropic; extra == 'agent'
69
71
  Requires-Dist: langchain-openai; extra == 'agent'
72
+ Requires-Dist: pillow>=11.1.0; extra == 'agent'
73
+ Requires-Dist: playwright; extra == 'agent'
74
+ Requires-Dist: pyautogui>=0.9.54; extra == 'agent'
75
+ Requires-Dist: pyright==1.1.401; extra == 'agent'
76
+ Requires-Dist: pytest-asyncio; extra == 'agent'
77
+ Requires-Dist: pytest-cov; extra == 'agent'
78
+ Requires-Dist: pytest-mock; extra == 'agent'
79
+ Requires-Dist: pytest<9,>=8.1.1; extra == 'agent'
80
+ Requires-Dist: ruff>=0.11.8; extra == 'agent'
81
+ Requires-Dist: setuptools; extra == 'agent'
82
+ Requires-Dist: textdistance<5,>=4.5.0; extra == 'agent'
70
83
  Provides-Extra: agents
84
+ Requires-Dist: aiodocker>=0.24.0; extra == 'agents'
71
85
  Requires-Dist: dotenv>=0.9.9; extra == 'agents'
86
+ Requires-Dist: inspect-ai>=0.3.80; extra == 'agents'
72
87
  Requires-Dist: ipykernel; extra == 'agents'
73
88
  Requires-Dist: ipython<9; extra == 'agents'
74
89
  Requires-Dist: jupyter-client; extra == 'agents'
@@ -76,6 +91,17 @@ Requires-Dist: jupyter-core; extra == 'agents'
76
91
  Requires-Dist: langchain; extra == 'agents'
77
92
  Requires-Dist: langchain-anthropic; extra == 'agents'
78
93
  Requires-Dist: langchain-openai; extra == 'agents'
94
+ Requires-Dist: pillow>=11.1.0; extra == 'agents'
95
+ Requires-Dist: playwright; extra == 'agents'
96
+ Requires-Dist: pyautogui>=0.9.54; extra == 'agents'
97
+ Requires-Dist: pyright==1.1.401; extra == 'agents'
98
+ Requires-Dist: pytest-asyncio; extra == 'agents'
99
+ Requires-Dist: pytest-cov; extra == 'agents'
100
+ Requires-Dist: pytest-mock; extra == 'agents'
101
+ Requires-Dist: pytest<9,>=8.1.1; extra == 'agents'
102
+ Requires-Dist: ruff>=0.11.8; extra == 'agents'
103
+ Requires-Dist: setuptools; extra == 'agents'
104
+ Requires-Dist: textdistance<5,>=4.5.0; extra == 'agents'
79
105
  Provides-Extra: dev
80
106
  Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
81
107
  Requires-Dist: dotenv>=0.9.9; extra == 'dev'
@@ -100,14 +126,6 @@ Requires-Dist: setuptools; extra == 'dev'
100
126
  Requires-Dist: textdistance<5,>=4.5.0; extra == 'dev'
101
127
  Provides-Extra: rl
102
128
  Requires-Dist: bitsandbytes>=0.41.0; (sys_platform == 'linux') and extra == 'rl'
103
- Requires-Dist: dotenv>=0.9.9; extra == 'rl'
104
- Requires-Dist: ipykernel; extra == 'rl'
105
- Requires-Dist: ipython<9; extra == 'rl'
106
- Requires-Dist: jupyter-client; extra == 'rl'
107
- Requires-Dist: jupyter-core; extra == 'rl'
108
- Requires-Dist: langchain; extra == 'rl'
109
- Requires-Dist: langchain-anthropic; extra == 'rl'
110
- Requires-Dist: langchain-openai; extra == 'rl'
111
129
  Requires-Dist: liger-kernel>=0.5.0; (sys_platform == 'linux') and extra == 'rl'
112
130
  Requires-Dist: peft>=0.17.1; extra == 'rl'
113
131
  Requires-Dist: vllm==0.10.1.1; extra == 'rl'
@@ -0,0 +1,92 @@
1
+ # test-test
2
+
3
+ ## Environment design pattern
4
+ - Controller (Think of this as a frontend in web development)
5
+ - Creates the UX and manages the lifecycle of an app (in this case for an agent)
6
+ - Define `mcp = MCPServer()` and register `@mcp.tool` as tools the agent can interact with
7
+ - Environment (Think of this as a backend in web development)
8
+ - Owns all long‑lived states of the environment and exposes the environment data structure
9
+ - Expose simple HTTP endpoints (`/health`, `/act`, `/reset`, `/state`)
10
+
11
+ IMPORTANT: Make sure all logs are going to stderr instead of stdio, which is reserved for MCP communication
12
+
13
+ ### Interactive Development
14
+ ```bash
15
+ # 1. Configure your API keys (optional - only needed for evaluation)
16
+ # Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY
17
+
18
+ # 2. Start the environment (optional: with --inspector or --interactive)
19
+ hud dev --build --interactive
20
+
21
+ # 3. Choose your preferred way to test:
22
+
23
+ # Option A: Run the task with Claude (requires ANTHROPIC_API_KEY)
24
+ hud eval tasks.json --agent claude
25
+
26
+ # Option B: Interactive notebook test_env.ipynb (great for learning!)
27
+ # Requires installation:
28
+ pip install hud-python[agents]
29
+
30
+ # Option C: Simple Python script (runs all tasks from tasks.json)
31
+ python test_task.py
32
+ ```
33
+
34
+ ## Layout
35
+ ```
36
+ controller/
37
+ __init__.py # mcp + shared HTTP client
38
+ __main__.py # python -m controller → mcp.run()
39
+ hooks.py # @mcp.initialize / @mcp.shutdown
40
+ tools.py # @mcp.tool act / setup / evaluate
41
+
42
+ ./environment
43
+ ├── __init__.py
44
+ └── server.py # FastAPI app: /health, /act, /reset, /state
45
+ ```
46
+
47
+ ## Publishing Your Environment
48
+
49
+ Once your environment is ready, you can share it with the community:
50
+
51
+ ### 1. Push to Registry
52
+ ```bash
53
+ # Build and push your environment (requires docker hub login and hud api key)
54
+ hud build
55
+ hud push
56
+ ```
57
+
58
+ ### 2. Create a Dataset
59
+
60
+ Create a dataset on HuggingFace with your tasks:
61
+
62
+ **Option A: Upload manually**
63
+ 1. Upload your `tasks.json` to HuggingFace
64
+ 2. Make sure it's **public** to appear on leaderboards
65
+
66
+ **Option B: Use the SDK**
67
+ ```python
68
+ from hud.datasets import save_tasks
69
+ import json
70
+
71
+ # Load your tasks
72
+ with open("tasks.json") as f:
73
+ tasks = json.load(f)
74
+
75
+ # Push to HuggingFace
76
+ save_tasks(tasks, repo_id="your-org/your-dataset")
77
+ ```
78
+
79
+ ### 3. Run and Track Performance
80
+
81
+ ```bash
82
+ # Run Claude on your benchmark
83
+ hud eval "your-org/your-dataset" --agent claude
84
+
85
+ # View results at:
86
+ # app.hud.so/leaderboards/your-org/your-dataset
87
+ ```
88
+
89
+ **Note**: Only public HuggingFace datasets appear as leaderboards!
90
+
91
+ 📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
92
+
@@ -0,0 +1,16 @@
1
+ # Controller
2
+
3
+ Frontend for the agent: defines tools, minimal state, calls the environment over HTTP.
4
+
5
+ What to implement
6
+ - Shared client in `__init__.py` (one `httpx.AsyncClient`)
7
+ - Lifecycle in `hooks.py` (`@mcp.initialize`/`@mcp.shutdown`)
8
+ - Tools in `tools.py` (`@mcp.tool`) — keep logic thin; docstrings = descriptions
9
+
10
+ Run
11
+ ```bash
12
+ hud run controller --transport http --reload
13
+ # Helper endpoints: http://localhost:8765/hud and /hud/tools
14
+ ```
15
+
16
+ Principle: the controller is UX, not state. Keep long‑lived state in the environment.
@@ -0,0 +1,16 @@
1
+ # Environment
2
+
3
+ Backend service: owns state and exposes HTTP APIs the controller calls.
4
+
5
+ Endpoints (FastAPI)
6
+ - `GET /health` → {status: ok}
7
+ - `POST /act` → increments counter and returns {count}
8
+ - `POST /reset` → resets counter
9
+ - `GET /state` → returns {count}
10
+
11
+ Run (dev)
12
+ ```bash
13
+ uv run uvicorn environment.server:app --reload --port 8005
14
+ ```
15
+
16
+ Principle: treat like a backend. Keep long‑lived state here; add endpoints as tools need them.
@@ -0,0 +1,19 @@
1
+ [project]
2
+ name = "test_test"
3
+ version = "0.1.0"
4
+ description = "A minimal HUD environment"
5
+ requires-python = ">=3.11"
6
+ dependencies = [ "hud-python==0.4.36", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",]
7
+
8
+ [build-system]
9
+ requires = [ "hatchling",]
10
+ build-backend = "hatchling.build"
11
+
12
+ [tool.hud]
13
+ image = "test_test:dev"
14
+
15
+ [tool.hatch.metadata]
16
+ allow-direct-references = true
17
+
18
+ [tool.hatch.build.targets.wheel]
19
+ packages = [ "controller", "environment",]
@@ -2,100 +2,99 @@
2
2
 
3
3
  A browser automation environment for HUD that provides GUI access and web app interaction capabilities. This environment supports hot-reloading during development while maintaining persistent state.
4
4
 
5
- ## Architecture Overview
5
+ ## Quick Start
6
6
 
7
- The browser environment uses a two-process architecture:
7
+ ### Interactive Development
8
+ ```bash
9
+ # 1. Configure your API keys (optional - only needed for evaluation)
10
+ # Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY
8
11
 
9
- 1. **Context Server** (`context.py`): Long-running process that maintains persistent state
10
- 2. **MCP Server** (`server.py`): Hot-reloadable process that handles tool requests
12
+ # 2. Start the environment (optional: with inspector)
13
+ hud dev --build --inspector
11
14
 
12
- ### Key Components
15
+ # 3. Choose your preferred way to test:
13
16
 
14
- - **BrowserContext**: Stores persistent state (running apps, ports, playwright instance)
15
- - **ServiceManager**: Manages X11, VNC, and app processes
16
- - **BaseHub Tools**: Setup and evaluate tools organized by app (2048, todo)
17
- - **Multiprocessing Proxy**: Enables state sharing between processes
17
+ # Option A: Run the task with Claude (requires ANTHROPIC_API_KEY)
18
+ hud eval tasks.json --agent claude
18
19
 
19
- ## Context Management and Common Pitfalls
20
+ # Option B: Interactive notebook test_env.ipynb (great for learning!)
21
+ # Requires installation:
22
+ pip install hud-python[agents]
20
23
 
21
- ### Understanding the Proxy System
24
+ # Option C: Simple Python script (runs all tasks from tasks.json)
25
+ python test_task.py
26
+ ```
22
27
 
23
- The browser environment uses Python's `multiprocessing.Manager` to share state between the context server and MCP server. This introduces important constraints:
28
+ ## How HUD Environments Work
24
29
 
25
- #### Common Pitfall: Unpicklable Objects
30
+ The environment is split into two components:
26
31
 
27
- ```python
28
- # BAD: This will fail with "cannot pickle 'coroutine' object"
29
- @setup.tool("my_tool")
30
- async def my_tool():
31
- env = setup.env
32
- result = await env.call_app_api("app", "/api/endpoint") # Returns coroutine
33
- # The coroutine can't be serialized through the proxy!
34
- ```
32
+ - **`env.py`** - Stateful logic that persists across reloads
33
+ - **`server.py`** - MCP server with tools (reloads on file changes)
35
34
 
36
- #### Solution: Direct HTTP Calls
35
+ This separation is crucial for `hud dev` - it allows you to modify the MCP tools and see changes immediately without losing the environment state. The environment runs as a separate process and communicates via socket, while the server can be restarted freely.
37
36
 
38
- ```python
39
- # GOOD: Make HTTP calls directly
40
- @setup.tool("my_tool")
41
- async def my_tool():
42
- import httpx
43
-
44
- # Get the backend port from persistent context
45
- persistent_ctx = setup.env
46
- backend_port = persistent_ctx.get_app_backend_port("app")
47
-
48
- # Make API call directly
49
- url = f"http://localhost:{backend_port}/api/endpoint"
50
- async with httpx.AsyncClient() as client:
51
- response = await client.get(url)
52
- response.raise_for_status()
53
- result = response.json()
54
- ```
37
+ If you are ever seeing issues with the environment itself, running `hud dev --full-reload` will reload both the environment and the server.
55
38
 
56
- ### State Synchronization Issues
39
+ ## Publishing Your Environment
57
40
 
58
- #### Common Pitfall: Direct List/Dict Manipulation
41
+ Once your environment is ready, you can share it with the community:
59
42
 
60
- ```python
61
- # BAD: Regular Python lists don't sync through proxy
62
- class ServiceManager:
63
- def __init__(self):
64
- self._launched_apps = [] # Won't sync!
43
+ ### 1. Push to Registry
44
+ ```bash
45
+ # Build and push your environment (requires docker hub login and hud api key)
46
+ hud build
47
+ hud push
65
48
  ```
66
49
 
67
- #### Solution: Store State in Persistent Context
50
+ ### 2. Create a Dataset
68
51
 
52
+ Create a dataset on HuggingFace with your tasks:
53
+
54
+ **Option A: Upload manually**
55
+ 1. Upload your `tasks.json` to HuggingFace
56
+ 2. Make sure it's **public** to appear on leaderboards
57
+
58
+ **Option B: Use the SDK**
69
59
  ```python
70
- # GOOD: Use the persistent context for shared state
71
- class BrowserContext:
72
- def __init__(self):
73
- self._running_apps: List[str] = []
74
- self._app_ports: Dict[str, Dict[str, int]] = {}
75
-
76
- def add_running_app(self, app_name: str) -> None:
77
- """Add app to running list."""
78
- if app_name not in self._running_apps:
79
- self._running_apps.append(app_name)
60
+ from hud.datasets import save_tasks
61
+ import json
62
+
63
+ # Load your tasks
64
+ with open("tasks.json") as f:
65
+ tasks = json.load(f)
66
+
67
+ # Push to HuggingFace
68
+ save_tasks(tasks, repo_id="your-org/your-dataset")
80
69
  ```
81
70
 
82
- ### Accessing Shared Resources
71
+ ### 3. Run and Track Performance
83
72
 
84
- #### ❌ Common Pitfall: Direct Attribute Access
73
+ ```bash
74
+ # Run Claude on your benchmark
75
+ hud eval "your-org/your-dataset" --agent claude
85
76
 
86
- ```python
87
- # BAD: Direct attribute access on proxy objects
88
- playwright_tool = env.playwright # May not work with proxy
77
+ # View results at:
78
+ # app.hud.so/leaderboards/your-org/your-dataset
89
79
  ```
90
80
 
91
- #### Solution: Use Getter Methods
81
+ **Note**: Only public HuggingFace datasets appear as leaderboards!
92
82
 
93
- ```python
94
- # GOOD: Use proxy-friendly getter methods
95
- playwright_tool = persistent_ctx.get_playwright_tool()
96
- ```
83
+ 📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
97
84
 
98
- ## Best Practices
85
+ ## Architecture Overview
86
+
87
+ The browser environment uses a two-process architecture:
88
+
89
+ 1. **Context Server** (`context.py`): Long-running process that maintains persistent state
90
+ 2. **MCP Server** (`server.py`): Hot-reloadable process that handles tool requests
91
+
92
+ ### Key Components
93
+
94
+ - **BrowserContext**: Stores persistent state (running apps, ports, playwright instance)
95
+ - **ServiceManager**: Manages X11, VNC, and app processes
96
+ - **BaseHub Tools**: Setup and evaluate tools organized by app (2048, todo)
97
+ - **Multiprocessing Proxy**: Enables state sharing between processes
99
98
 
100
99
  ### 1. Tool Implementation Pattern
101
100
 
@@ -166,26 +165,6 @@ from . import setup
166
165
  # Not inside functions
167
166
  ```
168
167
 
169
- ## Troubleshooting
170
-
171
- ### "Cannot pickle 'coroutine' object"
172
-
173
- **Cause**: Trying to return an async function result through the proxy.
174
-
175
- **Fix**: Don't use async methods on proxied objects. Make direct HTTP calls instead.
176
-
177
- ### "App not launched" errors
178
-
179
- **Cause**: State synchronization issue between ServiceManager and persistent context.
180
-
181
- **Fix**: Ensure `launch_app` stores app info in the persistent context, and setup/evaluate tools check the persistent context's app list.
182
-
183
- ### "Object has no attribute" on proxy objects
184
-
185
- **Cause**: Direct attribute access on multiprocessing proxy objects.
186
-
187
- **Fix**: Use getter/setter methods instead of direct attribute access.
188
-
189
168
  ## Development Workflow
190
169
 
191
170
  1. **Start the environment**: `hud dev`
@@ -0,0 +1,20 @@
1
+ [project]
2
+ name = "browser-environment"
3
+ version = "0.1.0"
4
+ description = "Browser environment server for managing X11, VNC, and applications"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "fastapi>=0.104.1",
8
+ "uvicorn[standard]>=0.24.0",
9
+ "httpx>=0.25.2",
10
+ "pydantic>=2.6,<3",
11
+ "pydantic-settings>=2.2,<3",
12
+ "python-multipart>=0.0.6",
13
+ ]
14
+
15
+ [build-system]
16
+ requires = ["hatchling"]
17
+ build-backend = "hatchling.build"
18
+
19
+ [tool.hatch.build.targets.wheel]
20
+ packages = ["controller", "environment"]
@@ -0,0 +1,22 @@
1
+ [project]
2
+ name = "hud-browser-controller"
3
+ version = "0.1.0"
4
+ description = "HUD Browser Controller - MCP interface for browser environments"
5
+ requires-python = ">=3.11,<3.14"
6
+ dependencies = [ "pydantic>=2.6,<3", "pydantic-settings>=2.2,<3", "hud-python@git+https://github.com/hud-evals/hud-python@env-cli-improvements", "playwright", "pyautogui", "httpx", "typer", "fastapi", "uvicorn",]
7
+
8
+ [build-system]
9
+ requires = [ "hatchling",]
10
+ build-backend = "hatchling.build"
11
+
12
+ [project.scripts]
13
+ hud-browser-controller = "controller.__main__:main"
14
+
15
+ [tool.hud]
16
+ image = "hud-browser:dev"
17
+
18
+ [tool.hatch.metadata]
19
+ allow-direct-references = true
20
+
21
+ [tool.hatch.build.targets.wheel]
22
+ packages = [ "controller", "problems",]
@@ -0,0 +1,19 @@
1
+ [project]
2
+ name = "deepresearch"
3
+ version = "0.1.0"
4
+ description = "DeepResearch HUD environment with HTTP backend (EXA on server)"
5
+ requires-python = ">=3.11"
6
+ dependencies = [ "hud-python==0.4.36", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "httpx>=0.24.0",]
7
+
8
+ [build-system]
9
+ requires = [ "hatchling",]
10
+ build-backend = "hatchling.build"
11
+
12
+ [tool.hud]
13
+ image = "deepresearch:dev"
14
+
15
+ [tool.hatch.metadata]
16
+ allow-direct-references = true
17
+
18
+ [tool.hatch.build.targets.wheel]
19
+ packages = [ "controller", "environment",]
@@ -6,7 +6,7 @@ import copy
6
6
  import logging
7
7
  from typing import TYPE_CHECKING, Any, ClassVar, cast
8
8
 
9
- from anthropic import AsyncAnthropic, BadRequestError
9
+ from anthropic import Anthropic, AsyncAnthropic, BadRequestError
10
10
  from anthropic.types.beta import BetaContentBlockParam, BetaImageBlockParam, BetaTextBlockParam
11
11
 
12
12
  import hud
@@ -54,6 +54,7 @@ class ClaudeAgent(MCPAgent):
54
54
  model: str = "claude-sonnet-4-20250514",
55
55
  max_tokens: int = 4096,
56
56
  use_computer_beta: bool = True,
57
+ validate_api_key: bool = True,
57
58
  **kwargs: Any,
58
59
  ) -> None:
59
60
  """
@@ -75,6 +76,13 @@ class ClaudeAgent(MCPAgent):
75
76
  raise ValueError("Anthropic API key not found. Set ANTHROPIC_API_KEY.")
76
77
  model_client = AsyncAnthropic(api_key=api_key)
77
78
 
79
+ # validate api key if requested
80
+ if validate_api_key:
81
+ try:
82
+ Anthropic(api_key=model_client.api_key).models.list()
83
+ except Exception as e:
84
+ raise ValueError(f"Anthropic API key is invalid: {e}") from e
85
+
78
86
  self.anthropic_client = model_client
79
87
  self.model = model
80
88
  self.max_tokens = max_tokens
@@ -6,7 +6,7 @@ import logging
6
6
  from typing import Any, ClassVar, Literal
7
7
 
8
8
  import mcp.types as types
9
- from openai import AsyncOpenAI
9
+ from openai import AsyncOpenAI, OpenAI
10
10
  from openai.types.responses import (
11
11
  ResponseComputerToolCall,
12
12
  ResponseInputMessageContentListParam,
@@ -45,6 +45,7 @@ class OperatorAgent(MCPAgent):
45
45
  model_client: AsyncOpenAI | None = None,
46
46
  model: str = "computer-use-preview",
47
47
  environment: Literal["windows", "mac", "linux", "browser"] = "linux",
48
+ validate_api_key: bool = True,
48
49
  **kwargs: Any,
49
50
  ) -> None:
50
51
  """
@@ -76,6 +77,13 @@ class OperatorAgent(MCPAgent):
76
77
  self.pending_call_id: str | None = None
77
78
  self.pending_safety_checks: list[Any] = []
78
79
 
80
+ # validate api key if requested
81
+ if validate_api_key:
82
+ try:
83
+ OpenAI(api_key=self.openai_client.api_key).models.list()
84
+ except Exception as e:
85
+ raise ValueError(f"OpenAI API key is invalid: {e}") from e
86
+
79
87
  self.model_name = "openai-" + self.model
80
88
 
81
89
  # Append OpenAI-specific instructions to the base system prompt
@@ -86,6 +86,7 @@ class TestClaudeAgent:
86
86
  model_client=mock_model_client,
87
87
  model="claude-3-opus-20240229",
88
88
  max_tokens=1000,
89
+ validate_api_key=False, # Skip validation in tests
89
90
  )
90
91
 
91
92
  assert agent.model_name == "claude-3-opus-20240229"
@@ -93,10 +94,14 @@ class TestClaudeAgent:
93
94
  assert agent.anthropic_client == mock_model_client
94
95
 
95
96
  @pytest.mark.asyncio
96
- async def test_init_without_model_client(self, mock_mcp_client):
97
+ async def test_init_without_model_client(self, mock_mcp_client, mock_anthropic):
97
98
  """Test agent initialization without model client."""
98
99
  with patch("hud.settings.settings.anthropic_api_key", "test_key"):
99
- agent = ClaudeAgent(mcp_client=mock_mcp_client, model="claude-3-opus-20240229")
100
+ agent = ClaudeAgent(
101
+ mcp_client=mock_mcp_client,
102
+ model="claude-3-opus-20240229",
103
+ validate_api_key=False, # Skip validation in tests
104
+ )
100
105
 
101
106
  assert agent.model_name == "claude-3-opus-20240229"
102
107
  assert agent.anthropic_client is not None
@@ -105,7 +110,11 @@ class TestClaudeAgent:
105
110
  async def test_format_blocks(self, mock_mcp_client):
106
111
  """Test formatting content blocks into Claude messages."""
107
112
  mock_model_client = MagicMock()
108
- agent = ClaudeAgent(mcp_client=mock_mcp_client, model_client=mock_model_client)
113
+ agent = ClaudeAgent(
114
+ mcp_client=mock_mcp_client,
115
+ model_client=mock_model_client,
116
+ validate_api_key=False, # Skip validation in tests
117
+ )
109
118
 
110
119
  # Test with text only
111
120
  text_blocks: list[types.ContentBlock] = [
@@ -141,7 +150,11 @@ class TestClaudeAgent:
141
150
  async def test_format_tool_results_method(self, mock_mcp_client):
142
151
  """Test the agent's format_tool_results method."""
143
152
  mock_model_client = MagicMock()
144
- agent = ClaudeAgent(mcp_client=mock_mcp_client, model_client=mock_model_client)
153
+ agent = ClaudeAgent(
154
+ mcp_client=mock_mcp_client,
155
+ model_client=mock_model_client,
156
+ validate_api_key=False, # Skip validation in tests
157
+ )
145
158
 
146
159
  tool_calls = [
147
160
  MCPToolCall(name="test_tool", arguments={}, id="id1"),
@@ -171,7 +184,11 @@ class TestClaudeAgent:
171
184
  """Test getting model response from Claude API."""
172
185
  # Disable telemetry for this test to avoid backend configuration issues
173
186
  with patch("hud.settings.settings.telemetry_enabled", False):
174
- agent = ClaudeAgent(mcp_client=mock_mcp_client, model_client=mock_anthropic)
187
+ agent = ClaudeAgent(
188
+ mcp_client=mock_mcp_client,
189
+ model_client=mock_anthropic,
190
+ validate_api_key=False, # Skip validation in tests
191
+ )
175
192
 
176
193
  # Mock the API response
177
194
  mock_response = MagicMock()
@@ -215,7 +232,11 @@ class TestClaudeAgent:
215
232
  """Test getting text-only response."""
216
233
  # Disable telemetry for this test to avoid backend configuration issues
217
234
  with patch("hud.settings.settings.telemetry_enabled", False):
218
- agent = ClaudeAgent(mcp_client=mock_mcp_client, model_client=mock_anthropic)
235
+ agent = ClaudeAgent(
236
+ mcp_client=mock_mcp_client,
237
+ model_client=mock_anthropic,
238
+ validate_api_key=False, # Skip validation in tests
239
+ )
219
240
 
220
241
  mock_response = MagicMock()
221
242
  # Create text block
@@ -242,7 +263,11 @@ class TestClaudeAgent:
242
263
  """Test handling API errors."""
243
264
  # Disable telemetry for this test to avoid backend configuration issues
244
265
  with patch("hud.settings.settings.telemetry_enabled", False):
245
- agent = ClaudeAgent(mcp_client=mock_mcp_client, model_client=mock_anthropic)
266
+ agent = ClaudeAgent(
267
+ mcp_client=mock_mcp_client,
268
+ model_client=mock_anthropic,
269
+ validate_api_key=False, # Skip validation in tests
270
+ )
246
271
 
247
272
  # Mock API error
248
273
  mock_anthropic.beta.messages.create = AsyncMock(