hud-python 0.4.52__tar.gz → 0.4.54__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (299) hide show
  1. {hud_python-0.4.52 → hud_python-0.4.54}/PKG-INFO +49 -49
  2. {hud_python-0.4.52 → hud_python-0.4.54}/README.md +47 -48
  3. {hud_python-0.4.52 → hud_python-0.4.54}/environments/README.md +2 -2
  4. {hud_python-0.4.52 → hud_python-0.4.54}/environments/blank/README.md +9 -2
  5. {hud_python-0.4.52 → hud_python-0.4.54}/environments/blank/server/pyproject.toml +1 -1
  6. {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/environment/todo/README.md +2 -2
  7. {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/server/pyproject.toml +1 -1
  8. {hud_python-0.4.52 → hud_python-0.4.54}/environments/deepresearch/server/pyproject.toml +1 -1
  9. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/base.py +9 -2
  10. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/openai_chat_generic.py +15 -3
  11. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/tests/test_base.py +15 -0
  12. hud_python-0.4.54/hud/agents/tests/test_base_runtime.py +164 -0
  13. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/__init__.py +20 -12
  14. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/build.py +35 -27
  15. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/dev.py +13 -31
  16. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/eval.py +85 -84
  17. hud_python-0.4.54/hud/cli/tests/test_analyze_module.py +120 -0
  18. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_build.py +24 -2
  19. hud_python-0.4.54/hud/cli/tests/test_build_failure.py +41 -0
  20. hud_python-0.4.54/hud/cli/tests/test_build_module.py +50 -0
  21. hud_python-0.4.54/hud/cli/tests/test_cli_more_wrappers.py +30 -0
  22. hud_python-0.4.54/hud/cli/tests/test_cli_root.py +134 -0
  23. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_eval.py +6 -6
  24. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_mcp_server.py +8 -7
  25. hud_python-0.4.54/hud/cli/tests/test_push_happy.py +74 -0
  26. hud_python-0.4.54/hud/cli/tests/test_push_wrapper.py +23 -0
  27. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/docker.py +120 -1
  28. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/runner.py +1 -1
  29. hud_python-0.4.54/hud/cli/utils/tests/test_config.py +58 -0
  30. hud_python-0.4.54/hud/cli/utils/tests/test_docker.py +93 -0
  31. hud_python-0.4.54/hud/cli/utils/tests/test_docker_hints.py +71 -0
  32. hud_python-0.4.54/hud/cli/utils/tests/test_env_check.py +74 -0
  33. hud_python-0.4.54/hud/cli/utils/tests/test_environment.py +42 -0
  34. hud_python-0.4.54/hud/cli/utils/tests/test_interactive_module.py +60 -0
  35. hud_python-0.4.54/hud/cli/utils/tests/test_local_runner.py +50 -0
  36. hud_python-0.4.54/hud/cli/utils/tests/test_logging_utils.py +23 -0
  37. hud_python-0.4.54/hud/cli/utils/tests/test_metadata.py +49 -0
  38. hud_python-0.4.54/hud/cli/utils/tests/test_package_runner.py +35 -0
  39. hud_python-0.4.54/hud/cli/utils/tests/test_registry_utils.py +49 -0
  40. hud_python-0.4.54/hud/cli/utils/tests/test_remote_runner.py +25 -0
  41. hud_python-0.4.54/hud/cli/utils/tests/test_runner_modules.py +52 -0
  42. hud_python-0.4.54/hud/cli/utils/tests/test_source_hash.py +36 -0
  43. hud_python-0.4.54/hud/cli/utils/tests/test_tasks.py +80 -0
  44. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/version_check.py +2 -2
  45. hud_python-0.4.54/hud/datasets/tests/test_runner.py +106 -0
  46. hud_python-0.4.54/hud/datasets/tests/test_utils.py +228 -0
  47. hud_python-0.4.54/hud/otel/tests/test_instrumentation.py +207 -0
  48. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_server_extra.py +2 -0
  49. {hud_python-0.4.52 → hud_python-0.4.54}/hud/shared/exceptions.py +35 -4
  50. {hud_python-0.4.52 → hud_python-0.4.54}/hud/shared/hints.py +25 -0
  51. {hud_python-0.4.52 → hud_python-0.4.54}/hud/shared/requests.py +15 -3
  52. hud_python-0.4.54/hud/shared/tests/__init__.py +0 -0
  53. {hud_python-0.4.52 → hud_python-0.4.54}/hud/shared/tests/test_exceptions.py +31 -23
  54. hud_python-0.4.54/hud/shared/tests/test_hints.py +167 -0
  55. hud_python-0.4.54/hud/telemetry/tests/__init__.py +0 -0
  56. hud_python-0.4.54/hud/telemetry/tests/test_async_context.py +242 -0
  57. hud_python-0.4.54/hud/telemetry/tests/test_instrument.py +414 -0
  58. hud_python-0.4.54/hud/telemetry/tests/test_job.py +609 -0
  59. hud_python-0.4.54/hud/telemetry/tests/test_trace.py +241 -0
  60. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/computer/settings.py +2 -2
  61. hud_python-0.4.54/hud/tools/tests/test_submit.py +85 -0
  62. hud_python-0.4.54/hud/tools/tests/test_types.py +193 -0
  63. {hud_python-0.4.52 → hud_python-0.4.54}/hud/types.py +17 -1
  64. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/agent_factories.py +1 -3
  65. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/mcp.py +1 -1
  66. hud_python-0.4.54/hud/utils/tests/__init__.py +0 -0
  67. hud_python-0.4.54/hud/utils/tests/test_agent_factories.py +60 -0
  68. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tests/test_mcp.py +4 -6
  69. hud_python-0.4.54/hud/utils/tests/test_pretty_errors.py +186 -0
  70. hud_python-0.4.54/hud/utils/tests/test_tasks.py +187 -0
  71. hud_python-0.4.54/hud/utils/tests/test_tool_shorthand.py +154 -0
  72. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tests/test_version.py +1 -1
  73. {hud_python-0.4.52 → hud_python-0.4.54}/hud/version.py +1 -1
  74. {hud_python-0.4.52 → hud_python-0.4.54}/pyproject.toml +17 -3
  75. hud_python-0.4.52/hud/otel/tests/__init__.py +0 -1
  76. hud_python-0.4.52/hud/telemetry/tests/test_trace.py +0 -63
  77. {hud_python-0.4.52 → hud_python-0.4.54}/.gitignore +0 -0
  78. {hud_python-0.4.52 → hud_python-0.4.54}/LICENSE +0 -0
  79. {hud_python-0.4.52 → hud_python-0.4.54}/environments/blank/environment/README.md +0 -0
  80. {hud_python-0.4.52 → hud_python-0.4.54}/environments/blank/environment/pyproject.toml +0 -0
  81. {hud_python-0.4.52 → hud_python-0.4.54}/environments/blank/server/README.md +0 -0
  82. {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/README.md +0 -0
  83. {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/environment/2048/README.md +0 -0
  84. {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
  85. {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/environment/README.md +0 -0
  86. {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/environment/pyproject.toml +0 -0
  87. {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
  88. {hud_python-0.4.52 → hud_python-0.4.54}/environments/browser/pyproject.toml +0 -0
  89. {hud_python-0.4.52 → hud_python-0.4.54}/environments/deepresearch/README.md +0 -0
  90. {hud_python-0.4.52 → hud_python-0.4.54}/environments/deepresearch/environment/pyproject.toml +0 -0
  91. {hud_python-0.4.52 → hud_python-0.4.54}/environments/deepresearch/pyproject.toml +0 -0
  92. {hud_python-0.4.52 → hud_python-0.4.54}/environments/remote_browser/README.md +0 -0
  93. {hud_python-0.4.52 → hud_python-0.4.54}/environments/remote_browser/pyproject.toml +0 -0
  94. {hud_python-0.4.52 → hud_python-0.4.54}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  95. {hud_python-0.4.52 → hud_python-0.4.54}/environments/text_2048/README.md +0 -0
  96. {hud_python-0.4.52 → hud_python-0.4.54}/environments/text_2048/pyproject.toml +0 -0
  97. {hud_python-0.4.52 → hud_python-0.4.54}/examples/README.md +0 -0
  98. {hud_python-0.4.52 → hud_python-0.4.54}/hud/__init__.py +0 -0
  99. {hud_python-0.4.52 → hud_python-0.4.54}/hud/__main__.py +0 -0
  100. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/__init__.py +0 -0
  101. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/claude.py +0 -0
  102. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/grounded_openai.py +0 -0
  103. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/langchain.py +0 -0
  104. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/lite_llm.py +0 -0
  105. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/misc/__init__.py +0 -0
  106. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/misc/integration_test_agent.py +0 -0
  107. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/misc/response_agent.py +0 -0
  108. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/openai.py +0 -0
  109. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/tests/__init__.py +0 -0
  110. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/tests/test_claude.py +0 -0
  111. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/tests/test_client.py +0 -0
  112. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  113. {hud_python-0.4.52 → hud_python-0.4.54}/hud/agents/tests/test_openai.py +0 -0
  114. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/__main__.py +0 -0
  115. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/analyze.py +0 -0
  116. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/clone.py +0 -0
  117. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/debug.py +0 -0
  118. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/flows/__init__.py +0 -0
  119. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/flows/tasks.py +0 -0
  120. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/get.py +0 -0
  121. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/init.py +0 -0
  122. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/list_func.py +0 -0
  123. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/pull.py +0 -0
  124. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/push.py +0 -0
  125. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/remove.py +0 -0
  126. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/__init__.py +0 -0
  127. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/celebrate.py +0 -0
  128. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/config.py +0 -0
  129. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/display.py +0 -0
  130. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/gpu.py +0 -0
  131. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/gpu_utils.py +0 -0
  132. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/local_runner.py +0 -0
  133. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/presets.py +0 -0
  134. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/remote_runner.py +0 -0
  135. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/rl_api.py +0 -0
  136. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/viewer.py +0 -0
  137. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/vllm.py +0 -0
  138. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/rl/wait_utils.py +0 -0
  139. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/__init__.py +0 -0
  140. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_analyze.py +0 -0
  141. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_analyze_metadata.py +0 -0
  142. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_cli_init.py +0 -0
  143. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_cli_main.py +0 -0
  144. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_clone.py +0 -0
  145. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_cursor.py +0 -0
  146. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_debug.py +0 -0
  147. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_list_func.py +0 -0
  148. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_main_module.py +0 -0
  149. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_pull.py +0 -0
  150. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_push.py +0 -0
  151. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_registry.py +0 -0
  152. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/tests/test_utils.py +0 -0
  153. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/__init__.py +0 -0
  154. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/config.py +0 -0
  155. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/cursor.py +0 -0
  156. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/env_check.py +0 -0
  157. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/environment.py +0 -0
  158. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/interactive.py +0 -0
  159. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/local_runner.py +0 -0
  160. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/logging.py +0 -0
  161. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/metadata.py +0 -0
  162. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/package_runner.py +0 -0
  163. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/registry.py +0 -0
  164. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/remote_runner.py +0 -0
  165. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/server.py +0 -0
  166. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/source_hash.py +0 -0
  167. {hud_python-0.4.52 → hud_python-0.4.54}/hud/cli/utils/tasks.py +0 -0
  168. {hud_python-0.4.52/hud/shared → hud_python-0.4.54/hud/cli/utils}/tests/__init__.py +0 -0
  169. {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/README.md +0 -0
  170. {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/__init__.py +0 -0
  171. {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/base.py +0 -0
  172. {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/fastmcp.py +0 -0
  173. {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/mcp_use.py +0 -0
  174. {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/tests/__init__.py +0 -0
  175. {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/tests/test_client_integration.py +0 -0
  176. {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/tests/test_fastmcp.py +0 -0
  177. {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/tests/test_mcp_use_retry.py +0 -0
  178. {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/tests/test_protocol.py +0 -0
  179. {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/utils/__init__.py +0 -0
  180. {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/utils/mcp_use_retry.py +0 -0
  181. {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/utils/retry.py +0 -0
  182. {hud_python-0.4.52 → hud_python-0.4.54}/hud/clients/utils/retry_transport.py +0 -0
  183. {hud_python-0.4.52 → hud_python-0.4.54}/hud/datasets/__init__.py +0 -0
  184. {hud_python-0.4.52 → hud_python-0.4.54}/hud/datasets/parallel.py +0 -0
  185. {hud_python-0.4.52 → hud_python-0.4.54}/hud/datasets/runner.py +0 -0
  186. {hud_python-0.4.52/hud/telemetry → hud_python-0.4.54/hud/datasets}/tests/__init__.py +0 -0
  187. {hud_python-0.4.52 → hud_python-0.4.54}/hud/datasets/utils.py +0 -0
  188. {hud_python-0.4.52 → hud_python-0.4.54}/hud/misc/__init__.py +0 -0
  189. {hud_python-0.4.52 → hud_python-0.4.54}/hud/misc/claude_plays_pokemon.py +0 -0
  190. {hud_python-0.4.52 → hud_python-0.4.54}/hud/native/__init__.py +0 -0
  191. {hud_python-0.4.52 → hud_python-0.4.54}/hud/native/comparator.py +0 -0
  192. {hud_python-0.4.52 → hud_python-0.4.54}/hud/native/tests/__init__.py +0 -0
  193. {hud_python-0.4.52 → hud_python-0.4.54}/hud/native/tests/test_comparator.py +0 -0
  194. {hud_python-0.4.52 → hud_python-0.4.54}/hud/native/tests/test_native_init.py +0 -0
  195. {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/__init__.py +0 -0
  196. {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/collector.py +0 -0
  197. {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/config.py +0 -0
  198. {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/context.py +0 -0
  199. {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/exporters.py +0 -0
  200. {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/instrumentation.py +0 -0
  201. {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/processors.py +0 -0
  202. {hud_python-0.4.52/hud/utils → hud_python-0.4.54/hud/otel}/tests/__init__.py +0 -0
  203. {hud_python-0.4.52 → hud_python-0.4.54}/hud/otel/tests/test_processors.py +0 -0
  204. {hud_python-0.4.52 → hud_python-0.4.54}/hud/py.typed +0 -0
  205. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/README.md +0 -0
  206. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/__init__.py +0 -0
  207. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/actor.py +0 -0
  208. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/buffer.py +0 -0
  209. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/chat_template.jinja +0 -0
  210. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/config.py +0 -0
  211. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/distributed.py +0 -0
  212. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/learner.py +0 -0
  213. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/tests/__init__.py +0 -0
  214. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/tests/test_learner.py +0 -0
  215. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/train.py +0 -0
  216. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/types.py +0 -0
  217. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/utils/start_vllm_server.sh +0 -0
  218. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/utils.py +0 -0
  219. {hud_python-0.4.52 → hud_python-0.4.54}/hud/rl/vllm_adapter.py +0 -0
  220. {hud_python-0.4.52 → hud_python-0.4.54}/hud/samples/__init__.py +0 -0
  221. {hud_python-0.4.52 → hud_python-0.4.54}/hud/samples/browser.py +0 -0
  222. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/__init__.py +0 -0
  223. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/context.py +0 -0
  224. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/helper/__init__.py +0 -0
  225. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/low_level.py +0 -0
  226. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/router.py +0 -0
  227. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/server.py +0 -0
  228. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/__init__.py +0 -0
  229. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_add_tool.py +0 -0
  230. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_context.py +0 -0
  231. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_mcp_server_handlers.py +0 -0
  232. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_mcp_server_integration.py +0 -0
  233. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_mcp_server_more.py +0 -0
  234. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_run_wrapper.py +0 -0
  235. {hud_python-0.4.52 → hud_python-0.4.54}/hud/server/tests/test_sigterm_runner.py +0 -0
  236. {hud_python-0.4.52 → hud_python-0.4.54}/hud/settings.py +0 -0
  237. {hud_python-0.4.52 → hud_python-0.4.54}/hud/shared/__init__.py +0 -0
  238. {hud_python-0.4.52 → hud_python-0.4.54}/hud/shared/tests/test_requests.py +0 -0
  239. {hud_python-0.4.52 → hud_python-0.4.54}/hud/telemetry/__init__.py +0 -0
  240. {hud_python-0.4.52 → hud_python-0.4.54}/hud/telemetry/async_context.py +0 -0
  241. {hud_python-0.4.52 → hud_python-0.4.54}/hud/telemetry/instrument.py +0 -0
  242. {hud_python-0.4.52 → hud_python-0.4.54}/hud/telemetry/job.py +0 -0
  243. {hud_python-0.4.52 → hud_python-0.4.54}/hud/telemetry/replay.py +0 -0
  244. {hud_python-0.4.52 → hud_python-0.4.54}/hud/telemetry/tests/test_replay.py +0 -0
  245. {hud_python-0.4.52 → hud_python-0.4.54}/hud/telemetry/trace.py +0 -0
  246. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/__init__.py +0 -0
  247. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/base.py +0 -0
  248. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/bash.py +0 -0
  249. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/computer/__init__.py +0 -0
  250. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/computer/anthropic.py +0 -0
  251. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/computer/hud.py +0 -0
  252. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/computer/openai.py +0 -0
  253. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/computer/qwen.py +0 -0
  254. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/edit.py +0 -0
  255. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/executors/__init__.py +0 -0
  256. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/executors/base.py +0 -0
  257. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/executors/pyautogui.py +0 -0
  258. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/executors/tests/__init__.py +0 -0
  259. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/executors/tests/test_base_executor.py +0 -0
  260. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  261. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/executors/xdo.py +0 -0
  262. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/grounding/__init__.py +0 -0
  263. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/grounding/config.py +0 -0
  264. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/grounding/grounded_tool.py +0 -0
  265. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/grounding/grounder.py +0 -0
  266. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/grounding/tests/__init__.py +0 -0
  267. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  268. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/playwright.py +0 -0
  269. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/response.py +0 -0
  270. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/submit.py +0 -0
  271. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/__init__.py +0 -0
  272. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_base.py +0 -0
  273. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_bash.py +0 -0
  274. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_bash_extended.py +0 -0
  275. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_computer.py +0 -0
  276. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_computer_actions.py +0 -0
  277. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_edit.py +0 -0
  278. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_init.py +0 -0
  279. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_playwright_tool.py +0 -0
  280. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_response.py +0 -0
  281. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_tools.py +0 -0
  282. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_tools_init.py +0 -0
  283. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/tests/test_utils.py +0 -0
  284. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/types.py +0 -0
  285. {hud_python-0.4.52 → hud_python-0.4.54}/hud/tools/utils.py +0 -0
  286. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/__init__.py +0 -0
  287. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/async_utils.py +0 -0
  288. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/group_eval.py +0 -0
  289. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/hud_console.py +0 -0
  290. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/pretty_errors.py +0 -0
  291. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/progress.py +0 -0
  292. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/task_tracking.py +0 -0
  293. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tasks.py +0 -0
  294. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/telemetry.py +0 -0
  295. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tests/test_async_utils.py +0 -0
  296. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tests/test_init.py +0 -0
  297. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tests/test_progress.py +0 -0
  298. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tests/test_telemetry.py +0 -0
  299. {hud_python-0.4.52 → hud_python-0.4.54}/hud/utils/tool_shorthand.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.52
3
+ Version: 0.4.54
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -42,6 +42,7 @@ Requires-Dist: httpx<1,>=0.23.0
42
42
  Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
43
43
  Requires-Dist: hud-mcp-python-sdk>=3.13.2
44
44
  Requires-Dist: hud-mcp-use-python-sdk==2.3.20
45
+ Requires-Dist: langchain==0.3.27
45
46
  Requires-Dist: numpy>=1.24.0
46
47
  Requires-Dist: openai
47
48
  Requires-Dist: opentelemetry-api>=1.34.1
@@ -160,12 +161,12 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
160
161
 
161
162
  ## Highlights
162
163
 
163
- - 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
164
164
  - 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
165
165
  - ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
166
166
  - 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
167
167
  - 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
168
168
  - 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
169
+ - 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
169
170
 
170
171
  > We welcome contributors and feature requests – open an issue or hop on a call to discuss improvements!
171
172
 
@@ -186,29 +187,6 @@ uv tool install hud-python
186
187
  Before starting, get your HUD_API_KEY at [hud.so](https://hud.so).
187
188
 
188
189
 
189
- ## Quickstart: Training
190
-
191
- RL using GRPO a Qwen2.5-VL model on any hud dataset:
192
-
193
- ```bash
194
- hud get hud-evals/basic-2048 # from HF
195
- hud rl basic-2048.json
196
- ```
197
-
198
- > See [agent training docs](https://docs.hud.so/train-agents/quickstart)
199
-
200
- Or make your own environment and dataset:
201
-
202
- ```bash
203
- hud init my-env && cd my-env
204
- hud dev --interactive
205
- # When ready to run:
206
- hud rl
207
- ```
208
-
209
- > See [environment design docs](https://docs.hud.so/build-environments)
210
-
211
-
212
190
  ## Quickstart: Evals
213
191
 
214
192
  For a tutorial that explains the agent and evaluation design, run:
@@ -265,38 +243,27 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
265
243
 
266
244
  ![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif)
267
245
 
268
- ## Reinforcement Learning with GRPO
269
-
270
- This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
271
-
272
- ![RL curve](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/rl_2.png)
246
+ ## Quickstart: Training
273
247
 
274
- Train with the new interactive `hud rl` flow:
248
+ RL using GRPO a Qwen2.5-VL model on any hud dataset:
275
249
 
276
250
  ```bash
277
- # Install CLI
278
- uv tool install hud-python
279
-
280
- # Option A: Run directly from a HuggingFace dataset
281
- hud rl hud-evals/basic-2048
282
-
283
- # Option B: Download first, modify, then train
284
- hud get hud-evals/basic-2048
285
- hud rl basic-2048.json
286
-
287
- # Optional: baseline evaluation
288
- hud eval basic-2048.json
251
+ hud get hud-evals/2048-basic # from HF
252
+ hud rl 2048-basic.json
289
253
  ```
290
254
 
291
- Supports multi‑turn RL for both:
292
- - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
293
- - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
255
+ > See [agent training docs](https://docs.hud.so/train-agents/quickstart)
294
256
 
295
- By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
257
+ Or make your own environment and dataset:
296
258
 
297
- Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
259
+ ```bash
260
+ hud init my-env && cd my-env
261
+ hud dev --interactive
262
+ # When ready to run:
263
+ hud rl
264
+ ```
298
265
 
299
- Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
266
+ > See [environment design docs](https://docs.hud.so/build-environments)
300
267
 
301
268
  ## Benchmarking Agents
302
269
 
@@ -460,6 +427,39 @@ We highly suggest running 3-5 evaluations per dataset for the most consistent re
460
427
 
461
428
  Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it:
462
429
 
430
+ ## Reinforcement Learning with GRPO
431
+
432
+ This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
433
+
434
+ ![RL curve](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/rl_2.png)
435
+
436
+ Train with the new interactive `hud rl` flow:
437
+
438
+ ```bash
439
+ # Install CLI
440
+ uv tool install hud-python
441
+
442
+ # Option A: Run directly from a HuggingFace dataset
443
+ hud rl hud-evals/2048-basic
444
+
445
+ # Option B: Download first, modify, then train
446
+ hud get hud-evals/2048-basic
447
+ hud rl 2048-basic.json
448
+
449
+ # Optional: baseline evaluation
450
+ hud eval 2048-basic.json
451
+ ```
452
+
453
+ Supports multi‑turn RL for both:
454
+ - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
455
+ - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
456
+
457
+ By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
458
+
459
+ Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
460
+
461
+ Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
462
+
463
463
  ## Architecture
464
464
 
465
465
  ```mermaid
@@ -22,12 +22,12 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
22
22
 
23
23
  ## Highlights
24
24
 
25
- - 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
26
25
  - 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
27
26
  - ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
28
27
  - 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
29
28
  - 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
30
29
  - 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
30
+ - 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
31
31
 
32
32
  > We welcome contributors and feature requests – open an issue or hop on a call to discuss improvements!
33
33
 
@@ -48,29 +48,6 @@ uv tool install hud-python
48
48
  Before starting, get your HUD_API_KEY at [hud.so](https://hud.so).
49
49
 
50
50
 
51
- ## Quickstart: Training
52
-
53
- RL using GRPO a Qwen2.5-VL model on any hud dataset:
54
-
55
- ```bash
56
- hud get hud-evals/basic-2048 # from HF
57
- hud rl basic-2048.json
58
- ```
59
-
60
- > See [agent training docs](https://docs.hud.so/train-agents/quickstart)
61
-
62
- Or make your own environment and dataset:
63
-
64
- ```bash
65
- hud init my-env && cd my-env
66
- hud dev --interactive
67
- # When ready to run:
68
- hud rl
69
- ```
70
-
71
- > See [environment design docs](https://docs.hud.so/build-environments)
72
-
73
-
74
51
  ## Quickstart: Evals
75
52
 
76
53
  For a tutorial that explains the agent and evaluation design, run:
@@ -127,38 +104,27 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
127
104
 
128
105
  ![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif)
129
106
 
130
- ## Reinforcement Learning with GRPO
131
-
132
- This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
133
-
134
- ![RL curve](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/rl_2.png)
107
+ ## Quickstart: Training
135
108
 
136
- Train with the new interactive `hud rl` flow:
109
+ RL using GRPO a Qwen2.5-VL model on any hud dataset:
137
110
 
138
111
  ```bash
139
- # Install CLI
140
- uv tool install hud-python
141
-
142
- # Option A: Run directly from a HuggingFace dataset
143
- hud rl hud-evals/basic-2048
144
-
145
- # Option B: Download first, modify, then train
146
- hud get hud-evals/basic-2048
147
- hud rl basic-2048.json
148
-
149
- # Optional: baseline evaluation
150
- hud eval basic-2048.json
112
+ hud get hud-evals/2048-basic # from HF
113
+ hud rl 2048-basic.json
151
114
  ```
152
115
 
153
- Supports multi‑turn RL for both:
154
- - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
155
- - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
116
+ > See [agent training docs](https://docs.hud.so/train-agents/quickstart)
156
117
 
157
- By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
118
+ Or make your own environment and dataset:
158
119
 
159
- Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
120
+ ```bash
121
+ hud init my-env && cd my-env
122
+ hud dev --interactive
123
+ # When ready to run:
124
+ hud rl
125
+ ```
160
126
 
161
- Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
127
+ > See [environment design docs](https://docs.hud.so/build-environments)
162
128
 
163
129
  ## Benchmarking Agents
164
130
 
@@ -322,6 +288,39 @@ We highly suggest running 3-5 evaluations per dataset for the most consistent re
322
288
 
323
289
  Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it:
324
290
 
291
+ ## Reinforcement Learning with GRPO
292
+
293
+ This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
294
+
295
+ ![RL curve](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/rl_2.png)
296
+
297
+ Train with the new interactive `hud rl` flow:
298
+
299
+ ```bash
300
+ # Install CLI
301
+ uv tool install hud-python
302
+
303
+ # Option A: Run directly from a HuggingFace dataset
304
+ hud rl hud-evals/2048-basic
305
+
306
+ # Option B: Download first, modify, then train
307
+ hud get hud-evals/2048-basic
308
+ hud rl 2048-basic.json
309
+
310
+ # Optional: baseline evaluation
311
+ hud eval 2048-basic.json
312
+ ```
313
+
314
+ Supports multi‑turn RL for both:
315
+ - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
316
+ - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
317
+
318
+ By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
319
+
320
+ Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
321
+
322
+ Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
323
+
325
324
  ## Architecture
326
325
 
327
326
  ```mermaid
@@ -804,9 +804,9 @@ class TodoCompleted:
804
804
  @problem("todo_basic", description="Complete two todo items", difficulty="easy")
805
805
  class TodoBasic:
806
806
  def get_setup(self):
807
- return {"function": "todo_seed", "args": {"num_items": 5}}
807
+ return {"name": "todo_seed", "arguments": {"num_items": 5}}
808
808
  def get_evaluation(self):
809
- return {"function": "todo_completed", "args": {"expected_count": 2}}
809
+ return {"name": "todo_completed", "arguments": {"expected_count": 2}}
810
810
  ```
811
811
 
812
812
  Decorators keep registration *next to the implementation* and avoid manual bookkeeping. The server simply exposes the combined metadata through an MCP **resource**. Follow `environments/browser/src/hud_controller/problems/registry.py` as a template and expose the JSON with `@mcp.resource("problems://registry")`.
@@ -6,10 +6,12 @@ See [docs](https://docs.hud.so/build-environments) for the complete environment
6
6
  ## Architecture
7
7
 
8
8
  **`environment/`** - Produces structured data
9
+
9
10
  - Owns all state (game logic, browser sessions, databases, etc.)
10
11
  - Exposes HTTP endpoints `/health`, `/act`, `/reset`, `/state` that return structured information about the environment state
11
12
 
12
13
  **`server/`** - Wraps data in MCP tools
14
+
13
15
  - Calls environment endpoints to get structured data for the agent, and environment setup/evaluation
14
16
  - Agents and tasks interact only with these tools!
15
17
 
@@ -33,12 +35,14 @@ Visit http://localhost:8765/docs to see the new tool appear instantly.
33
35
  In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
34
36
 
35
37
  For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
38
+
36
39
  ```bash
37
40
  cd ..
38
41
  hud dev
39
42
  ```
40
43
 
41
44
  ## Tasks & Evaluation
45
+
42
46
  ```bash
43
47
  # Build first in the global folder with the Dockerfile (creates blank:0.1.0)
44
48
  hud build
@@ -59,6 +63,7 @@ Your `tasks.json` uses `docker run` to launch the environment:
59
63
  ```
60
64
 
61
65
  **Commands:**
66
+
62
67
  ```bash
63
68
  # Build first
64
69
  hud build
@@ -78,6 +83,7 @@ hud rl tasks.json # Auto-converts docker→remote, builds & pushes if needed
78
83
  Once your environment is ready, you can share it with the community:
79
84
 
80
85
  ### 1. Push to Registry
86
+
81
87
  ```bash
82
88
  # Build and push your environment (requires docker hub login and hud api key)
83
89
  hud build
@@ -89,10 +95,12 @@ hud push
89
95
  Create a dataset on HuggingFace with your tasks:
90
96
 
91
97
  **Option A: Upload manually**
98
+
92
99
  1. Upload your `tasks.json` to HuggingFace
93
100
  2. Make sure it's **public** to appear on leaderboards
94
101
 
95
102
  **Option B: Use the SDK**
103
+
96
104
  ```python
97
105
  from hud.datasets import save_tasks
98
106
  import json
@@ -109,7 +117,7 @@ save_tasks(tasks, repo_id="your-org/your-dataset")
109
117
 
110
118
  ```bash
111
119
  # Run Claude on your benchmark
112
- hud eval "your-org/your-dataset" --agent claude
120
+ hud eval "your-org/your-dataset" claude
113
121
 
114
122
  # View results at:
115
123
  # hud.so/leaderboards/your-org/your-dataset
@@ -118,4 +126,3 @@ hud eval "your-org/your-dataset" --agent claude
118
126
  **Note**: Only public HuggingFace datasets appear as leaderboards!
119
127
 
120
128
  📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
121
-
@@ -4,7 +4,7 @@ version = "0.1.0"
4
4
  description = "MCP server for blank environment"
5
5
  requires-python = ">=3.11"
6
6
  dependencies = [
7
- "hud-python>=0.4.52",
7
+ "hud-python>=0.4.54",
8
8
  "httpx>=0.28.1",
9
9
  ]
10
10
 
@@ -47,8 +47,8 @@ await setup({"name": "todo_basic_usage"})
47
47
  await evaluate({"name": "todo_basic_usage"})
48
48
 
49
49
  # Direct function calls
50
- await setup({"function": "todo_reset", "args": {}})
51
- await evaluate({"function": "todo_completion_rate", "args": {"min_rate": 0.5}})
50
+ await setup({"name": "todo_reset", "arguments": {}})
51
+ await evaluate({"name": "todo_completion_rate", "arguments": {"min_rate": 0.5}})
52
52
 
53
53
  # MCP resource discovery
54
54
  todo_evaluators = await client.read_resource("evaluators://todo")
@@ -4,7 +4,7 @@ version = "0.1.0"
4
4
  description = "HUD Browser MCP Server"
5
5
  requires-python = ">=3.11,<3.14"
6
6
  dependencies = [
7
- "hud-python@git+https://github.com/hud-evals/hud-python@cli-dev",
7
+ "hud-python>=0.4.54",
8
8
  "httpx",
9
9
  "playwright",
10
10
  "pyautogui",
@@ -4,7 +4,7 @@ version = "0.1.0"
4
4
  description = "MCP server for DeepResearch environment"
5
5
  requires-python = ">=3.11"
6
6
  dependencies = [
7
- "hud-python>=0.4.52",
7
+ "hud-python>=0.4.54",
8
8
  "httpx>=0.24.0",
9
9
  ]
10
10
 
@@ -137,7 +137,11 @@ class MCPAgent(ABC):
137
137
  "No MCPClient. Please provide one when initializing the agent or pass a Task with mcp_config." # noqa: E501
138
138
  )
139
139
 
140
- await self._setup_config(self.mcp_client.mcp_config)
140
+ try:
141
+ client_cfg = getattr(self.mcp_client, "mcp_config", None)
142
+ except Exception:
143
+ client_cfg = None
144
+ await self._setup_config(client_cfg)
141
145
 
142
146
  # Initialize client if needed
143
147
  try:
@@ -618,8 +622,11 @@ class MCPAgent(ABC):
618
622
  except Exception as e:
619
623
  self.console.error_log(f"Response lifecycle tool failed: {e}")
620
624
 
621
- async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
625
+ async def _setup_config(self, mcp_config: dict[str, dict[str, Any]] | None) -> None:
622
626
  """Inject metadata into the metadata of the initialize request."""
627
+ if not isinstance(mcp_config, dict):
628
+ return
629
+
623
630
  if self.metadata:
624
631
  patch_mcp_config(
625
632
  mcp_config,
@@ -20,6 +20,7 @@ import logging
20
20
  from typing import TYPE_CHECKING, Any, ClassVar, cast
21
21
 
22
22
  import mcp.types as types
23
+ from openai import AsyncOpenAI
23
24
 
24
25
  from hud import instrument
25
26
  from hud.types import AgentResponse, MCPToolCall, MCPToolResult
@@ -28,7 +29,6 @@ from hud.utils.hud_console import HUDConsole
28
29
  from .base import MCPAgent
29
30
 
30
31
  if TYPE_CHECKING:
31
- from openai import AsyncOpenAI
32
32
  from openai.types.chat import ChatCompletionToolParam
33
33
 
34
34
  logger = logging.getLogger(__name__)
@@ -42,14 +42,26 @@ class GenericOpenAIChatAgent(MCPAgent):
42
42
  def __init__(
43
43
  self,
44
44
  *,
45
- openai_client: AsyncOpenAI | None,
45
+ openai_client: AsyncOpenAI | None = None,
46
+ api_key: str | None = None,
47
+ base_url: str | None = None,
46
48
  model_name: str = "gpt-4o-mini",
47
49
  completion_kwargs: dict[str, Any] | None = None,
48
50
  **agent_kwargs: Any,
49
51
  ) -> None:
50
52
  # Accept base-agent settings via **agent_kwargs (e.g., mcp_client, system_prompt, etc.)
51
53
  super().__init__(**agent_kwargs)
52
- self.oai = openai_client
54
+
55
+ # Handle client creation - support both patterns
56
+ if openai_client is not None:
57
+ # Use provided client (backward compatibility)
58
+ self.oai = openai_client
59
+ elif api_key is not None or base_url is not None:
60
+ # Create client from config (new pattern, consistent with other agents)
61
+ self.oai = AsyncOpenAI(api_key=api_key, base_url=base_url)
62
+ else:
63
+ raise ValueError("Either openai_client or (api_key and base_url) must be provided")
64
+
53
65
  self.model_name = model_name
54
66
  self.completion_kwargs: dict[str, Any] = completion_kwargs or {}
55
67
  self.mcp_schemas = []
@@ -329,6 +329,21 @@ class TestBaseMCPAgent:
329
329
  # call_tools doesn't validate empty names, it will return error
330
330
  await agent.call_tools(tool_call)
331
331
 
332
+ def test_get_tool_schemas(self):
333
+ """Test getting tool schemas."""
334
+ agent = MockMCPAgent()
335
+
336
+ agent._available_tools = [
337
+ types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
338
+ types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
339
+ ]
340
+
341
+ schemas = agent.get_tool_schemas()
342
+
343
+ # Should include non-lifecycle tools
344
+ assert len(schemas) == 2
345
+ assert schemas[0]["name"] == "tool1"
346
+
332
347
  def test_get_tools_by_server(self):
333
348
  """Test getting tools grouped by server."""
334
349
  agent = MockMCPAgent()
@@ -0,0 +1,164 @@
1
+ from __future__ import annotations
2
+
3
+ from unittest import mock
4
+
5
+ import mcp.types as types
6
+ import pytest
7
+
8
+ from hud.agents.base import MCPAgent, find_content, find_reward, text_to_blocks
9
+ from hud.types import AgentResponse, MCPToolCall, MCPToolResult
10
+
11
+
12
+ class DummyAgent(MCPAgent):
13
+ async def get_system_messages(self):
14
+ return [types.TextContent(text="sys", type="text")]
15
+
16
+ async def get_response(self, messages):
17
+ # Single step: no tool calls -> done
18
+ return AgentResponse(content="ok", tool_calls=[], done=True)
19
+
20
+ async def format_blocks(self, blocks):
21
+ # Return as-is
22
+ return blocks
23
+
24
+ async def format_tool_results(self, tool_calls, tool_results):
25
+ return [types.TextContent(text="tools", type="text")]
26
+
27
+
28
+ @pytest.mark.asyncio
29
+ async def test_run_with_string_prompt_auto_client(monkeypatch):
30
+ # Fake MCPClient with required methods
31
+ fake_client = mock.AsyncMock()
32
+ fake_client.initialize.return_value = None
33
+ fake_client.list_tools.return_value = []
34
+ fake_client.shutdown.return_value = None
35
+
36
+ # Patch MCPClient construction inside initialize()
37
+ with mock.patch("hud.clients.MCPClient", return_value=fake_client):
38
+ agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
39
+ result = await agent.run("hello", max_steps=1)
40
+ assert result.done is True and result.isError is False
41
+
42
+
43
+ def test_find_reward_and_content_extractors():
44
+ # Structured content
45
+ r = MCPToolResult(
46
+ content=text_to_blocks("{}"), isError=False, structuredContent={"reward": 0.7}
47
+ )
48
+ assert find_reward(r) == 0.7
49
+
50
+ # Text JSON
51
+ r2 = MCPToolResult(content=text_to_blocks('{"score": 0.5, "content": "hi"}'), isError=False)
52
+ assert find_reward(r2) == 0.5
53
+ assert find_content(r2) == "hi"
54
+
55
+
56
+ @pytest.mark.asyncio
57
+ async def test_call_tools_error_paths():
58
+ fake_client = mock.AsyncMock()
59
+ # First call succeeds
60
+ ok_result = MCPToolResult(content=text_to_blocks("ok"), isError=False)
61
+ fake_client.call_tool.side_effect = [ok_result, RuntimeError("boom")]
62
+ agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
63
+ results = await agent.call_tools(
64
+ [MCPToolCall(name="a", arguments={}), MCPToolCall(name="b", arguments={})]
65
+ )
66
+ assert results[0].isError is False
67
+ assert results[1].isError is True
68
+
69
+
70
+ @pytest.mark.asyncio
71
+ async def test_initialize_without_client_raises_valueerror():
72
+ agent = DummyAgent(mcp_client=None, auto_trace=False)
73
+ with pytest.raises(ValueError):
74
+ await agent.initialize(None)
75
+
76
+
77
+ def test_get_available_tools_before_initialize_raises():
78
+ agent = DummyAgent(mcp_client=mock.AsyncMock(), auto_trace=False)
79
+ with pytest.raises(RuntimeError):
80
+ agent.get_available_tools()
81
+
82
+
83
+ @pytest.mark.asyncio
84
+ async def test_format_message_invalid_type_raises():
85
+ agent = DummyAgent(mcp_client=mock.AsyncMock(), auto_trace=False)
86
+ with pytest.raises(ValueError):
87
+ await agent.format_message({"oops": 1}) # type: ignore
88
+
89
+
90
+ @pytest.mark.asyncio
91
+ async def test_call_tools_timeout_error_shutdown_called():
92
+ fake_client = mock.AsyncMock()
93
+ fake_client.call_tool.side_effect = TimeoutError("timeout")
94
+ fake_client.shutdown.return_value = None
95
+ agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
96
+ with pytest.raises(TimeoutError):
97
+ await agent.call_tools(MCPToolCall(name="x", arguments={}))
98
+ fake_client.shutdown.assert_awaited_once()
99
+
100
+
101
+ def test_text_to_blocks_shapes():
102
+ blocks = text_to_blocks("x")
103
+ assert isinstance(blocks, list) and blocks and isinstance(blocks[0], types.TextContent)
104
+
105
+
106
+ @pytest.mark.asyncio
107
+ async def test_run_returns_connection_error_trace(monkeypatch):
108
+ fake_client = mock.AsyncMock()
109
+ fake_client.mcp_config = {}
110
+ fake_client.initialize.side_effect = RuntimeError("Connection refused http://localhost:1234")
111
+ fake_client.list_tools.return_value = []
112
+ fake_client.shutdown.return_value = None
113
+
114
+ class DummyCM:
115
+ def __exit__(self, *args, **kwargs):
116
+ return False
117
+
118
+ monkeypatch.setattr("hud.utils.mcp.setup_hud_telemetry", lambda *args, **kwargs: DummyCM())
119
+
120
+ agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
121
+ result = await agent.run("p", max_steps=1)
122
+ assert result.isError is True
123
+ assert "Could not connect" in (result.content or "")
124
+
125
+
126
+ @pytest.mark.asyncio
127
+ async def test_run_calls_response_tool_when_configured(monkeypatch):
128
+ fake_client = mock.AsyncMock()
129
+ fake_client.mcp_config = {}
130
+ fake_client.initialize.return_value = None
131
+ fake_client.list_tools.return_value = []
132
+ fake_client.shutdown.return_value = None
133
+ ok = MCPToolResult(content=text_to_blocks("ok"), isError=False)
134
+ fake_client.call_tool.return_value = ok
135
+
136
+ class DummyCM:
137
+ def __exit__(self, *args, **kwargs):
138
+ return False
139
+
140
+ monkeypatch.setattr("hud.utils.mcp.setup_hud_telemetry", lambda *args, **kwargs: DummyCM())
141
+
142
+ agent = DummyAgent(mcp_client=fake_client, auto_trace=False, response_tool_name="submit")
143
+ result = await agent.run("hello", max_steps=1)
144
+ assert result.isError is False
145
+ fake_client.call_tool.assert_awaited()
146
+
147
+
148
+ @pytest.mark.asyncio
149
+ async def test_get_available_tools_after_initialize(monkeypatch):
150
+ fake_client = mock.AsyncMock()
151
+ fake_client.mcp_config = {}
152
+ fake_client.initialize.return_value = None
153
+ fake_client.list_tools.return_value = []
154
+ fake_client.shutdown.return_value = None
155
+
156
+ class DummyCM:
157
+ def __exit__(self, *args, **kwargs):
158
+ return False
159
+
160
+ monkeypatch.setattr("hud.utils.mcp.setup_hud_telemetry", lambda *args, **kwargs: DummyCM())
161
+
162
+ agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
163
+ await agent.initialize(None)
164
+ assert agent.get_available_tools() == []