hud-python 0.4.51__tar.gz → 0.4.53__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (301) hide show
  1. {hud_python-0.4.51 → hud_python-0.4.53}/PKG-INFO +48 -48
  2. {hud_python-0.4.51 → hud_python-0.4.53}/README.md +46 -47
  3. {hud_python-0.4.51 → hud_python-0.4.53}/environments/blank/README.md +9 -2
  4. {hud_python-0.4.51 → hud_python-0.4.53}/environments/blank/server/pyproject.toml +1 -1
  5. {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/server/pyproject.toml +1 -1
  6. {hud_python-0.4.51 → hud_python-0.4.53}/environments/deepresearch/server/pyproject.toml +1 -1
  7. {hud_python-0.4.51 → hud_python-0.4.53}/hud/__init__.py +13 -1
  8. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/base.py +14 -3
  9. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/lite_llm.py +1 -1
  10. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/openai_chat_generic.py +15 -3
  11. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/tests/test_base.py +9 -2
  12. hud_python-0.4.53/hud/agents/tests/test_base_runtime.py +164 -0
  13. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/__init__.py +18 -25
  14. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/build.py +35 -27
  15. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/dev.py +11 -29
  16. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/eval.py +114 -145
  17. hud_python-0.4.53/hud/cli/tests/test_analyze_module.py +120 -0
  18. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_build.py +26 -3
  19. hud_python-0.4.53/hud/cli/tests/test_build_failure.py +41 -0
  20. hud_python-0.4.53/hud/cli/tests/test_build_module.py +50 -0
  21. hud_python-0.4.53/hud/cli/tests/test_cli_more_wrappers.py +30 -0
  22. hud_python-0.4.53/hud/cli/tests/test_cli_root.py +134 -0
  23. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_eval.py +4 -0
  24. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_mcp_server.py +8 -7
  25. hud_python-0.4.53/hud/cli/tests/test_push_happy.py +74 -0
  26. hud_python-0.4.53/hud/cli/tests/test_push_wrapper.py +23 -0
  27. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/docker.py +120 -1
  28. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/runner.py +1 -1
  29. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/tasks.py +4 -1
  30. hud_python-0.4.53/hud/cli/utils/tests/test_config.py +58 -0
  31. hud_python-0.4.53/hud/cli/utils/tests/test_docker.py +93 -0
  32. hud_python-0.4.53/hud/cli/utils/tests/test_docker_hints.py +71 -0
  33. hud_python-0.4.53/hud/cli/utils/tests/test_env_check.py +74 -0
  34. hud_python-0.4.53/hud/cli/utils/tests/test_environment.py +42 -0
  35. hud_python-0.4.53/hud/cli/utils/tests/test_interactive_module.py +60 -0
  36. hud_python-0.4.53/hud/cli/utils/tests/test_local_runner.py +50 -0
  37. hud_python-0.4.53/hud/cli/utils/tests/test_logging_utils.py +23 -0
  38. hud_python-0.4.53/hud/cli/utils/tests/test_metadata.py +49 -0
  39. hud_python-0.4.53/hud/cli/utils/tests/test_package_runner.py +35 -0
  40. hud_python-0.4.53/hud/cli/utils/tests/test_registry_utils.py +49 -0
  41. hud_python-0.4.53/hud/cli/utils/tests/test_remote_runner.py +25 -0
  42. hud_python-0.4.53/hud/cli/utils/tests/test_runner_modules.py +52 -0
  43. hud_python-0.4.53/hud/cli/utils/tests/test_source_hash.py +36 -0
  44. hud_python-0.4.53/hud/cli/utils/tests/test_tasks.py +80 -0
  45. hud_python-0.4.53/hud/cli/utils/version_check.py +257 -0
  46. {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/base.py +1 -1
  47. {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/mcp_use.py +3 -1
  48. {hud_python-0.4.51 → hud_python-0.4.53}/hud/datasets/parallel.py +2 -2
  49. hud_python-0.4.53/hud/datasets/runner.py +184 -0
  50. hud_python-0.4.53/hud/datasets/tests/test_runner.py +106 -0
  51. hud_python-0.4.53/hud/datasets/tests/test_utils.py +228 -0
  52. {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/config.py +8 -6
  53. {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/context.py +4 -4
  54. {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/exporters.py +231 -57
  55. hud_python-0.4.53/hud/otel/tests/test_instrumentation.py +207 -0
  56. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/learner.py +1 -1
  57. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_server_extra.py +2 -0
  58. {hud_python-0.4.51 → hud_python-0.4.53}/hud/shared/exceptions.py +35 -9
  59. {hud_python-0.4.51 → hud_python-0.4.53}/hud/shared/hints.py +25 -0
  60. {hud_python-0.4.51 → hud_python-0.4.53}/hud/shared/requests.py +15 -3
  61. hud_python-0.4.53/hud/shared/tests/__init__.py +0 -0
  62. {hud_python-0.4.51 → hud_python-0.4.53}/hud/shared/tests/test_exceptions.py +39 -30
  63. hud_python-0.4.53/hud/shared/tests/test_hints.py +167 -0
  64. hud_python-0.4.53/hud/telemetry/__init__.py +50 -0
  65. hud_python-0.4.53/hud/telemetry/async_context.py +331 -0
  66. {hud_python-0.4.51 → hud_python-0.4.53}/hud/telemetry/job.py +51 -12
  67. hud_python-0.4.53/hud/telemetry/tests/__init__.py +0 -0
  68. hud_python-0.4.53/hud/telemetry/tests/test_async_context.py +242 -0
  69. hud_python-0.4.53/hud/telemetry/tests/test_instrument.py +414 -0
  70. hud_python-0.4.53/hud/telemetry/tests/test_job.py +609 -0
  71. hud_python-0.4.53/hud/telemetry/tests/test_trace.py +241 -0
  72. {hud_python-0.4.51 → hud_python-0.4.53}/hud/telemetry/trace.py +16 -17
  73. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/computer/qwen.py +4 -1
  74. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/computer/settings.py +2 -2
  75. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/executors/base.py +4 -2
  76. hud_python-0.4.53/hud/tools/tests/test_submit.py +85 -0
  77. hud_python-0.4.53/hud/tools/tests/test_types.py +193 -0
  78. {hud_python-0.4.51 → hud_python-0.4.53}/hud/types.py +7 -1
  79. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/agent_factories.py +1 -3
  80. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/mcp.py +1 -1
  81. hud_python-0.4.53/hud/utils/task_tracking.py +223 -0
  82. hud_python-0.4.53/hud/utils/tests/__init__.py +0 -0
  83. hud_python-0.4.53/hud/utils/tests/test_agent_factories.py +60 -0
  84. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tests/test_mcp.py +4 -6
  85. hud_python-0.4.53/hud/utils/tests/test_pretty_errors.py +186 -0
  86. hud_python-0.4.53/hud/utils/tests/test_tasks.py +187 -0
  87. hud_python-0.4.53/hud/utils/tests/test_tool_shorthand.py +154 -0
  88. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tests/test_version.py +1 -1
  89. {hud_python-0.4.51 → hud_python-0.4.53}/hud/version.py +1 -1
  90. {hud_python-0.4.51 → hud_python-0.4.53}/pyproject.toml +17 -3
  91. hud_python-0.4.51/hud/datasets/runner.py +0 -123
  92. hud_python-0.4.51/hud/otel/tests/__init__.py +0 -1
  93. hud_python-0.4.51/hud/telemetry/__init__.py +0 -26
  94. hud_python-0.4.51/hud/telemetry/tests/test_trace.py +0 -63
  95. {hud_python-0.4.51 → hud_python-0.4.53}/.gitignore +0 -0
  96. {hud_python-0.4.51 → hud_python-0.4.53}/LICENSE +0 -0
  97. {hud_python-0.4.51 → hud_python-0.4.53}/environments/README.md +0 -0
  98. {hud_python-0.4.51 → hud_python-0.4.53}/environments/blank/environment/README.md +0 -0
  99. {hud_python-0.4.51 → hud_python-0.4.53}/environments/blank/environment/pyproject.toml +0 -0
  100. {hud_python-0.4.51 → hud_python-0.4.53}/environments/blank/server/README.md +0 -0
  101. {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/README.md +0 -0
  102. {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/environment/2048/README.md +0 -0
  103. {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
  104. {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/environment/README.md +0 -0
  105. {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/environment/pyproject.toml +0 -0
  106. {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/environment/todo/README.md +0 -0
  107. {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
  108. {hud_python-0.4.51 → hud_python-0.4.53}/environments/browser/pyproject.toml +0 -0
  109. {hud_python-0.4.51 → hud_python-0.4.53}/environments/deepresearch/README.md +0 -0
  110. {hud_python-0.4.51 → hud_python-0.4.53}/environments/deepresearch/environment/pyproject.toml +0 -0
  111. {hud_python-0.4.51 → hud_python-0.4.53}/environments/deepresearch/pyproject.toml +0 -0
  112. {hud_python-0.4.51 → hud_python-0.4.53}/environments/remote_browser/README.md +0 -0
  113. {hud_python-0.4.51 → hud_python-0.4.53}/environments/remote_browser/pyproject.toml +0 -0
  114. {hud_python-0.4.51 → hud_python-0.4.53}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  115. {hud_python-0.4.51 → hud_python-0.4.53}/environments/text_2048/README.md +0 -0
  116. {hud_python-0.4.51 → hud_python-0.4.53}/environments/text_2048/pyproject.toml +0 -0
  117. {hud_python-0.4.51 → hud_python-0.4.53}/examples/README.md +0 -0
  118. {hud_python-0.4.51 → hud_python-0.4.53}/hud/__main__.py +0 -0
  119. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/__init__.py +0 -0
  120. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/claude.py +0 -0
  121. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/grounded_openai.py +0 -0
  122. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/langchain.py +0 -0
  123. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/misc/__init__.py +0 -0
  124. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/misc/integration_test_agent.py +0 -0
  125. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/misc/response_agent.py +0 -0
  126. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/openai.py +0 -0
  127. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/tests/__init__.py +0 -0
  128. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/tests/test_claude.py +0 -0
  129. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/tests/test_client.py +0 -0
  130. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  131. {hud_python-0.4.51 → hud_python-0.4.53}/hud/agents/tests/test_openai.py +0 -0
  132. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/__main__.py +0 -0
  133. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/analyze.py +0 -0
  134. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/clone.py +0 -0
  135. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/debug.py +0 -0
  136. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/flows/__init__.py +0 -0
  137. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/flows/tasks.py +0 -0
  138. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/get.py +0 -0
  139. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/init.py +0 -0
  140. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/list_func.py +0 -0
  141. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/pull.py +0 -0
  142. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/push.py +0 -0
  143. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/remove.py +0 -0
  144. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/__init__.py +0 -0
  145. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/celebrate.py +0 -0
  146. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/config.py +0 -0
  147. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/display.py +0 -0
  148. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/gpu.py +0 -0
  149. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/gpu_utils.py +0 -0
  150. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/local_runner.py +0 -0
  151. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/presets.py +0 -0
  152. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/remote_runner.py +0 -0
  153. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/rl_api.py +0 -0
  154. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/viewer.py +0 -0
  155. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/vllm.py +0 -0
  156. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/rl/wait_utils.py +0 -0
  157. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/__init__.py +0 -0
  158. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_analyze.py +0 -0
  159. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_analyze_metadata.py +0 -0
  160. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_cli_init.py +0 -0
  161. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_cli_main.py +0 -0
  162. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_clone.py +0 -0
  163. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_cursor.py +0 -0
  164. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_debug.py +0 -0
  165. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_list_func.py +0 -0
  166. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_main_module.py +0 -0
  167. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_pull.py +0 -0
  168. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_push.py +0 -0
  169. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_registry.py +0 -0
  170. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/tests/test_utils.py +0 -0
  171. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/__init__.py +0 -0
  172. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/config.py +0 -0
  173. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/cursor.py +0 -0
  174. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/env_check.py +0 -0
  175. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/environment.py +0 -0
  176. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/interactive.py +0 -0
  177. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/local_runner.py +0 -0
  178. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/logging.py +0 -0
  179. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/metadata.py +0 -0
  180. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/package_runner.py +0 -0
  181. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/registry.py +0 -0
  182. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/remote_runner.py +0 -0
  183. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/server.py +0 -0
  184. {hud_python-0.4.51 → hud_python-0.4.53}/hud/cli/utils/source_hash.py +0 -0
  185. {hud_python-0.4.51/hud/shared → hud_python-0.4.53/hud/cli/utils}/tests/__init__.py +0 -0
  186. {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/README.md +0 -0
  187. {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/__init__.py +0 -0
  188. {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/fastmcp.py +0 -0
  189. {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/tests/__init__.py +0 -0
  190. {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/tests/test_client_integration.py +0 -0
  191. {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/tests/test_fastmcp.py +0 -0
  192. {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/tests/test_mcp_use_retry.py +0 -0
  193. {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/tests/test_protocol.py +0 -0
  194. {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/utils/__init__.py +0 -0
  195. {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/utils/mcp_use_retry.py +0 -0
  196. {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/utils/retry.py +0 -0
  197. {hud_python-0.4.51 → hud_python-0.4.53}/hud/clients/utils/retry_transport.py +0 -0
  198. {hud_python-0.4.51 → hud_python-0.4.53}/hud/datasets/__init__.py +0 -0
  199. {hud_python-0.4.51/hud/telemetry → hud_python-0.4.53/hud/datasets}/tests/__init__.py +0 -0
  200. {hud_python-0.4.51 → hud_python-0.4.53}/hud/datasets/utils.py +0 -0
  201. {hud_python-0.4.51 → hud_python-0.4.53}/hud/misc/__init__.py +0 -0
  202. {hud_python-0.4.51 → hud_python-0.4.53}/hud/misc/claude_plays_pokemon.py +0 -0
  203. {hud_python-0.4.51 → hud_python-0.4.53}/hud/native/__init__.py +0 -0
  204. {hud_python-0.4.51 → hud_python-0.4.53}/hud/native/comparator.py +0 -0
  205. {hud_python-0.4.51 → hud_python-0.4.53}/hud/native/tests/__init__.py +0 -0
  206. {hud_python-0.4.51 → hud_python-0.4.53}/hud/native/tests/test_comparator.py +0 -0
  207. {hud_python-0.4.51 → hud_python-0.4.53}/hud/native/tests/test_native_init.py +0 -0
  208. {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/__init__.py +0 -0
  209. {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/collector.py +0 -0
  210. {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/instrumentation.py +0 -0
  211. {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/processors.py +0 -0
  212. {hud_python-0.4.51/hud/utils → hud_python-0.4.53/hud/otel}/tests/__init__.py +0 -0
  213. {hud_python-0.4.51 → hud_python-0.4.53}/hud/otel/tests/test_processors.py +0 -0
  214. {hud_python-0.4.51 → hud_python-0.4.53}/hud/py.typed +0 -0
  215. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/README.md +0 -0
  216. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/__init__.py +0 -0
  217. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/actor.py +0 -0
  218. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/buffer.py +0 -0
  219. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/chat_template.jinja +0 -0
  220. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/config.py +0 -0
  221. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/distributed.py +0 -0
  222. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/tests/__init__.py +0 -0
  223. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/tests/test_learner.py +0 -0
  224. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/train.py +0 -0
  225. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/types.py +0 -0
  226. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/utils/start_vllm_server.sh +0 -0
  227. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/utils.py +0 -0
  228. {hud_python-0.4.51 → hud_python-0.4.53}/hud/rl/vllm_adapter.py +0 -0
  229. {hud_python-0.4.51 → hud_python-0.4.53}/hud/samples/__init__.py +0 -0
  230. {hud_python-0.4.51 → hud_python-0.4.53}/hud/samples/browser.py +0 -0
  231. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/__init__.py +0 -0
  232. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/context.py +0 -0
  233. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/helper/__init__.py +0 -0
  234. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/low_level.py +0 -0
  235. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/router.py +0 -0
  236. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/server.py +0 -0
  237. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/__init__.py +0 -0
  238. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_add_tool.py +0 -0
  239. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_context.py +0 -0
  240. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_mcp_server_handlers.py +0 -0
  241. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_mcp_server_integration.py +0 -0
  242. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_mcp_server_more.py +0 -0
  243. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_run_wrapper.py +0 -0
  244. {hud_python-0.4.51 → hud_python-0.4.53}/hud/server/tests/test_sigterm_runner.py +0 -0
  245. {hud_python-0.4.51 → hud_python-0.4.53}/hud/settings.py +0 -0
  246. {hud_python-0.4.51 → hud_python-0.4.53}/hud/shared/__init__.py +0 -0
  247. {hud_python-0.4.51 → hud_python-0.4.53}/hud/shared/tests/test_requests.py +0 -0
  248. {hud_python-0.4.51 → hud_python-0.4.53}/hud/telemetry/instrument.py +0 -0
  249. {hud_python-0.4.51 → hud_python-0.4.53}/hud/telemetry/replay.py +0 -0
  250. {hud_python-0.4.51 → hud_python-0.4.53}/hud/telemetry/tests/test_replay.py +0 -0
  251. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/__init__.py +0 -0
  252. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/base.py +0 -0
  253. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/bash.py +0 -0
  254. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/computer/__init__.py +0 -0
  255. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/computer/anthropic.py +0 -0
  256. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/computer/hud.py +0 -0
  257. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/computer/openai.py +0 -0
  258. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/edit.py +0 -0
  259. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/executors/__init__.py +0 -0
  260. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/executors/pyautogui.py +0 -0
  261. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/executors/tests/__init__.py +0 -0
  262. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/executors/tests/test_base_executor.py +0 -0
  263. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  264. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/executors/xdo.py +0 -0
  265. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/grounding/__init__.py +0 -0
  266. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/grounding/config.py +0 -0
  267. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/grounding/grounded_tool.py +0 -0
  268. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/grounding/grounder.py +0 -0
  269. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/grounding/tests/__init__.py +0 -0
  270. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  271. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/playwright.py +0 -0
  272. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/response.py +0 -0
  273. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/submit.py +0 -0
  274. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/__init__.py +0 -0
  275. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_base.py +0 -0
  276. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_bash.py +0 -0
  277. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_bash_extended.py +0 -0
  278. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_computer.py +0 -0
  279. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_computer_actions.py +0 -0
  280. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_edit.py +0 -0
  281. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_init.py +0 -0
  282. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_playwright_tool.py +0 -0
  283. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_response.py +0 -0
  284. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_tools.py +0 -0
  285. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_tools_init.py +0 -0
  286. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/tests/test_utils.py +0 -0
  287. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/types.py +0 -0
  288. {hud_python-0.4.51 → hud_python-0.4.53}/hud/tools/utils.py +0 -0
  289. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/__init__.py +0 -0
  290. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/async_utils.py +0 -0
  291. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/group_eval.py +0 -0
  292. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/hud_console.py +0 -0
  293. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/pretty_errors.py +0 -0
  294. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/progress.py +0 -0
  295. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tasks.py +0 -0
  296. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/telemetry.py +0 -0
  297. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tests/test_async_utils.py +0 -0
  298. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tests/test_init.py +0 -0
  299. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tests/test_progress.py +0 -0
  300. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tests/test_telemetry.py +0 -0
  301. {hud_python-0.4.51 → hud_python-0.4.53}/hud/utils/tool_shorthand.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.51
3
+ Version: 0.4.53
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -48,6 +48,7 @@ Requires-Dist: opentelemetry-api>=1.34.1
48
48
  Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
49
49
  Requires-Dist: opentelemetry-instrumentation-mcp==0.47.0
50
50
  Requires-Dist: opentelemetry-sdk>=1.34.1
51
+ Requires-Dist: packaging>=21.0
51
52
  Requires-Dist: pathspec>=0.12.1
52
53
  Requires-Dist: pillow>=11.1.0
53
54
  Requires-Dist: prompt-toolkit==3.0.51
@@ -159,12 +160,12 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
159
160
 
160
161
  ## Highlights
161
162
 
162
- - 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
163
163
  - 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
164
164
  - ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
165
165
  - 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
166
166
  - 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
167
167
  - 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
168
+ - 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
168
169
 
169
170
  > We welcome contributors and feature requests – open an issue or hop on a call to discuss improvements!
170
171
 
@@ -185,29 +186,6 @@ uv tool install hud-python
185
186
  Before starting, get your HUD_API_KEY at [hud.so](https://hud.so).
186
187
 
187
188
 
188
- ## Quickstart: Training
189
-
190
- RL using GRPO a Qwen2.5-VL model on any hud dataset:
191
-
192
- ```bash
193
- hud get hud-evals/basic-2048 # from HF
194
- hud rl basic-2048.json
195
- ```
196
-
197
- > See [agent training docs](https://docs.hud.so/train-agents/quickstart)
198
-
199
- Or make your own environment and dataset:
200
-
201
- ```bash
202
- hud init my-env && cd my-env
203
- hud dev --interactive
204
- # When ready to run:
205
- hud rl
206
- ```
207
-
208
- > See [environment design docs](https://docs.hud.so/build-environments)
209
-
210
-
211
189
  ## Quickstart: Evals
212
190
 
213
191
  For a tutorial that explains the agent and evaluation design, run:
@@ -264,38 +242,27 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
264
242
 
265
243
  ![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif)
266
244
 
267
- ## Reinforcement Learning with GRPO
268
-
269
- This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
270
-
271
- ![RL curve](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/rl_2.png)
245
+ ## Quickstart: Training
272
246
 
273
- Train with the new interactive `hud rl` flow:
247
+ RL using GRPO a Qwen2.5-VL model on any hud dataset:
274
248
 
275
249
  ```bash
276
- # Install CLI
277
- uv tool install hud-python
278
-
279
- # Option A: Run directly from a HuggingFace dataset
280
- hud rl hud-evals/basic-2048
281
-
282
- # Option B: Download first, modify, then train
283
- hud get hud-evals/basic-2048
250
+ hud get hud-evals/basic-2048 # from HF
284
251
  hud rl basic-2048.json
285
-
286
- # Optional: baseline evaluation
287
- hud eval basic-2048.json
288
252
  ```
289
253
 
290
- Supports multi‑turn RL for both:
291
- - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
292
- - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
254
+ > See [agent training docs](https://docs.hud.so/train-agents/quickstart)
293
255
 
294
- By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
256
+ Or make your own environment and dataset:
295
257
 
296
- Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
258
+ ```bash
259
+ hud init my-env && cd my-env
260
+ hud dev --interactive
261
+ # When ready to run:
262
+ hud rl
263
+ ```
297
264
 
298
- Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
265
+ > See [environment design docs](https://docs.hud.so/build-environments)
299
266
 
300
267
  ## Benchmarking Agents
301
268
 
@@ -459,6 +426,39 @@ We highly suggest running 3-5 evaluations per dataset for the most consistent re
459
426
 
460
427
  Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it:
461
428
 
429
+ ## Reinforcement Learning with GRPO
430
+
431
+ This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
432
+
433
+ ![RL curve](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/rl_2.png)
434
+
435
+ Train with the new interactive `hud rl` flow:
436
+
437
+ ```bash
438
+ # Install CLI
439
+ uv tool install hud-python
440
+
441
+ # Option A: Run directly from a HuggingFace dataset
442
+ hud rl hud-evals/basic-2048
443
+
444
+ # Option B: Download first, modify, then train
445
+ hud get hud-evals/basic-2048
446
+ hud rl basic-2048.json
447
+
448
+ # Optional: baseline evaluation
449
+ hud eval basic-2048.json
450
+ ```
451
+
452
+ Supports multi‑turn RL for both:
453
+ - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
454
+ - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
455
+
456
+ By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
457
+
458
+ Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
459
+
460
+ Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
461
+
462
462
  ## Architecture
463
463
 
464
464
  ```mermaid
@@ -22,12 +22,12 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
22
22
 
23
23
  ## Highlights
24
24
 
25
- - 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
26
25
  - 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
27
26
  - ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
28
27
  - 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
29
28
  - 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
30
29
  - 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
30
+ - 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
31
31
 
32
32
  > We welcome contributors and feature requests – open an issue or hop on a call to discuss improvements!
33
33
 
@@ -48,29 +48,6 @@ uv tool install hud-python
48
48
  Before starting, get your HUD_API_KEY at [hud.so](https://hud.so).
49
49
 
50
50
 
51
- ## Quickstart: Training
52
-
53
- RL using GRPO a Qwen2.5-VL model on any hud dataset:
54
-
55
- ```bash
56
- hud get hud-evals/basic-2048 # from HF
57
- hud rl basic-2048.json
58
- ```
59
-
60
- > See [agent training docs](https://docs.hud.so/train-agents/quickstart)
61
-
62
- Or make your own environment and dataset:
63
-
64
- ```bash
65
- hud init my-env && cd my-env
66
- hud dev --interactive
67
- # When ready to run:
68
- hud rl
69
- ```
70
-
71
- > See [environment design docs](https://docs.hud.so/build-environments)
72
-
73
-
74
51
  ## Quickstart: Evals
75
52
 
76
53
  For a tutorial that explains the agent and evaluation design, run:
@@ -127,38 +104,27 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
127
104
 
128
105
  ![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif)
129
106
 
130
- ## Reinforcement Learning with GRPO
131
-
132
- This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
133
-
134
- ![RL curve](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/rl_2.png)
107
+ ## Quickstart: Training
135
108
 
136
- Train with the new interactive `hud rl` flow:
109
+ RL using GRPO a Qwen2.5-VL model on any hud dataset:
137
110
 
138
111
  ```bash
139
- # Install CLI
140
- uv tool install hud-python
141
-
142
- # Option A: Run directly from a HuggingFace dataset
143
- hud rl hud-evals/basic-2048
144
-
145
- # Option B: Download first, modify, then train
146
- hud get hud-evals/basic-2048
112
+ hud get hud-evals/basic-2048 # from HF
147
113
  hud rl basic-2048.json
148
-
149
- # Optional: baseline evaluation
150
- hud eval basic-2048.json
151
114
  ```
152
115
 
153
- Supports multi‑turn RL for both:
154
- - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
155
- - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
116
+ > See [agent training docs](https://docs.hud.so/train-agents/quickstart)
156
117
 
157
- By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
118
+ Or make your own environment and dataset:
158
119
 
159
- Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
120
+ ```bash
121
+ hud init my-env && cd my-env
122
+ hud dev --interactive
123
+ # When ready to run:
124
+ hud rl
125
+ ```
160
126
 
161
- Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
127
+ > See [environment design docs](https://docs.hud.so/build-environments)
162
128
 
163
129
  ## Benchmarking Agents
164
130
 
@@ -322,6 +288,39 @@ We highly suggest running 3-5 evaluations per dataset for the most consistent re
322
288
 
323
289
  Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it:
324
290
 
291
+ ## Reinforcement Learning with GRPO
292
+
293
+ This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
294
+
295
+ ![RL curve](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/rl_2.png)
296
+
297
+ Train with the new interactive `hud rl` flow:
298
+
299
+ ```bash
300
+ # Install CLI
301
+ uv tool install hud-python
302
+
303
+ # Option A: Run directly from a HuggingFace dataset
304
+ hud rl hud-evals/basic-2048
305
+
306
+ # Option B: Download first, modify, then train
307
+ hud get hud-evals/basic-2048
308
+ hud rl basic-2048.json
309
+
310
+ # Optional: baseline evaluation
311
+ hud eval basic-2048.json
312
+ ```
313
+
314
+ Supports multi‑turn RL for both:
315
+ - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
316
+ - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
317
+
318
+ By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
319
+
320
+ Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
321
+
322
+ Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
323
+
325
324
  ## Architecture
326
325
 
327
326
  ```mermaid
@@ -6,10 +6,12 @@ See [docs](https://docs.hud.so/build-environments) for the complete environment
6
6
  ## Architecture
7
7
 
8
8
  **`environment/`** - Produces structured data
9
+
9
10
  - Owns all state (game logic, browser sessions, databases, etc.)
10
11
  - Exposes HTTP endpoints `/health`, `/act`, `/reset`, `/state` that return structured information about the environment state
11
12
 
12
13
  **`server/`** - Wraps data in MCP tools
14
+
13
15
  - Calls environment endpoints to get structured data for the agent, and environment setup/evaluation
14
16
  - Agents and tasks interact only with these tools!
15
17
 
@@ -33,12 +35,14 @@ Visit http://localhost:8765/docs to see the new tool appear instantly.
33
35
  In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
34
36
 
35
37
  For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
38
+
36
39
  ```bash
37
40
  cd ..
38
41
  hud dev
39
42
  ```
40
43
 
41
44
  ## Tasks & Evaluation
45
+
42
46
  ```bash
43
47
  # Build first in the global folder with the Dockerfile (creates blank:0.1.0)
44
48
  hud build
@@ -59,6 +63,7 @@ Your `tasks.json` uses `docker run` to launch the environment:
59
63
  ```
60
64
 
61
65
  **Commands:**
66
+
62
67
  ```bash
63
68
  # Build first
64
69
  hud build
@@ -78,6 +83,7 @@ hud rl tasks.json # Auto-converts docker→remote, builds & pushes if needed
78
83
  Once your environment is ready, you can share it with the community:
79
84
 
80
85
  ### 1. Push to Registry
86
+
81
87
  ```bash
82
88
  # Build and push your environment (requires docker hub login and hud api key)
83
89
  hud build
@@ -89,10 +95,12 @@ hud push
89
95
  Create a dataset on HuggingFace with your tasks:
90
96
 
91
97
  **Option A: Upload manually**
98
+
92
99
  1. Upload your `tasks.json` to HuggingFace
93
100
  2. Make sure it's **public** to appear on leaderboards
94
101
 
95
102
  **Option B: Use the SDK**
103
+
96
104
  ```python
97
105
  from hud.datasets import save_tasks
98
106
  import json
@@ -109,7 +117,7 @@ save_tasks(tasks, repo_id="your-org/your-dataset")
109
117
 
110
118
  ```bash
111
119
  # Run Claude on your benchmark
112
- hud eval "your-org/your-dataset" --agent claude
120
+ hud eval "your-org/your-dataset" claude
113
121
 
114
122
  # View results at:
115
123
  # hud.so/leaderboards/your-org/your-dataset
@@ -118,4 +126,3 @@ hud eval "your-org/your-dataset" --agent claude
118
126
  **Note**: Only public HuggingFace datasets appear as leaderboards!
119
127
 
120
128
  📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
121
-
@@ -4,7 +4,7 @@ version = "0.1.0"
4
4
  description = "MCP server for blank environment"
5
5
  requires-python = ">=3.11"
6
6
  dependencies = [
7
- "hud-python>=0.4.51",
7
+ "hud-python>=0.4.53",
8
8
  "httpx>=0.28.1",
9
9
  ]
10
10
 
@@ -4,7 +4,7 @@ version = "0.1.0"
4
4
  description = "HUD Browser MCP Server"
5
5
  requires-python = ">=3.11,<3.14"
6
6
  dependencies = [
7
- "hud-python@git+https://github.com/hud-evals/hud-python@cli-dev",
7
+ "hud-python>=0.4.53",
8
8
  "httpx",
9
9
  "playwright",
10
10
  "pyautogui",
@@ -4,7 +4,7 @@ version = "0.1.0"
4
4
  description = "MCP server for DeepResearch environment"
5
5
  requires-python = ">=3.11"
6
6
  dependencies = [
7
- "hud-python>=0.4.51",
7
+ "hud-python>=0.4.53",
8
8
  "httpx>=0.24.0",
9
9
  ]
10
10
 
@@ -5,10 +5,22 @@ tools for building, evaluating, and training AI agents.
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
- from .telemetry import Trace, clear_trace, create_job, get_trace, instrument, job, trace
8
+ from .telemetry import (
9
+ Trace,
10
+ async_job,
11
+ async_trace,
12
+ clear_trace,
13
+ create_job,
14
+ get_trace,
15
+ instrument,
16
+ job,
17
+ trace,
18
+ )
9
19
 
10
20
  __all__ = [
11
21
  "Trace",
22
+ "async_job",
23
+ "async_trace",
12
24
  "clear_trace",
13
25
  "create_job",
14
26
  "get_trace",
@@ -55,6 +55,7 @@ class MCPAgent(ABC):
55
55
  # Filtering
56
56
  allowed_tools: list[str] | None = None,
57
57
  disallowed_tools: list[str] | None = None,
58
+ response_tool_name: str | None = None,
58
59
  # Messages
59
60
  system_prompt: str = GLOBAL_SYSTEM_PROMPT,
60
61
  append_setup_output: bool = True,
@@ -74,6 +75,7 @@ class MCPAgent(ABC):
74
75
  that provides `mcp_config`.
75
76
  allowed_tools: Names of tools to allow (None means allow all).
76
77
  disallowed_tools: Names of tools to always exclude.
78
+ response_tool_name: Name of the tool to use for response.
77
79
  system_prompt: System prompt to seed the conversation.
78
80
  append_setup_output: Whether to append setup tool output to the
79
81
  first turn's messages.
@@ -108,7 +110,7 @@ class MCPAgent(ABC):
108
110
 
109
111
  # Initialize these here so methods can be called before initialize()
110
112
  self._tool_map: dict[str, types.Tool] = {} # Simplified: just name to tool
111
- self.response_tool_name = None
113
+ self.response_tool_name = response_tool_name
112
114
 
113
115
  # Trace
114
116
  self._auto_trace = auto_trace
@@ -135,7 +137,11 @@ class MCPAgent(ABC):
135
137
  "No MCPClient. Please provide one when initializing the agent or pass a Task with mcp_config." # noqa: E501
136
138
  )
137
139
 
138
- await self._setup_config(self.mcp_client.mcp_config)
140
+ try:
141
+ client_cfg = getattr(self.mcp_client, "mcp_config", None)
142
+ except Exception:
143
+ client_cfg = None
144
+ await self._setup_config(client_cfg)
139
145
 
140
146
  # Initialize client if needed
141
147
  try:
@@ -168,6 +174,8 @@ class MCPAgent(ABC):
168
174
  self.disallowed_tools.extend(task.agent_config["disallowed_tools"])
169
175
  else: # If disallowed_tools is None, we overwrite it
170
176
  self.disallowed_tools = task.agent_config["disallowed_tools"]
177
+ if "response_tool_name" in task.agent_config:
178
+ self.response_tool_name = task.agent_config["response_tool_name"]
171
179
 
172
180
  all_tools = await self.mcp_client.list_tools()
173
181
  self._available_tools = []
@@ -614,8 +622,11 @@ class MCPAgent(ABC):
614
622
  except Exception as e:
615
623
  self.console.error_log(f"Response lifecycle tool failed: {e}")
616
624
 
617
- async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
625
+ async def _setup_config(self, mcp_config: dict[str, dict[str, Any]] | None) -> None:
618
626
  """Inject metadata into the metadata of the initialize request."""
627
+ if not isinstance(mcp_config, dict):
628
+ return
629
+
619
630
  if self.metadata:
620
631
  patch_mcp_config(
621
632
  mcp_config,
@@ -47,7 +47,7 @@ class LiteAgent(GenericOpenAIChatAgent):
47
47
  **agent_kwargs,
48
48
  )
49
49
 
50
- def get_tool_schemas(self) -> list[dict]:
50
+ def get_tool_schemas(self) -> list[Any]:
51
51
  # Prefer LiteLLM's stricter transformer (handles Bedrock & friends)
52
52
  if transform_mcp_tool_to_openai_tool is not None:
53
53
  return [
@@ -20,6 +20,7 @@ import logging
20
20
  from typing import TYPE_CHECKING, Any, ClassVar, cast
21
21
 
22
22
  import mcp.types as types
23
+ from openai import AsyncOpenAI
23
24
 
24
25
  from hud import instrument
25
26
  from hud.types import AgentResponse, MCPToolCall, MCPToolResult
@@ -28,7 +29,6 @@ from hud.utils.hud_console import HUDConsole
28
29
  from .base import MCPAgent
29
30
 
30
31
  if TYPE_CHECKING:
31
- from openai import AsyncOpenAI
32
32
  from openai.types.chat import ChatCompletionToolParam
33
33
 
34
34
  logger = logging.getLogger(__name__)
@@ -42,14 +42,26 @@ class GenericOpenAIChatAgent(MCPAgent):
42
42
  def __init__(
43
43
  self,
44
44
  *,
45
- openai_client: AsyncOpenAI | None,
45
+ openai_client: AsyncOpenAI | None = None,
46
+ api_key: str | None = None,
47
+ base_url: str | None = None,
46
48
  model_name: str = "gpt-4o-mini",
47
49
  completion_kwargs: dict[str, Any] | None = None,
48
50
  **agent_kwargs: Any,
49
51
  ) -> None:
50
52
  # Accept base-agent settings via **agent_kwargs (e.g., mcp_client, system_prompt, etc.)
51
53
  super().__init__(**agent_kwargs)
52
- self.oai = openai_client
54
+
55
+ # Handle client creation - support both patterns
56
+ if openai_client is not None:
57
+ # Use provided client (backward compatibility)
58
+ self.oai = openai_client
59
+ elif api_key is not None or base_url is not None:
60
+ # Create client from config (new pattern, consistent with other agents)
61
+ self.oai = AsyncOpenAI(api_key=api_key, base_url=base_url)
62
+ else:
63
+ raise ValueError("Either openai_client or (api_key and base_url) must be provided")
64
+
53
65
  self.model_name = model_name
54
66
  self.completion_kwargs: dict[str, Any] = completion_kwargs or {}
55
67
  self.mcp_schemas = []
@@ -94,7 +94,7 @@ class TestBaseMCPAgent:
94
94
 
95
95
  assert agent.mcp_client is not None
96
96
  assert agent.allowed_tools is None
97
- assert agent.disallowed_tools == []
97
+ assert agent.disallowed_tools is None
98
98
  assert agent.initial_screenshot is True
99
99
  assert agent.system_prompt is not None # Default system prompt is set
100
100
 
@@ -241,6 +241,13 @@ class TestBaseMCPAgent:
241
241
  assert "tool2" not in tool_names # Not in allowed list
242
242
  assert "tool3" not in tool_names # In disallowed list
243
243
 
244
+ # Make sure tool schemas are correct
245
+ schemas = agent.get_tool_schemas()
246
+ assert len(schemas) == 1
247
+ assert schemas[0]["name"] == "tool1"
248
+ assert schemas[0]["description"] == "Tool 1"
249
+ assert schemas[0]["parameters"] == {"type": "object"}
250
+
244
251
  @pytest.mark.asyncio
245
252
  async def test_call_tool_success(self):
246
253
  """Test successful tool call."""
@@ -334,7 +341,7 @@ class TestBaseMCPAgent:
334
341
  schemas = agent.get_tool_schemas()
335
342
 
336
343
  # Should include non-lifecycle tools
337
- assert len(schemas) == 1
344
+ assert len(schemas) == 2
338
345
  assert schemas[0]["name"] == "tool1"
339
346
 
340
347
  def test_get_tools_by_server(self):