hud-python 0.4.35__tar.gz → 0.4.37__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (250) hide show
  1. {hud_python-0.4.35 → hud_python-0.4.37}/.gitignore +0 -1
  2. {hud_python-0.4.35 → hud_python-0.4.37}/PKG-INFO +43 -23
  3. {hud_python-0.4.35 → hud_python-0.4.37}/README.md +11 -11
  4. {hud_python-0.4.35 → hud_python-0.4.37}/environments/README.md +5 -5
  5. hud_python-0.4.37/environments/blank/README.md +108 -0
  6. hud_python-0.4.37/environments/blank/controller/README.md +16 -0
  7. hud_python-0.4.37/environments/blank/environment/README.md +16 -0
  8. hud_python-0.4.37/environments/blank/pyproject.toml +19 -0
  9. {hud_python-0.4.35 → hud_python-0.4.37}/environments/browser/README.md +67 -88
  10. hud_python-0.4.37/environments/browser/pyproject.toml +22 -0
  11. hud_python-0.4.37/environments/deepresearch/pyproject.toml +19 -0
  12. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/__init__.py +2 -0
  13. hud_python-0.4.37/hud/agents/lite_llm.py +72 -0
  14. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/openai_chat_generic.py +21 -7
  15. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/tests/test_claude.py +32 -7
  16. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/tests/test_openai.py +29 -6
  17. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/__init__.py +228 -79
  18. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/build.py +26 -6
  19. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/dev.py +21 -40
  20. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/eval.py +96 -15
  21. hud_python-0.4.37/hud/cli/flows/tasks.py +388 -0
  22. hud_python-0.4.37/hud/cli/init.py +270 -0
  23. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/pull.py +6 -0
  24. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/push.py +11 -1
  25. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/__init__.py +14 -4
  26. hud_python-0.4.37/hud/cli/rl/celebrate.py +187 -0
  27. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/config.py +15 -8
  28. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/local_runner.py +44 -20
  29. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/remote_runner.py +166 -87
  30. hud_python-0.4.37/hud/cli/rl/viewer.py +141 -0
  31. hud_python-0.4.37/hud/cli/rl/wait_utils.py +89 -0
  32. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_build.py +3 -27
  33. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_mcp_server.py +1 -12
  34. hud_python-0.4.37/hud/cli/utils/config.py +85 -0
  35. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/docker.py +21 -39
  36. hud_python-0.4.37/hud/cli/utils/env_check.py +196 -0
  37. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/environment.py +4 -3
  38. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/interactive.py +2 -1
  39. hud_python-0.4.37/hud/cli/utils/local_runner.py +204 -0
  40. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/metadata.py +3 -1
  41. hud_python-0.4.37/hud/cli/utils/package_runner.py +292 -0
  42. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/remote_runner.py +4 -1
  43. hud_python-0.4.37/hud/cli/utils/source_hash.py +108 -0
  44. {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/base.py +1 -1
  45. {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/fastmcp.py +1 -1
  46. {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/mcp_use.py +30 -7
  47. {hud_python-0.4.35 → hud_python-0.4.37}/hud/datasets/parallel.py +3 -1
  48. {hud_python-0.4.35 → hud_python-0.4.37}/hud/datasets/runner.py +4 -1
  49. {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/config.py +1 -1
  50. {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/context.py +40 -6
  51. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/buffer.py +3 -0
  52. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/tests/test_learner.py +1 -1
  53. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/vllm_adapter.py +1 -1
  54. hud_python-0.4.37/hud/server/server.py +471 -0
  55. hud_python-0.4.37/hud/server/tests/test_add_tool.py +60 -0
  56. hud_python-0.4.37/hud/server/tests/test_context.py +128 -0
  57. hud_python-0.4.37/hud/server/tests/test_mcp_server_handlers.py +44 -0
  58. hud_python-0.4.37/hud/server/tests/test_mcp_server_integration.py +405 -0
  59. hud_python-0.4.37/hud/server/tests/test_mcp_server_more.py +247 -0
  60. hud_python-0.4.37/hud/server/tests/test_run_wrapper.py +53 -0
  61. hud_python-0.4.37/hud/server/tests/test_server_extra.py +166 -0
  62. hud_python-0.4.37/hud/server/tests/test_sigterm_runner.py +78 -0
  63. {hud_python-0.4.35 → hud_python-0.4.37}/hud/settings.py +38 -0
  64. {hud_python-0.4.35 → hud_python-0.4.37}/hud/shared/hints.py +2 -2
  65. {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/job.py +2 -2
  66. {hud_python-0.4.35 → hud_python-0.4.37}/hud/types.py +9 -2
  67. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tasks.py +32 -24
  68. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tests/test_version.py +1 -1
  69. {hud_python-0.4.35 → hud_python-0.4.37}/hud/version.py +1 -1
  70. {hud_python-0.4.35 → hud_python-0.4.37}/pyproject.toml +22 -22
  71. hud_python-0.4.35/environments/browser/pyproject.toml +0 -22
  72. hud_python-0.4.35/hud/cli/flows/tasks.py +0 -255
  73. hud_python-0.4.35/hud/cli/init.py +0 -677
  74. hud_python-0.4.35/hud/server/server.py +0 -244
  75. {hud_python-0.4.35 → hud_python-0.4.37}/LICENSE +0 -0
  76. {hud_python-0.4.35/environments/browser/apps → hud_python-0.4.37/environments/browser/environment}/2048/README.md +0 -0
  77. {hud_python-0.4.35/environments/browser/apps → hud_python-0.4.37/environments/browser/environment}/2048/backend/pyproject.toml +0 -0
  78. {hud_python-0.4.35/environments/browser/apps → hud_python-0.4.37/environments/browser/environment}/README.md +0 -0
  79. {hud_python-0.4.35/environments/browser/apps → hud_python-0.4.37/environments/browser/environment}/todo/README.md +0 -0
  80. {hud_python-0.4.35/environments/browser/apps → hud_python-0.4.37/environments/browser/environment}/todo/backend/pyproject.toml +0 -0
  81. {hud_python-0.4.35 → hud_python-0.4.37}/environments/remote_browser/README.md +0 -0
  82. {hud_python-0.4.35 → hud_python-0.4.37}/environments/remote_browser/pyproject.toml +0 -0
  83. {hud_python-0.4.35 → hud_python-0.4.37}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  84. {hud_python-0.4.35 → hud_python-0.4.37}/environments/text_2048/README.md +0 -0
  85. {hud_python-0.4.35 → hud_python-0.4.37}/environments/text_2048/pyproject.toml +0 -0
  86. {hud_python-0.4.35 → hud_python-0.4.37}/examples/README.md +0 -0
  87. {hud_python-0.4.35 → hud_python-0.4.37}/hud/__init__.py +0 -0
  88. {hud_python-0.4.35 → hud_python-0.4.37}/hud/__main__.py +0 -0
  89. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/base.py +0 -0
  90. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/claude.py +0 -0
  91. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/grounded_openai.py +0 -0
  92. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/langchain.py +0 -0
  93. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/misc/__init__.py +0 -0
  94. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/misc/response_agent.py +0 -0
  95. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/openai.py +0 -0
  96. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/tests/__init__.py +0 -0
  97. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/tests/test_base.py +0 -0
  98. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/tests/test_client.py +0 -0
  99. {hud_python-0.4.35 → hud_python-0.4.37}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  100. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/__main__.py +0 -0
  101. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/analyze.py +0 -0
  102. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/clone.py +0 -0
  103. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/debug.py +0 -0
  104. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/flows/__init__.py +0 -0
  105. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/get.py +0 -0
  106. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/list_func.py +0 -0
  107. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/remove.py +0 -0
  108. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/display.py +0 -0
  109. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/gpu.py +0 -0
  110. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/gpu_utils.py +0 -0
  111. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/presets.py +0 -0
  112. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/rl_api.py +0 -0
  113. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/rl/vllm.py +0 -0
  114. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/__init__.py +0 -0
  115. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_analyze.py +0 -0
  116. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_analyze_metadata.py +0 -0
  117. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_cli_init.py +0 -0
  118. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_cli_main.py +0 -0
  119. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_clone.py +0 -0
  120. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_cursor.py +0 -0
  121. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_debug.py +0 -0
  122. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_list_func.py +0 -0
  123. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_main_module.py +0 -0
  124. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_pull.py +0 -0
  125. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_push.py +0 -0
  126. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_registry.py +0 -0
  127. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/tests/test_utils.py +0 -0
  128. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/__init__.py +0 -0
  129. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/cursor.py +0 -0
  130. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/logging.py +0 -0
  131. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/registry.py +0 -0
  132. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/runner.py +0 -0
  133. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/server.py +0 -0
  134. {hud_python-0.4.35 → hud_python-0.4.37}/hud/cli/utils/tasks.py +0 -0
  135. {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/README.md +0 -0
  136. {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/__init__.py +0 -0
  137. {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/tests/__init__.py +0 -0
  138. {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/tests/test_client_integration.py +0 -0
  139. {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/tests/test_fastmcp.py +0 -0
  140. {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/tests/test_mcp_use_retry.py +0 -0
  141. {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/tests/test_protocol.py +0 -0
  142. {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/utils/__init__.py +0 -0
  143. {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/utils/mcp_use_retry.py +0 -0
  144. {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/utils/retry.py +0 -0
  145. {hud_python-0.4.35 → hud_python-0.4.37}/hud/clients/utils/retry_transport.py +0 -0
  146. {hud_python-0.4.35 → hud_python-0.4.37}/hud/datasets/__init__.py +0 -0
  147. {hud_python-0.4.35 → hud_python-0.4.37}/hud/datasets/utils.py +0 -0
  148. {hud_python-0.4.35 → hud_python-0.4.37}/hud/misc/__init__.py +0 -0
  149. {hud_python-0.4.35 → hud_python-0.4.37}/hud/misc/claude_plays_pokemon.py +0 -0
  150. {hud_python-0.4.35 → hud_python-0.4.37}/hud/native/__init__.py +0 -0
  151. {hud_python-0.4.35 → hud_python-0.4.37}/hud/native/comparator.py +0 -0
  152. {hud_python-0.4.35 → hud_python-0.4.37}/hud/native/tests/__init__.py +0 -0
  153. {hud_python-0.4.35 → hud_python-0.4.37}/hud/native/tests/test_comparator.py +0 -0
  154. {hud_python-0.4.35 → hud_python-0.4.37}/hud/native/tests/test_native_init.py +0 -0
  155. {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/__init__.py +0 -0
  156. {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/collector.py +0 -0
  157. {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/exporters.py +0 -0
  158. {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/instrumentation.py +0 -0
  159. {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/processors.py +0 -0
  160. {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/tests/__init__.py +0 -0
  161. {hud_python-0.4.35 → hud_python-0.4.37}/hud/otel/tests/test_processors.py +0 -0
  162. {hud_python-0.4.35 → hud_python-0.4.37}/hud/py.typed +0 -0
  163. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/README.md +0 -0
  164. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/__init__.py +0 -0
  165. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/actor.py +0 -0
  166. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/chat_template.jinja +0 -0
  167. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/config.py +0 -0
  168. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/distributed.py +0 -0
  169. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/learner.py +0 -0
  170. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/tests/__init__.py +0 -0
  171. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/train.py +0 -0
  172. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/types.py +0 -0
  173. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/utils/start_vllm_server.sh +0 -0
  174. {hud_python-0.4.35 → hud_python-0.4.37}/hud/rl/utils.py +0 -0
  175. {hud_python-0.4.35 → hud_python-0.4.37}/hud/samples/__init__.py +0 -0
  176. {hud_python-0.4.35 → hud_python-0.4.37}/hud/samples/browser.py +0 -0
  177. {hud_python-0.4.35 → hud_python-0.4.37}/hud/server/__init__.py +0 -0
  178. {hud_python-0.4.35 → hud_python-0.4.37}/hud/server/context.py +0 -0
  179. {hud_python-0.4.35 → hud_python-0.4.37}/hud/server/helper/__init__.py +0 -0
  180. {hud_python-0.4.35 → hud_python-0.4.37}/hud/server/low_level.py +0 -0
  181. {hud_python-0.4.35 → hud_python-0.4.37}/hud/server/tests/__init__.py +0 -0
  182. {hud_python-0.4.35 → hud_python-0.4.37}/hud/shared/__init__.py +0 -0
  183. {hud_python-0.4.35 → hud_python-0.4.37}/hud/shared/exceptions.py +0 -0
  184. {hud_python-0.4.35 → hud_python-0.4.37}/hud/shared/requests.py +0 -0
  185. {hud_python-0.4.35 → hud_python-0.4.37}/hud/shared/tests/__init__.py +0 -0
  186. {hud_python-0.4.35 → hud_python-0.4.37}/hud/shared/tests/test_exceptions.py +0 -0
  187. {hud_python-0.4.35 → hud_python-0.4.37}/hud/shared/tests/test_requests.py +0 -0
  188. {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/__init__.py +0 -0
  189. {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/instrument.py +0 -0
  190. {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/replay.py +0 -0
  191. {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/tests/__init__.py +0 -0
  192. {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/tests/test_replay.py +0 -0
  193. {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/tests/test_trace.py +0 -0
  194. {hud_python-0.4.35 → hud_python-0.4.37}/hud/telemetry/trace.py +0 -0
  195. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/__init__.py +0 -0
  196. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/base.py +0 -0
  197. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/bash.py +0 -0
  198. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/computer/__init__.py +0 -0
  199. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/computer/anthropic.py +0 -0
  200. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/computer/hud.py +0 -0
  201. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/computer/openai.py +0 -0
  202. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/computer/settings.py +0 -0
  203. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/edit.py +0 -0
  204. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/executors/__init__.py +0 -0
  205. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/executors/base.py +0 -0
  206. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/executors/pyautogui.py +0 -0
  207. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/executors/tests/__init__.py +0 -0
  208. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/executors/tests/test_base_executor.py +0 -0
  209. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  210. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/executors/xdo.py +0 -0
  211. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/grounding/__init__.py +0 -0
  212. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/grounding/config.py +0 -0
  213. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/grounding/grounded_tool.py +0 -0
  214. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/grounding/grounder.py +0 -0
  215. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/grounding/tests/__init__.py +0 -0
  216. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  217. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/playwright.py +0 -0
  218. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/response.py +0 -0
  219. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/submit.py +0 -0
  220. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/__init__.py +0 -0
  221. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_base.py +0 -0
  222. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_bash.py +0 -0
  223. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_bash_extended.py +0 -0
  224. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_computer.py +0 -0
  225. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_computer_actions.py +0 -0
  226. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_edit.py +0 -0
  227. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_init.py +0 -0
  228. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_playwright_tool.py +0 -0
  229. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_response.py +0 -0
  230. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_tools.py +0 -0
  231. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_tools_init.py +0 -0
  232. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/tests/test_utils.py +0 -0
  233. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/types.py +0 -0
  234. {hud_python-0.4.35 → hud_python-0.4.37}/hud/tools/utils.py +0 -0
  235. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/__init__.py +0 -0
  236. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/agent_factories.py +0 -0
  237. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/async_utils.py +0 -0
  238. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/group_eval.py +0 -0
  239. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/hud_console.py +0 -0
  240. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/mcp.py +0 -0
  241. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/pretty_errors.py +0 -0
  242. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/progress.py +0 -0
  243. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/telemetry.py +0 -0
  244. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tests/__init__.py +0 -0
  245. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tests/test_async_utils.py +0 -0
  246. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tests/test_init.py +0 -0
  247. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tests/test_mcp.py +0 -0
  248. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tests/test_progress.py +0 -0
  249. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tests/test_telemetry.py +0 -0
  250. {hud_python-0.4.35 → hud_python-0.4.37}/hud/utils/tool_shorthand.py +0 -0
@@ -22,7 +22,6 @@ uv.lock
22
22
 
23
23
  # Test files
24
24
  /*.ipynb
25
- test.json
26
25
  TODO.md
27
26
 
28
27
  .coverage
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.35
3
+ Version: 0.4.37
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -36,11 +36,13 @@ Classifier: Programming Language :: Python :: 3.12
36
36
  Classifier: Programming Language :: Python :: 3.13
37
37
  Requires-Python: <3.13,>=3.11
38
38
  Requires-Dist: anthropic
39
+ Requires-Dist: blessed>=1.20.0
39
40
  Requires-Dist: datasets>=2.14.0
40
41
  Requires-Dist: httpx<1,>=0.23.0
41
42
  Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
42
43
  Requires-Dist: hud-mcp-python-sdk>=3.13.2
43
- Requires-Dist: hud-mcp-use-python-sdk>=2.3.16
44
+ Requires-Dist: hud-mcp-use-python-sdk==2.3.19
45
+ Requires-Dist: litellm>=1.55.0
44
46
  Requires-Dist: numpy>=1.24.0
45
47
  Requires-Dist: openai
46
48
  Requires-Dist: opentelemetry-api>=1.34.1
@@ -50,8 +52,8 @@ Requires-Dist: opentelemetry-sdk>=1.34.1
50
52
  Requires-Dist: pathspec>=0.12.1
51
53
  Requires-Dist: pillow>=11.1.0
52
54
  Requires-Dist: prompt-toolkit==3.0.51
53
- Requires-Dist: pydantic-settings<3,>=2
54
- Requires-Dist: pydantic<3,>=2
55
+ Requires-Dist: pydantic-settings<3,>=2.2
56
+ Requires-Dist: pydantic<3,>=2.6
55
57
  Requires-Dist: questionary==2.1.0
56
58
  Requires-Dist: rich>=13.0.0
57
59
  Requires-Dist: toml>=0.10.2
@@ -59,7 +61,9 @@ Requires-Dist: typer>=0.9.0
59
61
  Requires-Dist: watchfiles>=0.21.0
60
62
  Requires-Dist: wrapt>=1.14.0
61
63
  Provides-Extra: agent
64
+ Requires-Dist: aiodocker>=0.24.0; extra == 'agent'
62
65
  Requires-Dist: dotenv>=0.9.9; extra == 'agent'
66
+ Requires-Dist: inspect-ai>=0.3.80; extra == 'agent'
63
67
  Requires-Dist: ipykernel; extra == 'agent'
64
68
  Requires-Dist: ipython<9; extra == 'agent'
65
69
  Requires-Dist: jupyter-client; extra == 'agent'
@@ -67,8 +71,21 @@ Requires-Dist: jupyter-core; extra == 'agent'
67
71
  Requires-Dist: langchain; extra == 'agent'
68
72
  Requires-Dist: langchain-anthropic; extra == 'agent'
69
73
  Requires-Dist: langchain-openai; extra == 'agent'
74
+ Requires-Dist: pillow>=11.1.0; extra == 'agent'
75
+ Requires-Dist: playwright; extra == 'agent'
76
+ Requires-Dist: pyautogui>=0.9.54; extra == 'agent'
77
+ Requires-Dist: pyright==1.1.401; extra == 'agent'
78
+ Requires-Dist: pytest-asyncio; extra == 'agent'
79
+ Requires-Dist: pytest-cov; extra == 'agent'
80
+ Requires-Dist: pytest-mock; extra == 'agent'
81
+ Requires-Dist: pytest<9,>=8.1.1; extra == 'agent'
82
+ Requires-Dist: ruff>=0.11.8; extra == 'agent'
83
+ Requires-Dist: setuptools; extra == 'agent'
84
+ Requires-Dist: textdistance<5,>=4.5.0; extra == 'agent'
70
85
  Provides-Extra: agents
86
+ Requires-Dist: aiodocker>=0.24.0; extra == 'agents'
71
87
  Requires-Dist: dotenv>=0.9.9; extra == 'agents'
88
+ Requires-Dist: inspect-ai>=0.3.80; extra == 'agents'
72
89
  Requires-Dist: ipykernel; extra == 'agents'
73
90
  Requires-Dist: ipython<9; extra == 'agents'
74
91
  Requires-Dist: jupyter-client; extra == 'agents'
@@ -76,6 +93,17 @@ Requires-Dist: jupyter-core; extra == 'agents'
76
93
  Requires-Dist: langchain; extra == 'agents'
77
94
  Requires-Dist: langchain-anthropic; extra == 'agents'
78
95
  Requires-Dist: langchain-openai; extra == 'agents'
96
+ Requires-Dist: pillow>=11.1.0; extra == 'agents'
97
+ Requires-Dist: playwright; extra == 'agents'
98
+ Requires-Dist: pyautogui>=0.9.54; extra == 'agents'
99
+ Requires-Dist: pyright==1.1.401; extra == 'agents'
100
+ Requires-Dist: pytest-asyncio; extra == 'agents'
101
+ Requires-Dist: pytest-cov; extra == 'agents'
102
+ Requires-Dist: pytest-mock; extra == 'agents'
103
+ Requires-Dist: pytest<9,>=8.1.1; extra == 'agents'
104
+ Requires-Dist: ruff>=0.11.8; extra == 'agents'
105
+ Requires-Dist: setuptools; extra == 'agents'
106
+ Requires-Dist: textdistance<5,>=4.5.0; extra == 'agents'
79
107
  Provides-Extra: dev
80
108
  Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
81
109
  Requires-Dist: dotenv>=0.9.9; extra == 'dev'
@@ -100,14 +128,6 @@ Requires-Dist: setuptools; extra == 'dev'
100
128
  Requires-Dist: textdistance<5,>=4.5.0; extra == 'dev'
101
129
  Provides-Extra: rl
102
130
  Requires-Dist: bitsandbytes>=0.41.0; (sys_platform == 'linux') and extra == 'rl'
103
- Requires-Dist: dotenv>=0.9.9; extra == 'rl'
104
- Requires-Dist: ipykernel; extra == 'rl'
105
- Requires-Dist: ipython<9; extra == 'rl'
106
- Requires-Dist: jupyter-client; extra == 'rl'
107
- Requires-Dist: jupyter-core; extra == 'rl'
108
- Requires-Dist: langchain; extra == 'rl'
109
- Requires-Dist: langchain-anthropic; extra == 'rl'
110
- Requires-Dist: langchain-openai; extra == 'rl'
111
131
  Requires-Dist: liger-kernel>=0.5.0; (sys_platform == 'linux') and extra == 'rl'
112
132
  Requires-Dist: peft>=0.17.1; extra == 'rl'
113
133
  Requires-Dist: vllm==0.10.1.1; extra == 'rl'
@@ -138,8 +158,8 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
138
158
  ## Highlights
139
159
 
140
160
  - 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
141
- - ⚡️ **[Live telemetry](https://app.hud.so)** – inspect every tool call, observation, and reward in real time.
142
- - 🗂️ **[Public benchmarks](https://app.hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
161
+ - ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
162
+ - 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
143
163
  - 🌱 **[Reinforcement learning built-in](rl/)** – Verifiers gym pipelines for GRPO on any environment.
144
164
  - 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
145
165
  - 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
@@ -185,14 +205,14 @@ from hud.agents import ClaudeAgent
185
205
  from hud.datasets import Task # See docs: https://docs.hud.so/reference/tasks
186
206
 
187
207
  async def main() -> None:
188
- with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://app.hud.so)
208
+ with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://hud.so)
189
209
  task = {
190
210
  "prompt": "Reach 64 in 2048.",
191
211
  "mcp_config": {
192
212
  "hud": {
193
213
  "url": "https://mcp.hud.so/v3/mcp", # HUD's cloud MCP server (see https://docs.hud.so/core-concepts/architecture)
194
214
  "headers": {
195
- "Authorization": f"Bearer {settings.api_key}", # Get your key at https://app.hud.so
215
+ "Authorization": f"Bearer {settings.api_key}", # Get your key at https://hud.so
196
216
  "Mcp-Image": "hudpython/hud-text-2048:v1.2" # Docker image from https://hub.docker.com/u/hudpython
197
217
  }
198
218
  }
@@ -219,7 +239,7 @@ async def main() -> None:
219
239
  asyncio.run(main())
220
240
  ```
221
241
 
222
- The above example let's the agent play 2048 ([See replay](https://app.hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
242
+ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
223
243
 
224
244
  ![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif)
225
245
 
@@ -250,7 +270,7 @@ Supports multi‑turn RL for both:
250
270
  - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
251
271
  - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
252
272
 
253
- By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `app.hud.so`, and lets you monitor/manage models at `app.hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
273
+ By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
254
274
 
255
275
  Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
256
276
 
@@ -260,7 +280,7 @@ This is Claude Computer Use running on our proprietary financial analyst benchma
260
280
 
261
281
  ![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
262
282
 
263
- > [See this trace on _app.hud.so_](https://app.hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
283
+ > [See this trace on _hud.so_](https://hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
264
284
 
265
285
  This example runs the full dataset (only takes ~20 minutes) using [run_evaluation.py](examples/run_evaluation.py):
266
286
 
@@ -286,7 +306,7 @@ results = await run_dataset(
286
306
  print(f"Average reward: {sum(r.reward for r in results) / len(results):.2f}")
287
307
  ```
288
308
 
289
- > Running a dataset creates a job and streams results to the [app.hud.so](https://app.hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
309
+ > Running a dataset creates a job and streams results to the [hud.so](https://hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
290
310
 
291
311
  ## Building Environments (MCP)
292
312
 
@@ -377,7 +397,7 @@ Tools
377
397
  hud push # needs docker login, hud api key
378
398
  ```
379
399
 
380
- 5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [app.hud.so](https://app.hud.so):
400
+ 5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [hud.so](https://hud.so):
381
401
 
382
402
  ```python
383
403
  from hud.agents import ClaudeAgent
@@ -408,7 +428,7 @@ result = await ClaudeAgent().run({ # See all agents: https://docs.hud.so/refere
408
428
 
409
429
  ## Leaderboards & benchmarks
410
430
 
411
- All leaderboards are publicly available on [app.hud.so/leaderboards](https://app.hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
431
+ All leaderboards are publicly available on [hud.so/leaderboards](https://hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
412
432
 
413
433
  ![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/leaderboards_3.png)
414
434
 
@@ -422,7 +442,7 @@ Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) funct
422
442
  %%{init: {"theme": "neutral", "themeVariables": {"fontSize": "14px"}} }%%
423
443
  graph LR
424
444
  subgraph "Platform"
425
- Dashboard["📊 app.hud.so"]
445
+ Dashboard["📊 hud.so"]
426
446
  API["🔌 mcp.hud.so"]
427
447
  end
428
448
 
@@ -23,8 +23,8 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
23
23
  ## Highlights
24
24
 
25
25
  - 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
26
- - ⚡️ **[Live telemetry](https://app.hud.so)** – inspect every tool call, observation, and reward in real time.
27
- - 🗂️ **[Public benchmarks](https://app.hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
26
+ - ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
27
+ - 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
28
28
  - 🌱 **[Reinforcement learning built-in](rl/)** – Verifiers gym pipelines for GRPO on any environment.
29
29
  - 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
30
30
  - 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
@@ -70,14 +70,14 @@ from hud.agents import ClaudeAgent
70
70
  from hud.datasets import Task # See docs: https://docs.hud.so/reference/tasks
71
71
 
72
72
  async def main() -> None:
73
- with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://app.hud.so)
73
+ with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://hud.so)
74
74
  task = {
75
75
  "prompt": "Reach 64 in 2048.",
76
76
  "mcp_config": {
77
77
  "hud": {
78
78
  "url": "https://mcp.hud.so/v3/mcp", # HUD's cloud MCP server (see https://docs.hud.so/core-concepts/architecture)
79
79
  "headers": {
80
- "Authorization": f"Bearer {settings.api_key}", # Get your key at https://app.hud.so
80
+ "Authorization": f"Bearer {settings.api_key}", # Get your key at https://hud.so
81
81
  "Mcp-Image": "hudpython/hud-text-2048:v1.2" # Docker image from https://hub.docker.com/u/hudpython
82
82
  }
83
83
  }
@@ -104,7 +104,7 @@ async def main() -> None:
104
104
  asyncio.run(main())
105
105
  ```
106
106
 
107
- The above example let's the agent play 2048 ([See replay](https://app.hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
107
+ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
108
108
 
109
109
  ![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif)
110
110
 
@@ -135,7 +135,7 @@ Supports multi‑turn RL for both:
135
135
  - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
136
136
  - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
137
137
 
138
- By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `app.hud.so`, and lets you monitor/manage models at `app.hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
138
+ By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
139
139
 
140
140
  Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
141
141
 
@@ -145,7 +145,7 @@ This is Claude Computer Use running on our proprietary financial analyst benchma
145
145
 
146
146
  ![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
147
147
 
148
- > [See this trace on _app.hud.so_](https://app.hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
148
+ > [See this trace on _hud.so_](https://hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
149
149
 
150
150
  This example runs the full dataset (only takes ~20 minutes) using [run_evaluation.py](examples/run_evaluation.py):
151
151
 
@@ -171,7 +171,7 @@ results = await run_dataset(
171
171
  print(f"Average reward: {sum(r.reward for r in results) / len(results):.2f}")
172
172
  ```
173
173
 
174
- > Running a dataset creates a job and streams results to the [app.hud.so](https://app.hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
174
+ > Running a dataset creates a job and streams results to the [hud.so](https://hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
175
175
 
176
176
  ## Building Environments (MCP)
177
177
 
@@ -262,7 +262,7 @@ Tools
262
262
  hud push # needs docker login, hud api key
263
263
  ```
264
264
 
265
- 5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [app.hud.so](https://app.hud.so):
265
+ 5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [hud.so](https://hud.so):
266
266
 
267
267
  ```python
268
268
  from hud.agents import ClaudeAgent
@@ -293,7 +293,7 @@ result = await ClaudeAgent().run({ # See all agents: https://docs.hud.so/refere
293
293
 
294
294
  ## Leaderboards & benchmarks
295
295
 
296
- All leaderboards are publicly available on [app.hud.so/leaderboards](https://app.hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
296
+ All leaderboards are publicly available on [hud.so/leaderboards](https://hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
297
297
 
298
298
  ![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/leaderboards_3.png)
299
299
 
@@ -307,7 +307,7 @@ Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) funct
307
307
  %%{init: {"theme": "neutral", "themeVariables": {"fontSize": "14px"}} }%%
308
308
  graph LR
309
309
  subgraph "Platform"
310
- Dashboard["📊 app.hud.so"]
310
+ Dashboard["📊 hud.so"]
311
311
  API["🔌 mcp.hud.so"]
312
312
  end
313
313
 
@@ -495,7 +495,7 @@ from hud.agents import ClaudeAgent
495
495
  from hud.clients import MCPClient
496
496
 
497
497
  async def main():
498
- # `trace` captures *everything* that happens and sends it to app.hud.so
498
+ # `trace` captures *everything* that happens and sends it to hud.so
499
499
  with hud.trace("local_test"):
500
500
  task = Task(
501
501
  prompt="Complete the task",
@@ -524,7 +524,7 @@ async def main():
524
524
  asyncio.run(main())
525
525
  ```
526
526
 
527
- The `trace` context manager sends a full timeline of agent actions, tool calls, and rewards to app.hud.so – perfect for debugging.
527
+ The `trace` context manager sends a full timeline of agent actions, tool calls, and rewards to hud.so – perfect for debugging.
528
528
 
529
529
  See `examples/01_hello_2048.py` and `examples/task_with_setup_eval.py` for larger end-to-end demos.
530
530
 
@@ -532,7 +532,7 @@ See `examples/01_hello_2048.py` and `examples/task_with_setup_eval.py` for large
532
532
 
533
533
  ## Phase 4 – Remote Deployment & HUD Runner
534
534
 
535
- **Goal →** the exact same image runs in parallel on hundreds of instances, and exposes more telemetry so the app.hud.so can visualise the whole lifecycle.
535
+ **Goal →** the exact same image runs in parallel on hundreds of instances, and exposes more telemetry so the hud.so can visualise the whole lifecycle.
536
536
 
537
537
  ### 1. Publish your image
538
538
 
@@ -595,11 +595,11 @@ async def initialize_environment(session=None, progress_token=None):
595
595
  await send(100, "ready")
596
596
  ```
597
597
 
598
- Those messages are displayed live on app.hud.so alongside resource graphs – perfect feedback while you wait.
598
+ Those messages are displayed live on hud.so alongside resource graphs – perfect feedback while you wait.
599
599
 
600
600
  ### 4. Live telemetry (`telemetry://live`) (Optional)
601
601
 
602
- Expose a resource named `telemetry://live` exactly like in `environments/browser/src/hud_controller/server.py` to return live url to be displayed on app.hud.so.
602
+ Expose a resource named `telemetry://live` exactly like in `environments/browser/src/hud_controller/server.py` to return live url to be displayed on hud.so.
603
603
 
604
604
  Once all of the above works you can unleash *hundreds* of concurrent agents on your new environment.
605
605
 
@@ -0,0 +1,108 @@
1
+ # test-test
2
+
3
+ ## Environment design pattern
4
+ - Controller (Think of this as a frontend in web development)
5
+ - Creates the UX and manages the lifecycle of an app (in this case for an agent)
6
+ - Define `mcp = MCPServer()` and register `@mcp.tool` as tools the agent can interact with
7
+ - Environment (Think of this as a backend in web development)
8
+ - Owns all long‑lived states of the environment and exposes the environment data structure
9
+ - Expose simple HTTP endpoints (`/health`, `/act`, `/reset`, `/state`)
10
+
11
+ IMPORTANT: Make sure all logs are going to stderr instead of stdio, which is reserved for MCP communication
12
+
13
+ ### Testing your environment
14
+ ```bash
15
+ # 1. Configure your API keys (optional - only needed for evaluation)
16
+ # Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY
17
+
18
+ # 2. Start the environment (optional: with --inspector or --interactive)
19
+ hud dev --build --interactive
20
+
21
+ # 3. Choose your preferred way to test:
22
+
23
+ # Option A: Run the task with Claude (requires ANTHROPIC_API_KEY)
24
+ hud eval tasks.json --agent claude
25
+
26
+ # Option B: Interactive notebook test_env.ipynb (great for learning!)
27
+
28
+ # Option C: Simple Python script (runs all tasks from tasks.json)
29
+ python test_task.py
30
+ ```
31
+
32
+ ## Iterating on your environment
33
+ This is usually the process for making any environment better:
34
+ ```bash
35
+ # 1. Start the environment and interact with it directly (or give MCP server to an agent):
36
+ hud dev --build --interactive
37
+
38
+ # 2. If the environment cannot start or fails inexplicably:
39
+ hud debug test_env:dev # Or your env name that appears when you run hud dev
40
+ # After fixing the error, go back to 1.
41
+
42
+ # 3. When the environment is in a stable state:
43
+ hud build
44
+ hud push # Requires docker login
45
+
46
+ # 4. As soon as it's pushed to the newest version, make sure tasks have it updated and run:
47
+ hud rl
48
+ # This is a good test to see if your environment and tasks are high quality!
49
+
50
+ ## Layout
51
+ ```
52
+ controller/
53
+ __init__.py # mcp + shared HTTP client
54
+ __main__.py # python -m controller → mcp.run()
55
+ hooks.py # @mcp.initialize / @mcp.shutdown
56
+ tools.py # @mcp.tool act / setup / evaluate
57
+
58
+ ./environment
59
+ ├── __init__.py
60
+ └── server.py # FastAPI app: /health, /act, /reset, /state
61
+ ```
62
+
63
+ ## Publishing Your Environment
64
+
65
+ Once your environment is ready, you can share it with the community:
66
+
67
+ ### 1. Push to Registry
68
+ ```bash
69
+ # Build and push your environment (requires docker hub login and hud api key)
70
+ hud build
71
+ hud push
72
+ ```
73
+
74
+ ### 2. Create a Dataset
75
+
76
+ Create a dataset on HuggingFace with your tasks:
77
+
78
+ **Option A: Upload manually**
79
+ 1. Upload your `tasks.json` to HuggingFace
80
+ 2. Make sure it's **public** to appear on leaderboards
81
+
82
+ **Option B: Use the SDK**
83
+ ```python
84
+ from hud.datasets import save_tasks
85
+ import json
86
+
87
+ # Load your tasks
88
+ with open("tasks.json") as f:
89
+ tasks = json.load(f)
90
+
91
+ # Push to HuggingFace
92
+ save_tasks(tasks, repo_id="your-org/your-dataset")
93
+ ```
94
+
95
+ ### 3. Run and Track Performance
96
+
97
+ ```bash
98
+ # Run Claude on your benchmark
99
+ hud eval "your-org/your-dataset" --agent claude
100
+
101
+ # View results at:
102
+ # hud.so/leaderboards/your-org/your-dataset
103
+ ```
104
+
105
+ **Note**: Only public HuggingFace datasets appear as leaderboards!
106
+
107
+ 📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
108
+
@@ -0,0 +1,16 @@
1
+ # Controller
2
+
3
+ Frontend for the agent: defines tools, minimal state, calls the environment over HTTP.
4
+
5
+ What to implement
6
+ - Shared client in `__init__.py` (one `httpx.AsyncClient`)
7
+ - Lifecycle in `hooks.py` (`@mcp.initialize`/`@mcp.shutdown`)
8
+ - Tools in `tools.py` (`@mcp.tool`) — keep logic thin; docstrings = descriptions
9
+
10
+ Run
11
+ ```bash
12
+ hud run controller --transport http --reload
13
+ # Helper endpoints: http://localhost:8765/hud and /hud/tools
14
+ ```
15
+
16
+ Principle: the controller is UX, not state. Keep long‑lived state in the environment.
@@ -0,0 +1,16 @@
1
+ # Environment
2
+
3
+ Backend service: owns state and exposes HTTP APIs the controller calls.
4
+
5
+ Endpoints (FastAPI)
6
+ - `GET /health` → {status: ok}
7
+ - `POST /act` → increments counter and returns {count}
8
+ - `POST /reset` → resets counter
9
+ - `GET /state` → returns {count}
10
+
11
+ Run (dev)
12
+ ```bash
13
+ uv run uvicorn environment.server:app --reload --port 8005
14
+ ```
15
+
16
+ Principle: treat like a backend. Keep long‑lived state here; add endpoints as tools need them.
@@ -0,0 +1,19 @@
1
+ [project]
2
+ name = "test_test"
3
+ version = "0.1.0"
4
+ description = "A minimal HUD environment"
5
+ requires-python = ">=3.11"
6
+ dependencies = [ "hud-python==0.4.37", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",]
7
+
8
+ [build-system]
9
+ requires = [ "hatchling",]
10
+ build-backend = "hatchling.build"
11
+
12
+ [tool.hud]
13
+ image = "test_test:dev"
14
+
15
+ [tool.hatch.metadata]
16
+ allow-direct-references = true
17
+
18
+ [tool.hatch.build.targets.wheel]
19
+ packages = [ "controller", "environment",]