hud-python 0.4.36__tar.gz → 0.4.38__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (248) hide show
  1. {hud_python-0.4.36 → hud_python-0.4.38}/PKG-INFO +14 -12
  2. {hud_python-0.4.36 → hud_python-0.4.38}/README.md +11 -11
  3. {hud_python-0.4.36 → hud_python-0.4.38}/environments/README.md +5 -5
  4. {hud_python-0.4.36 → hud_python-0.4.38}/environments/blank/README.md +20 -4
  5. {hud_python-0.4.36 → hud_python-0.4.38}/environments/blank/pyproject.toml +1 -1
  6. {hud_python-0.4.36 → hud_python-0.4.38}/environments/browser/README.md +1 -1
  7. {hud_python-0.4.36 → hud_python-0.4.38}/environments/browser/pyproject.toml +2 -2
  8. {hud_python-0.4.36 → hud_python-0.4.38}/environments/deepresearch/pyproject.toml +1 -1
  9. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/__init__.py +2 -0
  10. hud_python-0.4.38/hud/agents/lite_llm.py +72 -0
  11. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/openai_chat_generic.py +21 -7
  12. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/__init__.py +19 -4
  13. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/build.py +17 -2
  14. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/dev.py +1 -1
  15. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/eval.py +93 -13
  16. hud_python-0.4.38/hud/cli/flows/tasks.py +388 -0
  17. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/init.py +1 -1
  18. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/push.py +9 -0
  19. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/rl/__init__.py +14 -4
  20. hud_python-0.4.38/hud/cli/rl/celebrate.py +187 -0
  21. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/rl/config.py +15 -8
  22. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/rl/local_runner.py +44 -20
  23. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/rl/remote_runner.py +164 -87
  24. hud_python-0.4.38/hud/cli/rl/viewer.py +141 -0
  25. hud_python-0.4.38/hud/cli/rl/wait_utils.py +89 -0
  26. hud_python-0.4.38/hud/cli/utils/env_check.py +196 -0
  27. hud_python-0.4.38/hud/cli/utils/source_hash.py +108 -0
  28. {hud_python-0.4.36 → hud_python-0.4.38}/hud/clients/base.py +1 -1
  29. {hud_python-0.4.36 → hud_python-0.4.38}/hud/clients/fastmcp.py +1 -1
  30. {hud_python-0.4.36 → hud_python-0.4.38}/hud/otel/config.py +1 -1
  31. {hud_python-0.4.36 → hud_python-0.4.38}/hud/otel/context.py +2 -2
  32. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/vllm_adapter.py +1 -1
  33. {hud_python-0.4.36 → hud_python-0.4.38}/hud/server/server.py +84 -13
  34. hud_python-0.4.38/hud/server/tests/test_add_tool.py +60 -0
  35. hud_python-0.4.38/hud/server/tests/test_context.py +128 -0
  36. hud_python-0.4.38/hud/server/tests/test_mcp_server_handlers.py +44 -0
  37. hud_python-0.4.38/hud/server/tests/test_mcp_server_integration.py +405 -0
  38. hud_python-0.4.38/hud/server/tests/test_mcp_server_more.py +247 -0
  39. hud_python-0.4.38/hud/server/tests/test_run_wrapper.py +53 -0
  40. hud_python-0.4.38/hud/server/tests/test_server_extra.py +166 -0
  41. hud_python-0.4.38/hud/server/tests/test_sigterm_runner.py +78 -0
  42. {hud_python-0.4.36 → hud_python-0.4.38}/hud/shared/hints.py +1 -1
  43. {hud_python-0.4.36 → hud_python-0.4.38}/hud/telemetry/job.py +2 -2
  44. {hud_python-0.4.36 → hud_python-0.4.38}/hud/types.py +9 -2
  45. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/tasks.py +32 -24
  46. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/tests/test_version.py +1 -1
  47. {hud_python-0.4.36 → hud_python-0.4.38}/hud/version.py +1 -1
  48. {hud_python-0.4.36 → hud_python-0.4.38}/pyproject.toml +4 -1
  49. hud_python-0.4.36/environments/browser/environment/pyproject.toml +0 -20
  50. hud_python-0.4.36/hud/cli/flows/tasks.py +0 -256
  51. {hud_python-0.4.36 → hud_python-0.4.38}/.gitignore +0 -0
  52. {hud_python-0.4.36 → hud_python-0.4.38}/LICENSE +0 -0
  53. {hud_python-0.4.36 → hud_python-0.4.38}/environments/blank/controller/README.md +0 -0
  54. {hud_python-0.4.36 → hud_python-0.4.38}/environments/blank/environment/README.md +0 -0
  55. {hud_python-0.4.36 → hud_python-0.4.38}/environments/browser/environment/2048/README.md +0 -0
  56. {hud_python-0.4.36 → hud_python-0.4.38}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
  57. {hud_python-0.4.36 → hud_python-0.4.38}/environments/browser/environment/README.md +0 -0
  58. {hud_python-0.4.36 → hud_python-0.4.38}/environments/browser/environment/todo/README.md +0 -0
  59. {hud_python-0.4.36 → hud_python-0.4.38}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
  60. {hud_python-0.4.36 → hud_python-0.4.38}/environments/remote_browser/README.md +0 -0
  61. {hud_python-0.4.36 → hud_python-0.4.38}/environments/remote_browser/pyproject.toml +0 -0
  62. {hud_python-0.4.36 → hud_python-0.4.38}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  63. {hud_python-0.4.36 → hud_python-0.4.38}/environments/text_2048/README.md +0 -0
  64. {hud_python-0.4.36 → hud_python-0.4.38}/environments/text_2048/pyproject.toml +0 -0
  65. {hud_python-0.4.36 → hud_python-0.4.38}/examples/README.md +0 -0
  66. {hud_python-0.4.36 → hud_python-0.4.38}/hud/__init__.py +0 -0
  67. {hud_python-0.4.36 → hud_python-0.4.38}/hud/__main__.py +0 -0
  68. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/base.py +0 -0
  69. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/claude.py +0 -0
  70. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/grounded_openai.py +0 -0
  71. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/langchain.py +0 -0
  72. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/misc/__init__.py +0 -0
  73. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/misc/response_agent.py +0 -0
  74. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/openai.py +0 -0
  75. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/tests/__init__.py +0 -0
  76. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/tests/test_base.py +0 -0
  77. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/tests/test_claude.py +0 -0
  78. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/tests/test_client.py +0 -0
  79. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  80. {hud_python-0.4.36 → hud_python-0.4.38}/hud/agents/tests/test_openai.py +0 -0
  81. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/__main__.py +0 -0
  82. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/analyze.py +0 -0
  83. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/clone.py +0 -0
  84. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/debug.py +0 -0
  85. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/flows/__init__.py +0 -0
  86. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/get.py +0 -0
  87. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/list_func.py +0 -0
  88. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/pull.py +0 -0
  89. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/remove.py +0 -0
  90. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/rl/display.py +0 -0
  91. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/rl/gpu.py +0 -0
  92. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/rl/gpu_utils.py +0 -0
  93. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/rl/presets.py +0 -0
  94. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/rl/rl_api.py +0 -0
  95. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/rl/vllm.py +0 -0
  96. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/__init__.py +0 -0
  97. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_analyze.py +0 -0
  98. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_analyze_metadata.py +0 -0
  99. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_build.py +0 -0
  100. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_cli_init.py +0 -0
  101. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_cli_main.py +0 -0
  102. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_clone.py +0 -0
  103. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_cursor.py +0 -0
  104. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_debug.py +0 -0
  105. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_list_func.py +0 -0
  106. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_main_module.py +0 -0
  107. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_mcp_server.py +0 -0
  108. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_pull.py +0 -0
  109. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_push.py +0 -0
  110. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_registry.py +0 -0
  111. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/tests/test_utils.py +0 -0
  112. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/__init__.py +0 -0
  113. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/config.py +0 -0
  114. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/cursor.py +0 -0
  115. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/docker.py +0 -0
  116. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/environment.py +0 -0
  117. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/interactive.py +0 -0
  118. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/local_runner.py +0 -0
  119. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/logging.py +0 -0
  120. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/metadata.py +0 -0
  121. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/package_runner.py +0 -0
  122. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/registry.py +0 -0
  123. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/remote_runner.py +0 -0
  124. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/runner.py +0 -0
  125. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/server.py +0 -0
  126. {hud_python-0.4.36 → hud_python-0.4.38}/hud/cli/utils/tasks.py +0 -0
  127. {hud_python-0.4.36 → hud_python-0.4.38}/hud/clients/README.md +0 -0
  128. {hud_python-0.4.36 → hud_python-0.4.38}/hud/clients/__init__.py +0 -0
  129. {hud_python-0.4.36 → hud_python-0.4.38}/hud/clients/mcp_use.py +0 -0
  130. {hud_python-0.4.36 → hud_python-0.4.38}/hud/clients/tests/__init__.py +0 -0
  131. {hud_python-0.4.36 → hud_python-0.4.38}/hud/clients/tests/test_client_integration.py +0 -0
  132. {hud_python-0.4.36 → hud_python-0.4.38}/hud/clients/tests/test_fastmcp.py +0 -0
  133. {hud_python-0.4.36 → hud_python-0.4.38}/hud/clients/tests/test_mcp_use_retry.py +0 -0
  134. {hud_python-0.4.36 → hud_python-0.4.38}/hud/clients/tests/test_protocol.py +0 -0
  135. {hud_python-0.4.36 → hud_python-0.4.38}/hud/clients/utils/__init__.py +0 -0
  136. {hud_python-0.4.36 → hud_python-0.4.38}/hud/clients/utils/mcp_use_retry.py +0 -0
  137. {hud_python-0.4.36 → hud_python-0.4.38}/hud/clients/utils/retry.py +0 -0
  138. {hud_python-0.4.36 → hud_python-0.4.38}/hud/clients/utils/retry_transport.py +0 -0
  139. {hud_python-0.4.36 → hud_python-0.4.38}/hud/datasets/__init__.py +0 -0
  140. {hud_python-0.4.36 → hud_python-0.4.38}/hud/datasets/parallel.py +0 -0
  141. {hud_python-0.4.36 → hud_python-0.4.38}/hud/datasets/runner.py +0 -0
  142. {hud_python-0.4.36 → hud_python-0.4.38}/hud/datasets/utils.py +0 -0
  143. {hud_python-0.4.36 → hud_python-0.4.38}/hud/misc/__init__.py +0 -0
  144. {hud_python-0.4.36 → hud_python-0.4.38}/hud/misc/claude_plays_pokemon.py +0 -0
  145. {hud_python-0.4.36 → hud_python-0.4.38}/hud/native/__init__.py +0 -0
  146. {hud_python-0.4.36 → hud_python-0.4.38}/hud/native/comparator.py +0 -0
  147. {hud_python-0.4.36 → hud_python-0.4.38}/hud/native/tests/__init__.py +0 -0
  148. {hud_python-0.4.36 → hud_python-0.4.38}/hud/native/tests/test_comparator.py +0 -0
  149. {hud_python-0.4.36 → hud_python-0.4.38}/hud/native/tests/test_native_init.py +0 -0
  150. {hud_python-0.4.36 → hud_python-0.4.38}/hud/otel/__init__.py +0 -0
  151. {hud_python-0.4.36 → hud_python-0.4.38}/hud/otel/collector.py +0 -0
  152. {hud_python-0.4.36 → hud_python-0.4.38}/hud/otel/exporters.py +0 -0
  153. {hud_python-0.4.36 → hud_python-0.4.38}/hud/otel/instrumentation.py +0 -0
  154. {hud_python-0.4.36 → hud_python-0.4.38}/hud/otel/processors.py +0 -0
  155. {hud_python-0.4.36 → hud_python-0.4.38}/hud/otel/tests/__init__.py +0 -0
  156. {hud_python-0.4.36 → hud_python-0.4.38}/hud/otel/tests/test_processors.py +0 -0
  157. {hud_python-0.4.36 → hud_python-0.4.38}/hud/py.typed +0 -0
  158. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/README.md +0 -0
  159. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/__init__.py +0 -0
  160. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/actor.py +0 -0
  161. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/buffer.py +0 -0
  162. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/chat_template.jinja +0 -0
  163. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/config.py +0 -0
  164. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/distributed.py +0 -0
  165. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/learner.py +0 -0
  166. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/tests/__init__.py +0 -0
  167. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/tests/test_learner.py +0 -0
  168. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/train.py +0 -0
  169. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/types.py +0 -0
  170. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/utils/start_vllm_server.sh +0 -0
  171. {hud_python-0.4.36 → hud_python-0.4.38}/hud/rl/utils.py +0 -0
  172. {hud_python-0.4.36 → hud_python-0.4.38}/hud/samples/__init__.py +0 -0
  173. {hud_python-0.4.36 → hud_python-0.4.38}/hud/samples/browser.py +0 -0
  174. {hud_python-0.4.36 → hud_python-0.4.38}/hud/server/__init__.py +0 -0
  175. {hud_python-0.4.36 → hud_python-0.4.38}/hud/server/context.py +0 -0
  176. {hud_python-0.4.36 → hud_python-0.4.38}/hud/server/helper/__init__.py +0 -0
  177. {hud_python-0.4.36 → hud_python-0.4.38}/hud/server/low_level.py +0 -0
  178. {hud_python-0.4.36 → hud_python-0.4.38}/hud/server/tests/__init__.py +0 -0
  179. {hud_python-0.4.36 → hud_python-0.4.38}/hud/settings.py +0 -0
  180. {hud_python-0.4.36 → hud_python-0.4.38}/hud/shared/__init__.py +0 -0
  181. {hud_python-0.4.36 → hud_python-0.4.38}/hud/shared/exceptions.py +0 -0
  182. {hud_python-0.4.36 → hud_python-0.4.38}/hud/shared/requests.py +0 -0
  183. {hud_python-0.4.36 → hud_python-0.4.38}/hud/shared/tests/__init__.py +0 -0
  184. {hud_python-0.4.36 → hud_python-0.4.38}/hud/shared/tests/test_exceptions.py +0 -0
  185. {hud_python-0.4.36 → hud_python-0.4.38}/hud/shared/tests/test_requests.py +0 -0
  186. {hud_python-0.4.36 → hud_python-0.4.38}/hud/telemetry/__init__.py +0 -0
  187. {hud_python-0.4.36 → hud_python-0.4.38}/hud/telemetry/instrument.py +0 -0
  188. {hud_python-0.4.36 → hud_python-0.4.38}/hud/telemetry/replay.py +0 -0
  189. {hud_python-0.4.36 → hud_python-0.4.38}/hud/telemetry/tests/__init__.py +0 -0
  190. {hud_python-0.4.36 → hud_python-0.4.38}/hud/telemetry/tests/test_replay.py +0 -0
  191. {hud_python-0.4.36 → hud_python-0.4.38}/hud/telemetry/tests/test_trace.py +0 -0
  192. {hud_python-0.4.36 → hud_python-0.4.38}/hud/telemetry/trace.py +0 -0
  193. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/__init__.py +0 -0
  194. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/base.py +0 -0
  195. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/bash.py +0 -0
  196. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/computer/__init__.py +0 -0
  197. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/computer/anthropic.py +0 -0
  198. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/computer/hud.py +0 -0
  199. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/computer/openai.py +0 -0
  200. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/computer/settings.py +0 -0
  201. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/edit.py +0 -0
  202. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/executors/__init__.py +0 -0
  203. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/executors/base.py +0 -0
  204. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/executors/pyautogui.py +0 -0
  205. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/executors/tests/__init__.py +0 -0
  206. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/executors/tests/test_base_executor.py +0 -0
  207. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  208. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/executors/xdo.py +0 -0
  209. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/grounding/__init__.py +0 -0
  210. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/grounding/config.py +0 -0
  211. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/grounding/grounded_tool.py +0 -0
  212. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/grounding/grounder.py +0 -0
  213. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/grounding/tests/__init__.py +0 -0
  214. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  215. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/playwright.py +0 -0
  216. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/response.py +0 -0
  217. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/submit.py +0 -0
  218. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/tests/__init__.py +0 -0
  219. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/tests/test_base.py +0 -0
  220. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/tests/test_bash.py +0 -0
  221. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/tests/test_bash_extended.py +0 -0
  222. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/tests/test_computer.py +0 -0
  223. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/tests/test_computer_actions.py +0 -0
  224. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/tests/test_edit.py +0 -0
  225. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/tests/test_init.py +0 -0
  226. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/tests/test_playwright_tool.py +0 -0
  227. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/tests/test_response.py +0 -0
  228. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/tests/test_tools.py +0 -0
  229. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/tests/test_tools_init.py +0 -0
  230. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/tests/test_utils.py +0 -0
  231. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/types.py +0 -0
  232. {hud_python-0.4.36 → hud_python-0.4.38}/hud/tools/utils.py +0 -0
  233. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/__init__.py +0 -0
  234. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/agent_factories.py +0 -0
  235. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/async_utils.py +0 -0
  236. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/group_eval.py +0 -0
  237. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/hud_console.py +0 -0
  238. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/mcp.py +0 -0
  239. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/pretty_errors.py +0 -0
  240. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/progress.py +0 -0
  241. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/telemetry.py +0 -0
  242. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/tests/__init__.py +0 -0
  243. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/tests/test_async_utils.py +0 -0
  244. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/tests/test_init.py +0 -0
  245. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/tests/test_mcp.py +0 -0
  246. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/tests/test_progress.py +0 -0
  247. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/tests/test_telemetry.py +0 -0
  248. {hud_python-0.4.36 → hud_python-0.4.38}/hud/utils/tool_shorthand.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.36
3
+ Version: 0.4.38
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -36,11 +36,13 @@ Classifier: Programming Language :: Python :: 3.12
36
36
  Classifier: Programming Language :: Python :: 3.13
37
37
  Requires-Python: <3.13,>=3.11
38
38
  Requires-Dist: anthropic
39
+ Requires-Dist: blessed>=1.20.0
39
40
  Requires-Dist: datasets>=2.14.0
40
41
  Requires-Dist: httpx<1,>=0.23.0
41
42
  Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
42
43
  Requires-Dist: hud-mcp-python-sdk>=3.13.2
43
44
  Requires-Dist: hud-mcp-use-python-sdk==2.3.19
45
+ Requires-Dist: litellm>=1.55.0
44
46
  Requires-Dist: numpy>=1.24.0
45
47
  Requires-Dist: openai
46
48
  Requires-Dist: opentelemetry-api>=1.34.1
@@ -156,8 +158,8 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
156
158
  ## Highlights
157
159
 
158
160
  - 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
159
- - ⚡️ **[Live telemetry](https://app.hud.so)** – inspect every tool call, observation, and reward in real time.
160
- - 🗂️ **[Public benchmarks](https://app.hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
161
+ - ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
162
+ - 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
161
163
  - 🌱 **[Reinforcement learning built-in](rl/)** – Verifiers gym pipelines for GRPO on any environment.
162
164
  - 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
163
165
  - 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
@@ -203,14 +205,14 @@ from hud.agents import ClaudeAgent
203
205
  from hud.datasets import Task # See docs: https://docs.hud.so/reference/tasks
204
206
 
205
207
  async def main() -> None:
206
- with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://app.hud.so)
208
+ with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://hud.so)
207
209
  task = {
208
210
  "prompt": "Reach 64 in 2048.",
209
211
  "mcp_config": {
210
212
  "hud": {
211
213
  "url": "https://mcp.hud.so/v3/mcp", # HUD's cloud MCP server (see https://docs.hud.so/core-concepts/architecture)
212
214
  "headers": {
213
- "Authorization": f"Bearer {settings.api_key}", # Get your key at https://app.hud.so
215
+ "Authorization": f"Bearer {settings.api_key}", # Get your key at https://hud.so
214
216
  "Mcp-Image": "hudpython/hud-text-2048:v1.2" # Docker image from https://hub.docker.com/u/hudpython
215
217
  }
216
218
  }
@@ -237,7 +239,7 @@ async def main() -> None:
237
239
  asyncio.run(main())
238
240
  ```
239
241
 
240
- The above example let's the agent play 2048 ([See replay](https://app.hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
242
+ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
241
243
 
242
244
  ![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif)
243
245
 
@@ -268,7 +270,7 @@ Supports multi‑turn RL for both:
268
270
  - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
269
271
  - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
270
272
 
271
- By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `app.hud.so`, and lets you monitor/manage models at `app.hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
273
+ By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
272
274
 
273
275
  Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
274
276
 
@@ -278,7 +280,7 @@ This is Claude Computer Use running on our proprietary financial analyst benchma
278
280
 
279
281
  ![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
280
282
 
281
- > [See this trace on _app.hud.so_](https://app.hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
283
+ > [See this trace on _hud.so_](https://hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
282
284
 
283
285
  This example runs the full dataset (only takes ~20 minutes) using [run_evaluation.py](examples/run_evaluation.py):
284
286
 
@@ -304,7 +306,7 @@ results = await run_dataset(
304
306
  print(f"Average reward: {sum(r.reward for r in results) / len(results):.2f}")
305
307
  ```
306
308
 
307
- > Running a dataset creates a job and streams results to the [app.hud.so](https://app.hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
309
+ > Running a dataset creates a job and streams results to the [hud.so](https://hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
308
310
 
309
311
  ## Building Environments (MCP)
310
312
 
@@ -395,7 +397,7 @@ Tools
395
397
  hud push # needs docker login, hud api key
396
398
  ```
397
399
 
398
- 5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [app.hud.so](https://app.hud.so):
400
+ 5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [hud.so](https://hud.so):
399
401
 
400
402
  ```python
401
403
  from hud.agents import ClaudeAgent
@@ -426,7 +428,7 @@ result = await ClaudeAgent().run({ # See all agents: https://docs.hud.so/refere
426
428
 
427
429
  ## Leaderboards & benchmarks
428
430
 
429
- All leaderboards are publicly available on [app.hud.so/leaderboards](https://app.hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
431
+ All leaderboards are publicly available on [hud.so/leaderboards](https://hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
430
432
 
431
433
  ![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/leaderboards_3.png)
432
434
 
@@ -440,7 +442,7 @@ Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) funct
440
442
  %%{init: {"theme": "neutral", "themeVariables": {"fontSize": "14px"}} }%%
441
443
  graph LR
442
444
  subgraph "Platform"
443
- Dashboard["📊 app.hud.so"]
445
+ Dashboard["📊 hud.so"]
444
446
  API["🔌 mcp.hud.so"]
445
447
  end
446
448
 
@@ -23,8 +23,8 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
23
23
  ## Highlights
24
24
 
25
25
  - 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
26
- - ⚡️ **[Live telemetry](https://app.hud.so)** – inspect every tool call, observation, and reward in real time.
27
- - 🗂️ **[Public benchmarks](https://app.hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
26
+ - ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
27
+ - 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
28
28
  - 🌱 **[Reinforcement learning built-in](rl/)** – Verifiers gym pipelines for GRPO on any environment.
29
29
  - 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
30
30
  - 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
@@ -70,14 +70,14 @@ from hud.agents import ClaudeAgent
70
70
  from hud.datasets import Task # See docs: https://docs.hud.so/reference/tasks
71
71
 
72
72
  async def main() -> None:
73
- with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://app.hud.so)
73
+ with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://hud.so)
74
74
  task = {
75
75
  "prompt": "Reach 64 in 2048.",
76
76
  "mcp_config": {
77
77
  "hud": {
78
78
  "url": "https://mcp.hud.so/v3/mcp", # HUD's cloud MCP server (see https://docs.hud.so/core-concepts/architecture)
79
79
  "headers": {
80
- "Authorization": f"Bearer {settings.api_key}", # Get your key at https://app.hud.so
80
+ "Authorization": f"Bearer {settings.api_key}", # Get your key at https://hud.so
81
81
  "Mcp-Image": "hudpython/hud-text-2048:v1.2" # Docker image from https://hub.docker.com/u/hudpython
82
82
  }
83
83
  }
@@ -104,7 +104,7 @@ async def main() -> None:
104
104
  asyncio.run(main())
105
105
  ```
106
106
 
107
- The above example let's the agent play 2048 ([See replay](https://app.hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
107
+ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
108
108
 
109
109
  ![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif)
110
110
 
@@ -135,7 +135,7 @@ Supports multi‑turn RL for both:
135
135
  - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
136
136
  - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
137
137
 
138
- By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `app.hud.so`, and lets you monitor/manage models at `app.hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
138
+ By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
139
139
 
140
140
  Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
141
141
 
@@ -145,7 +145,7 @@ This is Claude Computer Use running on our proprietary financial analyst benchma
145
145
 
146
146
  ![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
147
147
 
148
- > [See this trace on _app.hud.so_](https://app.hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
148
+ > [See this trace on _hud.so_](https://hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
149
149
 
150
150
  This example runs the full dataset (only takes ~20 minutes) using [run_evaluation.py](examples/run_evaluation.py):
151
151
 
@@ -171,7 +171,7 @@ results = await run_dataset(
171
171
  print(f"Average reward: {sum(r.reward for r in results) / len(results):.2f}")
172
172
  ```
173
173
 
174
- > Running a dataset creates a job and streams results to the [app.hud.so](https://app.hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
174
+ > Running a dataset creates a job and streams results to the [hud.so](https://hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
175
175
 
176
176
  ## Building Environments (MCP)
177
177
 
@@ -262,7 +262,7 @@ Tools
262
262
  hud push # needs docker login, hud api key
263
263
  ```
264
264
 
265
- 5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [app.hud.so](https://app.hud.so):
265
+ 5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [hud.so](https://hud.so):
266
266
 
267
267
  ```python
268
268
  from hud.agents import ClaudeAgent
@@ -293,7 +293,7 @@ result = await ClaudeAgent().run({ # See all agents: https://docs.hud.so/refere
293
293
 
294
294
  ## Leaderboards & benchmarks
295
295
 
296
- All leaderboards are publicly available on [app.hud.so/leaderboards](https://app.hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
296
+ All leaderboards are publicly available on [hud.so/leaderboards](https://hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
297
297
 
298
298
  ![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/leaderboards_3.png)
299
299
 
@@ -307,7 +307,7 @@ Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) funct
307
307
  %%{init: {"theme": "neutral", "themeVariables": {"fontSize": "14px"}} }%%
308
308
  graph LR
309
309
  subgraph "Platform"
310
- Dashboard["📊 app.hud.so"]
310
+ Dashboard["📊 hud.so"]
311
311
  API["🔌 mcp.hud.so"]
312
312
  end
313
313
 
@@ -495,7 +495,7 @@ from hud.agents import ClaudeAgent
495
495
  from hud.clients import MCPClient
496
496
 
497
497
  async def main():
498
- # `trace` captures *everything* that happens and sends it to app.hud.so
498
+ # `trace` captures *everything* that happens and sends it to hud.so
499
499
  with hud.trace("local_test"):
500
500
  task = Task(
501
501
  prompt="Complete the task",
@@ -524,7 +524,7 @@ async def main():
524
524
  asyncio.run(main())
525
525
  ```
526
526
 
527
- The `trace` context manager sends a full timeline of agent actions, tool calls, and rewards to app.hud.so – perfect for debugging.
527
+ The `trace` context manager sends a full timeline of agent actions, tool calls, and rewards to hud.so – perfect for debugging.
528
528
 
529
529
  See `examples/01_hello_2048.py` and `examples/task_with_setup_eval.py` for larger end-to-end demos.
530
530
 
@@ -532,7 +532,7 @@ See `examples/01_hello_2048.py` and `examples/task_with_setup_eval.py` for large
532
532
 
533
533
  ## Phase 4 – Remote Deployment & HUD Runner
534
534
 
535
- **Goal →** the exact same image runs in parallel on hundreds of instances, and exposes more telemetry so the app.hud.so can visualise the whole lifecycle.
535
+ **Goal →** the exact same image runs in parallel on hundreds of instances, and exposes more telemetry so the hud.so can visualise the whole lifecycle.
536
536
 
537
537
  ### 1. Publish your image
538
538
 
@@ -595,11 +595,11 @@ async def initialize_environment(session=None, progress_token=None):
595
595
  await send(100, "ready")
596
596
  ```
597
597
 
598
- Those messages are displayed live on app.hud.so alongside resource graphs – perfect feedback while you wait.
598
+ Those messages are displayed live on hud.so alongside resource graphs – perfect feedback while you wait.
599
599
 
600
600
  ### 4. Live telemetry (`telemetry://live`) (Optional)
601
601
 
602
- Expose a resource named `telemetry://live` exactly like in `environments/browser/src/hud_controller/server.py` to return live url to be displayed on app.hud.so.
602
+ Expose a resource named `telemetry://live` exactly like in `environments/browser/src/hud_controller/server.py` to return live url to be displayed on hud.so.
603
603
 
604
604
  Once all of the above works you can unleash *hundreds* of concurrent agents on your new environment.
605
605
 
@@ -10,7 +10,7 @@
10
10
 
11
11
  IMPORTANT: Make sure all logs are going to stderr instead of stdio, which is reserved for MCP communication
12
12
 
13
- ### Interactive Development
13
+ ### Testing your environment
14
14
  ```bash
15
15
  # 1. Configure your API keys (optional - only needed for evaluation)
16
16
  # Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY
@@ -24,13 +24,29 @@ hud dev --build --interactive
24
24
  hud eval tasks.json --agent claude
25
25
 
26
26
  # Option B: Interactive notebook test_env.ipynb (great for learning!)
27
- # Requires installation:
28
- pip install hud-python[agents]
29
27
 
30
28
  # Option C: Simple Python script (runs all tasks from tasks.json)
31
29
  python test_task.py
32
30
  ```
33
31
 
32
+ ## Iterating on your environment
33
+ This is usually the process for making any environment better:
34
+ ```bash
35
+ # 1. Start the environment and interact with it directly (or give MCP server to an agent):
36
+ hud dev --build --interactive
37
+
38
+ # 2. If the environment cannot start or fails inexplicably:
39
+ hud debug test_env:dev # Or your env name that appears when you run hud dev
40
+ # After fixing the error, go back to 1.
41
+
42
+ # 3. When the environment is in a stable state:
43
+ hud build
44
+ hud push # Requires docker login
45
+
46
+ # 4. As soon as it's pushed to the newest version, make sure tasks have it updated and run:
47
+ hud rl
48
+ # This is a good test to see if your environment and tasks are high quality!
49
+
34
50
  ## Layout
35
51
  ```
36
52
  controller/
@@ -83,7 +99,7 @@ save_tasks(tasks, repo_id="your-org/your-dataset")
83
99
  hud eval "your-org/your-dataset" --agent claude
84
100
 
85
101
  # View results at:
86
- # app.hud.so/leaderboards/your-org/your-dataset
102
+ # hud.so/leaderboards/your-org/your-dataset
87
103
  ```
88
104
 
89
105
  **Note**: Only public HuggingFace datasets appear as leaderboards!
@@ -3,7 +3,7 @@ name = "test_test"
3
3
  version = "0.1.0"
4
4
  description = "A minimal HUD environment"
5
5
  requires-python = ">=3.11"
6
- dependencies = [ "hud-python==0.4.36", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",]
6
+ dependencies = [ "hud-python==0.4.38", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",]
7
7
 
8
8
  [build-system]
9
9
  requires = [ "hatchling",]
@@ -75,7 +75,7 @@ save_tasks(tasks, repo_id="your-org/your-dataset")
75
75
  hud eval "your-org/your-dataset" --agent claude
76
76
 
77
77
  # View results at:
78
- # app.hud.so/leaderboards/your-org/your-dataset
78
+ # hud.so/leaderboards/your-org/your-dataset
79
79
  ```
80
80
 
81
81
  **Note**: Only public HuggingFace datasets appear as leaderboards!
@@ -3,7 +3,7 @@ name = "hud-browser-controller"
3
3
  version = "0.1.0"
4
4
  description = "HUD Browser Controller - MCP interface for browser environments"
5
5
  requires-python = ">=3.11,<3.14"
6
- dependencies = [ "pydantic>=2.6,<3", "pydantic-settings>=2.2,<3", "hud-python@git+https://github.com/hud-evals/hud-python@env-cli-improvements", "playwright", "pyautogui", "httpx", "typer", "fastapi", "uvicorn",]
6
+ dependencies = [ "pydantic>=2.6,<3", "pydantic-settings>=2.2,<3", "hud-python@git+https://github.com/hud-evals/hud-python@env-cli-improvements", "playwright", "pyautogui", "httpx", "typer", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "python-multipart>=0.0.6",]
7
7
 
8
8
  [build-system]
9
9
  requires = [ "hatchling",]
@@ -19,4 +19,4 @@ image = "hud-browser:dev"
19
19
  allow-direct-references = true
20
20
 
21
21
  [tool.hatch.build.targets.wheel]
22
- packages = [ "controller", "problems",]
22
+ packages = [ "controller", "environment",]
@@ -3,7 +3,7 @@ name = "deepresearch"
3
3
  version = "0.1.0"
4
4
  description = "DeepResearch HUD environment with HTTP backend (EXA on server)"
5
5
  requires-python = ">=3.11"
6
- dependencies = [ "hud-python==0.4.36", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "httpx>=0.24.0",]
6
+ dependencies = [ "hud-python==0.4.38", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "httpx>=0.24.0",]
7
7
 
8
8
  [build-system]
9
9
  requires = [ "hatchling",]
@@ -2,12 +2,14 @@ from __future__ import annotations
2
2
 
3
3
  from .base import MCPAgent
4
4
  from .claude import ClaudeAgent
5
+ from .lite_llm import LiteAgent
5
6
  from .openai import OperatorAgent
6
7
  from .openai_chat_generic import GenericOpenAIChatAgent
7
8
 
8
9
  __all__ = [
9
10
  "ClaudeAgent",
10
11
  "GenericOpenAIChatAgent",
12
+ "LiteAgent",
11
13
  "MCPAgent",
12
14
  "OperatorAgent",
13
15
  ]
@@ -0,0 +1,72 @@
1
+ """LiteLLM MCP Agent implementation.
2
+
3
+ Same OpenAI chat-completions shape + MCP tool plumbing,
4
+ but transport is LiteLLM and (optionally) tools are shaped by LiteLLM's MCP transformer.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from typing import Any, ClassVar
11
+
12
+ import litellm
13
+
14
+ from .openai_chat_generic import GenericOpenAIChatAgent
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Prefer LiteLLM's built-in MCP -> OpenAI tool transformer (handles Bedrock nuances)
19
+ try:
20
+ from litellm.experimental_mcp_client.tools import (
21
+ transform_mcp_tool_to_openai_tool,
22
+ )
23
+ except Exception: # pragma: no cover - optional dependency
24
+ transform_mcp_tool_to_openai_tool = None # type: ignore
25
+
26
+
27
+ class LiteAgent(GenericOpenAIChatAgent):
28
+ """
29
+ Same OpenAI chat-completions shape + MCP tool plumbing,
30
+ but transport is LiteLLM and (optionally) tools are shaped by LiteLLM's MCP transformer.
31
+ """
32
+
33
+ metadata: ClassVar[dict[str, Any]] = {}
34
+
35
+ def __init__(
36
+ self,
37
+ *,
38
+ model_name: str = "gpt-4o-mini",
39
+ completion_kwargs: dict[str, Any] | None = None,
40
+ **agent_kwargs: Any,
41
+ ) -> None:
42
+ # We don't need an OpenAI client; pass None
43
+ super().__init__(
44
+ openai_client=None,
45
+ model_name=model_name,
46
+ completion_kwargs=completion_kwargs,
47
+ **agent_kwargs,
48
+ )
49
+
50
+ def get_tool_schemas(self) -> list[dict]:
51
+ # Prefer LiteLLM's stricter transformer (handles Bedrock & friends)
52
+ if transform_mcp_tool_to_openai_tool is not None:
53
+ return [
54
+ transform_mcp_tool_to_openai_tool(t) # returns ChatCompletionToolParam-like dict
55
+ for t in self.get_available_tools()
56
+ ]
57
+ # Fallback to the generic OpenAI sanitizer
58
+ return GenericOpenAIChatAgent.get_tool_schemas(self)
59
+
60
+ async def _invoke_chat_completion(
61
+ self,
62
+ *,
63
+ messages: list[Any],
64
+ tools: list[dict] | None,
65
+ extra: dict[str, Any],
66
+ ):
67
+ return await litellm.acompletion(
68
+ model=self.model_name,
69
+ messages=messages,
70
+ tools=tools or None, # LiteLLM tolerates None better than []
71
+ **extra,
72
+ )
@@ -42,7 +42,7 @@ class GenericOpenAIChatAgent(MCPAgent):
42
42
  def __init__(
43
43
  self,
44
44
  *,
45
- openai_client: AsyncOpenAI,
45
+ openai_client: AsyncOpenAI | None,
46
46
  model_name: str = "gpt-4o-mini",
47
47
  completion_kwargs: dict[str, Any] | None = None,
48
48
  **agent_kwargs: Any,
@@ -171,6 +171,23 @@ class GenericOpenAIChatAgent(MCPAgent):
171
171
  openai_tools.append(openai_tool)
172
172
  return openai_tools
173
173
 
174
+ async def _invoke_chat_completion(
175
+ self,
176
+ *,
177
+ messages: list[Any],
178
+ tools: list[dict] | None,
179
+ extra: dict[str, Any],
180
+ ):
181
+ if self.oai is None:
182
+ raise ValueError("openai_client is required for GenericOpenAIChatAgent")
183
+ # default transport = OpenAI SDK
184
+ return await self.oai.chat.completions.create(
185
+ model=self.model_name,
186
+ messages=messages,
187
+ tools=tools, # already ChatCompletionToolParam-shaped
188
+ **extra,
189
+ )
190
+
174
191
  @instrument(
175
192
  span_type="agent",
176
193
  record_args=False,
@@ -180,17 +197,14 @@ class GenericOpenAIChatAgent(MCPAgent):
180
197
  """Send chat request to OpenAI and convert the response."""
181
198
 
182
199
  # Convert MCP tool schemas to OpenAI format
183
- mcp_schemas = self.get_tool_schemas()
200
+ tools = cast("list[ChatCompletionToolParam]", self.get_tool_schemas())
184
201
 
185
202
  protected_keys = {"model", "messages", "tools"}
186
203
  extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
187
204
 
188
205
  try:
189
- response = await self.oai.chat.completions.create(
190
- model=self.model_name,
191
- messages=messages,
192
- tools=cast("list[ChatCompletionToolParam]", mcp_schemas),
193
- **extra,
206
+ response = await self._invoke_chat_completion(
207
+ messages=messages, tools=tools, extra=extra
194
208
  )
195
209
  except Exception as e:
196
210
  error_content = f"Error getting response {e}"
@@ -912,7 +912,7 @@ def eval(
912
912
  agent: str | None = typer.Argument(
913
913
  None,
914
914
  help=(
915
- "Agent backend to use (claude, openai, or vllm). If not provided, will prompt interactively." # noqa: E501
915
+ "Agent backend to use (claude, openai, vllm, or litellm). If not provided, will prompt interactively." # noqa: E501
916
916
  ),
917
917
  ),
918
918
  full: bool = typer.Option(
@@ -960,6 +960,12 @@ def eval(
960
960
  "--verbose",
961
961
  help="Enable verbose output from the agent",
962
962
  ),
963
+ very_verbose: bool = typer.Option(
964
+ False,
965
+ "--very-verbose",
966
+ "-vv",
967
+ help="Enable debug-level logs for maximum visibility",
968
+ ),
963
969
  vllm_base_url: str | None = typer.Option(
964
970
  None,
965
971
  "--vllm-base-url",
@@ -1025,13 +1031,14 @@ def eval(
1025
1031
  {"name": "Claude 4 Sonnet", "value": "claude"},
1026
1032
  {"name": "OpenAI Computer Use", "value": "openai"},
1027
1033
  {"name": "vLLM (Local Server)", "value": "vllm"},
1034
+ {"name": "LiteLLM (Multi-provider)", "value": "litellm"},
1028
1035
  ]
1029
1036
  )
1030
1037
 
1031
1038
  agent = hud_console.select("Select an agent to use:", choices=choices, default=0)
1032
1039
 
1033
1040
  # Handle HUD model selection
1034
- if agent and agent not in ["claude", "openai", "vllm"]:
1041
+ if agent and agent not in ["claude", "openai", "vllm", "litellm"]:
1035
1042
  # Find remote model name
1036
1043
  model = agent
1037
1044
  if not vllm_base_url:
@@ -1052,7 +1059,7 @@ def eval(
1052
1059
  hud_console.info(f"Using HUD model: {model} (trained on {base_model})")
1053
1060
 
1054
1061
  # Validate agent choice
1055
- valid_agents = ["claude", "openai", "vllm"]
1062
+ valid_agents = ["claude", "openai", "vllm", "litellm"]
1056
1063
  if agent not in valid_agents:
1057
1064
  hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
1058
1065
  raise typer.Exit(1)
@@ -1070,6 +1077,7 @@ def eval(
1070
1077
  max_workers=max_workers,
1071
1078
  max_concurrent_per_worker=max_concurrent_per_worker,
1072
1079
  verbose=verbose,
1080
+ very_verbose=very_verbose,
1073
1081
  vllm_base_url=vllm_base_url,
1074
1082
  group_size=group_size,
1075
1083
  )
@@ -1119,7 +1127,7 @@ def rl(
1119
1127
  ),
1120
1128
  model: str | None = typer.Argument(
1121
1129
  None,
1122
- help="Model to train (default: interactive selection)",
1130
+ help="Model to train from https://hud.so/models (default: interactive selection)",
1123
1131
  ),
1124
1132
  config_file: Path | None = typer.Option( # noqa: B008
1125
1133
  None,
@@ -1159,6 +1167,12 @@ def rl(
1159
1167
  "--ddp-gpus",
1160
1168
  help="Specific GPUs for DDP (e.g., '0,1,2,3')",
1161
1169
  ),
1170
+ yes: bool = typer.Option(
1171
+ False,
1172
+ "--yes",
1173
+ "-y",
1174
+ help="Auto-accept all prompts and use defaults (lazy mode)",
1175
+ ),
1162
1176
  vllm_gpu: int | None = typer.Option(
1163
1177
  None,
1164
1178
  "--vllm-gpu",
@@ -1180,6 +1194,7 @@ def rl(
1180
1194
  no_ddp=no_ddp,
1181
1195
  ddp_gpus=ddp_gpus,
1182
1196
  vllm_gpu=vllm_gpu,
1197
+ yes=yes,
1183
1198
  )
1184
1199
 
1185
1200
 
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import asyncio
6
+ import contextlib
6
7
  import hashlib
7
8
  import subprocess
8
9
  import time
@@ -13,6 +14,7 @@ from typing import Any
13
14
  import typer
14
15
  import yaml
15
16
 
17
+ from hud.cli.utils.source_hash import compute_source_hash, list_source_files
16
18
  from hud.clients import MCPClient
17
19
  from hud.utils.hud_console import HUDConsole
18
20
  from hud.version import __version__ as hud_version
@@ -341,10 +343,11 @@ def build_environment(
341
343
  required_env, optional_env = extract_env_vars_from_dockerfile(dockerfile_path)
342
344
 
343
345
  # Merge user-provided env vars with detected ones
344
- provided_env_vars = {}
346
+ provided_env_vars: dict[str, str] = {}
345
347
  missing_required = []
346
348
  if env_vars:
347
- provided_env_vars = env_vars.copy()
349
+ # Use placeholders in lock file for any provided values to avoid storing secrets
350
+ provided_env_vars = {k: f"${{{k}}}" for k in env_vars}
348
351
  # Track which required vars are still missing
349
352
  missing_required = [e for e in required_env if e not in env_vars]
350
353
 
@@ -384,6 +387,8 @@ def build_environment(
384
387
  "hudVersion": hud_version,
385
388
  "directory": str(env_dir.name),
386
389
  "version": new_version, # Internal environment version
390
+ # Fast source fingerprint for change detection
391
+ "sourceHash": compute_source_hash(env_dir),
387
392
  },
388
393
  "environment": {
389
394
  "initializeMs": analysis["initializeMs"],
@@ -424,6 +429,16 @@ def build_environment(
424
429
  with open(lock_path, "w") as f:
425
430
  yaml.dump(lock_content, f, default_flow_style=False, sort_keys=False)
426
431
 
432
+ # Also write the file list we hashed for transparency (non-essential)
433
+ with contextlib.suppress(Exception):
434
+ files = [
435
+ str(p.resolve().relative_to(env_dir)).replace("\\", "/")
436
+ for p in list_source_files(env_dir)
437
+ ]
438
+ lock_content["build"]["sourceFiles"] = files
439
+ with open(lock_path, "w") as f:
440
+ yaml.dump(lock_content, f, default_flow_style=False, sort_keys=False)
441
+
427
442
  hud_console.success("Created lock file: hud.lock.yaml")
428
443
 
429
444
  # Calculate lock file hash
@@ -530,7 +530,7 @@ async def start_mcp_proxy(
530
530
  stderr=asyncio.subprocess.DEVNULL,
531
531
  )
532
532
  await stop_result.communicate()
533
- hud_console.success("Container stopped successfully")
533
+ hud_console.success("Container stopped successfully")
534
534
  container_stopped = True
535
535
  except Exception as e:
536
536
  hud_console.warning(f"Failed to stop container: {e}")