hud-python 0.4.31__tar.gz → 0.4.33__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (224) hide show
  1. {hud_python-0.4.31 → hud_python-0.4.33}/PKG-INFO +1 -1
  2. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/flows/tasks.py +83 -14
  3. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/push.py +1 -0
  4. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/rl/remote_runner.py +75 -62
  5. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/buffer.py +108 -77
  6. hud_python-0.4.33/hud/samples/__init__.py +7 -0
  7. hud_python-0.4.33/hud/samples/browser.py +33 -0
  8. {hud_python-0.4.31 → hud_python-0.4.33}/hud/types.py +19 -6
  9. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/mcp.py +6 -1
  10. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/tests/test_version.py +1 -1
  11. hud_python-0.4.33/hud/utils/tool_shorthand.py +59 -0
  12. {hud_python-0.4.31 → hud_python-0.4.33}/hud/version.py +1 -1
  13. {hud_python-0.4.31 → hud_python-0.4.33}/pyproject.toml +1 -1
  14. {hud_python-0.4.31 → hud_python-0.4.33}/.gitignore +0 -0
  15. {hud_python-0.4.31 → hud_python-0.4.33}/LICENSE +0 -0
  16. {hud_python-0.4.31 → hud_python-0.4.33}/README.md +0 -0
  17. {hud_python-0.4.31 → hud_python-0.4.33}/environments/README.md +0 -0
  18. {hud_python-0.4.31 → hud_python-0.4.33}/environments/browser/README.md +0 -0
  19. {hud_python-0.4.31 → hud_python-0.4.33}/environments/browser/apps/2048/README.md +0 -0
  20. {hud_python-0.4.31 → hud_python-0.4.33}/environments/browser/apps/2048/backend/pyproject.toml +0 -0
  21. {hud_python-0.4.31 → hud_python-0.4.33}/environments/browser/apps/README.md +0 -0
  22. {hud_python-0.4.31 → hud_python-0.4.33}/environments/browser/apps/todo/README.md +0 -0
  23. {hud_python-0.4.31 → hud_python-0.4.33}/environments/browser/apps/todo/backend/pyproject.toml +0 -0
  24. {hud_python-0.4.31 → hud_python-0.4.33}/environments/browser/pyproject.toml +0 -0
  25. {hud_python-0.4.31 → hud_python-0.4.33}/environments/remote_browser/README.md +0 -0
  26. {hud_python-0.4.31 → hud_python-0.4.33}/environments/remote_browser/pyproject.toml +0 -0
  27. {hud_python-0.4.31 → hud_python-0.4.33}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  28. {hud_python-0.4.31 → hud_python-0.4.33}/environments/text_2048/README.md +0 -0
  29. {hud_python-0.4.31 → hud_python-0.4.33}/environments/text_2048/pyproject.toml +0 -0
  30. {hud_python-0.4.31 → hud_python-0.4.33}/examples/README.md +0 -0
  31. {hud_python-0.4.31 → hud_python-0.4.33}/hud/__init__.py +0 -0
  32. {hud_python-0.4.31 → hud_python-0.4.33}/hud/__main__.py +0 -0
  33. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/__init__.py +0 -0
  34. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/base.py +0 -0
  35. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/claude.py +0 -0
  36. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/grounded_openai.py +0 -0
  37. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/langchain.py +0 -0
  38. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/misc/__init__.py +0 -0
  39. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/misc/response_agent.py +0 -0
  40. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/openai.py +0 -0
  41. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/openai_chat_generic.py +0 -0
  42. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/tests/__init__.py +0 -0
  43. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/tests/test_base.py +0 -0
  44. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/tests/test_claude.py +0 -0
  45. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/tests/test_client.py +0 -0
  46. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  47. {hud_python-0.4.31 → hud_python-0.4.33}/hud/agents/tests/test_openai.py +0 -0
  48. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/__init__.py +0 -0
  49. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/__main__.py +0 -0
  50. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/analyze.py +0 -0
  51. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/build.py +0 -0
  52. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/clone.py +0 -0
  53. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/debug.py +0 -0
  54. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/dev.py +0 -0
  55. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/eval.py +0 -0
  56. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/flows/__init__.py +0 -0
  57. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/get.py +0 -0
  58. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/init.py +0 -0
  59. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/list_func.py +0 -0
  60. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/pull.py +0 -0
  61. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/remove.py +0 -0
  62. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/rl/__init__.py +0 -0
  63. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/rl/config.py +0 -0
  64. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/rl/display.py +0 -0
  65. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/rl/gpu.py +0 -0
  66. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/rl/gpu_utils.py +0 -0
  67. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/rl/local_runner.py +0 -0
  68. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/rl/presets.py +0 -0
  69. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/rl/rl_api.py +0 -0
  70. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/rl/vllm.py +0 -0
  71. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/__init__.py +0 -0
  72. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_analyze.py +0 -0
  73. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_analyze_metadata.py +0 -0
  74. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_build.py +0 -0
  75. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_cli_init.py +0 -0
  76. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_cli_main.py +0 -0
  77. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_clone.py +0 -0
  78. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_cursor.py +0 -0
  79. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_debug.py +0 -0
  80. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_list_func.py +0 -0
  81. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_main_module.py +0 -0
  82. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_mcp_server.py +0 -0
  83. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_pull.py +0 -0
  84. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_push.py +0 -0
  85. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_registry.py +0 -0
  86. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/tests/test_utils.py +0 -0
  87. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/utils/__init__.py +0 -0
  88. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/utils/cursor.py +0 -0
  89. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/utils/docker.py +0 -0
  90. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/utils/environment.py +0 -0
  91. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/utils/interactive.py +0 -0
  92. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/utils/logging.py +0 -0
  93. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/utils/metadata.py +0 -0
  94. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/utils/registry.py +0 -0
  95. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/utils/remote_runner.py +0 -0
  96. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/utils/runner.py +0 -0
  97. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/utils/server.py +0 -0
  98. {hud_python-0.4.31 → hud_python-0.4.33}/hud/cli/utils/tasks.py +0 -0
  99. {hud_python-0.4.31 → hud_python-0.4.33}/hud/clients/README.md +0 -0
  100. {hud_python-0.4.31 → hud_python-0.4.33}/hud/clients/__init__.py +0 -0
  101. {hud_python-0.4.31 → hud_python-0.4.33}/hud/clients/base.py +0 -0
  102. {hud_python-0.4.31 → hud_python-0.4.33}/hud/clients/fastmcp.py +0 -0
  103. {hud_python-0.4.31 → hud_python-0.4.33}/hud/clients/mcp_use.py +0 -0
  104. {hud_python-0.4.31 → hud_python-0.4.33}/hud/clients/tests/__init__.py +0 -0
  105. {hud_python-0.4.31 → hud_python-0.4.33}/hud/clients/tests/test_client_integration.py +0 -0
  106. {hud_python-0.4.31 → hud_python-0.4.33}/hud/clients/tests/test_fastmcp.py +0 -0
  107. {hud_python-0.4.31 → hud_python-0.4.33}/hud/clients/tests/test_mcp_use_retry.py +0 -0
  108. {hud_python-0.4.31 → hud_python-0.4.33}/hud/clients/tests/test_protocol.py +0 -0
  109. {hud_python-0.4.31 → hud_python-0.4.33}/hud/clients/utils/__init__.py +0 -0
  110. {hud_python-0.4.31 → hud_python-0.4.33}/hud/clients/utils/mcp_use_retry.py +3 -3
  111. {hud_python-0.4.31 → hud_python-0.4.33}/hud/clients/utils/retry.py +0 -0
  112. {hud_python-0.4.31 → hud_python-0.4.33}/hud/clients/utils/retry_transport.py +0 -0
  113. {hud_python-0.4.31 → hud_python-0.4.33}/hud/datasets/__init__.py +0 -0
  114. {hud_python-0.4.31 → hud_python-0.4.33}/hud/datasets/parallel.py +0 -0
  115. {hud_python-0.4.31 → hud_python-0.4.33}/hud/datasets/runner.py +0 -0
  116. {hud_python-0.4.31 → hud_python-0.4.33}/hud/datasets/utils.py +0 -0
  117. {hud_python-0.4.31 → hud_python-0.4.33}/hud/misc/__init__.py +0 -0
  118. {hud_python-0.4.31 → hud_python-0.4.33}/hud/misc/claude_plays_pokemon.py +0 -0
  119. {hud_python-0.4.31 → hud_python-0.4.33}/hud/native/__init__.py +0 -0
  120. {hud_python-0.4.31 → hud_python-0.4.33}/hud/native/comparator.py +0 -0
  121. {hud_python-0.4.31 → hud_python-0.4.33}/hud/native/tests/__init__.py +0 -0
  122. {hud_python-0.4.31 → hud_python-0.4.33}/hud/native/tests/test_comparator.py +0 -0
  123. {hud_python-0.4.31 → hud_python-0.4.33}/hud/native/tests/test_native_init.py +0 -0
  124. {hud_python-0.4.31 → hud_python-0.4.33}/hud/otel/__init__.py +0 -0
  125. {hud_python-0.4.31 → hud_python-0.4.33}/hud/otel/collector.py +0 -0
  126. {hud_python-0.4.31 → hud_python-0.4.33}/hud/otel/config.py +0 -0
  127. {hud_python-0.4.31 → hud_python-0.4.33}/hud/otel/context.py +0 -0
  128. {hud_python-0.4.31 → hud_python-0.4.33}/hud/otel/exporters.py +0 -0
  129. {hud_python-0.4.31 → hud_python-0.4.33}/hud/otel/instrumentation.py +0 -0
  130. {hud_python-0.4.31 → hud_python-0.4.33}/hud/otel/processors.py +0 -0
  131. {hud_python-0.4.31 → hud_python-0.4.33}/hud/otel/tests/__init__.py +0 -0
  132. {hud_python-0.4.31 → hud_python-0.4.33}/hud/otel/tests/test_processors.py +0 -0
  133. {hud_python-0.4.31 → hud_python-0.4.33}/hud/py.typed +0 -0
  134. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/README.md +0 -0
  135. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/__init__.py +0 -0
  136. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/actor.py +0 -0
  137. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/chat_template.jinja +0 -0
  138. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/config.py +0 -0
  139. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/distributed.py +0 -0
  140. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/learner.py +0 -0
  141. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/tests/__init__.py +0 -0
  142. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/tests/test_learner.py +0 -0
  143. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/train.py +0 -0
  144. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/types.py +0 -0
  145. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/utils/start_vllm_server.sh +0 -0
  146. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/utils.py +0 -0
  147. {hud_python-0.4.31 → hud_python-0.4.33}/hud/rl/vllm_adapter.py +0 -0
  148. {hud_python-0.4.31 → hud_python-0.4.33}/hud/server/__init__.py +0 -0
  149. {hud_python-0.4.31 → hud_python-0.4.33}/hud/server/context.py +0 -0
  150. {hud_python-0.4.31 → hud_python-0.4.33}/hud/server/helper/__init__.py +0 -0
  151. {hud_python-0.4.31 → hud_python-0.4.33}/hud/server/low_level.py +0 -0
  152. {hud_python-0.4.31 → hud_python-0.4.33}/hud/server/server.py +0 -0
  153. {hud_python-0.4.31 → hud_python-0.4.33}/hud/server/tests/__init__.py +0 -0
  154. {hud_python-0.4.31 → hud_python-0.4.33}/hud/settings.py +0 -0
  155. {hud_python-0.4.31 → hud_python-0.4.33}/hud/shared/__init__.py +0 -0
  156. {hud_python-0.4.31 → hud_python-0.4.33}/hud/shared/exceptions.py +0 -0
  157. {hud_python-0.4.31 → hud_python-0.4.33}/hud/shared/hints.py +0 -0
  158. {hud_python-0.4.31 → hud_python-0.4.33}/hud/shared/requests.py +0 -0
  159. {hud_python-0.4.31 → hud_python-0.4.33}/hud/shared/tests/__init__.py +0 -0
  160. {hud_python-0.4.31 → hud_python-0.4.33}/hud/shared/tests/test_exceptions.py +0 -0
  161. {hud_python-0.4.31 → hud_python-0.4.33}/hud/shared/tests/test_requests.py +0 -0
  162. {hud_python-0.4.31 → hud_python-0.4.33}/hud/telemetry/__init__.py +0 -0
  163. {hud_python-0.4.31 → hud_python-0.4.33}/hud/telemetry/instrument.py +0 -0
  164. {hud_python-0.4.31 → hud_python-0.4.33}/hud/telemetry/job.py +0 -0
  165. {hud_python-0.4.31 → hud_python-0.4.33}/hud/telemetry/replay.py +0 -0
  166. {hud_python-0.4.31 → hud_python-0.4.33}/hud/telemetry/tests/__init__.py +0 -0
  167. {hud_python-0.4.31 → hud_python-0.4.33}/hud/telemetry/tests/test_replay.py +0 -0
  168. {hud_python-0.4.31 → hud_python-0.4.33}/hud/telemetry/tests/test_trace.py +0 -0
  169. {hud_python-0.4.31 → hud_python-0.4.33}/hud/telemetry/trace.py +0 -0
  170. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/__init__.py +0 -0
  171. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/base.py +0 -0
  172. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/bash.py +0 -0
  173. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/computer/__init__.py +0 -0
  174. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/computer/anthropic.py +0 -0
  175. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/computer/hud.py +0 -0
  176. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/computer/openai.py +0 -0
  177. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/computer/settings.py +0 -0
  178. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/edit.py +0 -0
  179. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/executors/__init__.py +0 -0
  180. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/executors/base.py +0 -0
  181. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/executors/pyautogui.py +0 -0
  182. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/executors/tests/__init__.py +0 -0
  183. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/executors/tests/test_base_executor.py +0 -0
  184. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  185. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/executors/xdo.py +0 -0
  186. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/grounding/__init__.py +0 -0
  187. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/grounding/config.py +0 -0
  188. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/grounding/grounded_tool.py +0 -0
  189. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/grounding/grounder.py +0 -0
  190. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/grounding/tests/__init__.py +0 -0
  191. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  192. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/playwright.py +0 -0
  193. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/response.py +0 -0
  194. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/submit.py +0 -0
  195. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/tests/__init__.py +0 -0
  196. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/tests/test_base.py +0 -0
  197. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/tests/test_bash.py +0 -0
  198. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/tests/test_bash_extended.py +0 -0
  199. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/tests/test_computer.py +0 -0
  200. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/tests/test_computer_actions.py +0 -0
  201. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/tests/test_edit.py +0 -0
  202. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/tests/test_init.py +0 -0
  203. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/tests/test_playwright_tool.py +0 -0
  204. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/tests/test_response.py +0 -0
  205. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/tests/test_tools.py +0 -0
  206. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/tests/test_tools_init.py +0 -0
  207. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/tests/test_utils.py +0 -0
  208. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/types.py +0 -0
  209. {hud_python-0.4.31 → hud_python-0.4.33}/hud/tools/utils.py +0 -0
  210. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/__init__.py +0 -0
  211. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/agent_factories.py +0 -0
  212. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/async_utils.py +0 -0
  213. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/group_eval.py +0 -0
  214. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/hud_console.py +0 -0
  215. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/pretty_errors.py +0 -0
  216. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/progress.py +0 -0
  217. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/tasks.py +0 -0
  218. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/telemetry.py +0 -0
  219. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/tests/__init__.py +0 -0
  220. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/tests/test_async_utils.py +0 -0
  221. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/tests/test_init.py +0 -0
  222. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/tests/test_mcp.py +0 -0
  223. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/tests/test_progress.py +0 -0
  224. {hud_python-0.4.31 → hud_python-0.4.33}/hud/utils/tests/test_telemetry.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.31
3
+ Version: 0.4.33
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -27,9 +27,28 @@ def _is_remote_url(url: str) -> bool:
27
27
 
28
28
 
29
29
  def _validate_tasks(tasks: list[Task]) -> bool:
30
- """Validate the tasks file."""
30
+ """Validate the tasks file: return True if tasks already reference a remote MCP URL.
31
+
32
+ A task is considered remote if any "url" field anywhere inside mcp_config
33
+ is a valid remote URL (e.g., https://mcp.hud.so/v3/mcp).
34
+ """
35
+
36
+ def _has_remote_url(obj: Any) -> bool:
37
+ if isinstance(obj, dict):
38
+ for k, v in obj.items():
39
+ if k == "url" and isinstance(v, str) and _is_remote_url(v):
40
+ return True
41
+ if _has_remote_url(v):
42
+ return True
43
+ elif isinstance(obj, list):
44
+ for item in obj:
45
+ if _has_remote_url(item):
46
+ return True
47
+ return False
48
+
31
49
  for task in tasks:
32
- if not task.mcp_config or (not _is_remote_url(task.mcp_config.get("url", ""))):
50
+ cfg = task.mcp_config or {}
51
+ if not _has_remote_url(cfg):
33
52
  return False
34
53
  return True
35
54
 
@@ -100,7 +119,7 @@ def _ensure_pushed(env_dir: Path, lock_data: dict[str, Any]) -> dict[str, Any]:
100
119
  require_docker_running()
101
120
 
102
121
  # If Docker or login is not configured, the push function will fail and halt.
103
- push_environment(str(env_dir))
122
+ push_environment(str(env_dir), yes=True)
104
123
 
105
124
  # Reload lock after push
106
125
  lock_path = env_dir / "hud.lock.yaml"
@@ -111,10 +130,24 @@ def _ensure_pushed(env_dir: Path, lock_data: dict[str, Any]) -> dict[str, Any]:
111
130
 
112
131
 
113
132
  def _derive_remote_image(lock_data: dict[str, Any]) -> str:
114
- """Derive org/name:tag from lock file image field for MCP header."""
133
+ """Derive org/name:tag from lock file for MCP header.
134
+
135
+ Preference order:
136
+ 1) lock_data["push"]["image_with_tag"] if present
137
+ 2) Derive from lock_data["image"] (may be a digest; falls back to latest)
138
+ """
139
+ push_info = lock_data.get("push", {}) if isinstance(lock_data, dict) else {}
140
+
141
+ # 1) Exact image_with_tag if present
142
+ pushed_with_tag = str(push_info.get("image_with_tag", "")).strip()
143
+ if pushed_with_tag:
144
+ name, tag = extract_name_and_tag(pushed_with_tag)
145
+ return f"{name}:{tag}"
146
+
147
+ # Base name always comes from lock_data.image to preserve org/repo
115
148
  image_ref = str(lock_data.get("image", "")).strip()
116
149
  if not image_ref:
117
- raise typer.Exit("Lock file missing image reference")
150
+ raise typer.Exit(1)
118
151
  name, tag = extract_name_and_tag(image_ref)
119
152
  return f"{name}:{tag}"
120
153
 
@@ -157,19 +190,55 @@ def convert_tasks_to_remote(tasks_file: str) -> str:
157
190
  # Derive remote image name org/name:tag
158
191
  remote_image = _derive_remote_image(lock_data)
159
192
 
193
+ # Helper to strip extra fields from tool calls
194
+ def _simplify_tool_call(tool: Any) -> Any:
195
+ def _one(x: Any) -> dict[str, Any]:
196
+ try:
197
+ data = x.model_dump() if hasattr(x, "model_dump") else dict(x)
198
+ except Exception:
199
+ try:
200
+ data = dict(x)
201
+ except Exception:
202
+ return {}
203
+ # Keep only name and arguments
204
+ name = data.get("name")
205
+ arguments = data.get("arguments", {})
206
+ return {"name": name, "arguments": arguments}
207
+
208
+ if tool is None:
209
+ return None
210
+ if isinstance(tool, list):
211
+ return [_one(x) for x in tool]
212
+ return _one(tool)
213
+
160
214
  # Convert to list[dict]
161
215
  tasks_payload: list[dict[str, Any]] = []
162
216
  for t in tasks:
163
- item = t.model_dump()
164
- item["mcp_config"] = {
165
- "hud": {
166
- "url": "https://mcp.hud.so/v3/mcp",
167
- "headers": {
168
- "Authorization": "Bearer ${HUD_API_KEY}",
169
- "Mcp-Image": remote_image,
170
- },
171
- }
217
+ item: dict[str, Any] = {
218
+ "prompt": t.prompt,
219
+ "mcp_config": {
220
+ "hud": {
221
+ "url": "https://mcp.hud.so/v3/mcp",
222
+ "headers": {
223
+ "Authorization": "Bearer ${HUD_API_KEY}",
224
+ "Mcp-Image": remote_image,
225
+ },
226
+ }
227
+ },
172
228
  }
229
+
230
+ # Optional fields, omit Nones
231
+ if t.setup_tool is not None:
232
+ item["setup_tool"] = _simplify_tool_call(t.setup_tool)
233
+ if t.evaluate_tool is not None:
234
+ item["evaluate_tool"] = _simplify_tool_call(t.evaluate_tool)
235
+ if t.agent_tools is not None:
236
+ item["agent_tools"] = t.agent_tools
237
+ if t.system_prompt is not None:
238
+ item["system_prompt"] = t.system_prompt
239
+ if t.metadata:
240
+ item["metadata"] = t.metadata
241
+
173
242
  tasks_payload.append(item)
174
243
 
175
244
  # Write new file: remote_<name>.json (always JSON array)
@@ -332,6 +332,7 @@ def push_environment(
332
332
  "source": local_image,
333
333
  "pushedAt": datetime.now(UTC).isoformat().replace("+00:00", "Z"),
334
334
  "registry": pushed_digest.split("/")[0] if "/" in pushed_digest else "docker.io",
335
+ "image_with_tag": image,
335
336
  }
336
337
 
337
338
  # Save updated lock file
@@ -9,6 +9,7 @@ from __future__ import annotations
9
9
  import os
10
10
  import subprocess
11
11
  import time
12
+ import uuid
12
13
  from pathlib import Path
13
14
 
14
15
  from rich.console import Console
@@ -29,6 +30,41 @@ GPU_PRICING = {
29
30
  }
30
31
 
31
32
 
33
+ def ensure_vllm_deployed(model_name: str, gpu_type: str = "A100", timeout: int = 600) -> None:
34
+ """Deploy vLLM for a model if needed and wait until it's ready.
35
+
36
+ Args:
37
+ model_name: The name of the model to deploy vLLM for
38
+ gpu_type: GPU type to use for deployment (e.g., A100, H100)
39
+ timeout: Max seconds to wait for vLLM to be ready
40
+ """
41
+ # Check current model status
42
+ info = rl_api.get_model(model_name)
43
+ if info.vllm_url:
44
+ hud_console.success("vLLM server already running")
45
+ return
46
+
47
+ hud_console.info(f"Deploying vLLM server for {model_name}...")
48
+ rl_api.deploy_vllm(model_name, gpu_type=gpu_type)
49
+ hud_console.success("vLLM deployment started")
50
+
51
+ hud_console.info("Waiting for vLLM server to be ready...")
52
+ start_time = time.time()
53
+ with hud_console.progress() as progress:
54
+ progress.update("Checking deployment status (see live status on https://app.hud.so/models)")
55
+ while True:
56
+ if time.time() - start_time > timeout:
57
+ hud_console.error("Timeout waiting for vLLM deployment")
58
+ raise ValueError("vLLM deployment timeout")
59
+ info = rl_api.get_model(model_name)
60
+ if info.vllm_url or info.status == "ready":
61
+ hud_console.success(
62
+ f"vLLM server ready at http://rl.hud.so/v1/models/{model_name}/vllm"
63
+ )
64
+ break
65
+ time.sleep(5)
66
+
67
+
32
68
  def run_remote_training(
33
69
  tasks_file: str | None,
34
70
  model: str | None,
@@ -128,49 +164,55 @@ def run_remote_training(
128
164
  from rich.prompt import Prompt
129
165
 
130
166
  # Ask for model name
131
- default_name = model_type.split("/")[-1].lower()
167
+ base_default = model_type.split("/")[-1].lower()
168
+ default_name = base_default
169
+ existing_names = {m.name for m in active_models}
170
+ suffix = 1
171
+ while default_name in existing_names:
172
+ default_name = f"{base_default}-{suffix}"
173
+ suffix += 1
174
+
132
175
  hud_console.info(f"Enter model name (default: {default_name}):")
133
176
  model_name = Prompt.ask("Model name", default=default_name)
134
177
  model_name = model_name.replace("/", "-").lower()
135
178
 
136
- # Create the model
179
+ # Create the model with retry on name conflict
137
180
  hud_console.info(f"Creating model: {model_name}")
138
181
  try:
139
182
  rl_api.create_model(model_name, model_type)
140
183
  hud_console.success(f"Created model: {model_name}")
184
+ ensure_vllm_deployed(model_name, gpu_type="A100")
141
185
 
142
- # Deploy vLLM automatically
143
- hud_console.info(f"Deploying vLLM server for {model_name}...")
144
- rl_api.deploy_vllm(model_name, gpu_type="A100")
145
- hud_console.success("vLLM deployment started")
146
-
147
- # Wait for deployment
148
- hud_console.info("Waiting for vLLM server to be ready...")
149
- max_wait = 600 # 10 minutes
150
- start_time = time.time()
151
-
152
- with hud_console.progress() as progress:
153
- progress.update(
154
- "Checking deployment status (see live status on https://app.hud.so/models)"
155
- )
156
-
186
+ except Exception as e:
187
+ # If the name already exists, suggest a new name and prompt once
188
+ message = str(e)
189
+ if "already exists" in message or "409" in message:
190
+ alt_name = f"{model_name}-1"
191
+ i = 1
157
192
  while True:
158
- if time.time() - start_time > max_wait:
159
- hud_console.error("Timeout waiting for vLLM deployment")
160
- raise ValueError("vLLM deployment timeout")
161
-
162
- model_info = rl_api.get_model(model_name)
163
- if model_info.status == "ready":
164
- hud_console.success(
165
- f"vLLM server ready at http://rl.hud.so/v1/models/{model_name}/vllm"
166
- )
193
+ candidate = f"{model_name}-{str(uuid.uuid4())[:4]}"
194
+ if candidate not in existing_names:
195
+ alt_name = candidate
167
196
  break
168
-
169
- time.sleep(5)
170
-
171
- except Exception as e:
172
- hud_console.error(f"Failed to create model: {e}")
173
- raise
197
+ i += 1
198
+ hud_console.warning(
199
+ f"Model '{model_name}' exists. Suggesting '{alt_name}' instead."
200
+ )
201
+ try:
202
+ from rich.prompt import Prompt as _Prompt
203
+
204
+ chosen = _Prompt.ask("Use different name", default=alt_name)
205
+ chosen = chosen.replace("/", "-").lower()
206
+ rl_api.create_model(chosen, model_type)
207
+ hud_console.success(f"Created model: {chosen}")
208
+ model_name = chosen
209
+ ensure_vllm_deployed(model_name, gpu_type="A100")
210
+ except Exception as e2:
211
+ hud_console.error(f"Failed to create model: {e2}")
212
+ raise
213
+ else:
214
+ hud_console.error(f"Failed to create model: {e}")
215
+ raise
174
216
 
175
217
  else:
176
218
  # Existing model selected
@@ -194,36 +236,7 @@ def run_remote_training(
194
236
  return
195
237
 
196
238
  # Ensure vLLM is deployed
197
- if not model_info.vllm_url:
198
- hud_console.info(f"Deploying vLLM server for {model_name}...")
199
- rl_api.deploy_vllm(model_name, gpu_type="A100")
200
- hud_console.success("vLLM deployment started")
201
-
202
- # Wait for deployment
203
- hud_console.info("Waiting for vLLM server to be ready...")
204
- max_wait = 600 # 10 minutes
205
- start_time = time.time()
206
-
207
- with hud_console.progress() as progress:
208
- progress.update(
209
- "Checking deployment status (see live status on https://app.hud.so/models)"
210
- )
211
-
212
- while True:
213
- if time.time() - start_time > max_wait:
214
- hud_console.error("Timeout waiting for vLLM deployment")
215
- raise ValueError("vLLM deployment timeout")
216
-
217
- model_info = rl_api.get_model(model_name)
218
- if model_info.vllm_url:
219
- hud_console.success(
220
- f"vLLM server ready at http://rl.hud.so/v1/models/{model_name}/vllm"
221
- )
222
- break
223
-
224
- time.sleep(5)
225
- else:
226
- hud_console.success("vLLM server already running")
239
+ ensure_vllm_deployed(model_name, gpu_type="A100")
227
240
  except KeyboardInterrupt:
228
241
  hud_console.dim_info("Training cancelled", "")
229
242
  return
@@ -219,12 +219,93 @@ class ReplayBuffer(Buffer[Trace]):
219
219
  else:
220
220
  raise ValueError(f"Invalid select strategy: {self.select_strategy}")
221
221
 
222
+ def _extract_group_key(self, trace: Trace) -> tuple[str, str]:
223
+ """Return a stable grouping key for a trace.
224
+
225
+ Preference order:
226
+ 1) task.id when present (kind='id')
227
+ 2) task.prompt exact string (kind='prompt') when id is None
228
+ 3) 'NA' for missing/errored entries (kind='NA')
229
+ """
230
+ if getattr(trace, "isError", False):
231
+ return ("NA", "NA")
232
+
233
+ task = getattr(trace, "task", None)
234
+ if task is None:
235
+ return ("NA", "NA")
236
+
237
+ tid = getattr(task, "id", None)
238
+ if tid is not None:
239
+ return ("id", str(tid))
240
+
241
+ prompt = getattr(task, "prompt", None)
242
+ if prompt:
243
+ return ("prompt", str(prompt))
244
+
245
+ return ("NA", "NA")
246
+
247
+ def _validate_and_split_groups(
248
+ self, recent_traces: list[Trace]
249
+ ) -> tuple[list[list[Trace]], list[tuple[str, str]]]:
250
+ """Validate and split recent traces into homogeneous groups by id or prompt.
251
+
252
+ - Uses id when present; otherwise falls back to prompt equality.
253
+ - Any NA/error traces are excluded and the group is filled by duplicating
254
+ existing valid members in that group.
255
+ - Always returns len == groups_per_batch groups of size == group_size.
256
+ """
257
+ from collections import Counter
258
+
259
+ groups_per_batch = self.batch_size // self.group_size
260
+
261
+ window_keys = [self._extract_group_key(t) for t in recent_traces]
262
+ window_counter = Counter(k for k in window_keys if k[0] != "NA")
263
+
264
+ validated_groups: list[list[Trace]] = []
265
+ selected_keys: list[tuple[str, str]] = []
266
+
267
+ for g_idx in range(groups_per_batch):
268
+ start = g_idx * self.group_size
269
+ end = start + self.group_size
270
+ chunk = recent_traces[start:end]
271
+
272
+ key_counts = Counter()
273
+ per_item_keys: list[tuple[str, str]] = []
274
+ for tr in chunk:
275
+ k = self._extract_group_key(tr)
276
+ per_item_keys.append(k)
277
+ if k[0] != "NA":
278
+ key_counts[k] += 1
279
+
280
+ if key_counts:
281
+ best_key = key_counts.most_common(1)[0][0]
282
+ elif window_counter:
283
+ best_key = window_counter.most_common(1)[0][0]
284
+ else:
285
+ best_key = ("NA", "NA")
286
+
287
+ homogeneous = [tr for tr, k in zip(chunk, per_item_keys, strict=False) if k == best_key]
288
+
289
+ while len(homogeneous) < self.group_size:
290
+ if homogeneous:
291
+ homogeneous.append(homogeneous[-1])
292
+ else:
293
+ idx = next((i for i, wk in enumerate(window_keys) if wk[0] != "NA"), None)
294
+ if idx is not None:
295
+ homogeneous.append(recent_traces[idx])
296
+ elif chunk:
297
+ homogeneous.append(chunk[0])
298
+ else:
299
+ homogeneous.append(recent_traces[0])
300
+
301
+ validated_groups.append(homogeneous)
302
+ selected_keys.append(best_key)
303
+
304
+ return validated_groups, selected_keys
305
+
222
306
  def _sample_high_variance_traces(self) -> list[Trace]:
223
307
  from collections import Counter, defaultdict, deque
224
308
 
225
- # Expect recent window to already be grouped by task id
226
-
227
- # Build recent window and earlier lookup (short form)
228
309
  buf_list = list(self.buffer)
229
310
  if len(buf_list) < self.batch_size:
230
311
  hud_console.warning(
@@ -234,81 +315,32 @@ class ReplayBuffer(Buffer[Trace]):
234
315
  take = min(len(buf_list) or 1, self.batch_size - len(buf_list))
235
316
  buf_list.extend(buf_list[:take])
236
317
  recent_traces = buf_list[-self.batch_size :]
237
- hud_console.info(
238
- f"[group-sampler] recent-window histogram: {Counter(getattr(t.task, 'id', 'NA') for t in recent_traces)}" # noqa: E501
239
- )
318
+
319
+ recent_keys = [self._extract_group_key(t) for t in recent_traces]
320
+ hud_console.info(f"[group-sampler] recent-window histogram: {Counter(recent_keys)}")
240
321
 
241
322
  hud_console.info(
242
323
  f"[group-sampler] Building earlier traces lookup, buffer size: {len(buf_list)}"
243
324
  )
244
- earlier_traces_by_task: dict[str, deque[Trace]] = defaultdict(deque)
325
+ earlier_traces_by_key: dict[tuple[str, str], deque[Trace]] = defaultdict(deque)
245
326
  for tr in buf_list[: -self.batch_size]:
246
- earlier_traces_by_task[getattr(tr.task, "id", "NA")].append(tr)
327
+ k = self._extract_group_key(tr)
328
+ if k[0] != "NA":
329
+ earlier_traces_by_key[k].append(tr)
330
+
331
+ groups, group_keys = self._validate_and_split_groups(recent_traces)
247
332
 
248
- # Chunk from the most-recent end
249
333
  final_traces: list[Trace] = []
250
- groups_per_batch = self.batch_size // self.group_size
251
- hud_console.info(f"[group-sampler] Processing {groups_per_batch} groups")
252
- for g_idx in range(groups_per_batch):
253
- start = g_idx * self.group_size
254
- end = start + self.group_size
255
- group = recent_traces[start:end]
256
-
257
- # Assert homogeneity: every trace in a group must share the same task id
258
- cnt = Counter(getattr(t.task, "id", "NA") for t in group)
259
- if len(cnt) != 1:
260
- raise RuntimeError(f"Group {g_idx} is not homogeneous: {dict(cnt)}")
261
- target_tid = next(iter(cnt.keys()))
262
-
263
- # Build homogeneous group of target_tid, filling from earlier traces to increase spread
264
- homogeneous: list[Trace] = [
265
- t for t in group if getattr(t.task, "id", "NA") == target_tid
266
- ]
267
- needed = self.group_size - len(homogeneous)
268
-
269
- # Greedy fill: choose earlier traces (same task-id) farthest from current mean reward
270
- def current_mean(homogeneous: list[Trace]) -> float:
271
- if not homogeneous:
334
+ for g_idx, (homogeneous, target_key) in enumerate(zip(groups, group_keys, strict=False)):
335
+
336
+ def current_mean(h: list[Trace]) -> float:
337
+ if not h:
272
338
  return 0.0
273
- vals = [float(getattr(t, "reward", 0.0) or 0.0) for t in homogeneous]
339
+ vals = [float(getattr(t, "reward", 0.0) or 0.0) for t in h]
274
340
  return sum(vals) / len(vals)
275
341
 
276
- while needed > 0:
277
- pool = earlier_traces_by_task.get(target_tid, deque())
278
- if pool:
279
- mu = current_mean(homogeneous)
280
- # pick element farthest from current mean
281
- best_i = None
282
- best_dist = -1.0
283
- for i, tr in enumerate(list(pool)):
284
- r = float(getattr(tr, "reward", 0.0) or 0.0)
285
- dist = abs(r - mu)
286
- if dist > best_dist:
287
- best_dist = dist
288
- best_i = i
289
- # pop selected
290
- chosen = list(pool)[best_i] # type: ignore[index]
291
- # remove from deque efficiently by rotating
292
- left = list(pool)
293
- if best_i is not None:
294
- left.pop(best_i) # O(n) but pool is small in practice
295
- earlier_traces_by_task[target_tid] = deque(left)
296
- homogeneous.append(chosen)
297
- else:
298
- # duplicate extreme within current homogeneous set
299
- if not homogeneous:
300
- raise RuntimeError(f"Group {g_idx} has no traces for target {target_tid}")
301
- mu = current_mean(homogeneous)
302
- extreme = max(
303
- homogeneous, key=lambda t: abs(float(getattr(t, "reward", 0.0) or 0.0) - mu)
304
- )
305
- homogeneous.append(extreme)
306
- needed -= 1
307
-
308
- # Replacement step: swap in earlier traces to increase reward spread
309
- pool = earlier_traces_by_task.get(target_tid, deque())
342
+ pool = earlier_traces_by_key.get(target_key, deque())
310
343
  if pool:
311
- # Log pool stats
312
344
  pool_vals = [float(getattr(tr, "reward", 0.0) or 0.0) for tr in list(pool)]
313
345
  if pool_vals:
314
346
  pool_mean = sum(pool_vals) / len(pool_vals)
@@ -316,16 +348,15 @@ class ReplayBuffer(Buffer[Trace]):
316
348
  pool_vals
317
349
  )
318
350
  hud_console.info(
319
- f"[group-sampler] Group {g_idx}: earlier-pool size={len(pool_vals)} mean={pool_mean:.4f} std={(pool_var**0.5):.4f}" # noqa: E501
351
+ f"[group-sampler] Group {g_idx}: earlier-pool size={len(pool_vals)} "
352
+ f"mean={pool_mean:.4f} std={(pool_var**0.5):.4f}"
320
353
  )
321
354
 
322
- # Decide how many to replace (up to 1/4 of group, at least 1)
323
355
  replace_k = max(1, self.group_size // 4)
324
356
  replace_k = min(replace_k, len(pool), self.group_size)
325
357
 
326
358
  if replace_k > 0:
327
359
  mu = current_mean(homogeneous)
328
- # Select replacement candidates from pool farthest from current mean
329
360
  pool_list = list(pool)
330
361
  pool_indices = list(range(len(pool_list)))
331
362
  pool_indices.sort(
@@ -337,12 +368,11 @@ class ReplayBuffer(Buffer[Trace]):
337
368
  chosen_pool_idx = set(pool_indices[:replace_k])
338
369
  replacements = [pool_list[i] for i in pool_indices[:replace_k]]
339
370
 
340
- # Remove chosen from pool deque
341
371
  remaining = [tr for i, tr in enumerate(pool_list) if i not in chosen_pool_idx]
342
- earlier_traces_by_task[target_tid] = deque(remaining)
372
+ earlier_traces_by_key[target_key] = deque(remaining)
343
373
 
344
- # Select current group positions closest to mean to replace
345
374
  group_indices = list(range(len(homogeneous)))
375
+ mu = current_mean(homogeneous)
346
376
  group_indices.sort(
347
377
  key=lambda i: abs(
348
378
  (float(getattr(homogeneous[i], "reward", 0.0) or 0.0)) - mu
@@ -353,18 +383,19 @@ class ReplayBuffer(Buffer[Trace]):
353
383
  for pos, new_tr in zip(target_positions, replacements, strict=False):
354
384
  homogeneous[pos] = new_tr
355
385
 
356
- # Validate homogeneity
357
- if any(getattr(t.task, "id", "NA") != target_tid for t in homogeneous):
386
+ if any(self._extract_group_key(t) != target_key for t in homogeneous):
358
387
  raise RuntimeError(f"Group {g_idx} is not homogeneous after sampling")
359
388
  final_traces.extend(homogeneous)
360
389
 
361
390
  for i in range(0, len(final_traces), self.group_size):
362
391
  block = final_traces[i : i + self.group_size]
363
- if len({getattr(t.task, "id", "NA") for t in block}) != 1:
392
+ keys = {self._extract_group_key(t) for t in block}
393
+ if len(keys) != 1:
364
394
  raise RuntimeError(f"Homogeneity validation failed for block starting at index {i}")
365
395
 
366
396
  hud_console.info(
367
- f"[group-sampler] final histogram: {Counter(getattr(t.task, 'id', 'NA') for t in final_traces)}" # noqa: E501
397
+ f"[group-sampler] final histogram: "
398
+ f"{Counter(self._extract_group_key(t) for t in final_traces)}"
368
399
  )
369
400
  return final_traces
370
401
 
@@ -0,0 +1,7 @@
1
+ """Sample tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from hud.samples.browser import BrowserTask
6
+
7
+ __all__ = ["BrowserTask"]
@@ -0,0 +1,33 @@
1
+ """Sample browser task factory."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from pydantic import Field
8
+
9
+ from hud.settings import settings
10
+ from hud.types import MCPToolCall, Task
11
+
12
+
13
+ class BrowserTask(Task):
14
+ """Task subclass with browser defaults for BrowserTask(prompt=...)."""
15
+
16
+ prompt: str = "Open Google and be ready to search."
17
+ mcp_config: dict[str, Any] = Field(
18
+ default_factory=lambda: {
19
+ "browser": {
20
+ "url": "https://mcp.hud.so/v3/mcp",
21
+ "headers": {
22
+ "Authorization": f"Bearer {settings.api_key}",
23
+ "Mcp-Image": "hudevals/hud-remote-browser:0.1.1",
24
+ },
25
+ }
26
+ }
27
+ )
28
+ setup_tool: MCPToolCall | list[MCPToolCall] | None = Field(
29
+ default_factory=lambda: MCPToolCall(
30
+ name="setup",
31
+ arguments={"name": "navigate_to_url", "arguments": {"url": "https://www.google.com"}},
32
+ )
33
+ )
@@ -12,6 +12,7 @@ from mcp.types import CallToolRequestParams, CallToolResult
12
12
  from pydantic import BaseModel, ConfigDict, Field, field_validator
13
13
 
14
14
  from hud.settings import settings
15
+ from hud.utils.tool_shorthand import normalize_to_tool_call_dict
15
16
 
16
17
  logger = logging.getLogger(__name__)
17
18
 
@@ -59,8 +60,18 @@ class Task(BaseModel):
59
60
 
60
61
  @field_validator("setup_tool", "evaluate_tool", mode="before")
61
62
  @classmethod
62
- def convert_dict_to_tool_call(cls, v: Any) -> Any:
63
- """Convert dict to MCPToolCall instance, parsing JSON strings first."""
63
+ def convert_dict_to_tool_call(cls, v: Any, info: Any) -> Any:
64
+ """Convert dict (with shorthands) to MCPToolCall instance.
65
+
66
+ Supports nested forms by walking to the deepest tool name and its arguments.
67
+ Examples:
68
+ - {"name": "navigate", "arguments": {...}} -> name=navigate
69
+ - {"navigate": {...}} -> name=navigate
70
+ - {"setup": {"navigate": {...}}} -> name=navigate
71
+ - {"name": "setup", "arguments": {"name": "navigate", "arguments": {...}}}
72
+ -> name=navigate
73
+ - Lists are normalized element-wise
74
+ """
64
75
  if v is None:
65
76
  return None
66
77
 
@@ -73,10 +84,12 @@ class Task(BaseModel):
73
84
 
74
85
  raise HudConfigError(f"Invalid JSON string: {e}") from e
75
86
 
76
- if isinstance(v, dict):
77
- return MCPToolCall(**v)
78
- if isinstance(v, list):
79
- return [MCPToolCall(**item) if isinstance(item, dict) else item for item in v]
87
+ normalized = normalize_to_tool_call_dict(v)
88
+
89
+ if isinstance(normalized, dict):
90
+ return MCPToolCall(**normalized)
91
+ if isinstance(normalized, list):
92
+ return [MCPToolCall(**item) if isinstance(item, dict) else item for item in normalized]
80
93
  return v
81
94
 
82
95
  @field_validator("mcp_config", mode="before")