hud-python 0.4.51__tar.gz → 0.4.52__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (261) hide show
  1. {hud_python-0.4.51 → hud_python-0.4.52}/PKG-INFO +2 -1
  2. {hud_python-0.4.51 → hud_python-0.4.52}/environments/blank/server/pyproject.toml +1 -1
  3. {hud_python-0.4.51 → hud_python-0.4.52}/environments/deepresearch/server/pyproject.toml +1 -1
  4. {hud_python-0.4.51 → hud_python-0.4.52}/hud/__init__.py +13 -1
  5. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/base.py +5 -1
  6. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/lite_llm.py +1 -1
  7. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/tests/test_base.py +8 -16
  8. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/__init__.py +12 -22
  9. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/eval.py +53 -84
  10. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_build.py +2 -1
  11. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_eval.py +4 -0
  12. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_mcp_server.py +1 -1
  13. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/tasks.py +4 -1
  14. hud_python-0.4.52/hud/cli/utils/version_check.py +257 -0
  15. {hud_python-0.4.51 → hud_python-0.4.52}/hud/clients/base.py +1 -1
  16. {hud_python-0.4.51 → hud_python-0.4.52}/hud/clients/mcp_use.py +3 -1
  17. {hud_python-0.4.51 → hud_python-0.4.52}/hud/datasets/parallel.py +2 -2
  18. hud_python-0.4.52/hud/datasets/runner.py +184 -0
  19. {hud_python-0.4.51 → hud_python-0.4.52}/hud/otel/config.py +8 -6
  20. {hud_python-0.4.51 → hud_python-0.4.52}/hud/otel/context.py +4 -4
  21. {hud_python-0.4.51 → hud_python-0.4.52}/hud/otel/exporters.py +231 -57
  22. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/learner.py +1 -1
  23. {hud_python-0.4.51 → hud_python-0.4.52}/hud/shared/exceptions.py +0 -5
  24. {hud_python-0.4.51 → hud_python-0.4.52}/hud/shared/tests/test_exceptions.py +17 -16
  25. hud_python-0.4.52/hud/telemetry/__init__.py +50 -0
  26. hud_python-0.4.52/hud/telemetry/async_context.py +331 -0
  27. {hud_python-0.4.51 → hud_python-0.4.52}/hud/telemetry/job.py +51 -12
  28. {hud_python-0.4.51 → hud_python-0.4.52}/hud/telemetry/tests/test_trace.py +4 -4
  29. {hud_python-0.4.51 → hud_python-0.4.52}/hud/telemetry/trace.py +16 -17
  30. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/computer/qwen.py +4 -1
  31. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/executors/base.py +4 -2
  32. hud_python-0.4.52/hud/utils/task_tracking.py +223 -0
  33. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/tests/test_version.py +1 -1
  34. {hud_python-0.4.51 → hud_python-0.4.52}/hud/version.py +1 -1
  35. {hud_python-0.4.51 → hud_python-0.4.52}/pyproject.toml +2 -1
  36. hud_python-0.4.51/hud/datasets/runner.py +0 -123
  37. hud_python-0.4.51/hud/telemetry/__init__.py +0 -26
  38. {hud_python-0.4.51 → hud_python-0.4.52}/.gitignore +0 -0
  39. {hud_python-0.4.51 → hud_python-0.4.52}/LICENSE +0 -0
  40. {hud_python-0.4.51 → hud_python-0.4.52}/README.md +0 -0
  41. {hud_python-0.4.51 → hud_python-0.4.52}/environments/README.md +0 -0
  42. {hud_python-0.4.51 → hud_python-0.4.52}/environments/blank/README.md +0 -0
  43. {hud_python-0.4.51 → hud_python-0.4.52}/environments/blank/environment/README.md +0 -0
  44. {hud_python-0.4.51 → hud_python-0.4.52}/environments/blank/environment/pyproject.toml +0 -0
  45. {hud_python-0.4.51 → hud_python-0.4.52}/environments/blank/server/README.md +0 -0
  46. {hud_python-0.4.51 → hud_python-0.4.52}/environments/browser/README.md +0 -0
  47. {hud_python-0.4.51 → hud_python-0.4.52}/environments/browser/environment/2048/README.md +0 -0
  48. {hud_python-0.4.51 → hud_python-0.4.52}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
  49. {hud_python-0.4.51 → hud_python-0.4.52}/environments/browser/environment/README.md +0 -0
  50. {hud_python-0.4.51 → hud_python-0.4.52}/environments/browser/environment/pyproject.toml +0 -0
  51. {hud_python-0.4.51 → hud_python-0.4.52}/environments/browser/environment/todo/README.md +0 -0
  52. {hud_python-0.4.51 → hud_python-0.4.52}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
  53. {hud_python-0.4.51 → hud_python-0.4.52}/environments/browser/pyproject.toml +0 -0
  54. {hud_python-0.4.51 → hud_python-0.4.52}/environments/browser/server/pyproject.toml +0 -0
  55. {hud_python-0.4.51 → hud_python-0.4.52}/environments/deepresearch/README.md +0 -0
  56. {hud_python-0.4.51 → hud_python-0.4.52}/environments/deepresearch/environment/pyproject.toml +0 -0
  57. {hud_python-0.4.51 → hud_python-0.4.52}/environments/deepresearch/pyproject.toml +0 -0
  58. {hud_python-0.4.51 → hud_python-0.4.52}/environments/remote_browser/README.md +0 -0
  59. {hud_python-0.4.51 → hud_python-0.4.52}/environments/remote_browser/pyproject.toml +0 -0
  60. {hud_python-0.4.51 → hud_python-0.4.52}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  61. {hud_python-0.4.51 → hud_python-0.4.52}/environments/text_2048/README.md +0 -0
  62. {hud_python-0.4.51 → hud_python-0.4.52}/environments/text_2048/pyproject.toml +0 -0
  63. {hud_python-0.4.51 → hud_python-0.4.52}/examples/README.md +0 -0
  64. {hud_python-0.4.51 → hud_python-0.4.52}/hud/__main__.py +0 -0
  65. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/__init__.py +0 -0
  66. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/claude.py +0 -0
  67. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/grounded_openai.py +0 -0
  68. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/langchain.py +0 -0
  69. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/misc/__init__.py +0 -0
  70. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/misc/integration_test_agent.py +0 -0
  71. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/misc/response_agent.py +0 -0
  72. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/openai.py +0 -0
  73. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/openai_chat_generic.py +0 -0
  74. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/tests/__init__.py +0 -0
  75. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/tests/test_claude.py +0 -0
  76. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/tests/test_client.py +0 -0
  77. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  78. {hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/tests/test_openai.py +0 -0
  79. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/__main__.py +0 -0
  80. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/analyze.py +0 -0
  81. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/build.py +0 -0
  82. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/clone.py +0 -0
  83. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/debug.py +0 -0
  84. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/dev.py +0 -0
  85. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/flows/__init__.py +0 -0
  86. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/flows/tasks.py +0 -0
  87. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/get.py +0 -0
  88. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/init.py +0 -0
  89. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/list_func.py +0 -0
  90. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/pull.py +0 -0
  91. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/push.py +0 -0
  92. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/remove.py +0 -0
  93. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/rl/__init__.py +0 -0
  94. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/rl/celebrate.py +0 -0
  95. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/rl/config.py +0 -0
  96. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/rl/display.py +0 -0
  97. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/rl/gpu.py +0 -0
  98. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/rl/gpu_utils.py +0 -0
  99. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/rl/local_runner.py +0 -0
  100. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/rl/presets.py +0 -0
  101. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/rl/remote_runner.py +0 -0
  102. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/rl/rl_api.py +0 -0
  103. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/rl/viewer.py +0 -0
  104. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/rl/vllm.py +0 -0
  105. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/rl/wait_utils.py +0 -0
  106. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/__init__.py +0 -0
  107. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_analyze.py +0 -0
  108. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_analyze_metadata.py +0 -0
  109. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_cli_init.py +0 -0
  110. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_cli_main.py +0 -0
  111. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_clone.py +0 -0
  112. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_cursor.py +0 -0
  113. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_debug.py +0 -0
  114. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_list_func.py +0 -0
  115. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_main_module.py +0 -0
  116. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_pull.py +0 -0
  117. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_push.py +0 -0
  118. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_registry.py +0 -0
  119. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_utils.py +0 -0
  120. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/__init__.py +0 -0
  121. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/config.py +0 -0
  122. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/cursor.py +0 -0
  123. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/docker.py +0 -0
  124. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/env_check.py +0 -0
  125. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/environment.py +0 -0
  126. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/interactive.py +0 -0
  127. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/local_runner.py +0 -0
  128. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/logging.py +0 -0
  129. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/metadata.py +0 -0
  130. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/package_runner.py +0 -0
  131. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/registry.py +0 -0
  132. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/remote_runner.py +0 -0
  133. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/runner.py +0 -0
  134. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/server.py +0 -0
  135. {hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/source_hash.py +0 -0
  136. {hud_python-0.4.51 → hud_python-0.4.52}/hud/clients/README.md +0 -0
  137. {hud_python-0.4.51 → hud_python-0.4.52}/hud/clients/__init__.py +0 -0
  138. {hud_python-0.4.51 → hud_python-0.4.52}/hud/clients/fastmcp.py +0 -0
  139. {hud_python-0.4.51 → hud_python-0.4.52}/hud/clients/tests/__init__.py +0 -0
  140. {hud_python-0.4.51 → hud_python-0.4.52}/hud/clients/tests/test_client_integration.py +0 -0
  141. {hud_python-0.4.51 → hud_python-0.4.52}/hud/clients/tests/test_fastmcp.py +0 -0
  142. {hud_python-0.4.51 → hud_python-0.4.52}/hud/clients/tests/test_mcp_use_retry.py +0 -0
  143. {hud_python-0.4.51 → hud_python-0.4.52}/hud/clients/tests/test_protocol.py +0 -0
  144. {hud_python-0.4.51 → hud_python-0.4.52}/hud/clients/utils/__init__.py +0 -0
  145. {hud_python-0.4.51 → hud_python-0.4.52}/hud/clients/utils/mcp_use_retry.py +0 -0
  146. {hud_python-0.4.51 → hud_python-0.4.52}/hud/clients/utils/retry.py +0 -0
  147. {hud_python-0.4.51 → hud_python-0.4.52}/hud/clients/utils/retry_transport.py +0 -0
  148. {hud_python-0.4.51 → hud_python-0.4.52}/hud/datasets/__init__.py +0 -0
  149. {hud_python-0.4.51 → hud_python-0.4.52}/hud/datasets/utils.py +0 -0
  150. {hud_python-0.4.51 → hud_python-0.4.52}/hud/misc/__init__.py +0 -0
  151. {hud_python-0.4.51 → hud_python-0.4.52}/hud/misc/claude_plays_pokemon.py +0 -0
  152. {hud_python-0.4.51 → hud_python-0.4.52}/hud/native/__init__.py +0 -0
  153. {hud_python-0.4.51 → hud_python-0.4.52}/hud/native/comparator.py +0 -0
  154. {hud_python-0.4.51 → hud_python-0.4.52}/hud/native/tests/__init__.py +0 -0
  155. {hud_python-0.4.51 → hud_python-0.4.52}/hud/native/tests/test_comparator.py +0 -0
  156. {hud_python-0.4.51 → hud_python-0.4.52}/hud/native/tests/test_native_init.py +0 -0
  157. {hud_python-0.4.51 → hud_python-0.4.52}/hud/otel/__init__.py +0 -0
  158. {hud_python-0.4.51 → hud_python-0.4.52}/hud/otel/collector.py +0 -0
  159. {hud_python-0.4.51 → hud_python-0.4.52}/hud/otel/instrumentation.py +0 -0
  160. {hud_python-0.4.51 → hud_python-0.4.52}/hud/otel/processors.py +0 -0
  161. {hud_python-0.4.51 → hud_python-0.4.52}/hud/otel/tests/__init__.py +0 -0
  162. {hud_python-0.4.51 → hud_python-0.4.52}/hud/otel/tests/test_processors.py +0 -0
  163. {hud_python-0.4.51 → hud_python-0.4.52}/hud/py.typed +0 -0
  164. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/README.md +0 -0
  165. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/__init__.py +0 -0
  166. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/actor.py +0 -0
  167. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/buffer.py +0 -0
  168. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/chat_template.jinja +0 -0
  169. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/config.py +0 -0
  170. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/distributed.py +0 -0
  171. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/tests/__init__.py +0 -0
  172. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/tests/test_learner.py +0 -0
  173. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/train.py +0 -0
  174. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/types.py +0 -0
  175. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/utils/start_vllm_server.sh +0 -0
  176. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/utils.py +0 -0
  177. {hud_python-0.4.51 → hud_python-0.4.52}/hud/rl/vllm_adapter.py +0 -0
  178. {hud_python-0.4.51 → hud_python-0.4.52}/hud/samples/__init__.py +0 -0
  179. {hud_python-0.4.51 → hud_python-0.4.52}/hud/samples/browser.py +0 -0
  180. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/__init__.py +0 -0
  181. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/context.py +0 -0
  182. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/helper/__init__.py +0 -0
  183. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/low_level.py +0 -0
  184. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/router.py +0 -0
  185. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/server.py +0 -0
  186. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/tests/__init__.py +0 -0
  187. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/tests/test_add_tool.py +0 -0
  188. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/tests/test_context.py +0 -0
  189. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/tests/test_mcp_server_handlers.py +0 -0
  190. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/tests/test_mcp_server_integration.py +0 -0
  191. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/tests/test_mcp_server_more.py +0 -0
  192. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/tests/test_run_wrapper.py +0 -0
  193. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/tests/test_server_extra.py +0 -0
  194. {hud_python-0.4.51 → hud_python-0.4.52}/hud/server/tests/test_sigterm_runner.py +0 -0
  195. {hud_python-0.4.51 → hud_python-0.4.52}/hud/settings.py +0 -0
  196. {hud_python-0.4.51 → hud_python-0.4.52}/hud/shared/__init__.py +0 -0
  197. {hud_python-0.4.51 → hud_python-0.4.52}/hud/shared/hints.py +0 -0
  198. {hud_python-0.4.51 → hud_python-0.4.52}/hud/shared/requests.py +0 -0
  199. {hud_python-0.4.51 → hud_python-0.4.52}/hud/shared/tests/__init__.py +0 -0
  200. {hud_python-0.4.51 → hud_python-0.4.52}/hud/shared/tests/test_requests.py +0 -0
  201. {hud_python-0.4.51 → hud_python-0.4.52}/hud/telemetry/instrument.py +0 -0
  202. {hud_python-0.4.51 → hud_python-0.4.52}/hud/telemetry/replay.py +0 -0
  203. {hud_python-0.4.51 → hud_python-0.4.52}/hud/telemetry/tests/__init__.py +0 -0
  204. {hud_python-0.4.51 → hud_python-0.4.52}/hud/telemetry/tests/test_replay.py +0 -0
  205. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/__init__.py +0 -0
  206. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/base.py +0 -0
  207. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/bash.py +0 -0
  208. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/computer/__init__.py +0 -0
  209. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/computer/anthropic.py +0 -0
  210. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/computer/hud.py +0 -0
  211. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/computer/openai.py +0 -0
  212. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/computer/settings.py +0 -0
  213. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/edit.py +0 -0
  214. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/executors/__init__.py +0 -0
  215. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/executors/pyautogui.py +0 -0
  216. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/executors/tests/__init__.py +0 -0
  217. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/executors/tests/test_base_executor.py +0 -0
  218. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  219. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/executors/xdo.py +0 -0
  220. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/grounding/__init__.py +0 -0
  221. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/grounding/config.py +0 -0
  222. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/grounding/grounded_tool.py +0 -0
  223. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/grounding/grounder.py +0 -0
  224. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/grounding/tests/__init__.py +0 -0
  225. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  226. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/playwright.py +0 -0
  227. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/response.py +0 -0
  228. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/submit.py +0 -0
  229. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/tests/__init__.py +0 -0
  230. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/tests/test_base.py +0 -0
  231. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/tests/test_bash.py +0 -0
  232. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/tests/test_bash_extended.py +0 -0
  233. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/tests/test_computer.py +0 -0
  234. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/tests/test_computer_actions.py +0 -0
  235. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/tests/test_edit.py +0 -0
  236. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/tests/test_init.py +0 -0
  237. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/tests/test_playwright_tool.py +0 -0
  238. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/tests/test_response.py +0 -0
  239. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/tests/test_tools.py +0 -0
  240. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/tests/test_tools_init.py +0 -0
  241. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/tests/test_utils.py +0 -0
  242. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/types.py +0 -0
  243. {hud_python-0.4.51 → hud_python-0.4.52}/hud/tools/utils.py +0 -0
  244. {hud_python-0.4.51 → hud_python-0.4.52}/hud/types.py +0 -0
  245. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/__init__.py +0 -0
  246. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/agent_factories.py +0 -0
  247. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/async_utils.py +0 -0
  248. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/group_eval.py +0 -0
  249. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/hud_console.py +0 -0
  250. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/mcp.py +0 -0
  251. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/pretty_errors.py +0 -0
  252. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/progress.py +0 -0
  253. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/tasks.py +0 -0
  254. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/telemetry.py +0 -0
  255. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/tests/__init__.py +0 -0
  256. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/tests/test_async_utils.py +0 -0
  257. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/tests/test_init.py +0 -0
  258. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/tests/test_mcp.py +0 -0
  259. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/tests/test_progress.py +0 -0
  260. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/tests/test_telemetry.py +0 -0
  261. {hud_python-0.4.51 → hud_python-0.4.52}/hud/utils/tool_shorthand.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.51
3
+ Version: 0.4.52
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -48,6 +48,7 @@ Requires-Dist: opentelemetry-api>=1.34.1
48
48
  Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
49
49
  Requires-Dist: opentelemetry-instrumentation-mcp==0.47.0
50
50
  Requires-Dist: opentelemetry-sdk>=1.34.1
51
+ Requires-Dist: packaging>=21.0
51
52
  Requires-Dist: pathspec>=0.12.1
52
53
  Requires-Dist: pillow>=11.1.0
53
54
  Requires-Dist: prompt-toolkit==3.0.51
@@ -4,7 +4,7 @@ version = "0.1.0"
4
4
  description = "MCP server for blank environment"
5
5
  requires-python = ">=3.11"
6
6
  dependencies = [
7
- "hud-python>=0.4.51",
7
+ "hud-python>=0.4.52",
8
8
  "httpx>=0.28.1",
9
9
  ]
10
10
 
@@ -4,7 +4,7 @@ version = "0.1.0"
4
4
  description = "MCP server for DeepResearch environment"
5
5
  requires-python = ">=3.11"
6
6
  dependencies = [
7
- "hud-python>=0.4.51",
7
+ "hud-python>=0.4.52",
8
8
  "httpx>=0.24.0",
9
9
  ]
10
10
 
@@ -5,10 +5,22 @@ tools for building, evaluating, and training AI agents.
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
- from .telemetry import Trace, clear_trace, create_job, get_trace, instrument, job, trace
8
+ from .telemetry import (
9
+ Trace,
10
+ async_job,
11
+ async_trace,
12
+ clear_trace,
13
+ create_job,
14
+ get_trace,
15
+ instrument,
16
+ job,
17
+ trace,
18
+ )
9
19
 
10
20
  __all__ = [
11
21
  "Trace",
22
+ "async_job",
23
+ "async_trace",
12
24
  "clear_trace",
13
25
  "create_job",
14
26
  "get_trace",
@@ -55,6 +55,7 @@ class MCPAgent(ABC):
55
55
  # Filtering
56
56
  allowed_tools: list[str] | None = None,
57
57
  disallowed_tools: list[str] | None = None,
58
+ response_tool_name: str | None = None,
58
59
  # Messages
59
60
  system_prompt: str = GLOBAL_SYSTEM_PROMPT,
60
61
  append_setup_output: bool = True,
@@ -74,6 +75,7 @@ class MCPAgent(ABC):
74
75
  that provides `mcp_config`.
75
76
  allowed_tools: Names of tools to allow (None means allow all).
76
77
  disallowed_tools: Names of tools to always exclude.
78
+ response_tool_name: Name of the tool to use for response.
77
79
  system_prompt: System prompt to seed the conversation.
78
80
  append_setup_output: Whether to append setup tool output to the
79
81
  first turn's messages.
@@ -108,7 +110,7 @@ class MCPAgent(ABC):
108
110
 
109
111
  # Initialize these here so methods can be called before initialize()
110
112
  self._tool_map: dict[str, types.Tool] = {} # Simplified: just name to tool
111
- self.response_tool_name = None
113
+ self.response_tool_name = response_tool_name
112
114
 
113
115
  # Trace
114
116
  self._auto_trace = auto_trace
@@ -168,6 +170,8 @@ class MCPAgent(ABC):
168
170
  self.disallowed_tools.extend(task.agent_config["disallowed_tools"])
169
171
  else: # If disallowed_tools is None, we overwrite it
170
172
  self.disallowed_tools = task.agent_config["disallowed_tools"]
173
+ if "response_tool_name" in task.agent_config:
174
+ self.response_tool_name = task.agent_config["response_tool_name"]
171
175
 
172
176
  all_tools = await self.mcp_client.list_tools()
173
177
  self._available_tools = []
@@ -47,7 +47,7 @@ class LiteAgent(GenericOpenAIChatAgent):
47
47
  **agent_kwargs,
48
48
  )
49
49
 
50
- def get_tool_schemas(self) -> list[dict]:
50
+ def get_tool_schemas(self) -> list[Any]:
51
51
  # Prefer LiteLLM's stricter transformer (handles Bedrock & friends)
52
52
  if transform_mcp_tool_to_openai_tool is not None:
53
53
  return [
@@ -94,7 +94,7 @@ class TestBaseMCPAgent:
94
94
 
95
95
  assert agent.mcp_client is not None
96
96
  assert agent.allowed_tools is None
97
- assert agent.disallowed_tools == []
97
+ assert agent.disallowed_tools is None
98
98
  assert agent.initial_screenshot is True
99
99
  assert agent.system_prompt is not None # Default system prompt is set
100
100
 
@@ -241,6 +241,13 @@ class TestBaseMCPAgent:
241
241
  assert "tool2" not in tool_names # Not in allowed list
242
242
  assert "tool3" not in tool_names # In disallowed list
243
243
 
244
+ # Make sure tool schemas are correct
245
+ schemas = agent.get_tool_schemas()
246
+ assert len(schemas) == 1
247
+ assert schemas[0]["name"] == "tool1"
248
+ assert schemas[0]["description"] == "Tool 1"
249
+ assert schemas[0]["parameters"] == {"type": "object"}
250
+
244
251
  @pytest.mark.asyncio
245
252
  async def test_call_tool_success(self):
246
253
  """Test successful tool call."""
@@ -322,21 +329,6 @@ class TestBaseMCPAgent:
322
329
  # call_tools doesn't validate empty names, it will return error
323
330
  await agent.call_tools(tool_call)
324
331
 
325
- def test_get_tool_schemas(self):
326
- """Test getting tool schemas."""
327
- agent = MockMCPAgent()
328
-
329
- agent._available_tools = [
330
- types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
331
- types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
332
- ]
333
-
334
- schemas = agent.get_tool_schemas()
335
-
336
- # Should include non-lifecycle tools
337
- assert len(schemas) == 1
338
- assert schemas[0]["name"] == "tool1"
339
-
340
332
  def test_get_tools_by_server(self):
341
333
  """Test getting tools grouped by server."""
342
334
  agent = MockMCPAgent()
@@ -796,33 +796,19 @@ def eval(
796
796
  help="Comma-separated list of allowed tools",
797
797
  ),
798
798
  max_concurrent: int = typer.Option(
799
- 50,
799
+ 30,
800
800
  "--max-concurrent",
801
- help="Max concurrent tasks (prevents rate limits in both asyncio and parallel modes)",
801
+ help="Maximum concurrent tasks (1-200 recommended, prevents rate limits)",
802
802
  ),
803
803
  max_steps: int | None = typer.Option(
804
804
  None,
805
805
  "--max-steps",
806
806
  help="Maximum steps per task (default: 10 for single, 50 for full)",
807
807
  ),
808
- parallel: bool = typer.Option(
809
- False,
810
- "--parallel",
811
- help="Use process-based parallel execution for large datasets (100+ tasks)",
812
- ),
813
- max_workers: int | None = typer.Option(
814
- None,
815
- "--max-workers",
816
- help="Number of worker processes for parallel mode (auto-optimized if not set)",
817
- ),
818
- max_concurrent_per_worker: int = typer.Option(
819
- 20,
820
- "--max-concurrent-per-worker",
821
- help="Maximum concurrent tasks per worker in parallel mode",
822
- ),
823
808
  verbose: bool = typer.Option(
824
809
  False,
825
810
  "--verbose",
811
+ "-v",
826
812
  help="Enable verbose output from the agent",
827
813
  ),
828
814
  very_verbose: bool = typer.Option(
@@ -867,14 +853,14 @@ def eval(
867
853
 
868
854
  source = find_tasks_file(None, msg="Select a tasks file to run")
869
855
  hud_console.success(f"Selected: {source}")
870
- except Exception as e:
856
+ except (FileNotFoundError, Exception):
871
857
  hud_console.error(
872
858
  "No source provided and no task/eval JSON files found in current directory"
873
859
  )
874
860
  hud_console.info(
875
861
  "Usage: hud eval <source> or create a task JSON file (e.g., task.json, tasks.jsonl)"
876
862
  )
877
- raise typer.Exit(1) from e
863
+ raise typer.Exit(1) from None
878
864
 
879
865
  # Import eval_command lazily to avoid importing agent dependencies
880
866
  try:
@@ -950,9 +936,6 @@ def eval(
950
936
  allowed_tools=allowed_tools,
951
937
  max_concurrent=max_concurrent,
952
938
  max_steps=max_steps,
953
- parallel=parallel,
954
- max_workers=max_workers,
955
- max_concurrent_per_worker=max_concurrent_per_worker,
956
939
  verbose=verbose,
957
940
  very_verbose=very_verbose,
958
941
  vllm_base_url=vllm_base_url,
@@ -1126,6 +1109,13 @@ def set(
1126
1109
 
1127
1110
  def main() -> None:
1128
1111
  """Main entry point for the CLI."""
1112
+ # Check for updates (including on --version command)
1113
+ # Skip only on help-only commands
1114
+ if not (len(sys.argv) == 1 or (len(sys.argv) == 2 and sys.argv[1] in ["--help", "-h"])):
1115
+ from .utils.version_check import display_update_prompt
1116
+
1117
+ display_update_prompt()
1118
+
1129
1119
  # Handle --version flag before Typer parses args
1130
1120
  if "--version" in sys.argv:
1131
1121
  try:
@@ -300,6 +300,7 @@ async def run_single_task(
300
300
  agent_config = {
301
301
  "model": model or "claude-sonnet-4-20250514",
302
302
  "verbose": verbose,
303
+ "validate_api_key": False,
303
304
  }
304
305
  if allowed_tools:
305
306
  agent_config["allowed_tools"] = allowed_tools
@@ -345,24 +346,18 @@ async def run_full_dataset(
345
346
  allowed_tools: list[str] | None = None,
346
347
  max_concurrent: int = 30,
347
348
  max_steps: int = 10,
348
- parallel: bool = False,
349
- max_workers: int | None = None,
350
- max_concurrent_per_worker: int = 25,
351
349
  verbose: bool = False,
352
350
  vllm_base_url: str | None = None,
353
351
  group_size: int = 1,
354
352
  ) -> list[Any]:
355
- """Run evaluation across the entire dataset.
356
-
357
- Uses either asyncio-based run_dataset or process-based parallel execution
358
- depending on the parallel flag."""
353
+ """Run evaluation across the entire dataset using asyncio-based concurrency."""
359
354
 
360
355
  # Provide early feedback to user
361
356
  hud_console.info("🔧 Initializing evaluation...")
362
357
 
363
358
  # Import run_dataset lazily
364
359
  try:
365
- from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual
360
+ from hud.datasets import run_dataset
366
361
  from hud.utils.tasks import load_tasks
367
362
  except ImportError as e:
368
363
  hud_console.error(
@@ -434,7 +429,7 @@ async def run_full_dataset(
434
429
  )
435
430
  raise typer.Exit(1) from e
436
431
 
437
- agent_config = {"verbose": verbose}
432
+ agent_config = {"verbose": verbose, "validate_api_key": False}
438
433
  if allowed_tools:
439
434
  agent_config["allowed_tools"] = allowed_tools
440
435
 
@@ -472,6 +467,7 @@ async def run_full_dataset(
472
467
  agent_config = {
473
468
  "model": model or "claude-sonnet-4-20250514",
474
469
  "verbose": verbose,
470
+ "validate_api_key": False,
475
471
  }
476
472
  if allowed_tools:
477
473
  agent_config["allowed_tools"] = allowed_tools
@@ -505,9 +501,7 @@ async def run_full_dataset(
505
501
  agent_class=agent_class,
506
502
  agent_config=agent_config,
507
503
  group_size=group_size,
508
- max_parallel_episodes=max_concurrent
509
- if not parallel
510
- else max_concurrent_per_worker * (max_workers or 4),
504
+ max_parallel_episodes=max_concurrent,
511
505
  max_steps=max_steps,
512
506
  verbose=verbose,
513
507
  job_id=job.id,
@@ -519,48 +513,18 @@ async def run_full_dataset(
519
513
  # Return stats for consistency with other modes
520
514
  return stats
521
515
 
522
- # Original logic for non-grouped evaluation
523
- elif parallel:
524
- hud_console.info(
525
- f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…" # noqa: E501
526
- )
527
- if max_workers is None:
528
- # Use auto-optimization (now the default run_dataset_parallel)
529
- return await run_dataset_parallel(
530
- name=f"Evaluation {dataset_name}",
531
- dataset=dataset_or_tasks,
532
- agent_class=agent_class,
533
- agent_config=agent_config,
534
- max_concurrent=max_concurrent,
535
- metadata={"dataset": source, "parallel": True},
536
- max_steps=max_steps,
537
- auto_respond=True,
538
- )
539
- else:
540
- # Use manual configuration
541
- return await run_dataset_parallel_manual(
542
- name=f"Evaluation {dataset_name}",
543
- dataset=dataset_or_tasks,
544
- agent_class=agent_class,
545
- agent_config=agent_config,
546
- max_workers=max_workers,
547
- max_concurrent_per_worker=max_concurrent_per_worker,
548
- max_concurrent=max_concurrent,
549
- metadata={"dataset": source, "parallel": True},
550
- max_steps=max_steps,
551
- auto_respond=True,
552
- )
553
- else:
554
- hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
555
- return await run_dataset(
556
- name=f"Evaluation {dataset_name}",
557
- dataset=dataset_or_tasks,
558
- agent_class=agent_class,
559
- agent_config=agent_config,
560
- max_concurrent=max_concurrent,
561
- metadata={"dataset": source},
562
- max_steps=max_steps,
563
- )
516
+ # Run evaluation with asyncio-based concurrency
517
+ hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
518
+ return await run_dataset(
519
+ name=f"Evaluation {dataset_name}",
520
+ dataset=dataset_or_tasks,
521
+ agent_class=agent_class,
522
+ agent_config=agent_config,
523
+ max_concurrent=max_concurrent,
524
+ metadata={"dataset": source},
525
+ max_steps=max_steps,
526
+ auto_respond=True,
527
+ )
564
528
 
565
529
 
566
530
  def eval_command(
@@ -591,31 +555,20 @@ def eval_command(
591
555
  max_concurrent: int = typer.Option(
592
556
  30,
593
557
  "--max-concurrent",
594
- help="Concurrency level for asyncio mode (ignored in parallel mode)",
558
+ help=(
559
+ "Maximum concurrent tasks (1-200 recommended, prevents rate limits "
560
+ "and resource exhaustion)"
561
+ ),
595
562
  ),
596
563
  max_steps: int | None = typer.Option(
597
564
  None,
598
565
  "--max-steps",
599
566
  help="Maximum steps per task (default: 10 for single, 50 for full)",
600
567
  ),
601
- parallel: bool = typer.Option(
602
- False,
603
- "--parallel",
604
- help="Use process-based parallel execution for large datasets (100+ tasks)",
605
- ),
606
- max_workers: int | None = typer.Option(
607
- None,
608
- "--max-workers",
609
- help="Number of worker processes for parallel mode (auto-optimized if not set)",
610
- ),
611
- max_concurrent_per_worker: int = typer.Option(
612
- 20,
613
- "--max-concurrent-per-worker",
614
- help="Maximum concurrent tasks per worker in parallel mode",
615
- ),
616
568
  verbose: bool = typer.Option(
617
569
  False,
618
570
  "--verbose",
571
+ "-v",
619
572
  help="Enable verbose output from the agent",
620
573
  ),
621
574
  very_verbose: bool = typer.Option(
@@ -650,23 +603,20 @@ def eval_command(
650
603
  # Evaluate a single task from SheetBench
651
604
  hud eval hud-evals/SheetBench-50
652
605
 
653
- # Evaluate the FULL SheetBench dataset with Claude (asyncio mode)
606
+ # Evaluate the FULL SheetBench dataset with Claude
654
607
  hud eval hud-evals/SheetBench-50 --full --agent claude
655
608
 
656
- # Run large dataset with PARALLEL execution (auto-optimized)
657
- hud eval hud-evals/OSWorld-Verified-Gold --full --parallel
658
-
659
- # Parallel mode with manual configuration (16 workers, 25 tasks each)
660
- hud eval hud-evals/OSWorld-Verified-Gold --full --parallel --max-workers 16
609
+ # Run with higher concurrency for faster evaluation
610
+ hud eval hud-evals/OSWorld-Verified-Gold --full --max-concurrent 100
661
611
 
662
- # Limit total concurrent tasks to prevent rate limits
663
- hud eval hud-evals/SheetBench-50 --full --parallel --max-concurrent 20
612
+ # Limit concurrent tasks to prevent rate limits
613
+ hud eval hud-evals/SheetBench-50 --full --max-concurrent 20
664
614
 
665
615
  # Run a single task from a JSON file
666
616
  hud eval task.json
667
617
 
668
- # Run multiple tasks from a JSON file with parallel execution
669
- hud eval tasks.json --full --parallel
618
+ # Run multiple tasks from a JSON file
619
+ hud eval tasks.json --full
670
620
 
671
621
  # Run with OpenAI Operator agent
672
622
  hud eval hud-evals/OSWorld-Gold-Beta --agent openai
@@ -736,7 +686,11 @@ def eval_command(
736
686
 
737
687
  # Run evaluation
738
688
  if full:
739
- asyncio.run(
689
+ import time
690
+
691
+ start_time = time.time()
692
+
693
+ results = asyncio.run(
740
694
  run_full_dataset(
741
695
  source,
742
696
  agent_type=agent,
@@ -744,14 +698,29 @@ def eval_command(
744
698
  allowed_tools=allowed_tools_list,
745
699
  max_concurrent=max_concurrent,
746
700
  max_steps=max_steps,
747
- parallel=parallel,
748
- max_workers=max_workers,
749
- max_concurrent_per_worker=max_concurrent_per_worker,
750
701
  verbose=very_verbose or verbose,
751
702
  vllm_base_url=vllm_base_url,
752
703
  group_size=group_size,
753
704
  )
754
705
  )
706
+
707
+ elapsed = time.time() - start_time
708
+
709
+ # Print statistics (only for non-grouped mode)
710
+ if group_size == 1 and results:
711
+ hud_console.info("\n" + "=" * 50)
712
+ hud_console.success("📊 Evaluation Complete!")
713
+ hud_console.info("=" * 50)
714
+ hud_console.info(f"Total tasks: {len(results)}")
715
+ hud_console.info(f"Time elapsed: {elapsed:.2f} seconds")
716
+ hud_console.info(f"Throughput: {len(results) / elapsed:.2f} tasks/second")
717
+ hud_console.info(f"Execution mode: ASYNCIO (max_concurrent: {max_concurrent})")
718
+
719
+ # Count successes
720
+ successful = sum(1 for r in results if getattr(r, "reward", 0) > 0.7)
721
+ success_rate = 100 * successful / len(results)
722
+ hud_console.info(f"Successful tasks: {successful}/{len(results)} ({success_rate:.1f}%)")
723
+ hud_console.info("=" * 50)
755
724
  else:
756
725
  asyncio.run(
757
726
  run_single_task(
@@ -373,7 +373,8 @@ ENV API_KEY
373
373
  with open(lock_file) as f:
374
374
  lock_data = yaml.safe_load(f)
375
375
 
376
- assert lock_data["image"] == "test/env:latest@sha256:abc123"
376
+ assert lock_data["images"]["full"] == "test-env:0.1.0@sha256:abc123"
377
+ assert lock_data["images"]["local"] == "test-env:0.1.0"
377
378
  assert lock_data["build"]["version"] == "0.1.0"
378
379
  assert lock_data["environment"]["toolCount"] == 2
379
380
  assert len(lock_data["tools"]) == 2
@@ -332,6 +332,7 @@ class TestRunDatasetToolFiltering:
332
332
  patch.object(ClaudeAgent, "_run_context", mock_run_context),
333
333
  patch.object(ClaudeAgent, "call_tools", mock_call_tools),
334
334
  patch("hud.clients.MCPClient", return_value=mock_client_instance),
335
+ patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
335
336
  ):
336
337
  # Run the dataset
337
338
  await run_dataset(
@@ -400,6 +401,7 @@ class TestRunDatasetToolFiltering:
400
401
  patch.object(ClaudeAgent, "_run_context", mock_run_context),
401
402
  patch.object(ClaudeAgent, "call_tools", mock_call_tools),
402
403
  patch("hud.clients.MCPClient", return_value=mock_client_instance),
404
+ patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
403
405
  ):
404
406
  # Run the dataset
405
407
  await run_dataset(
@@ -500,6 +502,7 @@ class TestSystemPromptHandling:
500
502
  patch.object(ClaudeAgent, "_run_context", mock_run_context),
501
503
  patch.object(ClaudeAgent, "call_tools", mock_call_tools),
502
504
  patch("hud.clients.MCPClient", return_value=mock_mcp_client),
505
+ patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
503
506
  ):
504
507
  # Run the dataset
505
508
  await run_dataset(
@@ -551,6 +554,7 @@ class TestSystemPromptHandling:
551
554
  patch.object(ClaudeAgent, "_run_context", mock_run_context),
552
555
  patch.object(ClaudeAgent, "call_tools", mock_call_tools),
553
556
  patch("hud.clients.MCPClient", return_value=mock_mcp_client),
557
+ patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
554
558
  ):
555
559
  # Run the dataset
556
560
  await run_dataset(
@@ -19,7 +19,7 @@ class TestRunMCPDevServer:
19
19
  import click
20
20
 
21
21
  with (
22
- patch("hud.cli.dev.image_exists", return_value=False),
22
+ patch("hud.cli.utils.environment.image_exists", return_value=False),
23
23
  patch("click.confirm", return_value=False),
24
24
  pytest.raises(click.Abort),
25
25
  ):
@@ -18,9 +18,12 @@ def find_tasks_file(tasks_file: str | None, msg: str = "Select a tasks file") ->
18
18
  ]
19
19
  all_files = [file for file in all_files if file[0] != "."] # Remove all config files
20
20
 
21
+ if not all_files:
22
+ # No task files found - raise a clear exception
23
+ raise FileNotFoundError("No task JSON or JSONL files found in current directory")
24
+
21
25
  if len(all_files) == 1:
22
26
  return str(all_files[0])
23
-
24
27
  else:
25
28
  # Prompt user to select a file
26
29
  return hud_console.select(msg, choices=all_files)