hud-python 0.4.59__tar.gz → 0.4.61__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. {hud_python-0.4.59 → hud_python-0.4.61}/PKG-INFO +2 -1
  2. {hud_python-0.4.59 → hud_python-0.4.61}/environments/README.md +1 -1
  3. {hud_python-0.4.59 → hud_python-0.4.61}/environments/browser/server/pyproject.toml +1 -1
  4. hud_python-0.4.61/environments/jupyter/README.md +68 -0
  5. hud_python-0.4.61/environments/jupyter/server/pyproject.toml +34 -0
  6. hud_python-0.4.61/environments/online_mind2web/README.md +36 -0
  7. hud_python-0.4.61/environments/online_mind2web/pyproject.toml +22 -0
  8. hud_python-0.4.61/environments/remote_browser/src/hud_controller/providers/README.md +110 -0
  9. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/gemini.py +2 -1
  10. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/__init__.py +7 -3
  11. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/build.py +93 -5
  12. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/eval.py +21 -16
  13. {hud_python-0.4.59 → hud_python-0.4.61}/hud/datasets/parallel.py +1 -1
  14. {hud_python-0.4.59 → hud_python-0.4.61}/hud/datasets/runner.py +4 -53
  15. hud_python-0.4.61/hud/datasets/tests/test_runner.py +67 -0
  16. {hud_python-0.4.59 → hud_python-0.4.61}/hud/otel/context.py +16 -59
  17. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/actor.py +1 -1
  18. {hud_python-0.4.59 → hud_python-0.4.61}/hud/telemetry/__init__.py +14 -17
  19. {hud_python-0.4.59 → hud_python-0.4.61}/hud/telemetry/async_context.py +77 -85
  20. {hud_python-0.4.59 → hud_python-0.4.61}/hud/telemetry/job.py +8 -44
  21. hud_python-0.4.61/hud/telemetry/tests/test_async_context.py +515 -0
  22. {hud_python-0.4.59 → hud_python-0.4.61}/hud/telemetry/tests/test_job.py +0 -46
  23. {hud_python-0.4.59 → hud_python-0.4.61}/hud/telemetry/trace.py +5 -7
  24. hud_python-0.4.61/hud/telemetry/utils.py +42 -0
  25. hud_python-0.4.61/hud/tools/jupyter.py +313 -0
  26. hud_python-0.4.61/hud/tools/tests/test_jupyter_tool.py +176 -0
  27. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/group_eval.py +19 -11
  28. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/tests/test_version.py +1 -1
  29. {hud_python-0.4.59 → hud_python-0.4.61}/hud/version.py +1 -1
  30. {hud_python-0.4.59 → hud_python-0.4.61}/pyproject.toml +2 -1
  31. hud_python-0.4.59/hud/datasets/tests/test_runner.py +0 -106
  32. hud_python-0.4.59/hud/telemetry/tests/test_async_context.py +0 -242
  33. {hud_python-0.4.59 → hud_python-0.4.61}/.gitignore +0 -0
  34. {hud_python-0.4.59 → hud_python-0.4.61}/LICENSE +0 -0
  35. {hud_python-0.4.59 → hud_python-0.4.61}/README.md +0 -0
  36. {hud_python-0.4.59 → hud_python-0.4.61}/environments/blank/README.md +0 -0
  37. {hud_python-0.4.59 → hud_python-0.4.61}/environments/blank/environment/README.md +0 -0
  38. {hud_python-0.4.59 → hud_python-0.4.61}/environments/blank/environment/pyproject.toml +0 -0
  39. {hud_python-0.4.59 → hud_python-0.4.61}/environments/blank/server/README.md +0 -0
  40. {hud_python-0.4.59 → hud_python-0.4.61}/environments/blank/server/pyproject.toml +0 -0
  41. {hud_python-0.4.59 → hud_python-0.4.61}/environments/browser/README.md +0 -0
  42. {hud_python-0.4.59 → hud_python-0.4.61}/environments/browser/browser-base/README.md +0 -0
  43. {hud_python-0.4.59 → hud_python-0.4.61}/environments/browser/environment/2048/README.md +0 -0
  44. {hud_python-0.4.59 → hud_python-0.4.61}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
  45. {hud_python-0.4.59 → hud_python-0.4.61}/environments/browser/environment/README.md +0 -0
  46. {hud_python-0.4.59 → hud_python-0.4.61}/environments/browser/environment/pyproject.toml +0 -0
  47. {hud_python-0.4.59 → hud_python-0.4.61}/environments/browser/environment/todo/README.md +0 -0
  48. {hud_python-0.4.59 → hud_python-0.4.61}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
  49. {hud_python-0.4.59 → hud_python-0.4.61}/environments/browser/pyproject.toml +0 -0
  50. {hud_python-0.4.59 → hud_python-0.4.61}/environments/deepresearch/README.md +0 -0
  51. {hud_python-0.4.59 → hud_python-0.4.61}/environments/deepresearch/environment/pyproject.toml +0 -0
  52. {hud_python-0.4.59 → hud_python-0.4.61}/environments/deepresearch/pyproject.toml +0 -0
  53. {hud_python-0.4.59 → hud_python-0.4.61}/environments/deepresearch/server/pyproject.toml +0 -0
  54. {hud_python-0.4.59/environments/remote_browser → hud_python-0.4.61/environments/online_mind2web}/src/hud_controller/providers/README.md +0 -0
  55. {hud_python-0.4.59 → hud_python-0.4.61}/environments/remote_browser/README.md +0 -0
  56. {hud_python-0.4.59 → hud_python-0.4.61}/environments/remote_browser/pyproject.toml +0 -0
  57. {hud_python-0.4.59 → hud_python-0.4.61}/environments/rubrics/README.md +0 -0
  58. {hud_python-0.4.59 → hud_python-0.4.61}/environments/rubrics/environment/pyproject.toml +0 -0
  59. {hud_python-0.4.59 → hud_python-0.4.61}/environments/rubrics/pyproject.toml +0 -0
  60. {hud_python-0.4.59 → hud_python-0.4.61}/environments/rubrics/server/pyproject.toml +0 -0
  61. {hud_python-0.4.59 → hud_python-0.4.61}/environments/text_2048/README.md +0 -0
  62. {hud_python-0.4.59 → hud_python-0.4.61}/environments/text_2048/pyproject.toml +0 -0
  63. {hud_python-0.4.59 → hud_python-0.4.61}/examples/README.md +0 -0
  64. {hud_python-0.4.59 → hud_python-0.4.61}/hud/__init__.py +0 -0
  65. {hud_python-0.4.59 → hud_python-0.4.61}/hud/__main__.py +0 -0
  66. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/__init__.py +0 -0
  67. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/base.py +0 -0
  68. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/claude.py +0 -0
  69. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/grounded_openai.py +0 -0
  70. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/langchain.py +0 -0
  71. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/lite_llm.py +0 -0
  72. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/misc/__init__.py +0 -0
  73. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/misc/integration_test_agent.py +0 -0
  74. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/misc/response_agent.py +0 -0
  75. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/openai.py +0 -0
  76. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/openai_chat_generic.py +0 -0
  77. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/tests/__init__.py +0 -0
  78. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/tests/test_base.py +0 -0
  79. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/tests/test_base_runtime.py +0 -0
  80. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/tests/test_claude.py +0 -0
  81. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/tests/test_client.py +0 -0
  82. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/tests/test_gemini.py +0 -0
  83. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  84. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/tests/test_openai.py +0 -0
  85. {hud_python-0.4.59 → hud_python-0.4.61}/hud/agents/utils.py +0 -0
  86. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/__main__.py +0 -0
  87. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/analyze.py +0 -0
  88. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/clone.py +0 -0
  89. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/debug.py +0 -0
  90. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/dev.py +0 -0
  91. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/flows/__init__.py +0 -0
  92. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/flows/dev.py +0 -0
  93. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/flows/tasks.py +0 -0
  94. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/get.py +0 -0
  95. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/init.py +0 -0
  96. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/list_func.py +0 -0
  97. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/pull.py +0 -0
  98. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/push.py +0 -0
  99. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/remove.py +0 -0
  100. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/rl/__init__.py +0 -0
  101. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/rl/celebrate.py +0 -0
  102. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/rl/config.py +0 -0
  103. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/rl/display.py +0 -0
  104. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/rl/gpu.py +0 -0
  105. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/rl/gpu_utils.py +0 -0
  106. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/rl/local_runner.py +0 -0
  107. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/rl/presets.py +0 -0
  108. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/rl/remote_runner.py +0 -0
  109. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/rl/rl_api.py +0 -0
  110. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/rl/viewer.py +0 -0
  111. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/rl/vllm.py +0 -0
  112. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/rl/wait_utils.py +0 -0
  113. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/__init__.py +0 -0
  114. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_analyze.py +0 -0
  115. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_analyze_metadata.py +0 -0
  116. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_analyze_module.py +0 -0
  117. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_build.py +0 -0
  118. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_build_failure.py +0 -0
  119. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_build_module.py +0 -0
  120. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_cli_init.py +0 -0
  121. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_cli_main.py +0 -0
  122. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
  123. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_cli_root.py +0 -0
  124. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_clone.py +0 -0
  125. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_convert.py +0 -0
  126. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_cursor.py +0 -0
  127. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_debug.py +0 -0
  128. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_eval.py +0 -0
  129. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_list_func.py +0 -0
  130. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_main_module.py +0 -0
  131. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_mcp_server.py +0 -0
  132. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_pull.py +0 -0
  133. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_push.py +0 -0
  134. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_push_happy.py +0 -0
  135. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_push_wrapper.py +0 -0
  136. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_registry.py +0 -0
  137. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/tests/test_utils.py +0 -0
  138. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/__init__.py +0 -0
  139. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/config.py +0 -0
  140. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/cursor.py +0 -0
  141. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/docker.py +0 -0
  142. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/env_check.py +0 -0
  143. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/environment.py +0 -0
  144. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/interactive.py +0 -0
  145. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/local_runner.py +0 -0
  146. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/logging.py +0 -0
  147. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/metadata.py +0 -0
  148. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/package_runner.py +0 -0
  149. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/registry.py +0 -0
  150. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/remote_runner.py +0 -0
  151. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/runner.py +0 -0
  152. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/server.py +0 -0
  153. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/source_hash.py +0 -0
  154. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tasks.py +0 -0
  155. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/__init__.py +0 -0
  156. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_config.py +0 -0
  157. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_docker.py +0 -0
  158. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_docker_hints.py +0 -0
  159. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_env_check.py +0 -0
  160. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_environment.py +0 -0
  161. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_interactive_module.py +0 -0
  162. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_local_runner.py +0 -0
  163. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_logging_utils.py +0 -0
  164. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_metadata.py +0 -0
  165. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_package_runner.py +0 -0
  166. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_registry_utils.py +0 -0
  167. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_remote_runner.py +0 -0
  168. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_runner_modules.py +0 -0
  169. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_source_hash.py +0 -0
  170. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/tests/test_tasks.py +0 -0
  171. {hud_python-0.4.59 → hud_python-0.4.61}/hud/cli/utils/version_check.py +0 -0
  172. {hud_python-0.4.59 → hud_python-0.4.61}/hud/clients/README.md +0 -0
  173. {hud_python-0.4.59 → hud_python-0.4.61}/hud/clients/__init__.py +0 -0
  174. {hud_python-0.4.59 → hud_python-0.4.61}/hud/clients/base.py +0 -0
  175. {hud_python-0.4.59 → hud_python-0.4.61}/hud/clients/fastmcp.py +0 -0
  176. {hud_python-0.4.59 → hud_python-0.4.61}/hud/clients/mcp_use.py +0 -0
  177. {hud_python-0.4.59 → hud_python-0.4.61}/hud/clients/tests/__init__.py +0 -0
  178. {hud_python-0.4.59 → hud_python-0.4.61}/hud/clients/tests/test_client_integration.py +0 -0
  179. {hud_python-0.4.59 → hud_python-0.4.61}/hud/clients/tests/test_fastmcp.py +0 -0
  180. {hud_python-0.4.59 → hud_python-0.4.61}/hud/clients/tests/test_mcp_use_retry.py +0 -0
  181. {hud_python-0.4.59 → hud_python-0.4.61}/hud/clients/tests/test_protocol.py +0 -0
  182. {hud_python-0.4.59 → hud_python-0.4.61}/hud/clients/utils/__init__.py +0 -0
  183. {hud_python-0.4.59 → hud_python-0.4.61}/hud/clients/utils/mcp_use_retry.py +0 -0
  184. {hud_python-0.4.59 → hud_python-0.4.61}/hud/clients/utils/retry.py +0 -0
  185. {hud_python-0.4.59 → hud_python-0.4.61}/hud/clients/utils/retry_transport.py +0 -0
  186. {hud_python-0.4.59 → hud_python-0.4.61}/hud/datasets/__init__.py +0 -0
  187. {hud_python-0.4.59 → hud_python-0.4.61}/hud/datasets/tests/__init__.py +0 -0
  188. {hud_python-0.4.59 → hud_python-0.4.61}/hud/datasets/tests/test_utils.py +0 -0
  189. {hud_python-0.4.59 → hud_python-0.4.61}/hud/datasets/utils.py +0 -0
  190. {hud_python-0.4.59 → hud_python-0.4.61}/hud/misc/__init__.py +0 -0
  191. {hud_python-0.4.59 → hud_python-0.4.61}/hud/misc/claude_plays_pokemon.py +0 -0
  192. {hud_python-0.4.59 → hud_python-0.4.61}/hud/native/__init__.py +0 -0
  193. {hud_python-0.4.59 → hud_python-0.4.61}/hud/native/comparator.py +0 -0
  194. {hud_python-0.4.59 → hud_python-0.4.61}/hud/native/tests/__init__.py +0 -0
  195. {hud_python-0.4.59 → hud_python-0.4.61}/hud/native/tests/test_comparator.py +0 -0
  196. {hud_python-0.4.59 → hud_python-0.4.61}/hud/native/tests/test_native_init.py +0 -0
  197. {hud_python-0.4.59 → hud_python-0.4.61}/hud/otel/__init__.py +0 -0
  198. {hud_python-0.4.59 → hud_python-0.4.61}/hud/otel/collector.py +0 -0
  199. {hud_python-0.4.59 → hud_python-0.4.61}/hud/otel/config.py +0 -0
  200. {hud_python-0.4.59 → hud_python-0.4.61}/hud/otel/exporters.py +0 -0
  201. {hud_python-0.4.59 → hud_python-0.4.61}/hud/otel/instrumentation.py +0 -0
  202. {hud_python-0.4.59 → hud_python-0.4.61}/hud/otel/processors.py +0 -0
  203. {hud_python-0.4.59 → hud_python-0.4.61}/hud/otel/tests/__init__.py +0 -0
  204. {hud_python-0.4.59 → hud_python-0.4.61}/hud/otel/tests/test_instrumentation.py +0 -0
  205. {hud_python-0.4.59 → hud_python-0.4.61}/hud/otel/tests/test_processors.py +0 -0
  206. {hud_python-0.4.59 → hud_python-0.4.61}/hud/py.typed +0 -0
  207. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/README.md +0 -0
  208. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/__init__.py +0 -0
  209. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/buffer.py +0 -0
  210. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/chat_template.jinja +0 -0
  211. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/config.py +0 -0
  212. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/distributed.py +0 -0
  213. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/learner.py +0 -0
  214. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/tests/__init__.py +0 -0
  215. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/tests/test_learner.py +0 -0
  216. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/train.py +0 -0
  217. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/types.py +0 -0
  218. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/utils/start_vllm_server.sh +0 -0
  219. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/utils.py +0 -0
  220. {hud_python-0.4.59 → hud_python-0.4.61}/hud/rl/vllm_adapter.py +0 -0
  221. {hud_python-0.4.59 → hud_python-0.4.61}/hud/samples/__init__.py +0 -0
  222. {hud_python-0.4.59 → hud_python-0.4.61}/hud/samples/browser.py +0 -0
  223. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/__init__.py +0 -0
  224. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/context.py +0 -0
  225. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/helper/__init__.py +0 -0
  226. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/low_level.py +0 -0
  227. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/router.py +0 -0
  228. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/server.py +0 -0
  229. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/tests/__init__.py +0 -0
  230. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/tests/test_add_tool.py +0 -0
  231. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/tests/test_context.py +0 -0
  232. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/tests/test_mcp_server_handlers.py +0 -0
  233. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/tests/test_mcp_server_integration.py +0 -0
  234. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/tests/test_mcp_server_more.py +0 -0
  235. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/tests/test_run_wrapper.py +0 -0
  236. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/tests/test_server_extra.py +0 -0
  237. {hud_python-0.4.59 → hud_python-0.4.61}/hud/server/tests/test_sigterm_runner.py +0 -0
  238. {hud_python-0.4.59 → hud_python-0.4.61}/hud/settings.py +0 -0
  239. {hud_python-0.4.59 → hud_python-0.4.61}/hud/shared/__init__.py +0 -0
  240. {hud_python-0.4.59 → hud_python-0.4.61}/hud/shared/exceptions.py +0 -0
  241. {hud_python-0.4.59 → hud_python-0.4.61}/hud/shared/hints.py +0 -0
  242. {hud_python-0.4.59 → hud_python-0.4.61}/hud/shared/requests.py +0 -0
  243. {hud_python-0.4.59 → hud_python-0.4.61}/hud/shared/tests/__init__.py +0 -0
  244. {hud_python-0.4.59 → hud_python-0.4.61}/hud/shared/tests/test_exceptions.py +0 -0
  245. {hud_python-0.4.59 → hud_python-0.4.61}/hud/shared/tests/test_hints.py +0 -0
  246. {hud_python-0.4.59 → hud_python-0.4.61}/hud/shared/tests/test_requests.py +0 -0
  247. {hud_python-0.4.59 → hud_python-0.4.61}/hud/telemetry/instrument.py +0 -0
  248. {hud_python-0.4.59 → hud_python-0.4.61}/hud/telemetry/replay.py +0 -0
  249. {hud_python-0.4.59 → hud_python-0.4.61}/hud/telemetry/tests/__init__.py +0 -0
  250. {hud_python-0.4.59 → hud_python-0.4.61}/hud/telemetry/tests/test_instrument.py +0 -0
  251. {hud_python-0.4.59 → hud_python-0.4.61}/hud/telemetry/tests/test_replay.py +0 -0
  252. {hud_python-0.4.59 → hud_python-0.4.61}/hud/telemetry/tests/test_trace.py +0 -0
  253. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/__init__.py +0 -0
  254. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/base.py +0 -0
  255. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/bash.py +0 -0
  256. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/computer/__init__.py +0 -0
  257. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/computer/anthropic.py +0 -0
  258. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/computer/gemini.py +0 -0
  259. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/computer/hud.py +0 -0
  260. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/computer/openai.py +0 -0
  261. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/computer/qwen.py +0 -0
  262. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/computer/settings.py +0 -0
  263. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/edit.py +0 -0
  264. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/executors/__init__.py +0 -0
  265. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/executors/base.py +0 -0
  266. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/executors/pyautogui.py +0 -0
  267. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/executors/tests/__init__.py +0 -0
  268. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/executors/tests/test_base_executor.py +0 -0
  269. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  270. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/executors/xdo.py +0 -0
  271. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/grounding/__init__.py +0 -0
  272. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/grounding/config.py +0 -0
  273. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/grounding/grounded_tool.py +0 -0
  274. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/grounding/grounder.py +0 -0
  275. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/grounding/tests/__init__.py +0 -0
  276. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  277. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/playwright.py +0 -0
  278. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/response.py +0 -0
  279. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/submit.py +0 -0
  280. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/__init__.py +0 -0
  281. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/test_base.py +0 -0
  282. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/test_bash.py +0 -0
  283. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/test_bash_extended.py +0 -0
  284. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/test_computer.py +0 -0
  285. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/test_computer_actions.py +0 -0
  286. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/test_edit.py +0 -0
  287. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/test_init.py +0 -0
  288. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/test_playwright_tool.py +0 -0
  289. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/test_response.py +0 -0
  290. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/test_submit.py +0 -0
  291. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/test_tools.py +0 -0
  292. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/test_tools_init.py +0 -0
  293. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/test_types.py +0 -0
  294. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/tests/test_utils.py +0 -0
  295. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/types.py +0 -0
  296. {hud_python-0.4.59 → hud_python-0.4.61}/hud/tools/utils.py +0 -0
  297. {hud_python-0.4.59 → hud_python-0.4.61}/hud/types.py +0 -0
  298. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/__init__.py +0 -0
  299. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/agent_factories.py +0 -0
  300. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/async_utils.py +0 -0
  301. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/hud_console.py +0 -0
  302. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/mcp.py +0 -0
  303. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/pretty_errors.py +0 -0
  304. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/progress.py +0 -0
  305. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/task_tracking.py +0 -0
  306. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/tasks.py +0 -0
  307. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/telemetry.py +0 -0
  308. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/tests/__init__.py +0 -0
  309. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/tests/test_agent_factories.py +0 -0
  310. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/tests/test_async_utils.py +0 -0
  311. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/tests/test_init.py +0 -0
  312. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/tests/test_mcp.py +0 -0
  313. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/tests/test_pretty_errors.py +0 -0
  314. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/tests/test_progress.py +0 -0
  315. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/tests/test_tasks.py +0 -0
  316. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/tests/test_telemetry.py +0 -0
  317. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/tests/test_tool_shorthand.py +0 -0
  318. {hud_python-0.4.59 → hud_python-0.4.61}/hud/utils/tool_shorthand.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.59
3
+ Version: 0.4.61
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -59,6 +59,7 @@ Requires-Dist: pydantic<3,>=2.6
59
59
  Requires-Dist: questionary==2.1.0
60
60
  Requires-Dist: rich>=13.0.0
61
61
  Requires-Dist: toml>=0.10.2
62
+ Requires-Dist: tornado>=6.5.2
62
63
  Requires-Dist: typer>=0.9.0
63
64
  Requires-Dist: watchfiles>=0.21.0
64
65
  Requires-Dist: wrapt>=1.14.0
@@ -496,7 +496,7 @@ from hud.clients import MCPClient
496
496
 
497
497
  async def main():
498
498
  # `trace` captures *everything* that happens and sends it to hud.ai
499
- with hud.trace("local_test"):
499
+ async with hud.async_trace("local_test"):
500
500
  task = Task(
501
501
  prompt="Complete the task",
502
502
  mcp_config={
@@ -4,7 +4,7 @@ version = "0.1.0"
4
4
  description = "HUD Browser MCP Server"
5
5
  requires-python = ">=3.11,<3.14"
6
6
  dependencies = [
7
- "hud-python>=0.4.59",
7
+ "hud-python>=0.4.61",
8
8
  "httpx",
9
9
  "playwright",
10
10
  "pyautogui",
@@ -0,0 +1,68 @@
1
+ # Jupyter Env (for SpreadSheetBench)
2
+
3
+ ## QuickStart
4
+
5
+ ### MCP Server from Dockerhub (Don't Have to Build Docker Image)
6
+
7
+ Run task by
8
+ ```
9
+ hud eval Genteki/SpreadSheetBench
10
+ ```
11
+
12
+ ### Local MCP Server
13
+
14
+ First we build the docker image with
15
+ ```
16
+ docker build -t <image/name> .
17
+ ```
18
+ Then modify the docker image name in `test_task.json`. Finally, load all `api_key` needed into environment varible and run
19
+
20
+ ```
21
+ hud eval
22
+ ```
23
+
24
+ ## File Structure
25
+
26
+ `environments/jupyter` file sturcture:
27
+ ```
28
+ ├── Dockerfile
29
+ ├── server
30
+ │ ├── config.py
31
+ │ ├── evaluate
32
+ │ │ ├── compare.py
33
+ │ │ ├── dumb.py
34
+ │ │ ├── eval_all.py
35
+ │ │ ├── eval_single.py
36
+ │ │ ├── generalize.py
37
+ │ │ └── __init__.py
38
+ │ ├── __init__.py
39
+ │ ├── main.py
40
+ │ ├── pyproject.toml
41
+ │ ├── setup
42
+ │ │ ├── __init__.py
43
+ │ │ └── load_spreadsheet.py
44
+ │ └── tools
45
+ │ ├── __init__.py
46
+ │ └── jupyter_with_record.py
47
+ └── test_task.json
48
+ ```
49
+ Here we introduce the main parts of the environments
50
+ * `main.py` start point of MCP server
51
+ * `tools/jupyter_with_record.py`: offer `execute_code` method to allow agent interacting with jupyter kernel and record the solution
52
+ * `setup/`: setup methods for eval task
53
+ * `evaluate/` evaluations method for eval task
54
+
55
+
56
+ ## Related Linkd
57
+ ### Hugginface:
58
+ * [Genteki/SpreadSheetBench-Tiny](https://huggingface.co/datasets/Genteki/SpreadSheetBench-Tiny) (Size: 10)
59
+ * [Genteki/SpreadSheetBench-200](https://huggingface.co/datasets/Genteki/SpreadSheetBench-200) (Size: 200)
60
+ * [Genteki/SpreadSheetBench](https://huggingface.co/datasets/Genteki/SpreadSheetBench) (Size: 912)
61
+
62
+ ### Example Traces (May require permission)
63
+ * [Single Test Task](https://www.hud.so/trace/d31de170-e70a-4abb-8f95-70512515dade)
64
+ * [Genteki/SpreadSheetBench-Tiny Test](https://www.hud.so/jobs/2c426368-e352-4c79-af4a-aefb136e3f58)
65
+
66
+ ### Github
67
+
68
+ * Feature Branch: [New-Env-Jupyter](https://github.com/Genteki/hud-python/tree/New-Env-Jupyter)
@@ -0,0 +1,34 @@
1
+ [project]
2
+ name = "sheet-mcp-server"
3
+ version = "0.1.0"
4
+ description = "MCP server for XLSX spreadsheet manipulation"
5
+ authors = [{name = "HUD Team"}]
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "hud-python==0.4.61",
9
+ "pandas>=2.0.0",
10
+ "openpyxl>=3.1.0",
11
+ "xlsxwriter>=3.1.0",
12
+ "jupyter-client>=8.0.0",
13
+ "jupyter-kernel-gateway>=3.0.0",
14
+ "ipython>=8.0.0",
15
+ "nbformat>=5.7.0",
16
+ "fastapi>=0.100.0",
17
+ "uvicorn>=0.23.0",
18
+ "tornado>=6.0.0",
19
+ "aiohttp>=3.8.0",
20
+ ]
21
+
22
+ [project.optional-dependencies]
23
+ dev = [
24
+ "pytest>=7.0.0",
25
+ "pytest-asyncio>=0.21.0",
26
+ ]
27
+
28
+ [build-system]
29
+ requires = ["setuptools>=61.0", "wheel"]
30
+ build-backend = "setuptools.build_meta"
31
+
32
+ [tool.setuptools.packages.find]
33
+ where = ["."]
34
+ include = ["server*"]
@@ -0,0 +1,36 @@
1
+ # HUD Online Mind2Web Taskset
2
+
3
+ Based on hud remote-browser, this MCP server provides environment for Online-Mind2Web task exacution and evaluation.
4
+
5
+ ## Running with Docker
6
+
7
+ The Docker image supports both production and development modes using the same Dockerfile.
8
+
9
+ ### Building the Image
10
+
11
+ ```bash
12
+ # Production build (default)
13
+ docker build -t hud-om2w:latest .
14
+ ```
15
+
16
+ ### Running the Test Task
17
+ ```bash
18
+ hud eval ./test_task.json
19
+ ```
20
+
21
+ ### Running Whole Online-Mind2Web Dataset From HuggingFace
22
+ ```bash
23
+ hud eval Genteki/Online-Mind2Web --full --max-concurrent=5
24
+ ```
25
+
26
+ ### Different Evaluation Method
27
+
28
+ To chosse different evaluation method, you can change different `task["evaluate_tool"]["evaluate"]["name"]` value in task json file. Here are the different evaluation method we support for you:
29
+
30
+ | Evaluation Method | Final Screenshot | Screenshot History | Action Histroy |
31
+ |:---|:---:|:---:| :---: |
32
+ | `autonomous` | ✔ | ✗ | ✔ |
33
+ | `webjudge` | ✔ | ✔ | ✔ |
34
+ | `overall_judge`[^1] | - | - | - |
35
+
36
+ [^1]: `overall_judge` will execute all evaluation methods above and return the average of the rewards of them.
@@ -0,0 +1,22 @@
1
+ [project]
2
+ name = "hud-om2w"
3
+ version = "0.1.0"
4
+ description = "HUD Remote Browser Controller with MCP tools for cloud browser providers"
5
+ requires-python = ">=3.11,<3.13"
6
+ dependencies = [ "hud-python==0.4.61", "pyautogui", "playwright", "httpx", "typer", "google-api-python-client", "google-auth",]
7
+
8
+ [build-system]
9
+ requires = [ "hatchling",]
10
+ build-backend = "hatchling.build"
11
+
12
+ [project.scripts]
13
+ hud-om2w = "hud_controller.__main__:main"
14
+
15
+ [tool.hud]
16
+ image = "hud-om2w:dev"
17
+
18
+ [tool.hatch.metadata]
19
+ allow-direct-references = true
20
+
21
+ [tool.hatch.build.targets.wheel]
22
+ packages = [ "src/hud_controller",]
@@ -0,0 +1,110 @@
1
+ # Remote Browser Providers
2
+
3
+ This directory contains implementations for various cloud browser providers that can be used with the HUD Remote Browser environment.
4
+
5
+ ## Supported Providers
6
+
7
+ ### 1. **AnchorBrowser** ✅ (Implemented)
8
+ - **API Endpoint**: `https://api.anchorbrowser.io/v1/sessions`
9
+ - **Features**:
10
+ - Residential proxy support
11
+ - CAPTCHA solving
12
+ - Ad blocking
13
+ - Popup blocking
14
+ - **API Key**: `ANCHOR_API_KEY` environment variable
15
+ - **Documentation**: Internal
16
+
17
+ ### 2. **BrowserBase** 🚧 (To be implemented)
18
+ - **API Endpoint**: `https://api.browserbase.com/v1/sessions`
19
+ - **Features**:
20
+ - Multiple regions support
21
+ - Context persistence
22
+ - Live view URLs
23
+ - Session recordings
24
+ - Proxy support
25
+ - **API Key**: `X-BB-API-Key` header
26
+ - **Documentation**: https://docs.browserbase.com/reference/api/create-a-session
27
+
28
+ ### 3. **HyperBrowser** 🚧 (To be implemented)
29
+ - **API Endpoint**: `https://api.hyperbrowser.ai/api/session`
30
+ - **Features**:
31
+ - Stealth mode
32
+ - Advanced proxy configuration (country/state/city)
33
+ - Profile management
34
+ - Web recording
35
+ - CAPTCHA solving
36
+ - Ad blocking
37
+ - Browser fingerprinting
38
+ - **API Key**: `x-api-key` header
39
+ - **Documentation**: https://docs.hyperbrowser.ai/reference/api-reference/sessions
40
+
41
+ ### 4. **Steel** 🚧 (To be implemented)
42
+ - **API Endpoint**: `https://api.steel.dev/v1/sessions`
43
+ - **Features**:
44
+ - Session management
45
+ - Browser automation
46
+ - Proxy support
47
+ - **API Key**: `steel_api_key` header or `STEEL_API_KEY` env variable
48
+ - **Documentation**: https://docs.steel.dev/api-reference
49
+
50
+ ### 5. **Kernel** ❌ (Not yet available)
51
+ - **Status**: API not yet available for browser sessions
52
+ - **Documentation**: N/A
53
+
54
+ ## Provider Lifecycle
55
+
56
+ Each provider follows a similar lifecycle pattern:
57
+
58
+ 1. **Initialization**
59
+ - Set up API credentials
60
+ - Configure base URLs and default options
61
+
62
+ 2. **Session Creation** (`launch()`)
63
+ - Make API request to create a new browser session
64
+ - Handle provider-specific options (proxy, stealth, etc.)
65
+ - Return CDP WebSocket URL for Playwright connection
66
+
67
+ 3. **Session Management**
68
+ - Track session IDs and metadata
69
+ - Provide status checks
70
+ - Handle session-specific features (live view, recordings, etc.)
71
+
72
+ 4. **Session Termination** (`close()`)
73
+ - Clean up resources
74
+ - End the browser session via API
75
+ - Handle any provider-specific cleanup
76
+
77
+ ## Implementation Guide
78
+
79
+ To add a new provider:
80
+
81
+ 1. Create a new file in this directory (e.g., `browserbase.py`)
82
+ 2. Inherit from `BrowserProvider` base class
83
+ 3. Implement required methods:
84
+ - `__init__()` - Initialize with API credentials
85
+ - `launch()` - Create a new session and return CDP URL
86
+ - `close()` - Terminate the session
87
+ - `get_status()` - Return session status
88
+ 4. Add provider to the registry in `__init__.py`
89
+ 5. Update environment variables in the main README
90
+
91
+ ## Environment Variables
92
+
93
+ Each provider uses specific environment variables:
94
+
95
+ - **AnchorBrowser**: `ANCHOR_API_KEY`
96
+ - **BrowserBase**: `BROWSERBASE_API_KEY`
97
+ - **HyperBrowser**: `HYPERBROWSER_API_KEY`
98
+ - **Steel**: `STEEL_API_KEY`
99
+
100
+ ## Common Features Across Providers
101
+
102
+ | Feature | AnchorBrowser | BrowserBase | HyperBrowser | Steel |
103
+ |---------|---------------|-------------|--------------|-------|
104
+ | Proxy Support | ✅ | ✅ | ✅ | ✅ |
105
+ | CAPTCHA Solving | ✅ | ❓ | ✅ | ❓ |
106
+ | Ad Blocking | ✅ | ❓ | ✅ | ❓ |
107
+ | Session Recording | ❌ | ✅ | ✅ | ❓ |
108
+ | Live View | ✅ | ✅ | ✅ | ❓ |
109
+ | Profile Persistence | ❌ | ✅ | ✅ | ❓ |
110
+ | Multi-Region | ❌ | ✅ | ✅ | ❓ |
@@ -461,7 +461,8 @@ class GeminiAgent(MCPAgent):
461
461
  def _remove_old_screenshots(self, messages: list[genai_types.Content]) -> None:
462
462
  """
463
463
  Remove screenshots from old turns to manage context length.
464
- Keeps only the last N turns with screenshots (configured via self.max_recent_turn_with_screenshots).
464
+ Keeps only the last N turns with screenshots (configured via
465
+ self.max_recent_turn_with_screenshots).
465
466
  """
466
467
  turn_with_screenshots_found = 0
467
468
 
@@ -602,6 +602,9 @@ def build(
602
602
  platform: str | None = typer.Option(
603
603
  None, "--platform", help="Set Docker target platform (e.g., linux/amd64)"
604
604
  ),
605
+ remote_cache: str | None = typer.Option(
606
+ None, "--remote-cache", help="Enable remote cache using Amazon ECR with specified repo name"
607
+ ),
605
608
  ) -> None:
606
609
  """🏗️ Build a HUD environment and generate lock file.
607
610
 
@@ -614,8 +617,9 @@ def build(
614
617
  hud build # Build current directory
615
618
  hud build environments/text_2048 -e API_KEY=secret
616
619
  hud build . --tag my-env:v1.0 -e VAR1=value1 -e VAR2=value2
617
- hud build . --no-cache # Force rebuild[/not dim]
618
- """
620
+ hud build . --no-cache # Force rebuild
621
+ hud build . --remote-cache my-cache-repo # Use ECR remote cache (requires AWS_ACCOUNT_ID and AWS_DEFAULT_REGION)[/not dim]
622
+ """ # noqa: E501
619
623
  # Parse directory and extra arguments
620
624
  if params:
621
625
  directory = params[0]
@@ -652,7 +656,7 @@ def build(
652
656
  else:
653
657
  i += 1
654
658
 
655
- build_command(directory, tag, no_cache, verbose, env_vars, platform)
659
+ build_command(directory, tag, no_cache, verbose, env_vars, platform, remote_cache)
656
660
 
657
661
 
658
662
  @app.command()
@@ -365,6 +365,7 @@ def build_docker_image(
365
365
  verbose: bool = False,
366
366
  build_args: dict[str, str] | None = None,
367
367
  platform: str | None = None,
368
+ remote_cache: str | None = None,
368
369
  ) -> bool:
369
370
  """Build a Docker image from a directory."""
370
371
  hud_console = HUDConsole()
@@ -376,17 +377,62 @@ def build_docker_image(
376
377
  hud_console.error(f"No Dockerfile found in {directory}")
377
378
  return False
378
379
 
379
- # Default platform to match RL pipeline unless explicitly overridden
380
+ # Build command - use buildx when remote cache is enabled
380
381
  effective_platform = platform if platform is not None else "linux/amd64"
382
+ cmd = ["docker", "buildx", "build"] if remote_cache else ["docker", "build"]
381
383
 
382
- # Build command
383
- cmd = ["docker", "build"]
384
384
  if effective_platform:
385
385
  cmd.extend(["--platform", effective_platform])
386
386
  cmd.extend(["-t", tag])
387
387
  if no_cache:
388
388
  cmd.append("--no-cache")
389
389
 
390
+ # Add remote cache support for ECR
391
+ if remote_cache:
392
+ try:
393
+ import os
394
+ import re
395
+
396
+ # Validate ECR repo name
397
+ if not re.match(r"^[a-z0-9]([a-z0-9\-_]*[a-z0-9])?$", remote_cache):
398
+ hud_console.error(f"Invalid ECR repo name: {remote_cache}")
399
+ hud_console.info(
400
+ "ECR repo names must contain only lowercase letters, numbers, hyphens, and underscores" # noqa: E501
401
+ )
402
+ return False
403
+
404
+ # Get required environment variables
405
+ aws_account_id = os.getenv("AWS_ACCOUNT_ID")
406
+ aws_region = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
407
+
408
+ if not aws_account_id:
409
+ hud_console.error("AWS_ACCOUNT_ID environment variable not set")
410
+ return False
411
+
412
+ # ECR cache image reference
413
+ cache_image = (
414
+ f"{aws_account_id}.dkr.ecr.{aws_region}.amazonaws.com/{remote_cache}:cache"
415
+ )
416
+
417
+ # Add cache arguments with proper ECR format
418
+ cmd.extend(
419
+ [
420
+ "--cache-from",
421
+ f"type=registry,ref={cache_image}",
422
+ "--cache-to",
423
+ f"mode=max,image-manifest=true,oci-mediatypes=true,type=registry,ref={cache_image}",
424
+ "--load", # Load image to local Docker after build
425
+ ]
426
+ )
427
+
428
+ hud_console.success(f"Remote cache configured: {cache_image}")
429
+
430
+ except typer.Exit:
431
+ raise
432
+ except Exception as e:
433
+ hud_console.error(f"Remote cache setup error: {e}")
434
+ return False
435
+
390
436
  # Add build args
391
437
  for key, value in build_args.items():
392
438
  cmd.extend(["--build-arg", f"{key}={value}"])
@@ -412,6 +458,7 @@ def build_environment(
412
458
  verbose: bool = False,
413
459
  env_vars: dict[str, str] | None = None,
414
460
  platform: str | None = None,
461
+ remote_cache: str | None = None,
415
462
  ) -> None:
416
463
  """Build a HUD environment and generate lock file."""
417
464
  hud_console = HUDConsole()
@@ -482,6 +529,7 @@ def build_environment(
482
529
  verbose,
483
530
  build_args=None,
484
531
  platform=platform,
532
+ remote_cache=remote_cache,
485
533
  ):
486
534
  hud_console.error("Docker build failed")
487
535
  raise typer.Exit(1)
@@ -655,11 +703,50 @@ def build_environment(
655
703
  version_tag = f"{base_name}:{new_version}"
656
704
  latest_tag = f"{base_name}:latest"
657
705
 
658
- label_cmd = ["docker", "build"]
706
+ # Build command - use buildx when remote cache is enabled
707
+ label_cmd = ["docker", "buildx", "build"] if remote_cache else ["docker", "build"]
708
+
659
709
  # Use same defaulting for the second build step
660
710
  label_platform = platform if platform is not None else "linux/amd64"
661
711
  if label_platform:
662
712
  label_cmd.extend(["--platform", label_platform])
713
+
714
+ # Add remote cache support for final build
715
+ if remote_cache:
716
+ try:
717
+ import os
718
+ import re
719
+
720
+ if not re.match(r"^[a-z0-9]([a-z0-9\-_]*[a-z0-9])?$", remote_cache):
721
+ hud_console.error(f"Invalid ECR repo name: {remote_cache}")
722
+ raise typer.Exit(1)
723
+
724
+ aws_account_id = os.getenv("AWS_ACCOUNT_ID")
725
+ aws_region = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
726
+
727
+ if not aws_account_id:
728
+ hud_console.error("AWS_ACCOUNT_ID environment variable not set")
729
+ raise typer.Exit(1)
730
+
731
+ cache_image = (
732
+ f"{aws_account_id}.dkr.ecr.{aws_region}.amazonaws.com/{remote_cache}:cache"
733
+ )
734
+
735
+ label_cmd.extend(
736
+ [
737
+ "--cache-from",
738
+ f"type=registry,ref={cache_image}",
739
+ "--cache-to",
740
+ f"mode=max,image-manifest=true,oci-mediatypes=true,type=registry,ref={cache_image}",
741
+ "--load", # Load image to local Docker after build
742
+ ]
743
+ )
744
+ except typer.Exit:
745
+ raise
746
+ except Exception as e:
747
+ hud_console.error(f"Remote cache setup error: {e}")
748
+ raise typer.Exit(1) from e
749
+
663
750
  label_cmd.extend(
664
751
  [
665
752
  "--label",
@@ -780,6 +867,7 @@ def build_command(
780
867
  verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed output"),
781
868
  env_vars: dict[str, str] | None = None,
782
869
  platform: str | None = None,
870
+ remote_cache: str | None = None,
783
871
  ) -> None:
784
872
  """Build a HUD environment and generate lock file."""
785
- build_environment(directory, tag, no_cache, verbose, env_vars, platform)
873
+ build_environment(directory, tag, no_cache, verbose, env_vars, platform, remote_cache)
@@ -260,9 +260,8 @@ async def run_single_task(
260
260
  ) -> None:
261
261
  """Load one task and execute it, or detect if JSON contains a list and run as dataset."""
262
262
 
263
- # Provide early feedback to user
264
263
  hud_console.info("🔧 Initializing evaluation...")
265
- # Import Task and run_dataset lazily
264
+
266
265
  try:
267
266
  from hud.utils.tasks import load_tasks
268
267
  except ImportError as e:
@@ -399,23 +398,31 @@ async def run_single_task(
399
398
 
400
399
  if group_size > 1:
401
400
  hud_console.info(f"🔄 Running task with group_size={group_size}")
402
- # Run with grouping
403
- stats = await run_tasks_grouped(
404
- tasks=[task],
405
- agent_class=agent_class,
406
- agent_config=agent_config,
407
- group_size=group_size,
408
- max_parallel_episodes=48, # Same as RL default
409
- max_steps=max_steps,
410
- verbose=verbose,
411
- )
401
+ async with hud.async_job(
402
+ name=f"Group Eval: {task_prompt[:50]}... (x{group_size})",
403
+ metadata={
404
+ "task_id": getattr(task, "id", None),
405
+ "group_size": group_size,
406
+ "total_episodes": group_size,
407
+ },
408
+ ) as job:
409
+ stats = await run_tasks_grouped(
410
+ tasks=[task],
411
+ agent_class=agent_class,
412
+ agent_config=agent_config,
413
+ group_size=group_size,
414
+ max_parallel_episodes=48,
415
+ max_steps=max_steps,
416
+ verbose=verbose,
417
+ job_id=job.id,
418
+ )
412
419
  display_group_statistics(stats, show_details=True)
413
420
  else:
414
421
  # Enable agent step logging for single task mode
415
422
  logging.getLogger("hud.agents").setLevel(logging.INFO)
416
423
  logging.getLogger("hud.agents.base").setLevel(logging.INFO)
417
424
 
418
- with hud.trace(name=task_prompt):
425
+ async with hud.async_trace(name=task_prompt):
419
426
  agent = build_agent(
420
427
  agent_type,
421
428
  model=model,
@@ -442,10 +449,8 @@ async def run_full_dataset(
442
449
  ) -> list[Any]:
443
450
  """Run evaluation across the entire dataset using asyncio-based concurrency."""
444
451
 
445
- # Provide early feedback to user
446
452
  hud_console.info("🔧 Initializing evaluation...")
447
453
 
448
- # Import run_dataset lazily
449
454
  try:
450
455
  from hud.datasets import run_dataset
451
456
  from hud.utils.tasks import load_tasks
@@ -627,7 +632,7 @@ async def run_full_dataset(
627
632
  hud_console.info(f"🔄 Running dataset with group_size={group_size}")
628
633
 
629
634
  # Run with job tracking
630
- with hud.job(
635
+ async with hud.async_job(
631
636
  name=f"Evaluation {dataset_name} (group_size={group_size})",
632
637
  metadata={
633
638
  "dataset": source,
@@ -371,7 +371,7 @@ async def run_dataset_parallel_manual(
371
371
  logger.warning("Failed to extract dataset verification info")
372
372
 
373
373
  # Create job context
374
- with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
374
+ async with hud.async_job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
375
375
  # Prepare agent class info for pickling
376
376
  agent_module = agent_class.__module__
377
377
  agent_name = agent_class.__name__
@@ -30,20 +30,14 @@ async def run_dataset(
30
30
  ) -> list[Any]:
31
31
  """Run all tasks in a dataset with automatic job and telemetry tracking.
32
32
 
33
- This function handles concurrent task execution with proper telemetry collection.
34
- All tasks are executed in parallel up to `max_concurrent`, with full telemetry
35
- automatically uploaded to the HUD platform.
36
-
37
33
  Args:
38
34
  name: Name for the job
39
35
  dataset: HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50"),
40
36
  Dataset object, OR list of Task objects
41
37
  agent_class: Agent class to instantiate (e.g., ClaudeAgent)
42
- agent_config: Configuration/kwargs for agent (model, etc.)
43
- max_concurrent: Maximum parallel task execution. Higher values improve throughput
44
- but may increase memory usage. Recommended: 30-200 depending on
45
- task complexity and available resources.
46
- metadata: Optional metadata for the job
38
+ agent_config: Configuration kwargs for agent initialization
39
+ max_concurrent: Maximum concurrent tasks (recommended: 50-200)
40
+ metadata: Optional job metadata
47
41
  max_steps: Maximum steps per task
48
42
  split: Dataset split to use when loading from string (default: "train")
49
43
  auto_respond: Whether to use auto-response agent
@@ -101,7 +95,6 @@ async def run_dataset(
101
95
  except Exception:
102
96
  logger.warning("Failed to extract dataset verification info")
103
97
 
104
- # Use async job context manager for high-concurrency telemetry
105
98
  async with hud.async_job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
106
99
  # Run tasks with semaphore for concurrency control
107
100
  sem = asyncio.Semaphore(max_concurrent)
@@ -112,12 +105,10 @@ async def run_dataset(
112
105
  try:
113
106
  # Create trace for this task
114
107
  task_name = task_dict.get("prompt") or f"Task {index}"
115
-
116
- # Ensure task_id is a string for baggage propagation
117
108
  raw_task_id = task_dict.get("id")
118
109
  safe_task_id = str(raw_task_id) if raw_task_id is not None else None
110
+
119
111
  async with hud.async_trace(task_name, job_id=job_obj.id, task_id=safe_task_id):
120
- # with hud.trace(task_name, job_id=job_obj.id, task_id=safe_task_id):
121
112
  # Convert dict to Task here, at trace level
122
113
  task = Task(**task_dict)
123
114
 
@@ -141,44 +132,4 @@ async def run_dataset(
141
132
  if isinstance(result, Exception):
142
133
  logger.error("Worker %s failed with exception: %s", i, result, exc_info=result)
143
134
 
144
- # Ensure all telemetry is uploaded before returning
145
- await _flush_telemetry()
146
-
147
135
  return results
148
-
149
-
150
- async def _flush_telemetry() -> None:
151
- """Flush all pending telemetry operations.
152
-
153
- Ensures complete telemetry upload by:
154
- 1. Waiting for all async status updates to complete
155
- 2. Forcing OpenTelemetry span processor to export remaining spans
156
-
157
- This prevents telemetry loss at high concurrency (200+ tasks) by ensuring
158
- all operations complete before process exit.
159
- """
160
- from hud.otel.config import is_telemetry_configured
161
- from hud.utils import hud_console
162
- from hud.utils.task_tracking import wait_all_tasks
163
-
164
- hud_console.info("Uploading telemetry...")
165
-
166
- # Step 1: Wait for async status updates (job/trace status)
167
- completed_tasks = await wait_all_tasks(timeout_seconds=20.0)
168
- if completed_tasks > 0:
169
- hud_console.info(f"Completed {completed_tasks} pending telemetry tasks")
170
-
171
- # Step 2: Flush OpenTelemetry span exports
172
- if is_telemetry_configured():
173
- try:
174
- from opentelemetry import trace
175
- from opentelemetry.sdk.trace import TracerProvider
176
-
177
- provider = trace.get_tracer_provider()
178
- if isinstance(provider, TracerProvider):
179
- provider.force_flush(timeout_millis=20000)
180
- logger.debug("OpenTelemetry spans flushed successfully")
181
- except Exception as e:
182
- logger.warning("Failed to flush OpenTelemetry: %s", e)
183
-
184
- hud_console.info("Telemetry uploaded successfully")