hud-python 0.4.48__tar.gz → 0.4.50__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (261) hide show
  1. {hud_python-0.4.48 → hud_python-0.4.50}/PKG-INFO +1 -1
  2. {hud_python-0.4.48 → hud_python-0.4.50}/environments/README.md +56 -45
  3. hud_python-0.4.50/environments/blank/README.md +121 -0
  4. {hud_python-0.4.48 → hud_python-0.4.50}/environments/blank/environment/README.md +1 -1
  5. hud_python-0.4.50/environments/blank/environment/pyproject.toml +16 -0
  6. hud_python-0.4.50/environments/blank/server/README.md +21 -0
  7. hud_python-0.4.50/environments/blank/server/pyproject.toml +19 -0
  8. {hud_python-0.4.48 → hud_python-0.4.50}/environments/browser/README.md +24 -25
  9. hud_python-0.4.50/environments/browser/environment/pyproject.toml +23 -0
  10. hud_python-0.4.50/environments/browser/server/pyproject.toml +21 -0
  11. hud_python-0.4.50/environments/deepresearch/README.md +165 -0
  12. hud_python-0.4.50/environments/deepresearch/environment/pyproject.toml +17 -0
  13. {hud_python-0.4.48 → hud_python-0.4.50}/environments/deepresearch/pyproject.toml +1 -1
  14. hud_python-0.4.50/environments/deepresearch/server/pyproject.toml +19 -0
  15. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/base.py +40 -34
  16. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/grounded_openai.py +1 -1
  17. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/__init__.py +78 -213
  18. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/build.py +105 -45
  19. hud_python-0.4.50/hud/cli/dev.py +699 -0
  20. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/flows/tasks.py +98 -17
  21. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/init.py +18 -14
  22. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/push.py +27 -9
  23. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/rl/local_runner.py +3 -3
  24. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_eval.py +168 -119
  25. hud_python-0.4.50/hud/cli/tests/test_mcp_server.py +36 -0
  26. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/env_check.py +9 -9
  27. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/source_hash.py +1 -1
  28. {hud_python-0.4.48 → hud_python-0.4.50}/hud/server/__init__.py +2 -1
  29. hud_python-0.4.50/hud/server/router.py +160 -0
  30. {hud_python-0.4.48 → hud_python-0.4.50}/hud/server/server.py +246 -79
  31. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/base.py +9 -1
  32. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/bash.py +2 -2
  33. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/edit.py +3 -7
  34. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/hud_console.py +43 -0
  35. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/tests/test_version.py +1 -1
  36. {hud_python-0.4.48 → hud_python-0.4.50}/hud/version.py +1 -1
  37. {hud_python-0.4.48 → hud_python-0.4.50}/pyproject.toml +1 -4
  38. hud_python-0.4.48/environments/blank/README.md +0 -108
  39. hud_python-0.4.48/environments/blank/controller/README.md +0 -16
  40. hud_python-0.4.48/environments/blank/pyproject.toml +0 -19
  41. hud_python-0.4.48/hud/cli/dev.py +0 -828
  42. hud_python-0.4.48/hud/cli/tests/test_mcp_server.py +0 -125
  43. {hud_python-0.4.48 → hud_python-0.4.50}/.gitignore +0 -0
  44. {hud_python-0.4.48 → hud_python-0.4.50}/LICENSE +0 -0
  45. {hud_python-0.4.48 → hud_python-0.4.50}/README.md +0 -0
  46. {hud_python-0.4.48 → hud_python-0.4.50}/environments/browser/environment/2048/README.md +0 -0
  47. {hud_python-0.4.48 → hud_python-0.4.50}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
  48. {hud_python-0.4.48 → hud_python-0.4.50}/environments/browser/environment/README.md +0 -0
  49. {hud_python-0.4.48 → hud_python-0.4.50}/environments/browser/environment/todo/README.md +0 -0
  50. {hud_python-0.4.48 → hud_python-0.4.50}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
  51. {hud_python-0.4.48 → hud_python-0.4.50}/environments/browser/pyproject.toml +0 -0
  52. {hud_python-0.4.48 → hud_python-0.4.50}/environments/remote_browser/README.md +0 -0
  53. {hud_python-0.4.48 → hud_python-0.4.50}/environments/remote_browser/pyproject.toml +0 -0
  54. {hud_python-0.4.48 → hud_python-0.4.50}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  55. {hud_python-0.4.48 → hud_python-0.4.50}/environments/text_2048/README.md +0 -0
  56. {hud_python-0.4.48 → hud_python-0.4.50}/environments/text_2048/pyproject.toml +0 -0
  57. {hud_python-0.4.48 → hud_python-0.4.50}/examples/README.md +0 -0
  58. {hud_python-0.4.48 → hud_python-0.4.50}/hud/__init__.py +0 -0
  59. {hud_python-0.4.48 → hud_python-0.4.50}/hud/__main__.py +0 -0
  60. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/__init__.py +0 -0
  61. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/claude.py +0 -0
  62. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/langchain.py +0 -0
  63. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/lite_llm.py +0 -0
  64. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/misc/__init__.py +0 -0
  65. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/misc/integration_test_agent.py +0 -0
  66. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/misc/response_agent.py +0 -0
  67. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/openai.py +0 -0
  68. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/openai_chat_generic.py +0 -0
  69. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/tests/__init__.py +0 -0
  70. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/tests/test_base.py +0 -0
  71. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/tests/test_claude.py +0 -0
  72. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/tests/test_client.py +0 -0
  73. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  74. {hud_python-0.4.48 → hud_python-0.4.50}/hud/agents/tests/test_openai.py +0 -0
  75. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/__main__.py +0 -0
  76. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/analyze.py +0 -0
  77. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/clone.py +0 -0
  78. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/debug.py +0 -0
  79. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/eval.py +0 -0
  80. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/flows/__init__.py +0 -0
  81. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/get.py +0 -0
  82. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/list_func.py +0 -0
  83. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/pull.py +0 -0
  84. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/remove.py +0 -0
  85. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/rl/__init__.py +0 -0
  86. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/rl/celebrate.py +0 -0
  87. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/rl/config.py +0 -0
  88. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/rl/display.py +0 -0
  89. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/rl/gpu.py +0 -0
  90. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/rl/gpu_utils.py +0 -0
  91. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/rl/presets.py +0 -0
  92. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/rl/remote_runner.py +0 -0
  93. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/rl/rl_api.py +0 -0
  94. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/rl/viewer.py +0 -0
  95. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/rl/vllm.py +0 -0
  96. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/rl/wait_utils.py +0 -0
  97. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/__init__.py +0 -0
  98. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_analyze.py +0 -0
  99. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_analyze_metadata.py +0 -0
  100. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_build.py +0 -0
  101. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_cli_init.py +0 -0
  102. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_cli_main.py +0 -0
  103. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_clone.py +0 -0
  104. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_cursor.py +0 -0
  105. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_debug.py +0 -0
  106. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_list_func.py +0 -0
  107. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_main_module.py +0 -0
  108. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_pull.py +0 -0
  109. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_push.py +0 -0
  110. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_registry.py +0 -0
  111. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/tests/test_utils.py +0 -0
  112. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/__init__.py +0 -0
  113. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/config.py +0 -0
  114. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/cursor.py +0 -0
  115. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/docker.py +0 -0
  116. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/environment.py +0 -0
  117. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/interactive.py +0 -0
  118. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/local_runner.py +0 -0
  119. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/logging.py +0 -0
  120. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/metadata.py +0 -0
  121. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/package_runner.py +0 -0
  122. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/registry.py +0 -0
  123. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/remote_runner.py +0 -0
  124. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/runner.py +0 -0
  125. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/server.py +0 -0
  126. {hud_python-0.4.48 → hud_python-0.4.50}/hud/cli/utils/tasks.py +0 -0
  127. {hud_python-0.4.48 → hud_python-0.4.50}/hud/clients/README.md +0 -0
  128. {hud_python-0.4.48 → hud_python-0.4.50}/hud/clients/__init__.py +0 -0
  129. {hud_python-0.4.48 → hud_python-0.4.50}/hud/clients/base.py +0 -0
  130. {hud_python-0.4.48 → hud_python-0.4.50}/hud/clients/fastmcp.py +0 -0
  131. {hud_python-0.4.48 → hud_python-0.4.50}/hud/clients/mcp_use.py +0 -0
  132. {hud_python-0.4.48 → hud_python-0.4.50}/hud/clients/tests/__init__.py +0 -0
  133. {hud_python-0.4.48 → hud_python-0.4.50}/hud/clients/tests/test_client_integration.py +0 -0
  134. {hud_python-0.4.48 → hud_python-0.4.50}/hud/clients/tests/test_fastmcp.py +0 -0
  135. {hud_python-0.4.48 → hud_python-0.4.50}/hud/clients/tests/test_mcp_use_retry.py +0 -0
  136. {hud_python-0.4.48 → hud_python-0.4.50}/hud/clients/tests/test_protocol.py +0 -0
  137. {hud_python-0.4.48 → hud_python-0.4.50}/hud/clients/utils/__init__.py +0 -0
  138. {hud_python-0.4.48 → hud_python-0.4.50}/hud/clients/utils/mcp_use_retry.py +0 -0
  139. {hud_python-0.4.48 → hud_python-0.4.50}/hud/clients/utils/retry.py +0 -0
  140. {hud_python-0.4.48 → hud_python-0.4.50}/hud/clients/utils/retry_transport.py +0 -0
  141. {hud_python-0.4.48 → hud_python-0.4.50}/hud/datasets/__init__.py +0 -0
  142. {hud_python-0.4.48 → hud_python-0.4.50}/hud/datasets/parallel.py +0 -0
  143. {hud_python-0.4.48 → hud_python-0.4.50}/hud/datasets/runner.py +0 -0
  144. {hud_python-0.4.48 → hud_python-0.4.50}/hud/datasets/utils.py +0 -0
  145. {hud_python-0.4.48 → hud_python-0.4.50}/hud/misc/__init__.py +0 -0
  146. {hud_python-0.4.48 → hud_python-0.4.50}/hud/misc/claude_plays_pokemon.py +0 -0
  147. {hud_python-0.4.48 → hud_python-0.4.50}/hud/native/__init__.py +0 -0
  148. {hud_python-0.4.48 → hud_python-0.4.50}/hud/native/comparator.py +0 -0
  149. {hud_python-0.4.48 → hud_python-0.4.50}/hud/native/tests/__init__.py +0 -0
  150. {hud_python-0.4.48 → hud_python-0.4.50}/hud/native/tests/test_comparator.py +0 -0
  151. {hud_python-0.4.48 → hud_python-0.4.50}/hud/native/tests/test_native_init.py +0 -0
  152. {hud_python-0.4.48 → hud_python-0.4.50}/hud/otel/__init__.py +0 -0
  153. {hud_python-0.4.48 → hud_python-0.4.50}/hud/otel/collector.py +0 -0
  154. {hud_python-0.4.48 → hud_python-0.4.50}/hud/otel/config.py +0 -0
  155. {hud_python-0.4.48 → hud_python-0.4.50}/hud/otel/context.py +0 -0
  156. {hud_python-0.4.48 → hud_python-0.4.50}/hud/otel/exporters.py +0 -0
  157. {hud_python-0.4.48 → hud_python-0.4.50}/hud/otel/instrumentation.py +0 -0
  158. {hud_python-0.4.48 → hud_python-0.4.50}/hud/otel/processors.py +0 -0
  159. {hud_python-0.4.48 → hud_python-0.4.50}/hud/otel/tests/__init__.py +0 -0
  160. {hud_python-0.4.48 → hud_python-0.4.50}/hud/otel/tests/test_processors.py +0 -0
  161. {hud_python-0.4.48 → hud_python-0.4.50}/hud/py.typed +0 -0
  162. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/README.md +0 -0
  163. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/__init__.py +0 -0
  164. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/actor.py +0 -0
  165. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/buffer.py +0 -0
  166. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/chat_template.jinja +0 -0
  167. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/config.py +0 -0
  168. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/distributed.py +0 -0
  169. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/learner.py +0 -0
  170. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/tests/__init__.py +0 -0
  171. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/tests/test_learner.py +0 -0
  172. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/train.py +0 -0
  173. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/types.py +0 -0
  174. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/utils/start_vllm_server.sh +0 -0
  175. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/utils.py +0 -0
  176. {hud_python-0.4.48 → hud_python-0.4.50}/hud/rl/vllm_adapter.py +0 -0
  177. {hud_python-0.4.48 → hud_python-0.4.50}/hud/samples/__init__.py +0 -0
  178. {hud_python-0.4.48 → hud_python-0.4.50}/hud/samples/browser.py +0 -0
  179. {hud_python-0.4.48 → hud_python-0.4.50}/hud/server/context.py +0 -0
  180. {hud_python-0.4.48 → hud_python-0.4.50}/hud/server/helper/__init__.py +0 -0
  181. {hud_python-0.4.48 → hud_python-0.4.50}/hud/server/low_level.py +0 -0
  182. {hud_python-0.4.48 → hud_python-0.4.50}/hud/server/tests/__init__.py +0 -0
  183. {hud_python-0.4.48 → hud_python-0.4.50}/hud/server/tests/test_add_tool.py +0 -0
  184. {hud_python-0.4.48 → hud_python-0.4.50}/hud/server/tests/test_context.py +0 -0
  185. {hud_python-0.4.48 → hud_python-0.4.50}/hud/server/tests/test_mcp_server_handlers.py +0 -0
  186. {hud_python-0.4.48 → hud_python-0.4.50}/hud/server/tests/test_mcp_server_integration.py +0 -0
  187. {hud_python-0.4.48 → hud_python-0.4.50}/hud/server/tests/test_mcp_server_more.py +0 -0
  188. {hud_python-0.4.48 → hud_python-0.4.50}/hud/server/tests/test_run_wrapper.py +0 -0
  189. {hud_python-0.4.48 → hud_python-0.4.50}/hud/server/tests/test_server_extra.py +0 -0
  190. {hud_python-0.4.48 → hud_python-0.4.50}/hud/server/tests/test_sigterm_runner.py +0 -0
  191. {hud_python-0.4.48 → hud_python-0.4.50}/hud/settings.py +0 -0
  192. {hud_python-0.4.48 → hud_python-0.4.50}/hud/shared/__init__.py +0 -0
  193. {hud_python-0.4.48 → hud_python-0.4.50}/hud/shared/exceptions.py +0 -0
  194. {hud_python-0.4.48 → hud_python-0.4.50}/hud/shared/hints.py +0 -0
  195. {hud_python-0.4.48 → hud_python-0.4.50}/hud/shared/requests.py +0 -0
  196. {hud_python-0.4.48 → hud_python-0.4.50}/hud/shared/tests/__init__.py +0 -0
  197. {hud_python-0.4.48 → hud_python-0.4.50}/hud/shared/tests/test_exceptions.py +0 -0
  198. {hud_python-0.4.48 → hud_python-0.4.50}/hud/shared/tests/test_requests.py +0 -0
  199. {hud_python-0.4.48 → hud_python-0.4.50}/hud/telemetry/__init__.py +0 -0
  200. {hud_python-0.4.48 → hud_python-0.4.50}/hud/telemetry/instrument.py +0 -0
  201. {hud_python-0.4.48 → hud_python-0.4.50}/hud/telemetry/job.py +0 -0
  202. {hud_python-0.4.48 → hud_python-0.4.50}/hud/telemetry/replay.py +0 -0
  203. {hud_python-0.4.48 → hud_python-0.4.50}/hud/telemetry/tests/__init__.py +0 -0
  204. {hud_python-0.4.48 → hud_python-0.4.50}/hud/telemetry/tests/test_replay.py +0 -0
  205. {hud_python-0.4.48 → hud_python-0.4.50}/hud/telemetry/tests/test_trace.py +0 -0
  206. {hud_python-0.4.48 → hud_python-0.4.50}/hud/telemetry/trace.py +0 -0
  207. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/__init__.py +0 -0
  208. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/computer/__init__.py +0 -0
  209. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/computer/anthropic.py +0 -0
  210. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/computer/hud.py +0 -0
  211. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/computer/openai.py +0 -0
  212. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/computer/qwen.py +0 -0
  213. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/computer/settings.py +0 -0
  214. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/executors/__init__.py +0 -0
  215. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/executors/base.py +0 -0
  216. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/executors/pyautogui.py +0 -0
  217. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/executors/tests/__init__.py +0 -0
  218. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/executors/tests/test_base_executor.py +0 -0
  219. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  220. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/executors/xdo.py +0 -0
  221. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/grounding/__init__.py +0 -0
  222. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/grounding/config.py +0 -0
  223. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/grounding/grounded_tool.py +0 -0
  224. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/grounding/grounder.py +0 -0
  225. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/grounding/tests/__init__.py +0 -0
  226. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  227. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/playwright.py +0 -0
  228. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/response.py +0 -0
  229. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/submit.py +0 -0
  230. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/tests/__init__.py +0 -0
  231. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/tests/test_base.py +0 -0
  232. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/tests/test_bash.py +0 -0
  233. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/tests/test_bash_extended.py +0 -0
  234. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/tests/test_computer.py +0 -0
  235. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/tests/test_computer_actions.py +0 -0
  236. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/tests/test_edit.py +0 -0
  237. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/tests/test_init.py +0 -0
  238. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/tests/test_playwright_tool.py +0 -0
  239. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/tests/test_response.py +0 -0
  240. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/tests/test_tools.py +0 -0
  241. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/tests/test_tools_init.py +0 -0
  242. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/tests/test_utils.py +0 -0
  243. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/types.py +0 -0
  244. {hud_python-0.4.48 → hud_python-0.4.50}/hud/tools/utils.py +0 -0
  245. {hud_python-0.4.48 → hud_python-0.4.50}/hud/types.py +0 -0
  246. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/__init__.py +0 -0
  247. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/agent_factories.py +0 -0
  248. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/async_utils.py +0 -0
  249. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/group_eval.py +0 -0
  250. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/mcp.py +0 -0
  251. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/pretty_errors.py +0 -0
  252. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/progress.py +0 -0
  253. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/tasks.py +0 -0
  254. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/telemetry.py +0 -0
  255. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/tests/__init__.py +0 -0
  256. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/tests/test_async_utils.py +0 -0
  257. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/tests/test_init.py +0 -0
  258. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/tests/test_mcp.py +0 -0
  259. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/tests/test_progress.py +0 -0
  260. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/tests/test_telemetry.py +0 -0
  261. {hud_python-0.4.48 → hud_python-0.4.50}/hud/utils/tool_shorthand.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.48
3
+ Version: 0.4.50
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -156,24 +156,24 @@ For Python-based MCP environments, use this standard structure:
156
156
  ```
157
157
  my-environment/
158
158
  ├── Dockerfile
159
- ├── pyproject.toml # Package definition with dependencies
160
- ├── README.md # Environment documentation
161
- └── src/
162
- └── my_module/ # Your Python package
163
- ├── __init__.py
164
- ├── server.py # MCP server entry point
165
- ├── context.py # Core stateful environment logic (optional)
166
- ├── tools/ # Interactive tools (move, click, type, etc.)
167
- │ ├── __init__.py
168
- │ └── move.py # Example: custom tool inheriting from BaseTool
169
- ├── setup/ # Setup functions (modular approach)
170
- ├── __init__.py # Creates SetupTool instance & exports decorator
171
- ├── basic.py # Basic setup functions
172
- └── advanced.py # Advanced setup functions
173
- └── evaluate/ # Evaluator functions (modular approach)
174
- ├── __init__.py # Creates EvaluateTool instance & exports decorator
175
- ├── checks.py # Basic evaluation checks
176
- └── metrics.py # Advanced metrics evaluators
159
+ ├── README.md
160
+ ├── server/ # MCP server package
161
+ │ ├── pyproject.toml # MCP dependencies (hud-python, etc.)
162
+ │ ├── __init__.py # Empty package marker
163
+ ├── main.py # mcp = MCPServer() + lifecycle hooks
164
+ ├── tools.py # router = MCPRouter() + @router.tool decorators
165
+ ├── setup/ # Setup router (modular approach)
166
+ │ │ ├── __init__.py
167
+ ├── basic.py # Basic setup functions
168
+ └── advanced.py # Advanced setup functions
169
+ │ └── evaluate/ # Evaluate router (modular approach)
170
+ ├── __init__.py
171
+ ├── checks.py # Basic evaluation checks
172
+ └── metrics.py # Advanced metrics evaluators
173
+ └── environment/ # Backend service package
174
+ ├── pyproject.toml # Backend dependencies (fastapi, uvicorn)
175
+ ├── __init__.py
176
+ └── server.py # FastAPI app with /health, /act, /reset, /state
177
177
  ```
178
178
 
179
179
  This structure enables:
@@ -607,51 +607,62 @@ Once all of the above works you can unleash *hundreds* of concurrent agents on y
607
607
 
608
608
  ## Phase 5 – Hot-Reload Development
609
609
 
610
- To enable rapid development without Docker rebuilds, we can mount the source code and use hot-reload. The HUD CLI provides a built-in development proxy that handles all the complexity:
610
+ For rapid local development, run the controller and environment servers separately. This enables instant code updates without Docker rebuilds.
611
611
 
612
+ ### Development Setup
613
+
614
+ You'll need **two terminal windows** for local development:
615
+
616
+ #### Terminal 1: MCP Server
612
617
  ```bash
613
- # Navigate to your environment directory
614
- cd environments/my-environment
618
+ cd environments/my-environment/server
619
+ hud dev # Auto-detects and runs with hot-reload
615
620
 
616
- # Start the development proxy with hot-reload
617
- hud dev --build
621
+ # Optional flags:
622
+ hud dev --inspector # Launch MCP Inspector
623
+ hud dev --interactive # Launch interactive testing mode
624
+ hud dev --stdio # Use stdio transport (default: HTTP)
625
+ hud dev --watch ../shared # Watch additional directories
626
+ ```
627
+
628
+ The `hud dev` command:
629
+ - Auto-detects the MCP module in the current directory
630
+ - Watches for file changes and reloads automatically
631
+ - Runs on HTTP by default (http://localhost:8765/mcp)
632
+ - Can launch MCP Inspector for testing tools
633
+ - Can launch interactive mode for manual testing
618
634
 
619
- # Output:
620
- # 📦 Using cached image: hud-my-environment:dev
621
- # "hud-my-environment": {
622
- # "url": "http://localhost:8765/mcp"
623
- # }
624
- # ✨ Add to Cursor: cursor://anysphere.cursor-deeplink/mcp/install?name=...
625
- # 🌐 Reloading proxy live, press Ctrl+C to stop
635
+ #### Terminal 2: Environment Server (Backend)
636
+ ```bash
637
+ cd environments/my-environment/environment
638
+ uvicorn server:app --reload # Standard uvicorn with hot-reload
626
639
  ```
627
640
 
628
- This command:
629
- - Auto-detects or builds your Docker image with `:dev` tag
630
- - Mounts `./src` to `/app/src` for instant code updates
631
- - Uses watchfiles to monitor file changes and restart automatically
632
- - Exposes an HTTP endpoint for Cursor integration
633
- - Caches the image name in `pyproject.toml` for faster subsequent runs
641
+ For the backend, we simply use `uvicorn` directly since it already provides excellent hot-reload capabilities.
642
+
643
+ ### Development Workflow
644
+
645
+ 1. Start both servers in separate terminals
646
+ 2. Edit code in either `server/` or `environment/` - changes reload automatically
647
+ 3. Test changes immediately without rebuilding Docker images
648
+ 4. Use MCP Inspector or interactive mode to test tools
649
+ 5. When ready, build the complete Docker image: `hud build`
634
650
 
635
- #### Quick Cursor Setup
651
+ ### Quick Cursor Setup
636
652
 
637
- Either click the deeplink URL from the output, or manually add to `.cursor/mcp.json`:
653
+ Add to `.cursor/mcp.json` (or use the deeplink from `hud dev` output):
638
654
 
639
655
  ```json
640
656
  {
641
657
  "mcpServers": {
642
- "hud-my-environment": {
658
+ "my-environment-dev": {
643
659
  "url": "http://localhost:8765/mcp"
644
660
  }
645
661
  }
646
662
  }
647
663
  ```
648
664
 
649
- ### Development Workflow
650
-
651
- 1. Keep `hud dev` running in one terminal - it automatically handles reloads
652
- 2. Edit your code in `src/` - changes take effect immediately
653
- 3. Test changes in another terminal with `hud analyze` or the interactive mode
654
- 4. Use Cursor/Claude to iterate quickly on your environment
665
+ **Note**: Make sure both MCP server and environment backend are running when using with Cursor or agents.
655
666
 
656
667
  ### Process Separation for Stateful Environments
657
668
 
@@ -0,0 +1,121 @@
1
+ # Blank Environment
2
+
3
+ Minimal starter template for building HUD environments.
4
+ See [docs](https://docs.hud.so/build-environments) for the complete environment design workflow.
5
+
6
+ ## Architecture
7
+
8
+ **`environment/`** - Produces structured data
9
+ - Owns all state (game logic, browser sessions, databases, etc.)
10
+ - Exposes HTTP endpoints `/health`, `/act`, `/reset`, `/state` that return structured information about the environment state
11
+
12
+ **`server/`** - Wraps data in MCP tools
13
+ - Calls environment endpoints to get structured data for the agent, and environment setup/evaluation
14
+ - Agents and tasks interact only with these tools!
15
+
16
+ **Why separate?** Edit tools for the agent or tasks without restarting the heavy environment backend.
17
+
18
+ ## Development
19
+
20
+ ```bash
21
+ # Terminal 1 - Environment backend
22
+ cd environment
23
+ uv run uvicorn server:app --reload
24
+
25
+ # Terminal 2 - MCP server
26
+ cd server
27
+ uv run hud dev
28
+ ```
29
+
30
+ Uncomment the `setup` tool in `server/tools.py`, save, and watch it reload.
31
+ Visit http://localhost:8765/docs to see the new tool appear instantly.
32
+
33
+ In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
34
+
35
+ For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
36
+ ```bash
37
+ cd ..
38
+ hud dev
39
+ ```
40
+
41
+ ## Tasks & Evaluation
42
+ ```bash
43
+ # Build first in the global folder with the Dockerfile (creates blank:0.1.0)
44
+ hud build
45
+ ```
46
+
47
+ Your `tasks.json` uses `docker run` to launch the environment:
48
+
49
+ ```json
50
+ {
51
+ "prompt": "Your task prompt",
52
+ "mcp_config": {
53
+ "local": {
54
+ "command": "docker",
55
+ "args": ["run", "--rm", "-i", "blank:0.1.0"]
56
+ }
57
+ }
58
+ }
59
+ ```
60
+
61
+ **Commands:**
62
+ ```bash
63
+ # Build first
64
+ hud build
65
+
66
+ # Test task locally
67
+ hud eval tasks.json
68
+
69
+ # Push environment for remote running
70
+ hud push
71
+
72
+ # Production RL training
73
+ hud rl tasks.json # Auto-converts docker→remote, builds & pushes if needed
74
+ ```
75
+
76
+ ## Publishing Your Environment
77
+
78
+ Once your environment is ready, you can share it with the community:
79
+
80
+ ### 1. Push to Registry
81
+ ```bash
82
+ # Build and push your environment (requires docker hub login and hud api key)
83
+ hud build
84
+ hud push
85
+ ```
86
+
87
+ ### 2. Create a Dataset
88
+
89
+ Create a dataset on HuggingFace with your tasks:
90
+
91
+ **Option A: Upload manually**
92
+ 1. Upload your `tasks.json` to HuggingFace
93
+ 2. Make sure it's **public** to appear on leaderboards
94
+
95
+ **Option B: Use the SDK**
96
+ ```python
97
+ from hud.datasets import save_tasks
98
+ import json
99
+
100
+ # Load your tasks
101
+ with open("tasks.json") as f:
102
+ tasks = json.load(f)
103
+
104
+ # Push to HuggingFace
105
+ save_tasks(tasks, repo_id="your-org/your-dataset")
106
+ ```
107
+
108
+ ### 3. Run and Track Performance
109
+
110
+ ```bash
111
+ # Run Claude on your benchmark
112
+ hud eval "your-org/your-dataset" --agent claude
113
+
114
+ # View results at:
115
+ # hud.so/leaderboards/your-org/your-dataset
116
+ ```
117
+
118
+ **Note**: Only public HuggingFace datasets appear as leaderboards!
119
+
120
+ 📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
121
+
@@ -10,7 +10,7 @@ Endpoints (FastAPI)
10
10
 
11
11
  Run (dev)
12
12
  ```bash
13
- uv run uvicorn environment.server:app --reload --port 8005
13
+ uv run uvicorn server:app --reload --port 8005
14
14
  ```
15
15
 
16
16
  Principle: treat like a backend. Keep long‑lived state here; add endpoints as tools need them.
@@ -0,0 +1,16 @@
1
+ [project]
2
+ name = "blank-environment"
3
+ version = "0.1.0"
4
+ description = "Backend service for blank environment"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "fastapi",
8
+ "uvicorn[standard]",
9
+ ]
10
+
11
+ [build-system]
12
+ requires = ["hatchling"]
13
+ build-backend = "hatchling.build"
14
+
15
+ [tool.hatch.build.targets.wheel]
16
+ packages = ["."]
@@ -0,0 +1,21 @@
1
+ # MCP Server
2
+
3
+ MCP layer that wraps environment data in tools for agent interaction.
4
+
5
+ ## Structure
6
+
7
+ - `main.py` - Server initialization, imports routers
8
+ - `tools.py` - MCP tools that call environment HTTP endpoints
9
+
10
+ ## Development
11
+
12
+ ```bash
13
+ # Start MCP server with hot-reload
14
+ uv run hud dev
15
+ ```
16
+
17
+ ## Key Principles
18
+
19
+ - Keep tools thin - call environment HTTP endpoints
20
+ - Use routers for organization
21
+ - All long-lived state lives in `environment/`, not here
@@ -0,0 +1,19 @@
1
+ [project]
2
+ name = "blank-server"
3
+ version = "0.1.0"
4
+ description = "MCP server for blank environment"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "hud-python>=0.4.50",
8
+ "httpx>=0.28.1",
9
+ ]
10
+
11
+ [build-system]
12
+ requires = ["hatchling"]
13
+ build-backend = "hatchling.build"
14
+
15
+ [tool.hatch.metadata]
16
+ allow-direct-references = true
17
+
18
+ [tool.hatch.build.targets.wheel]
19
+ packages = ["."]
@@ -1,40 +1,39 @@
1
1
  # Browser Environment
2
2
 
3
- A browser automation environment for HUD that provides GUI access and web app interaction capabilities. This environment supports hot-reloading during development while maintaining persistent state.
3
+ Browser automation environment with GUI access for testing web applications. Includes sample apps (2048, Todo) and supports hot-reload development.
4
4
 
5
- ## Quick Start
5
+ ## Architecture
6
6
 
7
- ### Interactive Development
8
- ```bash
9
- # 1. Configure your API keys (optional - only needed for evaluation)
10
- # Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY
11
-
12
- # 2. Start the environment (optional: with inspector)
13
- hud dev --build --inspector
7
+ **`environment/`** - Produces structured data
8
+ - FastAPI backend with X11/VNC services (Linux-only)
9
+ - Launches and manages web apps (Next.js frontends + Python backends)
10
+ - Exposes HTTP endpoints for app control and state
14
11
 
15
- # 3. Choose your preferred way to test:
12
+ **`server/`** - Wraps data in MCP tools
13
+ - Browser automation tools (Playwright, computer vision)
14
+ - Setup tools (launch apps, seed data)
15
+ - Evaluation tools (check game state, todo completion)
16
16
 
17
- # Option A: Run the task with Claude (requires ANTHROPIC_API_KEY)
18
- hud eval tasks.json --agent claude
17
+ **Why separate?** The environment backend requires X11/VNC/Chromium (Docker-only). The MCP server tools can be edited with hot-reload, while the heavy environment stays running.
19
18
 
20
- # Option B: Interactive notebook test_env.ipynb (great for learning!)
21
- # Requires installation:
22
- pip install hud-python[agents]
23
-
24
- # Option C: Simple Python script (runs all tasks from tasks.json)
25
- python test_task.py
26
- ```
19
+ ## Development
27
20
 
28
- ## How HUD Environments Work
21
+ This environment **requires Docker** due to X11/VNC dependencies.
29
22
 
30
- The environment is split into two components:
23
+ ```bash
24
+ # Build first (creates hud-browser:0.1.0)
25
+ hud build
31
26
 
32
- - **`env.py`** - Stateful logic that persists across reloads
33
- - **`server.py`** - MCP server with tools (reloads on file changes)
27
+ # Start with hot-reload
28
+ hud dev
29
+ ```
34
30
 
35
- This separation is crucial for `hud dev` - it allows you to modify the MCP tools and see changes immediately without losing the environment state. The environment runs as a separate process and communicates via socket, while the server can be restarted freely.
31
+ When you run `hud dev` in an environment with a Dockerfile, it automatically:
32
+ - Detects Docker mode is needed
33
+ - Mounts `server/` and `environment/` as volumes
34
+ - Enables hot-reload for both layers
36
35
 
37
- If you are ever seeing issues with the environment itself, running `hud dev --full-reload` will reload both the environment and the server.
36
+ Edit files in `server/` or `environment/` and they reload inside the container!
38
37
 
39
38
  ## Publishing Your Environment
40
39
 
@@ -0,0 +1,23 @@
1
+ [project]
2
+ name = "hud-browser-environment"
3
+ version = "0.1.0"
4
+ description = "HUD Browser Environment Backend"
5
+ requires-python = ">=3.11,<3.14"
6
+ dependencies = [
7
+ "fastapi>=0.104.1",
8
+ "uvicorn[standard]>=0.24.0",
9
+ "python-multipart>=0.0.6",
10
+ "pydantic>=2.6,<3",
11
+ "pydantic-settings>=2.2,<3",
12
+ "httpx",
13
+ ]
14
+
15
+ [build-system]
16
+ requires = ["hatchling"]
17
+ build-backend = "hatchling.build"
18
+
19
+ [tool.hatch.metadata]
20
+ allow-direct-references = true
21
+
22
+ [tool.hatch.build.targets.wheel]
23
+ packages = ["environment"]
@@ -0,0 +1,21 @@
1
+ [project]
2
+ name = "hud-browser-server"
3
+ version = "0.1.0"
4
+ description = "HUD Browser MCP Server"
5
+ requires-python = ">=3.11,<3.14"
6
+ dependencies = [
7
+ "hud-python@git+https://github.com/hud-evals/hud-python@cli-dev",
8
+ "httpx",
9
+ "playwright",
10
+ "pyautogui",
11
+ ]
12
+
13
+ [build-system]
14
+ requires = ["hatchling"]
15
+ build-backend = "hatchling.build"
16
+
17
+ [tool.hatch.metadata]
18
+ allow-direct-references = true
19
+
20
+ [tool.hatch.build.targets.wheel]
21
+ packages = ["server"]
@@ -0,0 +1,165 @@
1
+ # Deep Research Environment
2
+
3
+ Web research environment powered by Exa API for searching and fetching content.
4
+ See [docs](https://docs.hud.so/build-environments) for the complete environment design workflow.
5
+
6
+ ## Architecture
7
+
8
+ **`environment/`** - Manages Exa API integration and state
9
+ - Holds the Exa API key server-side
10
+ - Exposes HTTP endpoints `/search`, `/fetch`, `/answer`, `/evaluate` for research workflows
11
+ - Implements exponential backoff for rate limiting
12
+
13
+ **`server/`** - Wraps data in MCP tools
14
+ - Provides `search()`, `fetch()`, `answer()`, `evaluate()` tools for agents
15
+ - Agents and tasks interact only with these tools
16
+
17
+ **Why separate?** Edit tools for the agent or tasks without restarting the environment backend.
18
+
19
+ ## Tools
20
+
21
+ - **`search(query: str)`** - Search the web using Exa API, returns list of results with titles and URLs
22
+ - **`fetch(url: str)`** - Fetch full content from a URL, returns summary, highlights, and text
23
+ - **`answer(final_answer: str)`** - Submit the final research answer
24
+ - **`evaluate(expected_answer: str)`** - Evaluate submitted answer against expected result
25
+
26
+ ## Setup
27
+
28
+ ### Requirements
29
+ - Exa API key (get one at [exa.ai](https://exa.ai))
30
+
31
+ ### Environment Variables
32
+ ```bash
33
+ export EXA_API_KEY="your_exa_api_key_here"
34
+ ```
35
+
36
+ ## Development
37
+
38
+ ```bash
39
+ # Terminal 1 - Environment backend
40
+ cd environment
41
+ export EXA_API_KEY="your_key"
42
+ uv run uvicorn server:app --reload
43
+
44
+ # Terminal 2 - MCP server
45
+ cd server
46
+ uv run hud dev
47
+ ```
48
+
49
+ The environment includes exponential backoff for rate limiting, so API calls will automatically retry on 429 errors.
50
+
51
+ In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
52
+
53
+ For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
54
+ ```bash
55
+ cd ..
56
+ export EXA_API_KEY="your_key"
57
+ hud dev
58
+ ```
59
+
60
+ ## Tasks & Evaluation
61
+
62
+ ```bash
63
+ # Build first in the global folder with the Dockerfile (creates deepresearch:0.1.0)
64
+ hud build
65
+ ```
66
+
67
+ Your `tasks.json` uses `docker run` to launch the environment:
68
+
69
+ ```json
70
+ {
71
+ "prompt": "Research and answer: What is the capital of France?",
72
+ "mcp_config": {
73
+ "local": {
74
+ "command": "docker",
75
+ "args": ["run", "--rm", "-i", "-e", "EXA_API_KEY", "deepresearch:0.1.0"]
76
+ }
77
+ },
78
+ "evaluator": {
79
+ "tool_name": "evaluate",
80
+ "tool_params": {
81
+ "expected_answer": "Paris"
82
+ }
83
+ }
84
+ }
85
+ ```
86
+
87
+ **Note:** The `-e EXA_API_KEY` flag passes your local API key to the container.
88
+
89
+ **Commands:**
90
+ ```bash
91
+ # Build first
92
+ hud build
93
+
94
+ # Test task locally
95
+ export EXA_API_KEY="your_key"
96
+ hud eval tasks.json
97
+
98
+ # Push environment for remote running
99
+ hud push
100
+
101
+ # Production RL training
102
+ hud rl tasks.json # Auto-converts docker→remote, builds & pushes if needed
103
+ ```
104
+
105
+ ## Publishing Your Environment
106
+
107
+ Once your environment is ready, you can share it with the community:
108
+
109
+ ### 1. Push to Registry
110
+ ```bash
111
+ # Build and push your environment (requires docker hub login and hud api key)
112
+ hud build
113
+ hud push
114
+ ```
115
+
116
+ ### 2. Create a Dataset
117
+
118
+ Create a dataset on HuggingFace with your tasks:
119
+
120
+ **Option A: Upload manually**
121
+ 1. Upload your `tasks.json` to HuggingFace
122
+ 2. Make sure it's **public** to appear on leaderboards
123
+
124
+ **Option B: Use the SDK**
125
+ ```python
126
+ from hud.datasets import save_tasks
127
+ import json
128
+
129
+ # Load your tasks
130
+ with open("tasks.json") as f:
131
+ tasks = json.load(f)
132
+
133
+ # Push to HuggingFace
134
+ save_tasks(tasks, repo_id="your-org/your-dataset")
135
+ ```
136
+
137
+ ### 3. Run and Track Performance
138
+
139
+ ```bash
140
+ # Run Claude on your benchmark
141
+ hud eval "your-org/your-dataset" --agent claude
142
+
143
+ # View results at:
144
+ # hud.so/leaderboards/your-org/your-dataset
145
+ ```
146
+
147
+ **Note**: Only public HuggingFace datasets appear as leaderboards!
148
+
149
+ 📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
150
+
151
+ ## Example Research Workflow
152
+
153
+ ```python
154
+ # Agent searches for information
155
+ results = search("latest AI developments 2024")
156
+
157
+ # Agent fetches detailed content from top result
158
+ content = fetch(results[0]["url"])
159
+
160
+ # Agent submits final answer
161
+ answer("Based on research, AI developments in 2024 include...")
162
+
163
+ # Evaluate answer
164
+ result = evaluate(expected_answer="AI developments")
165
+ ```
@@ -0,0 +1,17 @@
1
+ [project]
2
+ name = "deepresearch-environment"
3
+ version = "0.1.0"
4
+ description = "Backend service for DeepResearch environment"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "fastapi>=0.104.1",
8
+ "uvicorn[standard]>=0.24.0",
9
+ "httpx>=0.24.0",
10
+ ]
11
+
12
+ [build-system]
13
+ requires = ["hatchling"]
14
+ build-backend = "hatchling.build"
15
+
16
+ [tool.hatch.build.targets.wheel]
17
+ packages = ["environment"]
@@ -3,7 +3,7 @@ name = "deepresearch"
3
3
  version = "0.1.0"
4
4
  description = "DeepResearch HUD environment with HTTP backend (EXA on server)"
5
5
  requires-python = ">=3.11"
6
- dependencies = [ "hud-python==0.4.41", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "httpx>=0.24.0",]
6
+ dependencies = [ "hud-python==0.4.42", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "httpx>=0.24.0",]
7
7
 
8
8
  [build-system]
9
9
  requires = [ "hatchling",]
@@ -0,0 +1,19 @@
1
+ [project]
2
+ name = "deepresearch-mcp"
3
+ version = "0.1.0"
4
+ description = "MCP server for DeepResearch environment"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "hud-python>=0.4.50",
8
+ "httpx>=0.24.0",
9
+ ]
10
+
11
+ [build-system]
12
+ requires = ["hatchling"]
13
+ build-backend = "hatchling.build"
14
+
15
+ [tool.hatch.metadata]
16
+ allow-direct-references = true
17
+
18
+ [tool.hatch.build.targets.wheel]
19
+ packages = ["mcp"]