hud-python 0.4.11__tar.gz → 0.4.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (173) hide show
  1. {hud_python-0.4.11 → hud_python-0.4.13}/PKG-INFO +16 -13
  2. {hud_python-0.4.11 → hud_python-0.4.13}/README.md +2 -6
  3. {hud_python-0.4.11 → hud_python-0.4.13}/environments/README.md +15 -15
  4. hud_python-0.4.13/environments/browser/README.md +213 -0
  5. {hud_python-0.4.11 → hud_python-0.4.13}/environments/remote_browser/README.md +3 -0
  6. {hud_python-0.4.11 → hud_python-0.4.13}/environments/remote_browser/pyproject.toml +11 -16
  7. hud_python-0.4.13/hud/__main__.py +8 -0
  8. {hud_python-0.4.11 → hud_python-0.4.13}/hud/agents/base.py +7 -8
  9. {hud_python-0.4.11 → hud_python-0.4.13}/hud/agents/langchain.py +2 -2
  10. {hud_python-0.4.11 → hud_python-0.4.13}/hud/agents/tests/test_openai.py +3 -1
  11. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/__init__.py +114 -52
  12. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/build.py +121 -71
  13. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/debug.py +2 -2
  14. hud_python-0.4.11/hud/cli/mcp_server.py → hud_python-0.4.13/hud/cli/dev.py +101 -38
  15. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/eval.py +175 -90
  16. hud_python-0.4.13/hud/cli/init.py +658 -0
  17. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/list_func.py +72 -71
  18. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/pull.py +1 -2
  19. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/push.py +35 -23
  20. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/remove.py +35 -41
  21. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/test_analyze.py +2 -1
  22. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/test_analyze_metadata.py +42 -49
  23. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/test_build.py +28 -52
  24. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/test_cursor.py +1 -1
  25. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/test_debug.py +1 -1
  26. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/test_list_func.py +75 -64
  27. hud_python-0.4.13/hud/cli/tests/test_main_module.py +30 -0
  28. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/test_mcp_server.py +3 -3
  29. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/test_pull.py +30 -61
  30. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/test_push.py +70 -89
  31. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/test_registry.py +36 -38
  32. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/test_utils.py +1 -1
  33. hud_python-0.4.13/hud/cli/utils/__init__.py +1 -0
  34. hud_python-0.4.11/hud/cli/docker_utils.py → hud_python-0.4.13/hud/cli/utils/docker.py +36 -0
  35. hud_python-0.4.11/hud/cli/env_utils.py → hud_python-0.4.13/hud/cli/utils/environment.py +7 -7
  36. {hud_python-0.4.11/hud/cli → hud_python-0.4.13/hud/cli/utils}/interactive.py +91 -19
  37. hud_python-0.4.11/hud/cli/analyze_metadata.py → hud_python-0.4.13/hud/cli/utils/metadata.py +12 -8
  38. {hud_python-0.4.11/hud/cli → hud_python-0.4.13/hud/cli/utils}/registry.py +28 -30
  39. {hud_python-0.4.11/hud/cli → hud_python-0.4.13/hud/cli/utils}/remote_runner.py +1 -1
  40. hud_python-0.4.13/hud/cli/utils/runner.py +134 -0
  41. hud_python-0.4.13/hud/cli/utils/server.py +250 -0
  42. {hud_python-0.4.11 → hud_python-0.4.13}/hud/clients/base.py +1 -1
  43. {hud_python-0.4.11 → hud_python-0.4.13}/hud/clients/fastmcp.py +5 -13
  44. {hud_python-0.4.11 → hud_python-0.4.13}/hud/clients/mcp_use.py +6 -10
  45. {hud_python-0.4.11 → hud_python-0.4.13}/hud/server/server.py +35 -5
  46. {hud_python-0.4.11 → hud_python-0.4.13}/hud/shared/exceptions.py +11 -0
  47. {hud_python-0.4.11 → hud_python-0.4.13}/hud/shared/tests/test_exceptions.py +22 -0
  48. hud_python-0.4.13/hud/telemetry/tests/test_replay.py +40 -0
  49. hud_python-0.4.13/hud/telemetry/tests/test_trace.py +63 -0
  50. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/base.py +20 -3
  51. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/computer/hud.py +15 -6
  52. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/executors/tests/test_base_executor.py +27 -0
  53. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/response.py +12 -8
  54. hud_python-0.4.13/hud/tools/tests/test_response.py +60 -0
  55. hud_python-0.4.13/hud/tools/tests/test_tools_init.py +49 -0
  56. {hud_python-0.4.11 → hud_python-0.4.13}/hud/utils/design.py +19 -8
  57. {hud_python-0.4.11 → hud_python-0.4.13}/hud/utils/mcp.py +17 -5
  58. hud_python-0.4.13/hud/utils/tests/__init__.py +0 -0
  59. hud_python-0.4.13/hud/utils/tests/test_mcp.py +112 -0
  60. {hud_python-0.4.11 → hud_python-0.4.13}/hud/utils/tests/test_version.py +1 -1
  61. {hud_python-0.4.11 → hud_python-0.4.13}/hud/version.py +1 -1
  62. {hud_python-0.4.11 → hud_python-0.4.13}/pyproject.toml +10 -11
  63. hud_python-0.4.11/environments/browser/README.md +0 -447
  64. hud_python-0.4.11/environments/browser/src/hud_controller/README.md +0 -117
  65. hud_python-0.4.11/hud/cli/init.py +0 -280
  66. hud_python-0.4.11/hud/cli/runner.py +0 -160
  67. {hud_python-0.4.11 → hud_python-0.4.13}/.gitignore +0 -0
  68. {hud_python-0.4.11 → hud_python-0.4.13}/LICENSE +0 -0
  69. {hud_python-0.4.11 → hud_python-0.4.13}/environments/browser/apps/2048/README.md +0 -0
  70. {hud_python-0.4.11 → hud_python-0.4.13}/environments/browser/apps/2048/backend/pyproject.toml +0 -0
  71. {hud_python-0.4.11 → hud_python-0.4.13}/environments/browser/apps/README.md +0 -0
  72. {hud_python-0.4.11 → hud_python-0.4.13}/environments/browser/apps/todo/README.md +0 -0
  73. {hud_python-0.4.11 → hud_python-0.4.13}/environments/browser/apps/todo/backend/pyproject.toml +0 -0
  74. {hud_python-0.4.11 → hud_python-0.4.13}/environments/browser/pyproject.toml +0 -0
  75. {hud_python-0.4.11 → hud_python-0.4.13}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  76. {hud_python-0.4.11 → hud_python-0.4.13}/environments/text_2048/README.md +0 -0
  77. {hud_python-0.4.11 → hud_python-0.4.13}/environments/text_2048/pyproject.toml +0 -0
  78. {hud_python-0.4.11 → hud_python-0.4.13}/examples/README.md +0 -0
  79. {hud_python-0.4.11 → hud_python-0.4.13}/hud/__init__.py +0 -0
  80. {hud_python-0.4.11 → hud_python-0.4.13}/hud/agents/__init__.py +0 -0
  81. {hud_python-0.4.11 → hud_python-0.4.13}/hud/agents/claude.py +0 -0
  82. {hud_python-0.4.11 → hud_python-0.4.13}/hud/agents/misc/__init__.py +0 -0
  83. {hud_python-0.4.11 → hud_python-0.4.13}/hud/agents/misc/response_agent.py +0 -0
  84. {hud_python-0.4.11 → hud_python-0.4.13}/hud/agents/openai.py +0 -0
  85. {hud_python-0.4.11 → hud_python-0.4.13}/hud/agents/openai_chat_generic.py +0 -0
  86. {hud_python-0.4.11 → hud_python-0.4.13}/hud/agents/tests/__init__.py +0 -0
  87. {hud_python-0.4.11 → hud_python-0.4.13}/hud/agents/tests/test_base.py +0 -0
  88. {hud_python-0.4.11 → hud_python-0.4.13}/hud/agents/tests/test_claude.py +0 -0
  89. {hud_python-0.4.11 → hud_python-0.4.13}/hud/agents/tests/test_client.py +0 -0
  90. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/__main__.py +0 -0
  91. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/analyze.py +0 -0
  92. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/clone.py +0 -0
  93. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/__init__.py +0 -0
  94. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/test_cli_init.py +0 -0
  95. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/test_cli_main.py +0 -0
  96. {hud_python-0.4.11 → hud_python-0.4.13}/hud/cli/tests/test_clone.py +0 -0
  97. {hud_python-0.4.11/hud/cli → hud_python-0.4.13/hud/cli/utils}/cursor.py +0 -0
  98. /hud_python-0.4.11/hud/cli/utils.py → /hud_python-0.4.13/hud/cli/utils/logging.py +0 -0
  99. {hud_python-0.4.11 → hud_python-0.4.13}/hud/clients/README.md +0 -0
  100. {hud_python-0.4.11 → hud_python-0.4.13}/hud/clients/__init__.py +0 -0
  101. {hud_python-0.4.11 → hud_python-0.4.13}/hud/clients/tests/__init__.py +0 -0
  102. {hud_python-0.4.11 → hud_python-0.4.13}/hud/clients/tests/test_client_integration.py +0 -0
  103. {hud_python-0.4.11 → hud_python-0.4.13}/hud/clients/tests/test_fastmcp.py +0 -0
  104. {hud_python-0.4.11 → hud_python-0.4.13}/hud/clients/tests/test_protocol.py +0 -0
  105. {hud_python-0.4.11 → hud_python-0.4.13}/hud/clients/utils/__init__.py +0 -0
  106. {hud_python-0.4.11 → hud_python-0.4.13}/hud/clients/utils/retry_transport.py +0 -0
  107. {hud_python-0.4.11 → hud_python-0.4.13}/hud/datasets.py +0 -0
  108. {hud_python-0.4.11 → hud_python-0.4.13}/hud/misc/__init__.py +0 -0
  109. {hud_python-0.4.11 → hud_python-0.4.13}/hud/misc/claude_plays_pokemon.py +0 -0
  110. {hud_python-0.4.11 → hud_python-0.4.13}/hud/otel/__init__.py +0 -0
  111. {hud_python-0.4.11 → hud_python-0.4.13}/hud/otel/collector.py +0 -0
  112. {hud_python-0.4.11 → hud_python-0.4.13}/hud/otel/config.py +0 -0
  113. {hud_python-0.4.11 → hud_python-0.4.13}/hud/otel/context.py +0 -0
  114. {hud_python-0.4.11 → hud_python-0.4.13}/hud/otel/exporters.py +0 -0
  115. {hud_python-0.4.11 → hud_python-0.4.13}/hud/otel/instrumentation.py +0 -0
  116. {hud_python-0.4.11 → hud_python-0.4.13}/hud/otel/processors.py +0 -0
  117. {hud_python-0.4.11 → hud_python-0.4.13}/hud/otel/tests/__init__.py +0 -0
  118. {hud_python-0.4.11 → hud_python-0.4.13}/hud/otel/tests/test_processors.py +0 -0
  119. {hud_python-0.4.11 → hud_python-0.4.13}/hud/py.typed +0 -0
  120. {hud_python-0.4.11 → hud_python-0.4.13}/hud/server/__init__.py +0 -0
  121. {hud_python-0.4.11 → hud_python-0.4.13}/hud/server/context.py +0 -0
  122. {hud_python-0.4.11 → hud_python-0.4.13}/hud/server/helper/__init__.py +0 -0
  123. {hud_python-0.4.11 → hud_python-0.4.13}/hud/server/low_level.py +0 -0
  124. {hud_python-0.4.11 → hud_python-0.4.13}/hud/server/tests/__init__.py +0 -0
  125. {hud_python-0.4.11 → hud_python-0.4.13}/hud/settings.py +0 -0
  126. {hud_python-0.4.11 → hud_python-0.4.13}/hud/shared/__init__.py +0 -0
  127. {hud_python-0.4.11 → hud_python-0.4.13}/hud/shared/requests.py +0 -0
  128. {hud_python-0.4.11 → hud_python-0.4.13}/hud/shared/tests/__init__.py +0 -0
  129. {hud_python-0.4.11 → hud_python-0.4.13}/hud/shared/tests/test_requests.py +0 -0
  130. {hud_python-0.4.11 → hud_python-0.4.13}/hud/telemetry/__init__.py +0 -0
  131. {hud_python-0.4.11 → hud_python-0.4.13}/hud/telemetry/instrument.py +0 -0
  132. {hud_python-0.4.11 → hud_python-0.4.13}/hud/telemetry/job.py +0 -0
  133. {hud_python-0.4.11 → hud_python-0.4.13}/hud/telemetry/replay.py +0 -0
  134. {hud_python-0.4.11/hud/utils → hud_python-0.4.13/hud/telemetry}/tests/__init__.py +0 -0
  135. {hud_python-0.4.11 → hud_python-0.4.13}/hud/telemetry/trace.py +0 -0
  136. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/__init__.py +0 -0
  137. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/bash.py +0 -0
  138. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/computer/__init__.py +0 -0
  139. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/computer/anthropic.py +0 -0
  140. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/computer/openai.py +0 -0
  141. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/computer/settings.py +0 -0
  142. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/edit.py +0 -0
  143. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/executors/__init__.py +0 -0
  144. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/executors/base.py +0 -0
  145. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/executors/pyautogui.py +0 -0
  146. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/executors/tests/__init__.py +0 -0
  147. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  148. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/executors/xdo.py +0 -0
  149. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/playwright.py +0 -0
  150. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/tests/__init__.py +0 -0
  151. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/tests/test_base.py +0 -0
  152. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/tests/test_bash.py +0 -0
  153. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/tests/test_bash_extended.py +0 -0
  154. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/tests/test_computer.py +0 -0
  155. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/tests/test_computer_actions.py +0 -0
  156. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/tests/test_edit.py +0 -0
  157. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/tests/test_init.py +0 -0
  158. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/tests/test_playwright_tool.py +0 -0
  159. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/tests/test_tools.py +0 -0
  160. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/tests/test_utils.py +0 -0
  161. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/types.py +0 -0
  162. {hud_python-0.4.11 → hud_python-0.4.13}/hud/tools/utils.py +0 -0
  163. {hud_python-0.4.11 → hud_python-0.4.13}/hud/types.py +0 -0
  164. {hud_python-0.4.11 → hud_python-0.4.13}/hud/utils/__init__.py +0 -0
  165. {hud_python-0.4.11 → hud_python-0.4.13}/hud/utils/async_utils.py +0 -0
  166. {hud_python-0.4.11 → hud_python-0.4.13}/hud/utils/progress.py +0 -0
  167. {hud_python-0.4.11 → hud_python-0.4.13}/hud/utils/telemetry.py +0 -0
  168. {hud_python-0.4.11 → hud_python-0.4.13}/hud/utils/tests/test_async_utils.py +0 -0
  169. {hud_python-0.4.11 → hud_python-0.4.13}/hud/utils/tests/test_init.py +0 -0
  170. {hud_python-0.4.11 → hud_python-0.4.13}/hud/utils/tests/test_progress.py +0 -0
  171. {hud_python-0.4.11 → hud_python-0.4.13}/hud/utils/tests/test_telemetry.py +0 -0
  172. {hud_python-0.4.11 → hud_python-0.4.13}/rl/README.md +0 -0
  173. {hud_python-0.4.11 → hud_python-0.4.13}/rl/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.11
3
+ Version: 0.4.13
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -35,10 +35,9 @@ Classifier: Programming Language :: Python :: 3.11
35
35
  Classifier: Programming Language :: Python :: 3.12
36
36
  Classifier: Programming Language :: Python :: 3.13
37
37
  Requires-Python: <3.14,>=3.11
38
- Requires-Dist: fastmcp>=2.11.2
39
38
  Requires-Dist: httpx<1,>=0.23.0
40
- Requires-Dist: hud-mcp-python-sdk>=0.1.0
41
- Requires-Dist: mcp>=1.13.1
39
+ Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
40
+ Requires-Dist: hud-mcp-python-sdk>=3.13.2
42
41
  Requires-Dist: opentelemetry-api>=1.34.1
43
42
  Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
44
43
  Requires-Dist: opentelemetry-instrumentation-mcp>=0.44.1
@@ -56,7 +55,11 @@ Provides-Extra: agent
56
55
  Requires-Dist: anthropic; extra == 'agent'
57
56
  Requires-Dist: datasets>=2.14.0; extra == 'agent'
58
57
  Requires-Dist: dotenv>=0.9.9; extra == 'agent'
59
- Requires-Dist: hud-mcp-use-python-sdk>=0.1.0; extra == 'agent'
58
+ Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agent'
59
+ Requires-Dist: ipykernel; extra == 'agent'
60
+ Requires-Dist: ipython<9; extra == 'agent'
61
+ Requires-Dist: jupyter-client; extra == 'agent'
62
+ Requires-Dist: jupyter-core; extra == 'agent'
60
63
  Requires-Dist: langchain; extra == 'agent'
61
64
  Requires-Dist: langchain-anthropic; extra == 'agent'
62
65
  Requires-Dist: langchain-openai; extra == 'agent'
@@ -66,7 +69,11 @@ Provides-Extra: agents
66
69
  Requires-Dist: anthropic; extra == 'agents'
67
70
  Requires-Dist: datasets>=2.14.0; extra == 'agents'
68
71
  Requires-Dist: dotenv>=0.9.9; extra == 'agents'
69
- Requires-Dist: hud-mcp-use-python-sdk>=0.1.0; extra == 'agents'
72
+ Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agents'
73
+ Requires-Dist: ipykernel; extra == 'agents'
74
+ Requires-Dist: ipython<9; extra == 'agents'
75
+ Requires-Dist: jupyter-client; extra == 'agents'
76
+ Requires-Dist: jupyter-core; extra == 'agents'
70
77
  Requires-Dist: langchain; extra == 'agents'
71
78
  Requires-Dist: langchain-anthropic; extra == 'agents'
72
79
  Requires-Dist: langchain-openai; extra == 'agents'
@@ -77,7 +84,7 @@ Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
77
84
  Requires-Dist: anthropic; extra == 'dev'
78
85
  Requires-Dist: datasets>=2.14.0; extra == 'dev'
79
86
  Requires-Dist: dotenv>=0.9.9; extra == 'dev'
80
- Requires-Dist: hud-mcp-use-python-sdk>=0.1.0; extra == 'dev'
87
+ Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'dev'
81
88
  Requires-Dist: inspect-ai>=0.3.80; extra == 'dev'
82
89
  Requires-Dist: ipykernel; extra == 'dev'
83
90
  Requires-Dist: ipython<9; extra == 'dev'
@@ -233,7 +240,7 @@ Any hud MCP environment and evaluation works with our RL pipeline. Even our remo
233
240
 
234
241
  This is Claude Computer Use running on our proprietary financial analyst benchmark [SheetBench-50](https://huggingface.co/datasets/hud-evals/SheetBench-50):
235
242
 
236
- ![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/l/text-2048/docs/src/images/trace_sheet.gif)
243
+ ![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
237
244
 
238
245
  > [See this trace on _app.hud.so_](https://app.hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
239
246
 
@@ -385,7 +392,7 @@ result = await ClaudeAgent().run({ # See all agents: https://docs.hud.so/refere
385
392
 
386
393
  All leaderboards are publicly available on [app.hud.so/leaderboards](https://app.hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
387
394
 
388
- ![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/l/text-2048/docs/src/images/leaderboards_2.png)
395
+ ![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/leaderboards_3.png)
389
396
 
390
397
  We highly suggest running 3-5 evaluations per dataset for the most consistent results across multiple jobs.
391
398
 
@@ -430,10 +437,6 @@ graph LR
430
437
  Trace --> Dashboard
431
438
  AnyMCP -->|"MCP"| API
432
439
 
433
- style Dashboard fill:#e0e7ff,stroke:#6366f1,stroke-width:2px
434
- style SDK fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
435
- style RemoteEnv fill:#d1fae5,stroke:#10b981,stroke-width:2px
436
- style AnyMCP fill:#fce7f3,stroke:#ec4899,stroke-width:2px,stroke-dasharray: 5 5
437
440
  ```
438
441
 
439
442
  ## CLI reference
@@ -130,7 +130,7 @@ Any hud MCP environment and evaluation works with our RL pipeline. Even our remo
130
130
 
131
131
  This is Claude Computer Use running on our proprietary financial analyst benchmark [SheetBench-50](https://huggingface.co/datasets/hud-evals/SheetBench-50):
132
132
 
133
- ![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/l/text-2048/docs/src/images/trace_sheet.gif)
133
+ ![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
134
134
 
135
135
  > [See this trace on _app.hud.so_](https://app.hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
136
136
 
@@ -282,7 +282,7 @@ result = await ClaudeAgent().run({ # See all agents: https://docs.hud.so/refere
282
282
 
283
283
  All leaderboards are publicly available on [app.hud.so/leaderboards](https://app.hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
284
284
 
285
- ![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/l/text-2048/docs/src/images/leaderboards_2.png)
285
+ ![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/leaderboards_3.png)
286
286
 
287
287
  We highly suggest running 3-5 evaluations per dataset for the most consistent results across multiple jobs.
288
288
 
@@ -327,10 +327,6 @@ graph LR
327
327
  Trace --> Dashboard
328
328
  AnyMCP -->|"MCP"| API
329
329
 
330
- style Dashboard fill:#e0e7ff,stroke:#6366f1,stroke-width:2px
331
- style SDK fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
332
- style RemoteEnv fill:#d1fae5,stroke:#10b981,stroke-width:2px
333
- style AnyMCP fill:#fce7f3,stroke:#ec4899,stroke-width:2px,stroke-dasharray: 5 5
334
330
  ```
335
331
 
336
332
  ## CLI reference
@@ -351,7 +351,7 @@ from . import basic, advanced # This registers all @setup.tool() decorated func
351
351
 
352
352
  # In setup/basic.py
353
353
  from . import setup
354
- from hud.tools.types import SetupResult
354
+ from mcp.types import TextContent
355
355
 
356
356
  @setup.tool()
357
357
  async def reset(**kwargs):
@@ -361,14 +361,14 @@ async def reset(**kwargs):
361
361
  **kwargs: Additional parameters
362
362
 
363
363
  Returns:
364
- SetupResult
364
+ TextContent
365
365
  """
366
366
  # Access environment from the hub
367
367
  env = setup.env
368
368
  await env.reset_state()
369
- return SetupResult(
370
- content="Environment reset to initial state",
371
- info={"status": "success"}
369
+ return TextContent(
370
+ text="Environment reset to initial state",
371
+ type="text"
372
372
  )
373
373
 
374
374
  @setup.tool()
@@ -379,14 +379,14 @@ async def seed_data(num_items: int = 5):
379
379
  num_items: Number of items to create
380
380
 
381
381
  Returns:
382
- SetupResult
382
+ TextContent
383
383
  """
384
384
  # Access environment from the hub
385
385
  env = setup.env
386
386
  items = await env.create_items(num_items)
387
- return SetupResult(
388
- content=f"Created {len(items)} items",
389
- info={"items_created": len(items)}
387
+ return TextContent(
388
+ text=f"Created {len(items)} items",
389
+ type="text"
390
390
  )
391
391
 
392
392
  # In evaluate/__init__.py
@@ -735,7 +735,7 @@ See the `browser` environment for a complete production example of this pattern.
735
735
 
736
736
  ### 4. Cursor rules – paste this once
737
737
 
738
- Inside `.cursor/rules/hud_environment_iteration.mdc` add (or verify) the following so the agent always knows the expected iteration loop:
738
+ Inside `.cursor/rules/mcp_environment_iteration.mdc` add (or verify) the following so the agent always knows the expected iteration loop:
739
739
 
740
740
  ```mdc
741
741
  ---
@@ -743,7 +743,7 @@ description: Improve an MCP environment
743
743
  alwaysApply: false
744
744
  ---
745
745
  Setup
746
- 1. Make sure the user has started the development server with `hud dev --build` and that you can connect to the environment through the provided HTTP endpoint. Check that you have access to the environment's tools.
746
+ 1. Make sure the user has set up the mcp config for the environment by seeing if you have access to the tools by the given name (i.e. my-environment-dev), and make sure the title is in dev mode. If not, ask the user to make a dev version!
747
747
  2. Make sure you can find the source folder for this environment. Explore its contents and README.
748
748
  3. Clarify the objectives and ask follow up questions on the initial query to determine precise implementation details.
749
749
 
@@ -760,7 +760,7 @@ Iteration
760
760
  Context: In the my-environment folder, I have a browser app environment. I've built a tool to interact with it called my-environment-dev.
761
761
  Interaction: There are multiple tools to setup and evaluate the environment. There are also interaction tools for you to be able to move around it, and a screenshot tool to see the state. Use all of the available tools.
762
762
  Objective: Please test if all setup, evaluation functions are working. This means you should come up with new problem definitions to test all functionality on. Be creative in how you pick edge cases to test on.
763
- Rules: @hud_environment_iteration.mdc
763
+ Rules: @mcp_environment_iteration.mdc
764
764
  ```
765
765
 
766
766
  ---
@@ -827,13 +827,13 @@ Before making changes:
827
827
  ```python
828
828
  # In setup/my_new_setup.py
829
829
  from . import setup
830
- from hud.tools import BaseSetup, SetupResult
830
+ from hud.tools import BaseSetup, TextContent
831
831
 
832
832
  @setup("my_new_setup", description="Clear description of what this does")
833
833
  class MyNewSetup(BaseSetup):
834
- async def __call__(self, context, param1: str, param2: int = 10) -> SetupResult:
834
+ async def __call__(self, context, param1: str, param2: int = 10) -> TextContent:
835
835
  # Implementation
836
- return {"status": "success", "details": "..."}
836
+ return TextContent(...)
837
837
  ```
838
838
 
839
839
  **Adding New Evaluators**
@@ -0,0 +1,213 @@
1
+ # Browser Environment
2
+
3
+ A browser automation environment for HUD that provides GUI access and web app interaction capabilities. This environment supports hot-reloading during development while maintaining persistent state.
4
+
5
+ ## Architecture Overview
6
+
7
+ The browser environment uses a two-process architecture:
8
+
9
+ 1. **Context Server** (`context.py`): Long-running process that maintains persistent state
10
+ 2. **MCP Server** (`server.py`): Hot-reloadable process that handles tool requests
11
+
12
+ ### Key Components
13
+
14
+ - **BrowserContext**: Stores persistent state (running apps, ports, playwright instance)
15
+ - **ServiceManager**: Manages X11, VNC, and app processes
16
+ - **BaseHub Tools**: Setup and evaluate tools organized by app (2048, todo)
17
+ - **Multiprocessing Proxy**: Enables state sharing between processes
18
+
19
+ ## Context Management and Common Pitfalls
20
+
21
+ ### Understanding the Proxy System
22
+
23
+ The browser environment uses Python's `multiprocessing.Manager` to share state between the context server and MCP server. This introduces important constraints:
24
+
25
+ #### ❌ Common Pitfall: Unpicklable Objects
26
+
27
+ ```python
28
+ # BAD: This will fail with "cannot pickle 'coroutine' object"
29
+ @setup.tool("my_tool")
30
+ async def my_tool():
31
+ env = setup.env
32
+ result = await env.call_app_api("app", "/api/endpoint") # Returns coroutine
33
+ # The coroutine can't be serialized through the proxy!
34
+ ```
35
+
36
+ #### ✅ Solution: Direct HTTP Calls
37
+
38
+ ```python
39
+ # GOOD: Make HTTP calls directly
40
+ @setup.tool("my_tool")
41
+ async def my_tool():
42
+ import httpx
43
+
44
+ # Get the backend port from persistent context
45
+ persistent_ctx = setup.env
46
+ backend_port = persistent_ctx.get_app_backend_port("app")
47
+
48
+ # Make API call directly
49
+ url = f"http://localhost:{backend_port}/api/endpoint"
50
+ async with httpx.AsyncClient() as client:
51
+ response = await client.get(url)
52
+ response.raise_for_status()
53
+ result = response.json()
54
+ ```
55
+
56
+ ### State Synchronization Issues
57
+
58
+ #### ❌ Common Pitfall: Direct List/Dict Manipulation
59
+
60
+ ```python
61
+ # BAD: Regular Python lists don't sync through proxy
62
+ class ServiceManager:
63
+ def __init__(self):
64
+ self._launched_apps = [] # Won't sync!
65
+ ```
66
+
67
+ #### ✅ Solution: Store State in Persistent Context
68
+
69
+ ```python
70
+ # GOOD: Use the persistent context for shared state
71
+ class BrowserContext:
72
+ def __init__(self):
73
+ self._running_apps: List[str] = []
74
+ self._app_ports: Dict[str, Dict[str, int]] = {}
75
+
76
+ def add_running_app(self, app_name: str) -> None:
77
+ """Add app to running list."""
78
+ if app_name not in self._running_apps:
79
+ self._running_apps.append(app_name)
80
+ ```
81
+
82
+ ### Accessing Shared Resources
83
+
84
+ #### ❌ Common Pitfall: Direct Attribute Access
85
+
86
+ ```python
87
+ # BAD: Direct attribute access on proxy objects
88
+ playwright_tool = env.playwright # May not work with proxy
89
+ ```
90
+
91
+ #### ✅ Solution: Use Getter Methods
92
+
93
+ ```python
94
+ # GOOD: Use proxy-friendly getter methods
95
+ playwright_tool = persistent_ctx.get_playwright_tool()
96
+ ```
97
+
98
+ ## Best Practices
99
+
100
+ ### 1. Tool Implementation Pattern
101
+
102
+ All setup and evaluate tools should follow this pattern:
103
+
104
+ ```python
105
+ @setup.tool("tool_name")
106
+ async def tool_name(param1: type, param2: type):
107
+ """Tool description."""
108
+ try:
109
+ # Get persistent context
110
+ persistent_ctx = setup.env # or evaluate.env
111
+
112
+ # Get app ports
113
+ backend_port = persistent_ctx.get_app_backend_port("app_name")
114
+
115
+ # Make HTTP request
116
+ url = f"http://localhost:{backend_port}/api/endpoint"
117
+ async with httpx.AsyncClient() as client:
118
+ response = await client.method(url, json=data)
119
+ response.raise_for_status()
120
+ result = response.json()
121
+
122
+ # Return result
123
+ return TextContent(
124
+ text=f"Success message",
125
+ type="text"
126
+ )
127
+ except Exception as e:
128
+ logger.error(f"tool_name failed: {e}")
129
+ return TextContent(
130
+ text=f"Failed: {str(e)}",
131
+ type="text"
132
+ )
133
+ ```
134
+
135
+ ### 2. App Launch Pattern
136
+
137
+ When launching apps, ensure ports are stored in the persistent context:
138
+
139
+ ```python
140
+ # In launch_app tool
141
+ app_info = await service_manager.launch_app(app_name)
142
+
143
+ # Store ports in persistent context for later access
144
+ try:
145
+ backend_port = service_manager.get_app_port(app_name)
146
+ frontend_port = service_manager.get_app_frontend_port(app_name)
147
+ persistent_ctx.set_app_ports(app_name, frontend_port, backend_port)
148
+ except Exception as e:
149
+ logger.error(f"Failed to store ports: {e}")
150
+
151
+ # Track app in persistent context
152
+ persistent_ctx.add_running_app(app_name)
153
+ ```
154
+
155
+ ### 3. Import Organization
156
+
157
+ Keep imports at module level:
158
+
159
+ ```python
160
+ # At top of file
161
+ import logging
162
+ import httpx
163
+ from mcp.types import TextContent
164
+ from . import setup
165
+
166
+ # Not inside functions
167
+ ```
168
+
169
+ ## Troubleshooting
170
+
171
+ ### "Cannot pickle 'coroutine' object"
172
+
173
+ **Cause**: Trying to return an async function result through the proxy.
174
+
175
+ **Fix**: Don't use async methods on proxied objects. Make direct HTTP calls instead.
176
+
177
+ ### "App not launched" errors
178
+
179
+ **Cause**: State synchronization issue between ServiceManager and persistent context.
180
+
181
+ **Fix**: Ensure `launch_app` stores app info in the persistent context, and setup/evaluate tools check the persistent context's app list.
182
+
183
+ ### "Object has no attribute" on proxy objects
184
+
185
+ **Cause**: Direct attribute access on multiprocessing proxy objects.
186
+
187
+ **Fix**: Use getter/setter methods instead of direct attribute access.
188
+
189
+ ## Development Workflow
190
+
191
+ 1. **Start the environment**: `hud dev`
192
+ 2. **Make changes**: Edit tools in `src/hud_controller/`
193
+ 3. **Test immediately**: The MCP server hot-reloads automatically
194
+ 4. **Check logs**: Look for serialization or proxy errors
195
+
196
+ ## Adding New Apps
197
+
198
+ 1. Create app directory in `apps/`
199
+ 2. Add setup tools in `src/hud_controller/setup/app_name.py`
200
+ 3. Add evaluate tools in `src/hud_controller/evaluate/app_name.py`
201
+ 4. Follow the HTTP pattern - no `call_app_api` usage
202
+ 5. Store app ports in persistent context when launching
203
+
204
+ ## Key Files
205
+
206
+ - `context.py`: Persistent state management
207
+ - `server.py`: MCP server and tool definitions
208
+ - `services.py`: Process management for X11, VNC, apps
209
+ - `setup/`: Setup tools organized by app
210
+ - `evaluate/`: Evaluation tools organized by app
211
+
212
+ Remember: When in doubt, make direct HTTP calls and store state in the persistent context!
213
+
@@ -52,10 +52,13 @@ hud dev . --build
52
52
  # - Provide HTTP endpoint for Cursor
53
53
  # - Auto-restart on file changes
54
54
  # - Pass through environment variables
55
+ # - **Keep browser sessions alive across restarts**
55
56
  ```
56
57
 
57
58
  Add the URL from output to Cursor or click the deeplink.
58
59
 
60
+ **Note**: With hot-reload enabled, your browser session persists across code changes. This means you can modify your code and the server will restart automatically without losing your browser state, tabs, or navigation history.
61
+
59
62
  #### Option 2: Manual Docker Run
60
63
 
61
64
  For direct control over the development environment:
@@ -3,25 +3,20 @@ name = "hud-remote-browser"
3
3
  version = "0.1.0"
4
4
  description = "HUD Remote Browser Controller with MCP tools for cloud browser providers"
5
5
  requires-python = ">=3.11,<3.13"
6
- dependencies = [
7
- "hud-python @ git+https://github.com/hud-evals/hud-python.git@l/text-2048",
8
- "pyautogui",
9
- "playwright",
10
- "httpx",
11
- "typer",
12
- "google-api-python-client",
13
- "google-auth",
14
- ]
15
-
16
- [project.scripts]
17
- hud-remote-browser = "hud_controller.__main__:main"
6
+ dependencies = [ "hud-python>=0.4.12", "pyautogui", "playwright", "httpx", "typer", "google-api-python-client", "google-auth",]
18
7
 
19
8
  [build-system]
20
- requires = ["hatchling"]
9
+ requires = [ "hatchling",]
21
10
  build-backend = "hatchling.build"
22
11
 
23
- [tool.hatch.build.targets.wheel]
24
- packages = ["src/hud_controller"]
12
+ [project.scripts]
13
+ hud-remote-browser = "hud_controller.__main__:main"
14
+
15
+ [tool.hud]
16
+ image = "hud-remote-browser:dev"
25
17
 
26
18
  [tool.hatch.metadata]
27
- allow-direct-references = true
19
+ allow-direct-references = true
20
+
21
+ [tool.hatch.build.targets.wheel]
22
+ packages = [ "src/hud_controller",]
@@ -0,0 +1,8 @@
1
+ """Allow running CLI with python -m hud."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from hud.cli import main
6
+
7
+ if __name__ == "__main__":
8
+ main()
@@ -306,7 +306,7 @@ class MCPAgent(ABC):
306
306
  if decision == "STOP":
307
307
  # Try to submit response through lifecycle tool
308
308
  await self._maybe_submit_response(response, messages)
309
-
309
+
310
310
  logger.info("Stopping execution")
311
311
  final_response = response
312
312
  break
@@ -487,7 +487,7 @@ class MCPAgent(ABC):
487
487
  self._available_tools.append(tool)
488
488
  # Simplified mapping - just tool name to tool
489
489
  self._tool_map[tool.name] = tool
490
-
490
+
491
491
  # Auto-detect response tool as a lifecycle tool
492
492
  if tool.name == "response" and "response" not in self.lifecycle_tools:
493
493
  logger.debug("Auto-detected 'response' tool as a lifecycle tool")
@@ -495,7 +495,7 @@ class MCPAgent(ABC):
495
495
 
496
496
  async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
497
497
  """Submit response through lifecycle tool if available.
498
-
498
+
499
499
  Args:
500
500
  response: The agent's response
501
501
  messages: The current message history (will be modified in-place)
@@ -506,17 +506,16 @@ class MCPAgent(ABC):
506
506
  try:
507
507
  # Call the response tool with the agent's response
508
508
  response_tool_call = MCPToolCall(
509
- name="response",
510
- arguments={"response": response.content, "messages": messages}
509
+ name="response", arguments={"response": response.content, "messages": messages}
511
510
  )
512
511
  response_results = await self.call_tools(response_tool_call)
513
-
512
+
514
513
  # Format and add the response tool results to messages
515
514
  response_messages = await self.format_tool_results(
516
515
  [response_tool_call], response_results
517
516
  )
518
517
  messages.extend(response_messages)
519
-
518
+
520
519
  # Mark the task as done
521
520
  logger.info("Response lifecycle tool executed, marking task as done")
522
521
  except Exception as e:
@@ -579,7 +578,7 @@ class MCPAgent(ABC):
579
578
  logger.warning("Failed to close auto-created trace: %s", e)
580
579
  finally:
581
580
  self._auto_trace_cm = None
582
-
581
+
583
582
  # Clean up auto-created client
584
583
  if self._auto_created_client and self.mcp_client:
585
584
  try:
@@ -15,10 +15,10 @@ import hud
15
15
  if TYPE_CHECKING:
16
16
  from langchain.schema.language_model import BaseLanguageModel
17
17
  from langchain_core.tools import BaseTool
18
- from mcp_use.adapters.langchain_adapter import LangChainAdapter
18
+ from mcp_use.adapters.langchain_adapter import LangChainAdapter # type: ignore[attr-defined]
19
19
 
20
20
  try:
21
- from mcp_use.adapters.langchain_adapter import LangChainAdapter
21
+ from mcp_use.adapters.langchain_adapter import LangChainAdapter # type: ignore[attr-defined]
22
22
  except ImportError:
23
23
  LangChainAdapter = None # type: ignore[misc, assignment]
24
24
 
@@ -17,7 +17,9 @@ class TestOperatorAgent:
17
17
  @pytest.fixture
18
18
  def mock_mcp_client(self):
19
19
  """Create a mock MCP client."""
20
- mcp_client = MagicMock()
20
+ mcp_client = AsyncMock()
21
+ # Set up the mcp_config attribute as a regular dict, not a coroutine
22
+ mcp_client.mcp_config = {"test_server": {"url": "http://test"}}
21
23
  return mcp_client
22
24
 
23
25
  @pytest.fixture