hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -1,552 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: hud-python
3
- Version: 0.4.45
4
- Summary: SDK for the HUD platform.
5
- Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
- Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
7
- Project-URL: Documentation, https://docs.hud.so
8
- Author-email: HUD SDK <founders@hud.so>
9
- License: MIT License
10
-
11
- Copyright (c) 2025 Human Union Data, Inc
12
-
13
- Permission is hereby granted, free of charge, to any person obtaining a copy
14
- of this software and associated documentation files (the "Software"), to deal
15
- in the Software without restriction, including without limitation the rights
16
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
- copies of the Software, and to permit persons to whom the Software is
18
- furnished to do so, subject to the following conditions:
19
-
20
- The above copyright notice and this permission notice shall be included in all
21
- copies or substantial portions of the Software.
22
-
23
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
- SOFTWARE.
30
- License-File: LICENSE
31
- Classifier: Development Status :: 4 - Beta
32
- Classifier: Intended Audience :: Developers
33
- Classifier: Programming Language :: Python :: 3
34
- Classifier: Programming Language :: Python :: 3.11
35
- Classifier: Programming Language :: Python :: 3.12
36
- Classifier: Programming Language :: Python :: 3.13
37
- Requires-Python: <3.13,>=3.11
38
- Requires-Dist: anthropic
39
- Requires-Dist: blessed>=1.20.0
40
- Requires-Dist: datasets>=2.14.0
41
- Requires-Dist: httpx<1,>=0.23.0
42
- Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
43
- Requires-Dist: hud-mcp-python-sdk>=3.13.2
44
- Requires-Dist: hud-mcp-use-python-sdk==2.3.20
45
- Requires-Dist: numpy>=1.24.0
46
- Requires-Dist: openai
47
- Requires-Dist: opentelemetry-api>=1.34.1
48
- Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
49
- Requires-Dist: opentelemetry-instrumentation-mcp==0.47.0
50
- Requires-Dist: opentelemetry-sdk>=1.34.1
51
- Requires-Dist: pathspec>=0.12.1
52
- Requires-Dist: pillow>=11.1.0
53
- Requires-Dist: prompt-toolkit==3.0.51
54
- Requires-Dist: pydantic-settings<3,>=2.2
55
- Requires-Dist: pydantic<3,>=2.6
56
- Requires-Dist: questionary==2.1.0
57
- Requires-Dist: rich>=13.0.0
58
- Requires-Dist: toml>=0.10.2
59
- Requires-Dist: typer>=0.9.0
60
- Requires-Dist: watchfiles>=0.21.0
61
- Requires-Dist: wrapt>=1.14.0
62
- Provides-Extra: agent
63
- Requires-Dist: aiodocker>=0.24.0; extra == 'agent'
64
- Requires-Dist: dotenv>=0.9.9; extra == 'agent'
65
- Requires-Dist: inspect-ai>=0.3.80; extra == 'agent'
66
- Requires-Dist: ipykernel; extra == 'agent'
67
- Requires-Dist: ipython<9; extra == 'agent'
68
- Requires-Dist: jupyter-client; extra == 'agent'
69
- Requires-Dist: jupyter-core; extra == 'agent'
70
- Requires-Dist: langchain; extra == 'agent'
71
- Requires-Dist: langchain-anthropic; extra == 'agent'
72
- Requires-Dist: langchain-openai; extra == 'agent'
73
- Requires-Dist: litellm>=1.55.0; extra == 'agent'
74
- Requires-Dist: pillow>=11.1.0; extra == 'agent'
75
- Requires-Dist: playwright; extra == 'agent'
76
- Requires-Dist: pyautogui>=0.9.54; extra == 'agent'
77
- Requires-Dist: pyright==1.1.401; extra == 'agent'
78
- Requires-Dist: pytest-asyncio; extra == 'agent'
79
- Requires-Dist: pytest-cov; extra == 'agent'
80
- Requires-Dist: pytest-mock; extra == 'agent'
81
- Requires-Dist: pytest<9,>=8.1.1; extra == 'agent'
82
- Requires-Dist: ruff>=0.11.8; extra == 'agent'
83
- Requires-Dist: setuptools; extra == 'agent'
84
- Requires-Dist: textdistance<5,>=4.5.0; extra == 'agent'
85
- Provides-Extra: agents
86
- Requires-Dist: aiodocker>=0.24.0; extra == 'agents'
87
- Requires-Dist: dotenv>=0.9.9; extra == 'agents'
88
- Requires-Dist: inspect-ai>=0.3.80; extra == 'agents'
89
- Requires-Dist: ipykernel; extra == 'agents'
90
- Requires-Dist: ipython<9; extra == 'agents'
91
- Requires-Dist: jupyter-client; extra == 'agents'
92
- Requires-Dist: jupyter-core; extra == 'agents'
93
- Requires-Dist: langchain; extra == 'agents'
94
- Requires-Dist: langchain-anthropic; extra == 'agents'
95
- Requires-Dist: langchain-openai; extra == 'agents'
96
- Requires-Dist: litellm>=1.55.0; extra == 'agents'
97
- Requires-Dist: pillow>=11.1.0; extra == 'agents'
98
- Requires-Dist: playwright; extra == 'agents'
99
- Requires-Dist: pyautogui>=0.9.54; extra == 'agents'
100
- Requires-Dist: pyright==1.1.401; extra == 'agents'
101
- Requires-Dist: pytest-asyncio; extra == 'agents'
102
- Requires-Dist: pytest-cov; extra == 'agents'
103
- Requires-Dist: pytest-mock; extra == 'agents'
104
- Requires-Dist: pytest<9,>=8.1.1; extra == 'agents'
105
- Requires-Dist: ruff>=0.11.8; extra == 'agents'
106
- Requires-Dist: setuptools; extra == 'agents'
107
- Requires-Dist: textdistance<5,>=4.5.0; extra == 'agents'
108
- Provides-Extra: dev
109
- Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
110
- Requires-Dist: dotenv>=0.9.9; extra == 'dev'
111
- Requires-Dist: inspect-ai>=0.3.80; extra == 'dev'
112
- Requires-Dist: ipykernel; extra == 'dev'
113
- Requires-Dist: ipython<9; extra == 'dev'
114
- Requires-Dist: jupyter-client; extra == 'dev'
115
- Requires-Dist: jupyter-core; extra == 'dev'
116
- Requires-Dist: langchain; extra == 'dev'
117
- Requires-Dist: langchain-anthropic; extra == 'dev'
118
- Requires-Dist: langchain-openai; extra == 'dev'
119
- Requires-Dist: litellm>=1.55.0; extra == 'dev'
120
- Requires-Dist: pillow>=11.1.0; extra == 'dev'
121
- Requires-Dist: playwright; extra == 'dev'
122
- Requires-Dist: pyautogui>=0.9.54; extra == 'dev'
123
- Requires-Dist: pyright==1.1.401; extra == 'dev'
124
- Requires-Dist: pytest-asyncio; extra == 'dev'
125
- Requires-Dist: pytest-cov; extra == 'dev'
126
- Requires-Dist: pytest-mock; extra == 'dev'
127
- Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
128
- Requires-Dist: ruff>=0.11.8; extra == 'dev'
129
- Requires-Dist: setuptools; extra == 'dev'
130
- Requires-Dist: textdistance<5,>=4.5.0; extra == 'dev'
131
- Provides-Extra: rl
132
- Requires-Dist: bitsandbytes>=0.41.0; (sys_platform == 'linux') and extra == 'rl'
133
- Requires-Dist: liger-kernel>=0.5.0; (sys_platform == 'linux') and extra == 'rl'
134
- Requires-Dist: peft>=0.17.1; extra == 'rl'
135
- Requires-Dist: vllm==0.10.1.1; extra == 'rl'
136
- Description-Content-Type: text/markdown
137
-
138
- <div align="left">
139
- <picture>
140
- <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo_dark.svg">
141
- <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo.svg">
142
- <img src="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo.svg" alt="HUD" width="150" style="margin-bottom: 24px;"/>
143
- </picture>
144
- </div>
145
-
146
- OSS RL environment + evals toolkit. Wrap software as environments, run benchmarks, and train with RL – locally or at scale.
147
-
148
- [![PyPI version](https://img.shields.io/pypi/v/hud-python?style=flat-square)](https://pypi.org/project/hud-python/)
149
- [![License](https://img.shields.io/badge/license-MIT-green?style=flat-square)](LICENSE)
150
- [![Add docs to Cursor](https://img.shields.io/badge/Add%20docs%20to-Cursor-black?style=flat-square)](https://cursor.com/en/install-mcp?name=docs-hud-python&config=eyJ1cmwiOiJodHRwczovL2RvY3MuaHVkLnNvL21jcCJ9)
151
- [![Discord](https://img.shields.io/discord/1327447144772407390?label=Discord&logo=discord&style=flat-square)](https://discord.gg/wkjtmHYYjm)
152
- [![X Follow](https://img.shields.io/twitter/follow/hud_evals?style=social)](https://x.com/intent/user?screen_name=hud_evals)
153
- [![Shop](https://img.shields.io/badge/_-white.svg?label=shop&logo=&style=social)](https://shop.hud.so)
154
-
155
-
156
- ### Are you a startup building agents?
157
-
158
- [📅 Hop on a call](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
159
-
160
- ## Highlights
161
-
162
- - 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
163
- - 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
164
- - ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
165
- - 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
166
- - 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
167
- - 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
168
-
169
- > We welcome contributors and feature requests – open an issue or hop on a call to discuss improvements!
170
-
171
- ## Installation
172
-
173
- ```bash
174
- # SDK - MCP servers, telemetry, evaluation
175
- pip install hud-python
176
-
177
- # CLI - RL pipeline, environment design
178
- uv tool install hud-python
179
- # uv tool update-shell
180
- ```
181
-
182
- > See [docs.hud.so](https://docs.hud.so), or add docs to any MCP client:
183
- > `claude mcp add --transport http docs-hud https://docs.hud.so/mcp`
184
-
185
- Before starting, get your HUD_API_KEY at [hud.so](https://hud.so).
186
-
187
-
188
- ## Quickstart: Training
189
-
190
- RL using GRPO a Qwen2.5-VL model on any hud dataset:
191
-
192
- ```bash
193
- hud get hud-evals/basic-2048 # from HF
194
- hud rl basic-2048.json
195
- ```
196
-
197
- > See [agent training docs](https://docs.hud.so/train-agents/quickstart)
198
-
199
- Or make your own environment and dataset:
200
-
201
- ```bash
202
- hud init my-env && cd my-env
203
- hud dev --interactive
204
- # When ready to run:
205
- hud rl
206
- ```
207
-
208
- > See [environment design docs](https://docs.hud.so/build-environments)
209
-
210
-
211
- ## Quickstart: Evals
212
-
213
- For a tutorial that explains the agent and evaluation design, run:
214
-
215
- ```python
216
- uvx hud-python quickstart
217
- ```
218
-
219
- Or just write your own agent loop (more [examples here](examples/)).
220
-
221
- ```python
222
- import asyncio, hud, os
223
- from hud.settings import settings
224
- from hud.clients import MCPClient
225
- from hud.agents import ClaudeAgent
226
- from hud.datasets import Task # See docs: https://docs.hud.so/reference/tasks
227
-
228
- async def main() -> None:
229
- with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://hud.so)
230
- task = {
231
- "prompt": "Reach 64 in 2048.",
232
- "mcp_config": {
233
- "hud": {
234
- "url": "https://mcp.hud.so/v3/mcp", # HUD's cloud MCP server (see https://docs.hud.so/core-concepts/architecture)
235
- "headers": {
236
- "Authorization": f"Bearer {settings.api_key}", # Get your key at https://hud.so
237
- "Mcp-Image": "hudpython/hud-text-2048:v1.2" # Docker image from https://hub.docker.com/u/hudpython
238
- }
239
- }
240
- },
241
- "evaluate_tool": {"name": "evaluate", "arguments": {"name": "max_number", "arguments": {"target": 64}}},
242
- }
243
- task = Task(**task)
244
-
245
- # 1. Define the client explicitly:
246
- client = MCPClient(mcp_config=task.mcp_config)
247
- agent = ClaudeAgent(
248
- mcp_client=client,
249
- model="claude-sonnet-4-20250514", # requires ANTHROPIC_API_KEY
250
- )
251
-
252
- result = await agent.run(task)
253
-
254
- # 2. Or just:
255
- # result = await ClaudeAgent().run(task)
256
-
257
- print(f"Reward: {result.reward}")
258
- await client.shutdown()
259
-
260
- asyncio.run(main())
261
- ```
262
-
263
- The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
264
-
265
- ![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif)
266
-
267
- ## Reinforcement Learning with GRPO
268
-
269
- This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
270
-
271
- ![RL curve](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/rl_2.png)
272
-
273
- Train with the new interactive `hud rl` flow:
274
-
275
- ```bash
276
- # Install CLI
277
- uv tool install hud-python
278
-
279
- # Option A: Run directly from a HuggingFace dataset
280
- hud rl hud-evals/basic-2048
281
-
282
- # Option B: Download first, modify, then train
283
- hud get hud-evals/basic-2048
284
- hud rl basic-2048.json
285
-
286
- # Optional: baseline evaluation
287
- hud eval basic-2048.json
288
- ```
289
-
290
- Supports multi‑turn RL for both:
291
- - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
292
- - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
293
-
294
- By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
295
-
296
- Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
297
-
298
- Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
299
-
300
- ## Benchmarking Agents
301
-
302
- This is Claude Computer Use running on our proprietary financial analyst benchmark [SheetBench-50](https://huggingface.co/datasets/hud-evals/SheetBench-50):
303
-
304
- ![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
305
-
306
- > [See this trace on _hud.so_](https://hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
307
-
308
- This example runs the full dataset (only takes ~20 minutes) using [run_evaluation.py](examples/run_evaluation.py):
309
-
310
- ```bash
311
- python examples/run_evaluation.py hud-evals/SheetBench-50 --full --agent claude
312
- ```
313
-
314
- Or in code:
315
-
316
- ```python
317
- import asyncio
318
- from hud.datasets import run_dataset
319
- from hud.agents import ClaudeAgent
320
-
321
- results = await run_dataset(
322
- name="My SheetBench-50 Evaluation",
323
- dataset="hud-evals/SheetBench-50", # <-- HuggingFace dataset
324
- agent_class=ClaudeAgent, # <-- Your custom agent can replace this (see https://docs.hud.so/evaluate-agents/create-agents)
325
- agent_config={"model": "claude-sonnet-4-20250514"},
326
- max_concurrent=50,
327
- max_steps=30,
328
- )
329
- print(f"Average reward: {sum(r.reward for r in results) / len(results):.2f}")
330
- ```
331
-
332
- > Running a dataset creates a job and streams results to the [hud.so](https://hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
333
-
334
- ## Building Environments (MCP)
335
-
336
- This is how you can make any environment into an interactable one in 5 steps:
337
-
338
- 1. Define MCP server layer using [`MCPServer`](https://docs.hud.so/reference/environments)
339
-
340
- ```python
341
- from hud.server import MCPServer
342
- from hud.tools import HudComputerTool
343
-
344
- mcp = MCPServer("My Environment")
345
-
346
- # Add hud tools (see all tools: https://docs.hud.so/reference/tools)
347
- mcp.tool(HudComputerTool())
348
-
349
- # Or custom tools (see https://docs.hud.so/build-environments/adapting-software)
350
- @mcp.tool("launch_app"):
351
- def launch_app(name: str = "Gmail")
352
- ...
353
-
354
- if __name__ == "__main__":
355
- mcp.run()
356
- ```
357
-
358
- 2. Write a simple Dockerfile that installs packages and runs:
359
-
360
- ```python
361
- CMD ["python", "-m", "hud_controller.server"]
362
- ```
363
-
364
- And build the image:
365
-
366
- ```bash
367
- hud build # runs docker build under the hood
368
- ```
369
-
370
- Or run it in interactible mode
371
-
372
- ```bash
373
- hud dev
374
- ```
375
-
376
- 3. Debug it with the CLI to see if it launches:
377
-
378
- ```console
379
- $ hud debug my-name/my-environment:latest
380
-
381
- ✓ Phase 1: Docker image exists
382
- ✓ Phase 2: MCP server responds to initialize
383
- ✓ Phase 3: Tools are discoverable
384
- ✓ Phase 4: Basic tool execution works
385
- ✓ Phase 5: Parallel performance is good
386
-
387
- Progress: [█████████████████████] 5/5 phases (100%)
388
- ✅ All phases completed successfully!
389
- ```
390
-
391
- Analyze it to see if all tools appear:
392
-
393
- ```console
394
- $ hud analyze hudpython/hud-remote-browser:latest
395
- ⠏ ✓ Analysis complete
396
- ...
397
- Tools
398
- ├── Regular Tools
399
- │ ├── computer
400
- │ │ └── Control computer with mouse, keyboard, and screenshots
401
- ...
402
- └── Hub Tools
403
- ├── setup
404
- │ ├── navigate_to_url
405
- │ ├── set_cookies
406
- │ ├── ...
407
- └── evaluate
408
- ├── url_match
409
- ├── page_contains
410
- ├── cookie_exists
411
- ├── ...
412
-
413
- 📡 Telemetry Data
414
- Live URL https://live.anchorbrowser.io?sessionId=abc123def456
415
- ```
416
-
417
- 4. When the tests pass, push it up to the docker registry:
418
-
419
- ```bash
420
- hud push # needs docker login, hud api key
421
- ```
422
-
423
- 5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [hud.so](https://hud.so):
424
-
425
- ```python
426
- from hud.agents import ClaudeAgent
427
-
428
- result = await ClaudeAgent().run({ # See all agents: https://docs.hud.so/reference/agents
429
- "prompt": "Please explore this environment",
430
- "mcp_config": {
431
- "my-environment": {
432
- "url": "https://mcp.hud.so/v3/mcp",
433
- "headers": {
434
- "Authorization": f"Bearer {os.getenv('HUD_API_KEY')}",
435
- "Mcp-Image": "my-name/my-environment:latest"
436
- }
437
- }
438
- # "my-environment": { # or use hud run which wraps local and remote running
439
- # "cmd": "hud",
440
- # "args": [
441
- # "run",
442
- # "my-name/my-environment:latest",
443
- # ]
444
- # }
445
- }
446
- })
447
-
448
- ```
449
-
450
- > See the full environment design guide and common pitfalls in [`environments/README.md`](environments/README.md)
451
-
452
- ## Leaderboards & benchmarks
453
-
454
- All leaderboards are publicly available on [hud.so/leaderboards](https://hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
455
-
456
- ![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/leaderboards_3.png)
457
-
458
- We highly suggest running 3-5 evaluations per dataset for the most consistent results across multiple jobs.
459
-
460
- Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it:
461
-
462
- ## Architecture
463
-
464
- ```mermaid
465
- %%{init: {"theme": "neutral", "themeVariables": {"fontSize": "14px"}} }%%
466
- graph LR
467
- subgraph "Platform"
468
- Dashboard["📊 hud.so"]
469
- API["🔌 mcp.hud.so"]
470
- end
471
-
472
- subgraph "hud"
473
- Agent["🤖 Agent"]
474
- Task["📋 Task"]
475
- SDK["📦 SDK"]
476
- end
477
-
478
- subgraph "Environments"
479
- LocalEnv["🖥️ Local Docker<br/>(Development)"]
480
- RemoteEnv["☁️ Remote Docker<br/>(100s Parallel)"]
481
- end
482
-
483
- subgraph "otel"
484
- Trace["📡 Traces & Metrics"]
485
- end
486
-
487
- Dataset["📚 Dataset<br/>(HuggingFace)"]
488
-
489
- AnyMCP["🔗 Any MCP Client<br/>(Cursor, Claude, Custom)"]
490
-
491
- Agent <--> SDK
492
- Task --> SDK
493
- Dataset <-.-> Task
494
- SDK <-->|"MCP"| LocalEnv
495
- SDK <-->|"MCP"| API
496
- API <-->|"MCP"| RemoteEnv
497
- SDK --> Trace
498
- Trace --> Dashboard
499
- AnyMCP -->|"MCP"| API
500
-
501
- ```
502
-
503
- ## CLI reference
504
-
505
- | Command | Purpose | Docs |
506
- | ----------------------- | ------------------------------------------ | ---- |
507
- | [`hud init`](https://docs.hud.so/reference/cli/init) | Create new environment with boilerplate. | [📖](https://docs.hud.so/reference/cli/init) |
508
- | [`hud dev`](https://docs.hud.so/reference/cli/dev) | Hot-reload development with Docker. | [📖](https://docs.hud.so/reference/cli/dev) |
509
- | [`hud build`](https://docs.hud.so/reference/cli/build) | Build image and generate lock file. | [📖](https://docs.hud.so/reference/cli/build) |
510
- | [`hud push`](https://docs.hud.so/reference/cli/push) | Share environment to registry. | [📖](https://docs.hud.so/reference/cli/push) |
511
- | [`hud pull <target>`](https://docs.hud.so/reference/cli/pull) | Get environment from registry. | [📖](https://docs.hud.so/reference/cli/pull) |
512
- | [`hud analyze <image>`](https://docs.hud.so/reference/cli/analyze) | Discover tools, resources, and metadata. | [📖](https://docs.hud.so/reference/cli/analyze) |
513
- | [`hud debug <image>`](https://docs.hud.so/reference/cli/debug) | Five-phase health check of an environment. | [📖](https://docs.hud.so/reference/cli/debug) |
514
- | [`hud run <image>`](https://docs.hud.so/reference/cli/run) | Run MCP server locally or remotely. | [📖](https://docs.hud.so/reference/cli/run) |
515
-
516
- ## Roadmap
517
-
518
- - Merging our forks in to the main `mcp`, `mcp_use` repositories
519
- - Helpers for building new environments (see [current guide](environments/README.md))
520
- - Integrations with every major agent framework
521
- - Evaluation environment registry
522
- - MCP opentelemetry standard
523
-
524
- ## Contributing
525
-
526
- We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
527
-
528
- Key areas:
529
- - [Environment examples](environments/) - Add new MCP environments
530
- - [Agent implementations](hud/agents/) - Add support for new LLM providers
531
- - [Tool library](hud/tools/) - Extend the built-in tool collection
532
- - [RL training](hud/rl/) - Improve reinforcement learning pipelines
533
-
534
- Thanks to all our contributors!
535
-
536
- <a href="https://github.com/hud-evals/hud-python/graphs/contributors">
537
- <img src="https://contrib.rocks/image?repo=hud-evals/hud-python&max=50" />
538
- </a>
539
-
540
- ## Citation
541
-
542
- ```bibtex
543
- @software{hud2025agentevalplatform,
544
- author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
545
- title = {HUD: An Evaluation Platform for Agents},
546
- date = {2025-04},
547
- url = {https://github.com/hud-evals/hud-python},
548
- langid = {en}
549
- }
550
- ```
551
-
552
- > **License**: HUD is released under the MIT License – see the [LICENSE](LICENSE) file for details.