hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,154 @@
1
+ from __future__ import annotations
2
+
3
+ from hud.utils.tool_shorthand import (
4
+ _is_call_like,
5
+ _to_call_dict,
6
+ normalize_to_tool_call_dict,
7
+ )
8
+
9
+
10
+ def test_is_call_like_with_name_and_arguments():
11
+ """Test _is_call_like with name and arguments keys."""
12
+ obj = {"name": "test_tool", "arguments": {"key": "value"}}
13
+ assert _is_call_like(obj) is True
14
+
15
+
16
+ def test_is_call_like_with_single_key_dict_value():
17
+ """Test _is_call_like with single key dict containing dict value."""
18
+ obj = {"tool": {"name": "test"}}
19
+ assert _is_call_like(obj) is True
20
+
21
+
22
+ def test_is_call_like_with_nested_single_key():
23
+ """Test _is_call_like with nested single key dict."""
24
+ obj = {"tool": {"inner": {"key": "value"}}}
25
+ assert _is_call_like(obj) is True
26
+
27
+
28
+ def test_is_call_like_not_dict():
29
+ """Test _is_call_like returns False for non-dict."""
30
+ assert _is_call_like("string") is False
31
+ assert _is_call_like(123) is False
32
+ assert _is_call_like(None) is False
33
+ assert _is_call_like([]) is False
34
+
35
+
36
+ def test_is_call_like_empty_dict():
37
+ """Test _is_call_like returns False for empty dict."""
38
+ assert _is_call_like({}) is False
39
+
40
+
41
+ def test_is_call_like_multi_key_dict():
42
+ """Test _is_call_like returns False for multi-key dict without name/arguments."""
43
+ obj = {"key1": "value1", "key2": "value2"}
44
+ assert _is_call_like(obj) is False
45
+
46
+
47
+ def test_to_call_dict_with_name_arguments():
48
+ """Test _to_call_dict preserves name and arguments."""
49
+ obj = {"name": "test_tool", "arguments": {"param": "value"}}
50
+ result = _to_call_dict(obj)
51
+ assert result == {"name": "test_tool", "arguments": {"param": "value"}}
52
+
53
+
54
+ def test_to_call_dict_with_nested_call():
55
+ """Test _to_call_dict with nested call-like arguments."""
56
+ obj = {"name": "outer", "arguments": {"name": "inner", "arguments": {"x": 1}}}
57
+ result = _to_call_dict(obj)
58
+ assert result == {"name": "outer", "arguments": {"name": "inner", "arguments": {"x": 1}}}
59
+
60
+
61
+ def test_to_call_dict_shorthand_single_key():
62
+ """Test _to_call_dict converts shorthand single-key dict."""
63
+ obj = {"tool_name": {"name": "inner", "arguments": {}}}
64
+ result = _to_call_dict(obj)
65
+ assert result == {"name": "tool_name", "arguments": {"name": "inner", "arguments": {}}}
66
+
67
+
68
+ def test_to_call_dict_non_call_arguments():
69
+ """Test _to_call_dict with non-call-like arguments."""
70
+ obj = {"name": "test", "arguments": {"simple": "value"}}
71
+ result = _to_call_dict(obj)
72
+ assert result == {"name": "test", "arguments": {"simple": "value"}}
73
+
74
+
75
+ def test_to_call_dict_non_dict():
76
+ """Test _to_call_dict returns non-dict unchanged."""
77
+ assert _to_call_dict("string") == "string"
78
+ assert _to_call_dict(123) == 123
79
+ assert _to_call_dict(None) is None
80
+
81
+
82
+ def test_to_call_dict_single_key_non_call():
83
+ """Test _to_call_dict with single key but non-call value."""
84
+ obj = {"key": "simple_value"}
85
+ result = _to_call_dict(obj)
86
+ assert result == {"key": "simple_value"}
87
+
88
+
89
+ def test_normalize_to_tool_call_dict_none():
90
+ """Test normalize_to_tool_call_dict with None."""
91
+ assert normalize_to_tool_call_dict(None) is None
92
+
93
+
94
+ def test_normalize_to_tool_call_dict_simple_dict():
95
+ """Test normalize_to_tool_call_dict with simple dict."""
96
+ obj = {"name": "tool", "arguments": {"x": 1}}
97
+ result = normalize_to_tool_call_dict(obj)
98
+ assert result == {"name": "tool", "arguments": {"x": 1}}
99
+
100
+
101
+ def test_normalize_to_tool_call_dict_shorthand():
102
+ """Test normalize_to_tool_call_dict with shorthand notation."""
103
+ obj = {"tool_name": {"name": "inner", "arguments": {}}}
104
+ result = normalize_to_tool_call_dict(obj)
105
+ assert result == {"name": "tool_name", "arguments": {"name": "inner", "arguments": {}}}
106
+
107
+
108
+ def test_normalize_to_tool_call_dict_list():
109
+ """Test normalize_to_tool_call_dict with list of dicts."""
110
+ obj = [
111
+ {"name": "tool1", "arguments": {"a": 1}},
112
+ {"name": "tool2", "arguments": {"b": 2}},
113
+ ]
114
+ result = normalize_to_tool_call_dict(obj)
115
+ assert len(result) == 2
116
+ assert result[0] == {"name": "tool1", "arguments": {"a": 1}}
117
+ assert result[1] == {"name": "tool2", "arguments": {"b": 2}}
118
+
119
+
120
+ def test_normalize_to_tool_call_dict_list_shorthand():
121
+ """Test normalize_to_tool_call_dict with list of shorthand dicts."""
122
+ obj = [
123
+ {"tool1": {"name": "inner1", "arguments": {}}},
124
+ {"tool2": {"name": "inner2", "arguments": {}}},
125
+ ]
126
+ result = normalize_to_tool_call_dict(obj)
127
+ assert len(result) == 2
128
+ assert result[0]["name"] == "tool1"
129
+ assert result[1]["name"] == "tool2"
130
+
131
+
132
+ def test_normalize_to_tool_call_dict_non_dict_non_list():
133
+ """Test normalize_to_tool_call_dict with non-dict, non-list value."""
134
+ assert normalize_to_tool_call_dict("string") == "string"
135
+ assert normalize_to_tool_call_dict(123) == 123
136
+
137
+
138
+ def test_normalize_to_tool_call_dict_empty_list():
139
+ """Test normalize_to_tool_call_dict with empty list."""
140
+ assert normalize_to_tool_call_dict([]) == []
141
+
142
+
143
+ def test_normalize_to_tool_call_dict_complex_nested():
144
+ """Test normalize_to_tool_call_dict with complex nested structure."""
145
+ obj = {
146
+ "outer_tool": {
147
+ "name": "middle_tool",
148
+ "arguments": {"name": "inner_tool", "arguments": {"x": 1}},
149
+ }
150
+ }
151
+ result = normalize_to_tool_call_dict(obj)
152
+ assert result["name"] == "outer_tool"
153
+ assert result["arguments"]["name"] == "middle_tool"
154
+ assert result["arguments"]["arguments"]["name"] == "inner_tool"
@@ -5,4 +5,4 @@ def test_import():
5
5
  """Test that the package can be imported."""
6
6
  import hud
7
7
 
8
- assert hud.__version__ == "0.4.45"
8
+ assert hud.__version__ == "0.5.1"
hud/utils/types.py ADDED
@@ -0,0 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, ParamSpec, TypeVar
4
+
5
+ if TYPE_CHECKING:
6
+ from collections.abc import Callable
7
+
8
+ P = ParamSpec("P")
9
+ R = TypeVar("R")
10
+
11
+
12
+ def with_signature(
13
+ params_cls: Callable[P, Any],
14
+ ) -> Callable[[Callable[..., R]], Callable[P, R]]:
15
+ """Decorator that gives a method the signature of a Pydantic model."""
16
+
17
+ def decorator(method: Callable[..., R]) -> Callable[P, R]:
18
+ return method # type: ignore[return-value]
19
+
20
+ return decorator
hud/version.py CHANGED
@@ -4,4 +4,4 @@ Version information for the HUD SDK.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- __version__ = "0.4.45"
7
+ __version__ = "0.5.1"
@@ -0,0 +1,264 @@
1
+ Metadata-Version: 2.4
2
+ Name: hud-python
3
+ Version: 0.5.1
4
+ Summary: SDK for the HUD platform.
5
+ Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
+ Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
7
+ Project-URL: Documentation, https://docs.hud.ai
8
+ Author-email: HUD <founders@hud.ai>
9
+ License: MIT License
10
+
11
+ Copyright (c) 2025 Human Union Data, Inc
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Classifier: Development Status :: 4 - Beta
32
+ Classifier: Intended Audience :: Developers
33
+ Classifier: Programming Language :: Python :: 3
34
+ Classifier: Programming Language :: Python :: 3.11
35
+ Classifier: Programming Language :: Python :: 3.12
36
+ Classifier: Programming Language :: Python :: 3.13
37
+ Requires-Python: <3.13,>=3.11
38
+ Requires-Dist: blessed>=1.20.0
39
+ Requires-Dist: fastmcp==2.13.3
40
+ Requires-Dist: httpx<1,>=0.23.0
41
+ Requires-Dist: mcp<1.23,>1.21.1
42
+ Requires-Dist: openai>=2.8.1
43
+ Requires-Dist: packaging>=21.0
44
+ Requires-Dist: prompt-toolkit==3.0.51
45
+ Requires-Dist: pydantic-settings<3,>=2.2
46
+ Requires-Dist: pydantic<3,>=2.6
47
+ Requires-Dist: questionary==2.1.0
48
+ Requires-Dist: rich>=13.0.0
49
+ Requires-Dist: scarf-sdk>=0.1.0
50
+ Requires-Dist: toml>=0.10.2
51
+ Requires-Dist: typer>=0.9.0
52
+ Requires-Dist: watchfiles>=0.21.0
53
+ Provides-Extra: agent
54
+ Requires-Dist: anthropic>=0.75; extra == 'agent'
55
+ Requires-Dist: datasets>=2.14.0; extra == 'agent'
56
+ Requires-Dist: google-genai; extra == 'agent'
57
+ Requires-Dist: langchain>=1.1.0; extra == 'agent'
58
+ Requires-Dist: mcp-use==1.5.0; extra == 'agent'
59
+ Requires-Dist: openai-agents; extra == 'agent'
60
+ Requires-Dist: pillow>=11.1.0; extra == 'agent'
61
+ Requires-Dist: tornado>=6.5.2; extra == 'agent'
62
+ Provides-Extra: agents
63
+ Requires-Dist: anthropic>=0.75; extra == 'agents'
64
+ Requires-Dist: datasets>=2.14.0; extra == 'agents'
65
+ Requires-Dist: google-genai; extra == 'agents'
66
+ Requires-Dist: langchain>=1.1.0; extra == 'agents'
67
+ Requires-Dist: mcp-use==1.5.0; extra == 'agents'
68
+ Requires-Dist: openai-agents; extra == 'agents'
69
+ Requires-Dist: pillow>=11.1.0; extra == 'agents'
70
+ Requires-Dist: tornado>=6.5.2; extra == 'agents'
71
+ Provides-Extra: bedrock
72
+ Requires-Dist: anthropic[bedrock]>=0.75; extra == 'bedrock'
73
+ Provides-Extra: dev
74
+ Requires-Dist: anthropic>=0.75; extra == 'dev'
75
+ Requires-Dist: datasets>=2.14.0; extra == 'dev'
76
+ Requires-Dist: dotenv>=0.9.9; extra == 'dev'
77
+ Requires-Dist: google-adk; extra == 'dev'
78
+ Requires-Dist: google-genai; extra == 'dev'
79
+ Requires-Dist: ipykernel; extra == 'dev'
80
+ Requires-Dist: ipython<9; extra == 'dev'
81
+ Requires-Dist: jupyter-client; extra == 'dev'
82
+ Requires-Dist: jupyter-core; extra == 'dev'
83
+ Requires-Dist: langchain>=1.1.0; extra == 'dev'
84
+ Requires-Dist: llama-index-core; extra == 'dev'
85
+ Requires-Dist: mcp-use==1.5.0; extra == 'dev'
86
+ Requires-Dist: openai-agents; extra == 'dev'
87
+ Requires-Dist: pillow>=11.1.0; extra == 'dev'
88
+ Requires-Dist: playwright; extra == 'dev'
89
+ Requires-Dist: pyautogui>=0.9.54; extra == 'dev'
90
+ Requires-Dist: pyright==1.1.407; extra == 'dev'
91
+ Requires-Dist: pytest-asyncio; extra == 'dev'
92
+ Requires-Dist: pytest-cov; extra == 'dev'
93
+ Requires-Dist: pytest-mock; extra == 'dev'
94
+ Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
95
+ Requires-Dist: ruff>=0.11.8; extra == 'dev'
96
+ Requires-Dist: tornado>=6.5.2; extra == 'dev'
97
+ Description-Content-Type: text/markdown
98
+
99
+ <div align="left">
100
+ <picture>
101
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo_dark.svg">
102
+ <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo.svg">
103
+ <img src="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo.svg" alt="HUD" width="150" style="margin-bottom: 24px;"/>
104
+ </picture>
105
+ </div>
106
+
107
+ The HUD SDK is an open-source Python toolkit for building, evaluating, and training AI agents. Use a unified API for any model provider, wrap your code as MCP environments, run A/B evals at scale, and train with reinforcement learning.
108
+
109
+ To learn more, check out our [Documentation](https://docs.hud.ai) and [API Reference](https://docs.hud.ai/reference).
110
+
111
+ [![PyPI](https://img.shields.io/pypi/v/hud-python?style=flat-square)](https://pypi.org/project/hud-python/)
112
+ [![License](https://img.shields.io/badge/license-MIT-green?style=flat-square)](LICENSE)
113
+ [![Add docs to Cursor](https://img.shields.io/badge/Add%20docs%20to-Cursor-black?style=flat-square)](https://cursor.com/en/install-mcp?name=docs-hud-python&config=eyJ1cmwiOiJodHRwczovL2RvY3MuaHVkLmFpL21jcCJ9)
114
+ [![Discord](https://img.shields.io/discord/1327447144772407390?label=Discord&logo=discord&style=flat-square)](https://discord.gg/wkjtmHYYjm)
115
+ [![X Follow](https://img.shields.io/twitter/follow/hud_evals?style=social)](https://x.com/intent/user?screen_name=hud_evals)
116
+ [![Shop](https://img.shields.io/badge/_-white.svg?label=shop&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAJCAYAAAAywQxIAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAACxMAAAsTAQCanBgAAAF6SURBVChTlZA9ixNhFIWf8yaTpFHRRMXCKpAZhCAYFvwoLHZhwUKw9A9YCJb+Bq0sxGbBQrTxX1j41dvIRAjGZbdwRUUGIzPMeyw2swS3WZ/ynHvP5VylafoAWAd+5Xm+wX+SpukmcMf29RDCZrD9BViz3f53+CjYngKZpD5A2/Y7SQBMJpOkKIprdV1vdzqdHzHGblmW9Ww2+5pl2TmAxWKxmM/nP8fj8cmqqtZijJ9sb0u6ABBWjh0riuIt8CqE8LGu66e2d5MkeQ8QY3xme7fb7T4ZjUbrZVl+jjFuSXoEXGxCDgIl9WzfAO5LSmzvNB771R6vzG4Bx0MIt/M8vwV8aLyDQNt70+n0G1AspaTxVln+aghQluVsKbvxVysflT9NQK/XO7R/SGiQ9Nt2aftElmWXJd1kv0kbeANQVdWl4XB4XtJouXaqNRgMHkrqS+r0+/3XwD1JXdungRfAVWBi+6WkK8D3EMJz22cl3W21WgNgx3YAzvwFd0Chdq03gKUAAAAASUVORK5CYII=&style=social)](https://shop.hud.ai)
117
+ [![Scarf](https://static.scarf.sh/a.png?x-pxid=6530ff33-4945-452b-81f9-626872593933)](https://scarf.sh)
118
+ [![Docs](https://img.shields.io/badge/docs-hud.ai-blue?style=flat-square)](https://docs.hud.ai)
119
+
120
+ ## Install
121
+
122
+ ```bash
123
+ pip install hud-python
124
+ ```
125
+
126
+ Get your API key at [hud.ai](https://hud.ai) and set it:
127
+
128
+ ```bash
129
+ export HUD_API_KEY=your-key-here
130
+ ```
131
+
132
+ > For CLI tools (`hud init`, `hud dev`, etc.): `uv tool install hud-python --python 3.12`
133
+
134
+ ![Agent running on SheetBench](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
135
+
136
+ ## Usage
137
+
138
+ ### Unified Model API
139
+
140
+ Use Claude, GPT, Gemini, or Grok through one OpenAI-compatible endpoint:
141
+
142
+ ```python
143
+ from openai import AsyncOpenAI
144
+ import os
145
+
146
+ client = AsyncOpenAI(
147
+ base_url="https://inference.hud.ai",
148
+ api_key=os.environ["HUD_API_KEY"]
149
+ )
150
+
151
+ response = await client.chat.completions.create(
152
+ model="claude-sonnet-4-5", # or gpt-4o, gemini-2.5-pro (https://hud.ai/models)
153
+ messages=[{"role": "user", "content": "Hello!"}]
154
+ )
155
+ ```
156
+
157
+ Every call is traced at [hud.ai](https://hud.ai). → [Docs](https://docs.hud.ai/quick-links/gateway)
158
+
159
+ ### Environments
160
+
161
+ Turn your code into tools agents can call. Define how to evaluate them:
162
+
163
+ ```python
164
+ from hud import Environment
165
+
166
+ env = Environment("my-env")
167
+
168
+ @env.tool()
169
+ def add(a: int, b: int) -> int:
170
+ """Add two numbers."""
171
+ return a + b
172
+
173
+ @env.scenario("solve-math")
174
+ async def solve_math(problem: str, answer: int):
175
+ response = yield problem # Prompt
176
+ yield 1.0 if str(answer) in response else 0.0 # Reward
177
+
178
+ async with env("solve-math", problem="What is 2+2?", answer=4) as ctx:
179
+ # Your agent logic here - call tools, get response
180
+ result = await ctx.call_tool("add", a=2, b=2)
181
+ await ctx.submit(f"The answer is {result}")
182
+
183
+ print(ctx.reward) # 1.0
184
+ ```
185
+
186
+ The agent runs between the yields. First yield sends the prompt, second yield scores the result. → [Docs](https://docs.hud.ai/quick-links/environments) · [Templates](https://hud.ai/environments)
187
+
188
+ ### A/B Evals
189
+
190
+ Test different models. Repeat runs to see the distribution:
191
+
192
+ ```python
193
+ from openai import AsyncOpenAI
194
+ import os
195
+
196
+ client = AsyncOpenAI(
197
+ base_url="https://inference.hud.ai",
198
+ api_key=os.environ["HUD_API_KEY"]
199
+ )
200
+
201
+ # Using the env from above
202
+ async with env("solve-math", problem="What is 2+2?", answer=4, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx:
203
+ response = await client.chat.completions.create(
204
+ model=ctx.variants["model"],
205
+ messages=[{"role": "user", "content": ctx.prompt}],
206
+ tools=ctx.tools # Environment tools available to the model
207
+ )
208
+ await ctx.submit(response.choices[0].message.content)
209
+ ```
210
+
211
+ **Variants** test configurations. **Groups** repeat for distribution. Results stream to [hud.ai](https://hud.ai). → [Docs](https://docs.hud.ai/quick-links/ab-testing)
212
+
213
+ ### Deploy & Train
214
+
215
+ Push to GitHub, connect on hud.ai, run at scale:
216
+
217
+ ```bash
218
+ hud init # Scaffold environment
219
+ git push # Push to GitHub
220
+ # Connect on hud.ai → New → Environment
221
+ hud eval my-eval --model gpt-4o --group-size 100
222
+ # Or create and run tasks on the platform
223
+ ```
224
+
225
+ Every run generates training data. Use it to fine-tune or run RL. → [Docs](https://docs.hud.ai/quick-links/deploy)
226
+
227
+ ## Links
228
+
229
+ - 📖 [Documentation](https://docs.hud.ai)
230
+ - ⌨️ [CLI Reference](https://docs.hud.ai/reference/cli/overview)
231
+ - 🏆 [Leaderboards](https://hud.ai/leaderboards)
232
+ - 🌐 [Environment Templates](https://hud.ai/environments)
233
+ - 🤖 [Supported Models](https://hud.ai/models)
234
+ - 💬 [Discord](https://discord.gg/wkjtmHYYjm)
235
+
236
+ ## Enterprise
237
+
238
+ Building agents at scale? We work with teams on custom environments, benchmarks, and training.
239
+
240
+ [📅 Book a call](https://cal.com/jay-hud) · [📧 founders@hud.ai](mailto:founders@hud.ai)
241
+
242
+ ## Contributing
243
+
244
+ We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md).
245
+
246
+ Key areas: [Agents](hud/agents/) · [Tools](hud/tools/) · [Environments](https://hud.ai/environments)
247
+
248
+ <a href="https://github.com/hud-evals/hud-python/graphs/contributors">
249
+ <img src="https://contrib.rocks/image?repo=hud-evals/hud-python&max=50" />
250
+ </a>
251
+
252
+ ## Citation
253
+
254
+ ```bibtex
255
+ @software{hud2025agentevalplatform,
256
+ author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Govind Pimpale and Dylan Bowman and Jaideep and Nguyen Nhat Minh},
257
+ title = {HUD: An Evaluation and RL Envrionments Platform for Agents},
258
+ date = {2025-04},
259
+ url = {https://github.com/hud-evals/hud-python},
260
+ langid = {en}
261
+ }
262
+ ```
263
+
264
+ MIT License · [LICENSE](LICENSE)