hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,221 @@
1
+ """Tests for hud.datasets.loader module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from unittest.mock import MagicMock, patch
6
+
7
+ import pytest
8
+
9
+ from hud.datasets.loader import load_tasks
10
+
11
+
12
+ class TestLoadTasks:
13
+ """Tests for load_tasks() function."""
14
+
15
+ @patch("hud.datasets.loader.httpx.Client")
16
+ @patch("hud.datasets.loader.settings")
17
+ def test_load_tasks_success(
18
+ self, mock_settings: MagicMock, mock_client_class: MagicMock
19
+ ) -> None:
20
+ """load_tasks() successfully loads tasks from API."""
21
+ mock_settings.hud_api_url = "https://api.hud.ai"
22
+ mock_settings.api_key = "test_key"
23
+
24
+ mock_response = MagicMock()
25
+ # EvalsetTasksResponse format: tasks keyed by task ID
26
+ mock_response.json.return_value = {
27
+ "evalset_id": "evalset-123",
28
+ "evalset_name": "test-dataset",
29
+ "tasks": {
30
+ "task-1": {
31
+ "env": {"name": "test"},
32
+ "scenario": "checkout",
33
+ "args": {"user": "alice"},
34
+ },
35
+ "task-2": {
36
+ "env": {"name": "test"},
37
+ "scenario": "login",
38
+ "args": {"user": "bob"},
39
+ },
40
+ },
41
+ }
42
+ mock_response.raise_for_status = MagicMock()
43
+
44
+ mock_client = MagicMock()
45
+ mock_client.get.return_value = mock_response
46
+ mock_client.__enter__.return_value = mock_client
47
+ mock_client.__exit__.return_value = None
48
+ mock_client_class.return_value = mock_client
49
+
50
+ tasks = load_tasks("test-org/test-dataset")
51
+
52
+ assert len(tasks) == 2
53
+ # Tasks are keyed by ID in dict, order may vary
54
+ scenarios = {t.scenario for t in tasks}
55
+ assert scenarios == {"checkout", "login"}
56
+ # Check task IDs are set from dict keys
57
+ task_ids = {t.id for t in tasks}
58
+ assert task_ids == {"task-1", "task-2"}
59
+ mock_client.get.assert_called_once_with(
60
+ "https://api.hud.ai/tasks/evalset/test-org/test-dataset",
61
+ headers={"Authorization": "Bearer test_key"},
62
+ params={"all": "true"},
63
+ )
64
+
65
+ @patch("hud.datasets.loader.httpx.Client")
66
+ @patch("hud.datasets.loader.settings")
67
+ def test_load_tasks_single_task(
68
+ self, mock_settings: MagicMock, mock_client_class: MagicMock
69
+ ) -> None:
70
+ """load_tasks() handles single task in EvalsetTasksResponse."""
71
+ mock_settings.hud_api_url = "https://api.hud.ai"
72
+ mock_settings.api_key = "test_key"
73
+
74
+ mock_response = MagicMock()
75
+ mock_response.json.return_value = {
76
+ "evalset_id": "evalset-123",
77
+ "evalset_name": "test-dataset",
78
+ "tasks": {
79
+ "task-1": {
80
+ "env": {"name": "test"},
81
+ "scenario": "checkout",
82
+ "args": {"user": "alice"},
83
+ },
84
+ },
85
+ }
86
+ mock_response.raise_for_status = MagicMock()
87
+
88
+ mock_client = MagicMock()
89
+ mock_client.get.return_value = mock_response
90
+ mock_client.__enter__.return_value = mock_client
91
+ mock_client.__exit__.return_value = None
92
+ mock_client_class.return_value = mock_client
93
+
94
+ tasks = load_tasks("test-org/test-dataset")
95
+
96
+ assert len(tasks) == 1
97
+ assert tasks[0].scenario == "checkout"
98
+ assert tasks[0].id == "task-1"
99
+
100
+ @patch("hud.datasets.loader.httpx.Client")
101
+ @patch("hud.datasets.loader.settings")
102
+ def test_load_tasks_no_api_key(
103
+ self, mock_settings: MagicMock, mock_client_class: MagicMock
104
+ ) -> None:
105
+ """load_tasks() works without API key."""
106
+ mock_settings.hud_api_url = "https://api.hud.ai"
107
+ mock_settings.api_key = None
108
+
109
+ mock_response = MagicMock()
110
+ mock_response.json.return_value = {
111
+ "evalset_id": "evalset-123",
112
+ "evalset_name": "test-dataset",
113
+ "tasks": {},
114
+ }
115
+ mock_response.raise_for_status = MagicMock()
116
+
117
+ mock_client = MagicMock()
118
+ mock_client.get.return_value = mock_response
119
+ mock_client.__enter__.return_value = mock_client
120
+ mock_client.__exit__.return_value = None
121
+ mock_client_class.return_value = mock_client
122
+
123
+ tasks = load_tasks("test-org/test-dataset")
124
+
125
+ assert len(tasks) == 0
126
+ mock_client.get.assert_called_once_with(
127
+ "https://api.hud.ai/tasks/evalset/test-org/test-dataset",
128
+ headers={},
129
+ params={"all": "true"},
130
+ )
131
+
132
+ @patch("hud.datasets.loader.httpx.Client")
133
+ @patch("hud.datasets.loader.settings")
134
+ def test_load_tasks_http_error(
135
+ self, mock_settings: MagicMock, mock_client_class: MagicMock
136
+ ) -> None:
137
+ """load_tasks() raises ValueError on HTTP error."""
138
+ import httpx
139
+
140
+ mock_settings.hud_api_url = "https://api.hud.ai"
141
+ mock_settings.api_key = "test_key"
142
+
143
+ mock_client = MagicMock()
144
+ mock_client.get.side_effect = httpx.HTTPError("Network error")
145
+ mock_client.__enter__.return_value = mock_client
146
+ mock_client.__exit__.return_value = None
147
+ mock_client_class.return_value = mock_client
148
+
149
+ with pytest.raises(ValueError, match="Failed to load tasks"):
150
+ load_tasks("test-org/test-dataset")
151
+
152
+ @patch("hud.datasets.loader.httpx.Client")
153
+ @patch("hud.datasets.loader.settings")
154
+ def test_load_tasks_json_error(
155
+ self, mock_settings: MagicMock, mock_client_class: MagicMock
156
+ ) -> None:
157
+ """load_tasks() raises ValueError on JSON processing error."""
158
+ mock_settings.hud_api_url = "https://api.hud.ai"
159
+ mock_settings.api_key = "test_key"
160
+
161
+ mock_response = MagicMock()
162
+ mock_response.json.side_effect = Exception("Invalid JSON")
163
+ mock_response.raise_for_status = MagicMock()
164
+
165
+ mock_client = MagicMock()
166
+ mock_client.get.return_value = mock_response
167
+ mock_client.__enter__.return_value = mock_client
168
+ mock_client.__exit__.return_value = None
169
+ mock_client_class.return_value = mock_client
170
+
171
+ with pytest.raises(ValueError, match="Failed to load tasks"):
172
+ load_tasks("test-org/test-dataset")
173
+
174
+ @patch("hud.datasets.loader.httpx.Client")
175
+ @patch("hud.datasets.loader.settings")
176
+ def test_load_tasks_empty(self, mock_settings: MagicMock, mock_client_class: MagicMock) -> None:
177
+ """load_tasks() handles empty dataset."""
178
+ mock_settings.hud_api_url = "https://api.hud.ai"
179
+ mock_settings.api_key = "test_key"
180
+
181
+ mock_response = MagicMock()
182
+ mock_response.json.return_value = {"tasks": {}}
183
+ mock_response.raise_for_status = MagicMock()
184
+
185
+ mock_client = MagicMock()
186
+ mock_client.get.return_value = mock_response
187
+ mock_client.__enter__.return_value = mock_client
188
+ mock_client.__exit__.return_value = None
189
+ mock_client_class.return_value = mock_client
190
+
191
+ tasks = load_tasks("test-org/test-dataset")
192
+
193
+ assert len(tasks) == 0
194
+
195
+ @patch("hud.datasets.loader.httpx.Client")
196
+ @patch("hud.datasets.loader.settings")
197
+ def test_load_tasks_missing_fields(
198
+ self, mock_settings: MagicMock, mock_client_class: MagicMock
199
+ ) -> None:
200
+ """load_tasks() handles tasks with missing optional fields (but env is required)."""
201
+ mock_settings.hud_api_url = "https://api.hud.ai"
202
+ mock_settings.api_key = "test_key"
203
+
204
+ mock_response = MagicMock()
205
+ mock_response.json.return_value = {
206
+ "tasks": {"task-1": {"env": {"name": "test-env"}, "scenario": "test"}},
207
+ }
208
+ mock_response.raise_for_status = MagicMock()
209
+
210
+ mock_client = MagicMock()
211
+ mock_client.get.return_value = mock_response
212
+ mock_client.__enter__.return_value = mock_client
213
+ mock_client.__exit__.return_value = None
214
+ mock_client_class.return_value = mock_client
215
+
216
+ tasks = load_tasks("test-org/test-dataset")
217
+
218
+ assert len(tasks) == 1
219
+ assert tasks[0].scenario == "test"
220
+ assert tasks[0].id == "task-1"
221
+ assert tasks[0].args == {}
@@ -0,0 +1,315 @@
1
+ """Tests for hud.datasets.utils module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from unittest.mock import AsyncMock, MagicMock, patch
6
+
7
+ import pytest
8
+
9
+ from hud.datasets.utils import (
10
+ BatchRequest,
11
+ SingleTaskRequest,
12
+ cancel_all_jobs,
13
+ cancel_job,
14
+ cancel_task,
15
+ submit_rollouts,
16
+ )
17
+ from hud.eval.display import display_results
18
+ from hud.types import AgentType, LegacyTask, Trace
19
+
20
+
21
+ class TestSingleTaskRequest:
22
+ """Tests for SingleTaskRequest schema."""
23
+
24
+ def test_valid_request(self):
25
+ """Test creating a valid SingleTaskRequest with v5 task."""
26
+ request = SingleTaskRequest(
27
+ task={"env": {"name": "browser"}, "scenario": "checkout"},
28
+ agent_type=AgentType.CLAUDE,
29
+ agent_params={"checkpoint_name": "claude-sonnet-4-5"},
30
+ max_steps=10,
31
+ job_id="job-123",
32
+ task_id="task-1",
33
+ trace_name="Test trace",
34
+ )
35
+ assert request.task_id == "task-1"
36
+ assert request.agent_type == AgentType.CLAUDE
37
+
38
+ def test_empty_job_id_rejected(self):
39
+ """Test that empty job_id is rejected."""
40
+ with pytest.raises(ValueError, match="job_id must be a non-empty string"):
41
+ SingleTaskRequest(
42
+ task={"prompt": "test", "mcp_config": {}},
43
+ agent_type=AgentType.CLAUDE,
44
+ job_id="",
45
+ task_id="task-1",
46
+ trace_name="Test",
47
+ )
48
+
49
+ def test_invalid_task_rejected(self):
50
+ """Test that invalid task payload is rejected (neither v4 nor v5)."""
51
+ with pytest.raises(ValueError, match="Task must have 'env'"):
52
+ SingleTaskRequest(
53
+ task={"invalid_field": "test"}, # Missing required fields
54
+ agent_type=AgentType.CLAUDE,
55
+ job_id="job-123",
56
+ task_id="task-1",
57
+ trace_name="Test",
58
+ )
59
+
60
+ def test_incomplete_v4_task_rejected(self):
61
+ """Test that incomplete v4 task (missing evaluate_tool) is rejected."""
62
+ # When prompt + mcp_config is present but evaluate_tool is missing,
63
+ # it's detected as v4 format but fails validation
64
+ with pytest.raises(ValueError, match="v4 task missing required fields"):
65
+ SingleTaskRequest(
66
+ task={
67
+ "prompt": "test",
68
+ "mcp_config": {"server": {"url": "http://localhost"}},
69
+ # Missing evaluate_tool
70
+ },
71
+ agent_type=AgentType.CLAUDE,
72
+ job_id="job-123",
73
+ task_id="task-1",
74
+ trace_name="Test",
75
+ )
76
+
77
+ def test_valid_v4_task_accepted(self):
78
+ """Test that complete v4 task is accepted."""
79
+ request = SingleTaskRequest(
80
+ task={
81
+ "prompt": "test",
82
+ "mcp_config": {"server": {"url": "http://localhost"}},
83
+ "evaluate_tool": {"name": "check", "arguments": {}},
84
+ },
85
+ agent_type=AgentType.CLAUDE,
86
+ job_id="job-123",
87
+ task_id="task-1",
88
+ trace_name="Test",
89
+ )
90
+ assert request.task_id == "task-1"
91
+
92
+ def test_valid_v5_task_accepted(self):
93
+ """Test that v5 task with env is accepted."""
94
+ request = SingleTaskRequest(
95
+ task={"env": {"name": "browser"}, "scenario": "login"},
96
+ agent_type=AgentType.CLAUDE,
97
+ job_id="job-123",
98
+ task_id="task-1",
99
+ trace_name="Test",
100
+ )
101
+ assert request.task_id == "task-1"
102
+
103
+
104
+ class TestBatchRequest:
105
+ """Tests for BatchRequest schema."""
106
+
107
+ def test_valid_batch(self):
108
+ """Test creating a valid batch request."""
109
+ requests = [
110
+ SingleTaskRequest(
111
+ task={"env": {"name": "browser"}, "scenario": "test"},
112
+ agent_type=AgentType.CLAUDE,
113
+ job_id="job-123",
114
+ task_id=f"task-{i}",
115
+ trace_name=f"Trace {i}",
116
+ )
117
+ for i in range(3)
118
+ ]
119
+ batch = BatchRequest(requests=requests)
120
+ assert len(batch.requests) == 3
121
+
122
+
123
+ class TestCancellationFunctions:
124
+ """Tests for cancellation functions."""
125
+
126
+ @pytest.mark.asyncio
127
+ async def test_cancel_task(self):
128
+ """Test cancel_task makes correct API call."""
129
+ with patch("hud.datasets.utils.httpx.AsyncClient") as mock_client_cls:
130
+ mock_response = MagicMock()
131
+ mock_response.json.return_value = {"cancelled": True, "task_id": "task-1"}
132
+ mock_response.raise_for_status = MagicMock()
133
+
134
+ mock_client = AsyncMock()
135
+ mock_client.post.return_value = mock_response
136
+ mock_client.__aenter__.return_value = mock_client
137
+ mock_client.__aexit__.return_value = None
138
+ mock_client_cls.return_value = mock_client
139
+
140
+ with patch("hud.datasets.utils.settings") as mock_settings:
141
+ mock_settings.hud_api_url = "https://api.hud.ai"
142
+ mock_settings.api_key = "test-key"
143
+
144
+ result = await cancel_task("job-123", "task-1")
145
+
146
+ assert result["cancelled"] is True
147
+ mock_client.post.assert_called_once()
148
+ call_args = mock_client.post.call_args
149
+ assert "cancel" in call_args[0][0]
150
+ assert call_args[1]["json"]["job_id"] == "job-123"
151
+ assert call_args[1]["json"]["task_id"] == "task-1"
152
+
153
+ @pytest.mark.asyncio
154
+ async def test_cancel_job(self):
155
+ """Test cancel_job makes correct API call."""
156
+ with patch("hud.datasets.utils.httpx.AsyncClient") as mock_client_cls:
157
+ mock_response = MagicMock()
158
+ mock_response.json.return_value = {"cancelled": 5, "job_id": "job-123"}
159
+ mock_response.raise_for_status = MagicMock()
160
+
161
+ mock_client = AsyncMock()
162
+ mock_client.post.return_value = mock_response
163
+ mock_client.__aenter__.return_value = mock_client
164
+ mock_client.__aexit__.return_value = None
165
+ mock_client_cls.return_value = mock_client
166
+
167
+ with patch("hud.datasets.utils.settings") as mock_settings:
168
+ mock_settings.hud_api_url = "https://api.hud.ai"
169
+ mock_settings.api_key = "test-key"
170
+
171
+ result = await cancel_job("job-123")
172
+
173
+ assert result["cancelled"] == 5
174
+ mock_client.post.assert_called_once()
175
+
176
+ @pytest.mark.asyncio
177
+ async def test_cancel_all_jobs(self):
178
+ """Test cancel_all_jobs makes correct API call."""
179
+ with patch("hud.datasets.utils.httpx.AsyncClient") as mock_client_cls:
180
+ mock_response = MagicMock()
181
+ mock_response.json.return_value = {"jobs_cancelled": 3, "total_tasks_cancelled": 10}
182
+ mock_response.raise_for_status = MagicMock()
183
+
184
+ mock_client = AsyncMock()
185
+ mock_client.post.return_value = mock_response
186
+ mock_client.__aenter__.return_value = mock_client
187
+ mock_client.__aexit__.return_value = None
188
+ mock_client_cls.return_value = mock_client
189
+
190
+ with patch("hud.datasets.utils.settings") as mock_settings:
191
+ mock_settings.hud_api_url = "https://api.hud.ai"
192
+ mock_settings.api_key = "test-key"
193
+
194
+ result = await cancel_all_jobs()
195
+
196
+ assert result["jobs_cancelled"] == 3
197
+ assert result["total_tasks_cancelled"] == 10
198
+
199
+
200
+ class TestDisplayResults:
201
+ """Tests for display_results function."""
202
+
203
+ def test_display_with_traces(self):
204
+ """Test displaying single-run trace results."""
205
+ tasks = [
206
+ LegacyTask(id="t1", prompt="Test task 1", mcp_config={}),
207
+ LegacyTask(id="t2", prompt="Test task 2", mcp_config={}),
208
+ ]
209
+ results = [
210
+ Trace(reward=0.9, done=True),
211
+ Trace(reward=0.5, done=True),
212
+ ]
213
+
214
+ # Should not raise
215
+ display_results(results, tasks=tasks)
216
+
217
+ def test_display_with_group_stats(self):
218
+ """Test displaying group statistics."""
219
+ tasks = [
220
+ LegacyTask(id="t1", prompt="Test task 1", mcp_config={}),
221
+ ]
222
+ results = [
223
+ {
224
+ "task_id": "t1",
225
+ "prompt": "Test task 1",
226
+ "mean_reward": 0.85,
227
+ "std_reward": 0.1,
228
+ "min_reward": 0.7,
229
+ "max_reward": 1.0,
230
+ "success_rate": 0.9,
231
+ "group_size": 3,
232
+ "rewards": [0.8, 0.85, 0.9],
233
+ }
234
+ ]
235
+
236
+ # Should not raise
237
+ display_results(results, tasks=tasks)
238
+
239
+ def test_display_empty_results(self):
240
+ """Test displaying when no valid results."""
241
+ tasks = [LegacyTask(prompt="Test", mcp_config={})]
242
+ results: list[Trace | None] = [None]
243
+
244
+ # Should not raise
245
+ display_results(results, tasks=tasks)
246
+
247
+
248
+ class TestSubmitRollouts:
249
+ """Tests for submit_rollouts function."""
250
+
251
+ @pytest.mark.asyncio
252
+ async def test_submit_single_task(self):
253
+ """Test submitting a single task (v5 format)."""
254
+ from hud.eval.task import Task
255
+
256
+ tasks = [Task(env={"name": "browser"}, scenario="test", id="task-1")]
257
+
258
+ with patch("hud.datasets.utils.httpx.AsyncClient") as mock_client_cls:
259
+ mock_response = MagicMock()
260
+ mock_response.json.return_value = {"accepted": 1, "rejected": 0}
261
+ mock_response.raise_for_status = MagicMock()
262
+
263
+ mock_client = AsyncMock()
264
+ mock_client.post.return_value = mock_response
265
+ mock_client.__aenter__.return_value = mock_client
266
+ mock_client.__aexit__.return_value = None
267
+ mock_client_cls.return_value = mock_client
268
+
269
+ with patch("hud.datasets.utils.settings") as mock_settings:
270
+ mock_settings.hud_api_url = "https://api.hud.ai"
271
+ mock_settings.api_key = "test-key"
272
+
273
+ # submit_rollouts doesn't return a value
274
+ await submit_rollouts(
275
+ tasks=tasks,
276
+ agent_type=AgentType.CLAUDE,
277
+ job_id="job-123",
278
+ )
279
+
280
+ mock_client.post.assert_called_once()
281
+
282
+ @pytest.mark.asyncio
283
+ async def test_submit_with_group_size(self):
284
+ """Test submitting with group_size > 1 creates multiple requests per task."""
285
+ from hud.eval.task import Task
286
+
287
+ tasks = [Task(env={"name": "browser"}, scenario="test", id="task-1")]
288
+
289
+ with patch("hud.datasets.utils.httpx.AsyncClient") as mock_client_cls:
290
+ mock_response = MagicMock()
291
+ mock_response.json.return_value = {"accepted": 3, "rejected": 0}
292
+ mock_response.raise_for_status = MagicMock()
293
+
294
+ mock_client = AsyncMock()
295
+ mock_client.post.return_value = mock_response
296
+ mock_client.__aenter__.return_value = mock_client
297
+ mock_client.__aexit__.return_value = None
298
+ mock_client_cls.return_value = mock_client
299
+
300
+ with patch("hud.datasets.utils.settings") as mock_settings:
301
+ mock_settings.hud_api_url = "https://api.hud.ai"
302
+ mock_settings.api_key = "test-key"
303
+
304
+ await submit_rollouts(
305
+ tasks=tasks,
306
+ agent_type=AgentType.CLAUDE,
307
+ job_id="job-123",
308
+ group_size=3,
309
+ )
310
+
311
+ # Verify batch request contains 3 requests (1 task x 3 group_size)
312
+ call_args = mock_client.post.call_args
313
+ assert call_args is not None
314
+ batch_data = call_args.kwargs["json"]
315
+ assert len(batch_data["requests"]) == 3