hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,168 @@
1
+ """Tests for hud.eval.parallel module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ast
6
+
7
+ import pytest
8
+
9
+ from hud.eval.parallel import (
10
+ ASTExtractionError,
11
+ _extract_body,
12
+ _find_async_with,
13
+ _get_end_line,
14
+ expand_variants,
15
+ resolve_group_ids,
16
+ )
17
+
18
+
19
+ class TestExpandVariants:
20
+ """Tests for expand_variants helper."""
21
+
22
+ def test_none_returns_empty_dict(self) -> None:
23
+ """None variants returns list with empty dict."""
24
+ result = expand_variants(None)
25
+ assert result == [{}]
26
+
27
+ def test_empty_dict_returns_empty_dict(self) -> None:
28
+ """Empty variants returns list with empty dict."""
29
+ result = expand_variants({})
30
+ assert result == [{}]
31
+
32
+ def test_single_value_stays_single(self) -> None:
33
+ """Single non-list value stays as single variant."""
34
+ result = expand_variants({"model": "gpt-4o"})
35
+ assert result == [{"model": "gpt-4o"}]
36
+
37
+ def test_list_expands_to_variants(self) -> None:
38
+ """List value expands to multiple variants."""
39
+ result = expand_variants({"model": ["gpt-4o", "claude"]})
40
+ assert result == [{"model": "gpt-4o"}, {"model": "claude"}]
41
+
42
+ def test_multiple_lists_create_combinations(self) -> None:
43
+ """Multiple lists create all combinations."""
44
+ result = expand_variants(
45
+ {
46
+ "model": ["a", "b"],
47
+ "temp": [0.0, 1.0],
48
+ }
49
+ )
50
+
51
+ assert len(result) == 4
52
+ assert {"model": "a", "temp": 0.0} in result
53
+ assert {"model": "a", "temp": 1.0} in result
54
+ assert {"model": "b", "temp": 0.0} in result
55
+ assert {"model": "b", "temp": 1.0} in result
56
+
57
+ def test_mixed_single_and_list(self) -> None:
58
+ """Mixed single values and lists work correctly."""
59
+ result = expand_variants(
60
+ {
61
+ "model": ["gpt-4o", "claude"],
62
+ "temp": 0.7,
63
+ }
64
+ )
65
+
66
+ assert len(result) == 2
67
+ assert {"model": "gpt-4o", "temp": 0.7} in result
68
+ assert {"model": "claude", "temp": 0.7} in result
69
+
70
+
71
+ class TestResolveGroupIds:
72
+ """Tests for resolve_group_ids helper."""
73
+
74
+ def test_uses_provided_group_ids(self) -> None:
75
+ """Uses provided group_ids when given."""
76
+ result = resolve_group_ids(["a", "b", "c"], 3)
77
+ assert result == ["a", "b", "c"]
78
+
79
+ def test_generates_shared_group_id(self) -> None:
80
+ """Generates shared group_id when not provided."""
81
+ result = resolve_group_ids(None, 3)
82
+ assert len(result) == 3
83
+ # All should be the same
84
+ assert result[0] == result[1] == result[2]
85
+ # Should be a valid UUID
86
+ assert len(result[0]) == 36
87
+
88
+ def test_raises_on_length_mismatch(self) -> None:
89
+ """Raises ValueError when group_ids length doesn't match."""
90
+ with pytest.raises(ValueError, match="group_ids length"):
91
+ resolve_group_ids(["a", "b"], 3)
92
+
93
+
94
+ class TestASTHelpers:
95
+ """Tests for AST helper functions."""
96
+
97
+ def test_find_async_with_finds_correct_node(self) -> None:
98
+ """_find_async_with finds the async with containing target line."""
99
+ source = """
100
+ async def main():
101
+ x = 1
102
+ async with something as ctx:
103
+ do_stuff()
104
+ more_stuff()
105
+ y = 2
106
+ """
107
+ tree = ast.parse(source)
108
+
109
+ # Line 5 is inside the async with
110
+ node = _find_async_with(tree, 5)
111
+ assert node is not None
112
+ assert isinstance(node, ast.AsyncWith)
113
+
114
+ def test_find_async_with_returns_none_when_not_found(self) -> None:
115
+ """_find_async_with returns None when line is outside async with."""
116
+ source = """
117
+ async def main():
118
+ x = 1
119
+ async with something as ctx:
120
+ do_stuff()
121
+ y = 2
122
+ """
123
+ tree = ast.parse(source)
124
+
125
+ # Line 7 is outside the async with
126
+ node = _find_async_with(tree, 7)
127
+ assert node is None
128
+
129
+ def test_get_end_line(self) -> None:
130
+ """_get_end_line returns last line of node."""
131
+ source = """
132
+ async with ctx:
133
+ line1()
134
+ line2()
135
+ line3()
136
+ """
137
+ tree = ast.parse(source)
138
+ async_with = tree.body[0]
139
+
140
+ end_line = _get_end_line(async_with)
141
+ assert end_line >= 4 # At least through line 4
142
+
143
+ def test_extract_body(self) -> None:
144
+ """_extract_body extracts the body source from async with."""
145
+ source = """async with ctx:
146
+ do_thing()
147
+ more_thing()
148
+ """
149
+ lines = source.split("\n")
150
+ lines = [line + "\n" for line in lines]
151
+
152
+ tree = ast.parse(source)
153
+ async_with = tree.body[0]
154
+ assert isinstance(async_with, ast.AsyncWith)
155
+
156
+ body = _extract_body(lines, async_with)
157
+ assert "do_thing()" in body
158
+ assert "more_thing()" in body
159
+
160
+
161
+ class TestASTExtractionError:
162
+ """Tests for ASTExtractionError."""
163
+
164
+ def test_is_exception(self) -> None:
165
+ """ASTExtractionError is an exception."""
166
+ error = ASTExtractionError("test message")
167
+ assert isinstance(error, Exception)
168
+ assert str(error) == "test message"
@@ -0,0 +1,291 @@
1
+ """Tests for hud.eval.task module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+
7
+ from hud.eval.task import Task, TaskAgentConfig
8
+
9
+
10
+ class TestTaskSerialization:
11
+ """Tests for Task serialization and roundtrip."""
12
+
13
+ def test_v5_task_roundtrip(self) -> None:
14
+ """v5 Task serializes and deserializes correctly."""
15
+ task = Task(
16
+ env={"name": "browser", "include": ["navigate", "click"]},
17
+ scenario="checkout",
18
+ id="task-1",
19
+ args={"user_id": "alice"},
20
+ )
21
+
22
+ # Serialize
23
+ data = task.model_dump(mode="json")
24
+
25
+ # Should have v5 format
26
+ assert "env" in data
27
+ assert data["env"]["name"] == "browser"
28
+ assert data["scenario"] == "checkout"
29
+ assert data["id"] == "task-1"
30
+
31
+ # Recreate from serialized data
32
+ task2 = Task(**data)
33
+
34
+ # Serialize again
35
+ data2 = task2.model_dump(mode="json")
36
+
37
+ # Should be identical
38
+ assert data == data2
39
+
40
+ def test_v4_task_roundtrip(self) -> None:
41
+ """v4 Task serializes (flattens) and deserializes correctly."""
42
+ v4_dict = {
43
+ "prompt": "Go to google.com and search for cats",
44
+ "mcp_config": {
45
+ "browser": {"url": "http://localhost:8080"},
46
+ },
47
+ "evaluate_tool": {"name": "check_url", "arguments": {"contains": "google"}},
48
+ "setup_tool": {"name": "navigate", "arguments": {"url": "about:blank"}},
49
+ "id": "v4-task-1",
50
+ "agent_config": {"system_prompt": "You are a helpful assistant"},
51
+ "metadata": {"category": "navigation"},
52
+ }
53
+
54
+ # Create Task from v4 dict
55
+ task = Task.from_v4(v4_dict)
56
+
57
+ # Serialize (should flatten to v4 format)
58
+ data = task.model_dump(mode="json")
59
+
60
+ # Should have v4 format (flat, not nested env)
61
+ assert "prompt" in data
62
+ assert "mcp_config" in data
63
+ assert "evaluate_tool" in data
64
+ assert data["prompt"] == "Go to google.com and search for cats"
65
+ assert data["id"] == "v4-task-1"
66
+
67
+ # Recreate from serialized data
68
+ task2 = Task(**data)
69
+
70
+ # Serialize again
71
+ data2 = task2.model_dump(mode="json")
72
+
73
+ # Should be identical
74
+ assert data == data2
75
+
76
+ def test_v4_preserves_agent_config(self) -> None:
77
+ """v4 Task preserves agent_config through roundtrip."""
78
+ v4_dict = {
79
+ "prompt": "Test prompt",
80
+ "mcp_config": {"server": {"url": "http://localhost"}},
81
+ "evaluate_tool": {"name": "check", "arguments": {}},
82
+ "agent_config": {"system_prompt": "Custom system prompt"},
83
+ }
84
+
85
+ task = Task.from_v4(v4_dict)
86
+ data = task.model_dump(mode="json")
87
+
88
+ # agent_config should preserve system_prompt and restore tool filters
89
+ agent_config = data.get("agent_config")
90
+ assert agent_config is not None
91
+ assert agent_config["system_prompt"] == "Custom system prompt"
92
+ # allowed_tools defaults to ["*"] when not specified (restored during serialization)
93
+ assert agent_config["allowed_tools"] == ["*"]
94
+ # These have default False values from TaskAgentConfig
95
+ assert agent_config["append_setup_output"] is False
96
+ assert agent_config["append_setup_tool"] is False
97
+
98
+ # Roundtrip
99
+ task2 = Task(**data)
100
+ assert task2.agent_config is not None
101
+ assert isinstance(task2.agent_config, TaskAgentConfig)
102
+ assert task2.agent_config.system_prompt == "Custom system prompt"
103
+ # Tool filters should be on Environment after roundtrip
104
+ assert task2.env is not None
105
+ assert task2.env._agent_include is None # ["*"] → None
106
+
107
+ def test_v4_preserves_metadata(self) -> None:
108
+ """v4 Task preserves metadata through roundtrip."""
109
+ v4_dict = {
110
+ "prompt": "Test prompt",
111
+ "mcp_config": {"server": {"url": "http://localhost"}},
112
+ "evaluate_tool": {"name": "check", "arguments": {}},
113
+ "metadata": {"key1": "value1", "key2": 42},
114
+ }
115
+
116
+ task = Task.from_v4(v4_dict)
117
+ data = task.model_dump(mode="json")
118
+
119
+ assert data.get("metadata") == {"key1": "value1", "key2": 42}
120
+
121
+ # Roundtrip
122
+ task2 = Task(**data)
123
+ assert task2.metadata == {"key1": "value1", "key2": 42}
124
+
125
+
126
+ class TestTaskValidation:
127
+ """Tests for Task validation."""
128
+
129
+ def test_v5_allows_none_env(self) -> None:
130
+ """v5 Task allows None env (for blank evals)."""
131
+ task = Task(scenario="test") # env=None is valid
132
+ assert task.env is None
133
+ assert task.scenario == "test"
134
+
135
+ def test_v4_requires_evaluate_tool(self) -> None:
136
+ """v4 Task requires evaluate_tool for validation."""
137
+ from hud.eval.utils import validate_v4_task
138
+
139
+ with pytest.raises(ValueError, match="evaluate_tool"):
140
+ validate_v4_task(
141
+ {
142
+ "prompt": "test",
143
+ "mcp_config": {"server": {}},
144
+ # Missing evaluate_tool
145
+ }
146
+ )
147
+
148
+ def test_agent_config_accepts_dict(self) -> None:
149
+ """agent_config can be provided as dict and gets converted."""
150
+ task = Task(
151
+ env={"name": "browser"},
152
+ agent_config={"system_prompt": "Hello"},
153
+ )
154
+
155
+ assert isinstance(task.agent_config, TaskAgentConfig)
156
+ assert task.agent_config.system_prompt == "Hello"
157
+
158
+
159
+ class TestV4AgentConfigToolFilters:
160
+ """Tests for v4 agent_config.allowed_tools and disallowed_tools processing."""
161
+
162
+ def test_v4_extracts_allowed_tools(self) -> None:
163
+ """v4 allowed_tools is extracted and stored on Environment."""
164
+ v4_dict = {
165
+ "prompt": "Test prompt",
166
+ "mcp_config": {"server": {"url": "http://localhost"}},
167
+ "evaluate_tool": {"name": "check", "arguments": {}},
168
+ "agent_config": {
169
+ "allowed_tools": ["browser_*", "file_read"],
170
+ },
171
+ }
172
+
173
+ task = Task.from_v4(v4_dict)
174
+
175
+ assert task.env is not None
176
+ assert task.env._agent_include == ["browser_*", "file_read"]
177
+
178
+ def test_v4_extracts_disallowed_tools(self) -> None:
179
+ """v4 disallowed_tools is extracted and stored on Environment."""
180
+ v4_dict = {
181
+ "prompt": "Test prompt",
182
+ "mcp_config": {"server": {"url": "http://localhost"}},
183
+ "evaluate_tool": {"name": "check", "arguments": {}},
184
+ "agent_config": {
185
+ "disallowed_tools": ["*setup*", "*evaluate*", "checkout_branch"],
186
+ },
187
+ }
188
+
189
+ task = Task.from_v4(v4_dict)
190
+
191
+ assert task.env is not None
192
+ assert task.env._agent_exclude == ["*setup*", "*evaluate*", "checkout_branch"]
193
+
194
+ def test_v4_wildcard_star_allowed_converts_to_none(self) -> None:
195
+ """v4 allowed_tools=['*'] converts to None (meaning include all)."""
196
+ v4_dict = {
197
+ "prompt": "Test prompt",
198
+ "mcp_config": {"server": {"url": "http://localhost"}},
199
+ "evaluate_tool": {"name": "check", "arguments": {}},
200
+ "agent_config": {
201
+ "allowed_tools": ["*"],
202
+ },
203
+ }
204
+
205
+ task = Task.from_v4(v4_dict)
206
+
207
+ assert task.env is not None
208
+ # ["*"] should be converted to None
209
+ assert task.env._agent_include is None
210
+
211
+ def test_v4_both_allowed_and_disallowed(self) -> None:
212
+ """v4 supports both allowed_tools and disallowed_tools together."""
213
+ v4_dict = {
214
+ "prompt": "Test prompt",
215
+ "mcp_config": {"server": {"url": "http://localhost"}},
216
+ "evaluate_tool": {"name": "check", "arguments": {}},
217
+ "agent_config": {
218
+ "allowed_tools": ["*"],
219
+ "disallowed_tools": ["*setup*", "*evaluate*"],
220
+ },
221
+ }
222
+
223
+ task = Task.from_v4(v4_dict)
224
+
225
+ assert task.env is not None
226
+ assert task.env._agent_include is None # ["*"] → None
227
+ assert task.env._agent_exclude == ["*setup*", "*evaluate*"]
228
+
229
+ @pytest.mark.asyncio
230
+ async def test_v4_tool_filters_applied_in_as_tools(self) -> None:
231
+ """v4 tool filters are applied when calling env.as_tools()."""
232
+ v4_dict = {
233
+ "prompt": "Test prompt",
234
+ "mcp_config": {"server": {"url": "http://localhost"}},
235
+ "evaluate_tool": {"name": "check", "arguments": {}},
236
+ "agent_config": {
237
+ "allowed_tools": ["*"],
238
+ "disallowed_tools": ["*setup*"],
239
+ },
240
+ }
241
+
242
+ task = Task.from_v4(v4_dict)
243
+ env = task.env
244
+ assert env is not None
245
+
246
+ # Add local tools to test filtering
247
+ @env.tool()
248
+ def my_setup_tool() -> str:
249
+ """Should be filtered out."""
250
+ return "setup"
251
+
252
+ @env.tool()
253
+ def run_query() -> str:
254
+ """Should be visible."""
255
+ return "query"
256
+
257
+ await env._build_routing()
258
+
259
+ tools = env.as_tools()
260
+ tool_names = [t.name for t in tools]
261
+
262
+ assert "my_setup_tool" not in tool_names
263
+ assert "run_query" in tool_names
264
+
265
+ def test_v4_tool_filters_preserved_in_serialization(self) -> None:
266
+ """v4 tool filters are preserved when serializing for remote execution."""
267
+ v4_dict = {
268
+ "prompt": "Test prompt",
269
+ "mcp_config": {"server": {"url": "http://localhost"}},
270
+ "evaluate_tool": {"name": "check", "arguments": {}},
271
+ "agent_config": {
272
+ "allowed_tools": ["*"],
273
+ "disallowed_tools": ["*setup*", "*evaluate*", "*grade*"],
274
+ },
275
+ }
276
+
277
+ task = Task.from_v4(v4_dict)
278
+
279
+ # Serialize (this is what gets sent to remote execution)
280
+ data = task.model_dump(mode="json")
281
+
282
+ # agent_config must include the tool filters for remote execution
283
+ assert "agent_config" in data
284
+ assert data["agent_config"]["allowed_tools"] == ["*"]
285
+ assert data["agent_config"]["disallowed_tools"] == ["*setup*", "*evaluate*", "*grade*"]
286
+
287
+ # Verify roundtrip works (remote worker will deserialize this)
288
+ task2 = Task(**data)
289
+ assert task2.env is not None
290
+ assert task2.env._agent_include is None # ["*"] → None
291
+ assert task2.env._agent_exclude == ["*setup*", "*evaluate*", "*grade*"]
hud/eval/types.py ADDED
@@ -0,0 +1,65 @@
1
+ """Types and exceptions for the eval module.
2
+
3
+ Kept separate to avoid circular imports.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import Any
9
+
10
+ from pydantic import BaseModel
11
+
12
+ # =============================================================================
13
+ # Exceptions
14
+ # =============================================================================
15
+
16
+
17
+ class ParallelEvalComplete(Exception):
18
+ """Raised by summary context to skip body re-execution after parallel eval.
19
+
20
+ This is caught by the eval() context manager to cleanly exit.
21
+ The summary context with results is still accessible after the with block.
22
+ """
23
+
24
+
25
+ # =============================================================================
26
+ # Payload Models
27
+ # =============================================================================
28
+
29
+
30
+ class EvalPayload(BaseModel):
31
+ """Base payload for eval enter/exit."""
32
+
33
+ prompt: str | None = None
34
+ code_snippet: str | None = None
35
+ job_id: str | None = None
36
+ group_id: str | None = None
37
+ variants: dict[str, Any] | None = None
38
+ task_version_id: str | None = None
39
+ metadata: dict[str, Any] | None = None
40
+
41
+
42
+ class EvalExitPayload(EvalPayload):
43
+ """Exit payload with result fields."""
44
+
45
+ reward: float | None = None
46
+ success: bool = True
47
+ error_message: str | None = None
48
+
49
+
50
+ class JobEnterPayload(BaseModel):
51
+ """Payload for job/{job_id}/enter - sent once at job start."""
52
+
53
+ name: str | None = None
54
+ variants: dict[str, Any] | None = None # Full variant config
55
+ group: int | None = None
56
+ taskset: str | None = None # taskset slug to associate job with
57
+ tasks: list[dict[str, Any]] | None = None # task definitions to add to taskset
58
+
59
+
60
+ __all__ = [
61
+ "EvalExitPayload",
62
+ "EvalPayload",
63
+ "JobEnterPayload",
64
+ "ParallelEvalComplete",
65
+ ]