hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,168 @@
1
+ """Tests for hud.eval.parallel module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ast
6
+
7
+ import pytest
8
+
9
+ from hud.eval.parallel import (
10
+ ASTExtractionError,
11
+ _extract_body,
12
+ _find_async_with,
13
+ _get_end_line,
14
+ expand_variants,
15
+ resolve_group_ids,
16
+ )
17
+
18
+
19
+ class TestExpandVariants:
20
+ """Tests for expand_variants helper."""
21
+
22
+ def test_none_returns_empty_dict(self) -> None:
23
+ """None variants returns list with empty dict."""
24
+ result = expand_variants(None)
25
+ assert result == [{}]
26
+
27
+ def test_empty_dict_returns_empty_dict(self) -> None:
28
+ """Empty variants returns list with empty dict."""
29
+ result = expand_variants({})
30
+ assert result == [{}]
31
+
32
+ def test_single_value_stays_single(self) -> None:
33
+ """Single non-list value stays as single variant."""
34
+ result = expand_variants({"model": "gpt-4o"})
35
+ assert result == [{"model": "gpt-4o"}]
36
+
37
+ def test_list_expands_to_variants(self) -> None:
38
+ """List value expands to multiple variants."""
39
+ result = expand_variants({"model": ["gpt-4o", "claude"]})
40
+ assert result == [{"model": "gpt-4o"}, {"model": "claude"}]
41
+
42
+ def test_multiple_lists_create_combinations(self) -> None:
43
+ """Multiple lists create all combinations."""
44
+ result = expand_variants(
45
+ {
46
+ "model": ["a", "b"],
47
+ "temp": [0.0, 1.0],
48
+ }
49
+ )
50
+
51
+ assert len(result) == 4
52
+ assert {"model": "a", "temp": 0.0} in result
53
+ assert {"model": "a", "temp": 1.0} in result
54
+ assert {"model": "b", "temp": 0.0} in result
55
+ assert {"model": "b", "temp": 1.0} in result
56
+
57
+ def test_mixed_single_and_list(self) -> None:
58
+ """Mixed single values and lists work correctly."""
59
+ result = expand_variants(
60
+ {
61
+ "model": ["gpt-4o", "claude"],
62
+ "temp": 0.7,
63
+ }
64
+ )
65
+
66
+ assert len(result) == 2
67
+ assert {"model": "gpt-4o", "temp": 0.7} in result
68
+ assert {"model": "claude", "temp": 0.7} in result
69
+
70
+
71
+ class TestResolveGroupIds:
72
+ """Tests for resolve_group_ids helper."""
73
+
74
+ def test_uses_provided_group_ids(self) -> None:
75
+ """Uses provided group_ids when given."""
76
+ result = resolve_group_ids(["a", "b", "c"], 3)
77
+ assert result == ["a", "b", "c"]
78
+
79
+ def test_generates_shared_group_id(self) -> None:
80
+ """Generates shared group_id when not provided."""
81
+ result = resolve_group_ids(None, 3)
82
+ assert len(result) == 3
83
+ # All should be the same
84
+ assert result[0] == result[1] == result[2]
85
+ # Should be a valid UUID
86
+ assert len(result[0]) == 36
87
+
88
+ def test_raises_on_length_mismatch(self) -> None:
89
+ """Raises ValueError when group_ids length doesn't match."""
90
+ with pytest.raises(ValueError, match="group_ids length"):
91
+ resolve_group_ids(["a", "b"], 3)
92
+
93
+
94
+ class TestASTHelpers:
95
+ """Tests for AST helper functions."""
96
+
97
+ def test_find_async_with_finds_correct_node(self) -> None:
98
+ """_find_async_with finds the async with containing target line."""
99
+ source = """
100
+ async def main():
101
+ x = 1
102
+ async with something as ctx:
103
+ do_stuff()
104
+ more_stuff()
105
+ y = 2
106
+ """
107
+ tree = ast.parse(source)
108
+
109
+ # Line 5 is inside the async with
110
+ node = _find_async_with(tree, 5)
111
+ assert node is not None
112
+ assert isinstance(node, ast.AsyncWith)
113
+
114
+ def test_find_async_with_returns_none_when_not_found(self) -> None:
115
+ """_find_async_with returns None when line is outside async with."""
116
+ source = """
117
+ async def main():
118
+ x = 1
119
+ async with something as ctx:
120
+ do_stuff()
121
+ y = 2
122
+ """
123
+ tree = ast.parse(source)
124
+
125
+ # Line 7 is outside the async with
126
+ node = _find_async_with(tree, 7)
127
+ assert node is None
128
+
129
+ def test_get_end_line(self) -> None:
130
+ """_get_end_line returns last line of node."""
131
+ source = """
132
+ async with ctx:
133
+ line1()
134
+ line2()
135
+ line3()
136
+ """
137
+ tree = ast.parse(source)
138
+ async_with = tree.body[0]
139
+
140
+ end_line = _get_end_line(async_with)
141
+ assert end_line >= 4 # At least through line 4
142
+
143
+ def test_extract_body(self) -> None:
144
+ """_extract_body extracts the body source from async with."""
145
+ source = """async with ctx:
146
+ do_thing()
147
+ more_thing()
148
+ """
149
+ lines = source.split("\n")
150
+ lines = [line + "\n" for line in lines]
151
+
152
+ tree = ast.parse(source)
153
+ async_with = tree.body[0]
154
+ assert isinstance(async_with, ast.AsyncWith)
155
+
156
+ body = _extract_body(lines, async_with)
157
+ assert "do_thing()" in body
158
+ assert "more_thing()" in body
159
+
160
+
161
+ class TestASTExtractionError:
162
+ """Tests for ASTExtractionError."""
163
+
164
+ def test_is_exception(self) -> None:
165
+ """ASTExtractionError is an exception."""
166
+ error = ASTExtractionError("test message")
167
+ assert isinstance(error, Exception)
168
+ assert str(error) == "test message"
@@ -0,0 +1,145 @@
1
+ """Tests for hud.eval.task module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+
7
+ from hud.eval.task import Task, TaskAgentConfig
8
+
9
+
10
+ class TestTaskSerialization:
11
+ """Tests for Task serialization and roundtrip."""
12
+
13
+ def test_v5_task_roundtrip(self) -> None:
14
+ """v5 Task serializes and deserializes correctly."""
15
+ task = Task(
16
+ env={"name": "browser", "include": ["navigate", "click"]},
17
+ scenario="checkout",
18
+ id="task-1",
19
+ args={"user_id": "alice"},
20
+ )
21
+
22
+ # Serialize
23
+ data = task.model_dump(mode="json")
24
+
25
+ # Should have v5 format
26
+ assert "env" in data
27
+ assert data["env"]["name"] == "browser"
28
+ assert data["scenario"] == "checkout"
29
+ assert data["id"] == "task-1"
30
+
31
+ # Recreate from serialized data
32
+ task2 = Task(**data)
33
+
34
+ # Serialize again
35
+ data2 = task2.model_dump(mode="json")
36
+
37
+ # Should be identical
38
+ assert data == data2
39
+
40
+ def test_v4_task_roundtrip(self) -> None:
41
+ """v4 Task serializes (flattens) and deserializes correctly."""
42
+ v4_dict = {
43
+ "prompt": "Go to google.com and search for cats",
44
+ "mcp_config": {
45
+ "browser": {"url": "http://localhost:8080"},
46
+ },
47
+ "evaluate_tool": {"name": "check_url", "arguments": {"contains": "google"}},
48
+ "setup_tool": {"name": "navigate", "arguments": {"url": "about:blank"}},
49
+ "id": "v4-task-1",
50
+ "agent_config": {"system_prompt": "You are a helpful assistant"},
51
+ "metadata": {"category": "navigation"},
52
+ }
53
+
54
+ # Create Task from v4 dict
55
+ task = Task.from_v4(v4_dict)
56
+
57
+ # Serialize (should flatten to v4 format)
58
+ data = task.model_dump(mode="json")
59
+
60
+ # Should have v4 format (flat, not nested env)
61
+ assert "prompt" in data
62
+ assert "mcp_config" in data
63
+ assert "evaluate_tool" in data
64
+ assert data["prompt"] == "Go to google.com and search for cats"
65
+ assert data["id"] == "v4-task-1"
66
+
67
+ # Recreate from serialized data
68
+ task2 = Task(**data)
69
+
70
+ # Serialize again
71
+ data2 = task2.model_dump(mode="json")
72
+
73
+ # Should be identical
74
+ assert data == data2
75
+
76
+ def test_v4_preserves_agent_config(self) -> None:
77
+ """v4 Task preserves agent_config through roundtrip."""
78
+ v4_dict = {
79
+ "prompt": "Test prompt",
80
+ "mcp_config": {"server": {"url": "http://localhost"}},
81
+ "evaluate_tool": {"name": "check", "arguments": {}},
82
+ "agent_config": {"system_prompt": "Custom system prompt"},
83
+ }
84
+
85
+ task = Task.from_v4(v4_dict)
86
+ data = task.model_dump(mode="json")
87
+
88
+ assert data.get("agent_config") == {"system_prompt": "Custom system prompt"}
89
+
90
+ # Roundtrip
91
+ task2 = Task(**data)
92
+ assert task2.agent_config is not None
93
+ assert isinstance(task2.agent_config, TaskAgentConfig)
94
+ assert task2.agent_config.system_prompt == "Custom system prompt"
95
+
96
+ def test_v4_preserves_metadata(self) -> None:
97
+ """v4 Task preserves metadata through roundtrip."""
98
+ v4_dict = {
99
+ "prompt": "Test prompt",
100
+ "mcp_config": {"server": {"url": "http://localhost"}},
101
+ "evaluate_tool": {"name": "check", "arguments": {}},
102
+ "metadata": {"key1": "value1", "key2": 42},
103
+ }
104
+
105
+ task = Task.from_v4(v4_dict)
106
+ data = task.model_dump(mode="json")
107
+
108
+ assert data.get("metadata") == {"key1": "value1", "key2": 42}
109
+
110
+ # Roundtrip
111
+ task2 = Task(**data)
112
+ assert task2.metadata == {"key1": "value1", "key2": 42}
113
+
114
+
115
+ class TestTaskValidation:
116
+ """Tests for Task validation."""
117
+
118
+ def test_v5_allows_none_env(self) -> None:
119
+ """v5 Task allows None env (for blank evals)."""
120
+ task = Task(scenario="test") # env=None is valid
121
+ assert task.env is None
122
+ assert task.scenario == "test"
123
+
124
+ def test_v4_requires_evaluate_tool(self) -> None:
125
+ """v4 Task requires evaluate_tool for validation."""
126
+ from hud.eval.utils import validate_v4_task
127
+
128
+ with pytest.raises(ValueError, match="evaluate_tool"):
129
+ validate_v4_task(
130
+ {
131
+ "prompt": "test",
132
+ "mcp_config": {"server": {}},
133
+ # Missing evaluate_tool
134
+ }
135
+ )
136
+
137
+ def test_agent_config_accepts_dict(self) -> None:
138
+ """agent_config can be provided as dict and gets converted."""
139
+ task = Task(
140
+ env={"name": "browser"},
141
+ agent_config={"system_prompt": "Hello"},
142
+ )
143
+
144
+ assert isinstance(task.agent_config, TaskAgentConfig)
145
+ assert task.agent_config.system_prompt == "Hello"
hud/eval/types.py ADDED
@@ -0,0 +1,63 @@
1
+ """Types and exceptions for the eval module.
2
+
3
+ Kept separate to avoid circular imports.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import Any
9
+
10
+ from pydantic import BaseModel
11
+
12
+ # =============================================================================
13
+ # Exceptions
14
+ # =============================================================================
15
+
16
+
17
+ class ParallelEvalComplete(Exception):
18
+ """Raised by summary context to skip body re-execution after parallel eval.
19
+
20
+ This is caught by the eval() context manager to cleanly exit.
21
+ The summary context with results is still accessible after the with block.
22
+ """
23
+
24
+
25
+ # =============================================================================
26
+ # Payload Models
27
+ # =============================================================================
28
+
29
+
30
+ class EvalPayload(BaseModel):
31
+ """Base payload for eval enter/exit."""
32
+
33
+ prompt: str | None = None
34
+ code_snippet: str | None = None
35
+ job_id: str | None = None
36
+ group_id: str | None = None
37
+ variants: dict[str, Any] | None = None
38
+ task_version_id: str | None = None
39
+ metadata: dict[str, Any] | None = None
40
+
41
+
42
+ class EvalExitPayload(EvalPayload):
43
+ """Exit payload with result fields."""
44
+
45
+ reward: float | None = None
46
+ success: bool = True
47
+ error_message: str | None = None
48
+
49
+
50
+ class JobEnterPayload(BaseModel):
51
+ """Payload for job/{job_id}/enter - sent once at job start."""
52
+
53
+ name: str | None = None
54
+ variants: dict[str, Any] | None = None # Full variant config
55
+ group: int | None = None
56
+
57
+
58
+ __all__ = [
59
+ "EvalExitPayload",
60
+ "EvalPayload",
61
+ "JobEnterPayload",
62
+ "ParallelEvalComplete",
63
+ ]
hud/eval/utils.py ADDED
@@ -0,0 +1,183 @@
1
+ """Utility functions for the eval module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import warnings
7
+ from typing import Any
8
+
9
+ __all__ = ["build_env_from_v4", "is_v4_format", "validate_v4_task"]
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def is_v4_format(data: dict[str, Any]) -> bool:
15
+ """Detect if dict looks like v4 LegacyTask format.
16
+
17
+ Used for branching logic. Checks if data has the core v4 fields
18
+ (prompt AND mcp_config). Does NOT validate completeness.
19
+
20
+ Args:
21
+ data: Dict to check
22
+
23
+ Returns:
24
+ True if looks like v4 format, False otherwise
25
+ """
26
+ if not isinstance(data, dict):
27
+ return False
28
+
29
+ # Core v4 detection: prompt + mcp_config
30
+ return bool(data.get("prompt")) and bool(data.get("mcp_config"))
31
+
32
+
33
+ def validate_v4_task(data: dict[str, Any]) -> None:
34
+ """Validate v4 task has all required fields.
35
+
36
+ A valid v4 task must have all three required fields:
37
+ - prompt: The task instruction
38
+ - mcp_config: MCP server configuration
39
+ - evaluate_tool: How to evaluate success
40
+
41
+ Call this after is_v4_format() when you need to ensure completeness.
42
+
43
+ Args:
44
+ data: Dict to validate
45
+
46
+ Raises:
47
+ ValueError: If any required fields are missing
48
+ """
49
+ missing = []
50
+ if not data.get("prompt"):
51
+ missing.append("prompt")
52
+ if not data.get("mcp_config"):
53
+ missing.append("mcp_config")
54
+ if not data.get("evaluate_tool"):
55
+ missing.append("evaluate_tool")
56
+
57
+ if missing:
58
+ raise ValueError(f"v4 task missing required fields: {', '.join(missing)}")
59
+
60
+
61
+ def build_env_from_v4(source: dict[str, Any] | Any) -> dict[str, Any]:
62
+ """Build Environment from v4 LegacyTask format.
63
+
64
+ Creates an Environment configured with the legacy task's fields.
65
+ Returns a dict ready to be passed to Task() constructor.
66
+
67
+ Args:
68
+ source: dict or LegacyTask with v4 fields (prompt, mcp_config, etc.)
69
+
70
+ Returns:
71
+ Dict with Task fields: env, id, scenario, args, validation, system_prompt, metadata
72
+
73
+ Raises:
74
+ TypeError: If source is not a dict or LegacyTask
75
+ """
76
+ from hud.environment import Environment
77
+ from hud.types import LegacyTask, MCPToolCall
78
+
79
+ # Convert dict to LegacyTask if needed
80
+ if isinstance(source, dict):
81
+ with warnings.catch_warnings():
82
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
83
+ legacy = LegacyTask(**source)
84
+ elif isinstance(source, LegacyTask):
85
+ legacy = source
86
+ else:
87
+ raise TypeError(f"Expected dict or LegacyTask, got {type(source).__name__}")
88
+
89
+ # Warn if using local MCP configs (command without url)
90
+ _warn_local_mcp(legacy.mcp_config)
91
+
92
+ # Extract tool filters from agent_config (v4 style)
93
+ # These are agent-level filters, not connection-level
94
+ include_tools: list[str] | None = None
95
+ exclude_tools: list[str] | None = None
96
+ if legacy.agent_config:
97
+ include_tools = legacy.agent_config.allowed_tools
98
+ exclude_tools = legacy.agent_config.disallowed_tools
99
+
100
+ # Convert ["*"] wildcard to None (meaning include all)
101
+ if include_tools == ["*"]:
102
+ include_tools = None
103
+
104
+ # Create Environment - NO connections made here, just config stored
105
+ env = Environment(legacy.id or "v4-legacy")
106
+ env.connect_mcp_config(legacy.mcp_config)
107
+
108
+ # Store agent-level tool filters on Environment (applied in as_tools())
109
+ # This allows Environment to call setup/evaluate while hiding them from agent
110
+ env._agent_include = include_tools
111
+ env._agent_exclude = exclude_tools
112
+
113
+ # Set the prompt
114
+ env.prompt = legacy.prompt
115
+
116
+ # Add setup_tool calls (stored, not executed)
117
+ if legacy.setup_tool:
118
+ setup_calls = legacy.setup_tool
119
+ if not isinstance(setup_calls, list):
120
+ setup_calls = [setup_calls]
121
+ for call in setup_calls:
122
+ env.setup_tool(call.name, **(call.arguments or {}))
123
+
124
+ # Add evaluate_tool calls (stored, not executed)
125
+ if legacy.evaluate_tool:
126
+ eval_calls = legacy.evaluate_tool
127
+ if not isinstance(eval_calls, list):
128
+ eval_calls = [eval_calls]
129
+ for call in eval_calls:
130
+ env.evaluate_tool(call.name, **(call.arguments or {}))
131
+
132
+ # Build Task fields dict
133
+ result: dict[str, Any] = {
134
+ "env": env,
135
+ "id": legacy.id,
136
+ "scenario": None, # v4 uses prompt, not scenarios
137
+ "args": {},
138
+ }
139
+
140
+ # Map integration_test_tool → validation (same concept: tool calls to verify)
141
+ if legacy.integration_test_tool:
142
+ int_test = legacy.integration_test_tool
143
+ if not isinstance(int_test, list):
144
+ int_test = [int_test]
145
+ # Convert to MCPToolCall if needed
146
+ result["validation"] = [
147
+ call if isinstance(call, MCPToolCall) else MCPToolCall(**call.model_dump())
148
+ for call in int_test
149
+ ]
150
+
151
+ # Extract agent_config (just system_prompt for now)
152
+ if legacy.agent_config and legacy.agent_config.system_prompt:
153
+ result["agent_config"] = {"system_prompt": legacy.agent_config.system_prompt}
154
+
155
+ # Preserve metadata
156
+ if legacy.metadata:
157
+ result["metadata"] = legacy.metadata
158
+
159
+ return result
160
+
161
+
162
+ def _warn_local_mcp(mcp_config: dict[str, Any] | None) -> None:
163
+ """Warn if mcp_config uses local MCP servers (command without url).
164
+
165
+ Local MCP servers can cause port conflicts when running tasks concurrently.
166
+ """
167
+ if not mcp_config:
168
+ return
169
+
170
+ has_local = any(
171
+ isinstance(server_cfg, dict) and "command" in server_cfg and not server_cfg.get("url")
172
+ for server_cfg in mcp_config.values()
173
+ if isinstance(server_cfg, dict)
174
+ )
175
+
176
+ if has_local:
177
+ warnings.warn(
178
+ "Task uses local MCP configuration (command without url). "
179
+ "This may cause port conflicts when running tasks concurrently. "
180
+ "Consider using remote MCP servers for parallel execution.",
181
+ UserWarning,
182
+ stacklevel=4,
183
+ )
@@ -0,0 +1,19 @@
1
+ """
2
+ HUD runtime patches for third-party libraries.
3
+
4
+ This module applies monkey-patches to fix issues in dependencies
5
+ without requiring forked packages.
6
+ """
7
+
8
+ from hud.patches.mcp_patches import apply_all_patches, suppress_fastmcp_logging
9
+ from hud.patches.warnings import apply_default_warning_filters, suppress_mcp_use_import_warnings
10
+
11
+ # Apply patches on import
12
+ apply_all_patches()
13
+
14
+ __all__ = [
15
+ "apply_all_patches",
16
+ "apply_default_warning_filters",
17
+ "suppress_fastmcp_logging",
18
+ "suppress_mcp_use_import_warnings",
19
+ ]