hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,178 @@
1
+ """Tests for hud.eval.context module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from unittest.mock import AsyncMock, MagicMock, patch
6
+
7
+ import pytest
8
+
9
+ from hud.eval.context import (
10
+ EvalContext,
11
+ get_current_trace_headers,
12
+ )
13
+
14
+
15
+ class TestEvalContext:
16
+ """Tests for EvalContext."""
17
+
18
+ def test_init_generates_trace_id(self) -> None:
19
+ """EvalContext generates trace_id if not provided."""
20
+ ctx = EvalContext(name="test-task", quiet=True)
21
+
22
+ assert ctx.trace_id is not None
23
+ assert len(ctx.trace_id) == 36 # UUID format
24
+
25
+ def test_init_uses_provided_trace_id(self) -> None:
26
+ """EvalContext uses provided trace_id."""
27
+ ctx = EvalContext(name="test-task", trace_id="custom-id", quiet=True)
28
+
29
+ assert ctx.trace_id == "custom-id"
30
+
31
+ def test_headers_contains_trace_id(self) -> None:
32
+ """headers property returns dict with trace ID."""
33
+ ctx = EvalContext(name="test-task", trace_id="test-123", quiet=True)
34
+
35
+ assert ctx.headers == {"Trace-Id": "test-123"}
36
+
37
+ def test_success_true_when_no_error(self) -> None:
38
+ """success property returns True when no error."""
39
+ ctx = EvalContext(name="test-task", quiet=True)
40
+
41
+ assert ctx.success is True
42
+
43
+ def test_success_false_when_error(self) -> None:
44
+ """success property returns False when error is set."""
45
+ ctx = EvalContext(name="test-task", quiet=True)
46
+ ctx.error = ValueError("test error")
47
+
48
+ assert ctx.success is False
49
+
50
+ def test_variants_empty_by_default(self) -> None:
51
+ """variants is empty dict by default."""
52
+ ctx = EvalContext(name="test-task", quiet=True)
53
+
54
+ assert ctx.variants == {}
55
+
56
+ def test_variants_set_from_init(self) -> None:
57
+ """variants set from parameter."""
58
+ ctx = EvalContext(
59
+ name="test-task",
60
+ variants={"model": "gpt-4o", "temp": 0.7},
61
+ quiet=True,
62
+ )
63
+
64
+ assert ctx.variants == {"model": "gpt-4o", "temp": 0.7}
65
+
66
+ @pytest.mark.asyncio
67
+ async def test_context_manager_sets_headers(self) -> None:
68
+ """Context manager sets trace headers in contextvar."""
69
+ ctx = EvalContext(name="test-task", trace_id="test-123", quiet=True)
70
+
71
+ # Mock telemetry calls
72
+ with (
73
+ patch.object(ctx, "_eval_enter", new_callable=AsyncMock),
74
+ patch.object(ctx, "_eval_exit", new_callable=AsyncMock),
75
+ patch.object(EvalContext, "__aenter__", return_value=ctx),
76
+ patch.object(EvalContext, "__aexit__", return_value=None),
77
+ ):
78
+ assert get_current_trace_headers() is None
79
+
80
+ # Manually set token for test
81
+ from hud.eval.context import _current_trace_headers
82
+
83
+ token = _current_trace_headers.set(ctx.headers)
84
+ try:
85
+ headers = get_current_trace_headers()
86
+ assert headers is not None
87
+ assert headers["Trace-Id"] == "test-123"
88
+ finally:
89
+ _current_trace_headers.reset(token)
90
+
91
+ assert get_current_trace_headers() is None
92
+
93
+ def test_repr(self) -> None:
94
+ """__repr__ shows useful info."""
95
+ ctx = EvalContext(
96
+ name="test-task",
97
+ trace_id="abc12345-6789-0000-0000-000000000000",
98
+ quiet=True,
99
+ )
100
+ ctx.reward = 0.95
101
+
102
+ repr_str = repr(ctx)
103
+ assert "abc12345" in repr_str
104
+ assert "test-task" in repr_str
105
+ assert "0.95" in repr_str
106
+
107
+
108
+ class TestEvalContextPrompt:
109
+ """Tests for EvalContext.prompt feature."""
110
+
111
+ def test_prompt_can_be_set(self) -> None:
112
+ """EvalContext.prompt can be set."""
113
+ ctx = EvalContext(name="test-task", quiet=True)
114
+ ctx.prompt = "Test prompt"
115
+
116
+ assert ctx.prompt == "Test prompt"
117
+
118
+ def test_prompt_included_in_payload(self) -> None:
119
+ """Prompt is included in eval payload."""
120
+ ctx = EvalContext(name="test-task", quiet=True)
121
+ ctx.prompt = "Test prompt"
122
+
123
+ payload = ctx._build_base_payload()
124
+ assert payload.prompt == "Test prompt"
125
+
126
+
127
+ class TestEvalContextFromEnvironment:
128
+ """Tests for EvalContext.from_environment factory."""
129
+
130
+ def test_copies_connections(self) -> None:
131
+ """from_environment copies connections from parent (deep copy)."""
132
+ from hud.environment import Environment
133
+
134
+ parent = Environment("parent-env")
135
+ # Add a mock connection with copy method
136
+ mock_conn = MagicMock()
137
+ mock_conn_copy = MagicMock()
138
+ mock_conn.copy.return_value = mock_conn_copy
139
+ parent._connections["test-conn"] = mock_conn
140
+
141
+ ctx = EvalContext.from_environment(parent, name="test-task")
142
+
143
+ # Verify connection was copied (not same object)
144
+ assert "test-conn" in ctx._connections
145
+ mock_conn.copy.assert_called_once()
146
+ assert ctx._connections["test-conn"] is mock_conn_copy
147
+
148
+ def test_copies_prompt(self) -> None:
149
+ """from_environment copies prompt from parent."""
150
+ from hud.environment import Environment
151
+
152
+ parent = Environment("parent-env")
153
+ parent.prompt = "Parent prompt"
154
+
155
+ ctx = EvalContext.from_environment(parent, name="test-task")
156
+
157
+ assert ctx.prompt == "Parent prompt"
158
+
159
+ def test_sets_eval_properties(self) -> None:
160
+ """from_environment sets eval-specific properties."""
161
+ from hud.environment import Environment
162
+
163
+ parent = Environment("parent-env")
164
+
165
+ ctx = EvalContext.from_environment(
166
+ parent,
167
+ name="test-task",
168
+ trace_id="custom-trace",
169
+ variants={"model": "gpt-4o"},
170
+ group_id="group-123",
171
+ index=5,
172
+ )
173
+
174
+ assert ctx.eval_name == "test-task"
175
+ assert ctx.trace_id == "custom-trace"
176
+ assert ctx.variants == {"model": "gpt-4o"}
177
+ assert ctx.group_id == "group-123"
178
+ assert ctx.index == 5
@@ -0,0 +1,210 @@
1
+ """Tests for hud.eval.task module (Task class)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+
7
+ from hud.eval.task import Task
8
+
9
+
10
+ class TestTaskDataclass:
11
+ """Tests for Task as a Pydantic model."""
12
+
13
+ def test_init_defaults(self) -> None:
14
+ """Task initializes with sensible defaults."""
15
+ task = Task()
16
+
17
+ assert task.env is None
18
+ assert task.scenario is None
19
+ assert task.args == {}
20
+
21
+ def test_init_with_env_dict(self) -> None:
22
+ """Task auto-converts env dict to Environment via validator."""
23
+ from hud.environment import Environment
24
+
25
+ task = Task(
26
+ env={"name": "browser", "include": ["navigate"]},
27
+ scenario="checkout",
28
+ args={"user_id": "alice"},
29
+ )
30
+
31
+ # env dict is auto-converted to Environment
32
+ assert isinstance(task.env, Environment)
33
+ assert task.scenario == "checkout"
34
+ assert task.args == {"user_id": "alice"}
35
+
36
+ def test_copy_creates_new_instance(self) -> None:
37
+ """copy() creates a new Task instance."""
38
+ original = Task(
39
+ env={"name": "test"},
40
+ scenario="checkout",
41
+ args={"user_id": "alice"},
42
+ )
43
+ copied = original.copy()
44
+
45
+ assert copied is not original
46
+ assert copied.env is original.env # Env reference is shared (intentional)
47
+ assert copied.scenario == original.scenario
48
+ assert copied.args == original.args
49
+ assert copied.args is not original.args # Args are deep copied
50
+
51
+
52
+ class TestEnvironmentCall:
53
+ """Tests for Environment.__call__ returning Task."""
54
+
55
+ def test_call_returns_task(self) -> None:
56
+ """Environment() returns a Task object."""
57
+ from hud.environment import Environment
58
+
59
+ env = Environment("test-env")
60
+ task = env()
61
+
62
+ assert isinstance(task, Task)
63
+
64
+ def test_call_with_scenario_sets_scenario(self) -> None:
65
+ """Environment(scenario) sets scenario name."""
66
+ from hud.environment import Environment
67
+
68
+ env = Environment("test-env")
69
+ task = env("checkout")
70
+
71
+ assert task.scenario == "checkout"
72
+
73
+ def test_call_with_args_sets_args(self) -> None:
74
+ """Environment(scenario, **args) sets args."""
75
+ from hud.environment import Environment
76
+
77
+ env = Environment("test-env")
78
+ task = env("checkout", user_id="alice", amount=100)
79
+
80
+ assert task.args == {"user_id": "alice", "amount": 100}
81
+
82
+ def test_call_returns_task_with_env(self) -> None:
83
+ """Environment() returns Task with env reference."""
84
+ from hud.environment import Environment
85
+
86
+ env = Environment("test-env")
87
+ task = env()
88
+
89
+ # Task has reference to the Environment
90
+ assert task.env is env
91
+
92
+ # With setup_tool (v4 legacy)
93
+ env2 = Environment("test-env").setup_tool("navigate", url="https://example.com")
94
+ task2 = env2()
95
+ assert task2.env is env2
96
+ assert len(task2.env._setup_calls) == 1
97
+
98
+
99
+ class TestTaskFromV4:
100
+ """Tests for Task.from_v4() migration helper."""
101
+
102
+ def test_from_v4_with_legacy_task(self) -> None:
103
+ """Task.from_v4() accepts LegacyTask object."""
104
+ import warnings
105
+
106
+ # Suppress the deprecation warning from LegacyTask
107
+ with warnings.catch_warnings():
108
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
109
+ from hud.types import LegacyTask
110
+
111
+ legacy = LegacyTask(
112
+ prompt="Navigate to google.com",
113
+ mcp_config={"hud": {"url": "https://mcp.hud.ai"}},
114
+ evaluate_tool={"name": "check", "arguments": {}},
115
+ )
116
+
117
+ task = Task.from_v4(legacy)
118
+
119
+ assert isinstance(task, Task)
120
+ assert task.env is not None
121
+ assert task.env.prompt == "Navigate to google.com"
122
+ assert task.scenario is None # Uses setup/evaluate_tool, not scenarios
123
+
124
+ def test_from_v4_with_dict(self) -> None:
125
+ """Task.from_v4() accepts dict with LegacyTask fields."""
126
+ task = Task.from_v4(
127
+ {
128
+ "prompt": "Navigate to google.com",
129
+ "mcp_config": {"hud": {"url": "https://mcp.hud.ai"}},
130
+ "evaluate_tool": {"name": "check", "arguments": {}},
131
+ }
132
+ )
133
+
134
+ assert isinstance(task, Task)
135
+ assert task.env is not None
136
+ assert task.env.prompt == "Navigate to google.com"
137
+
138
+ def test_from_v4_with_json_string(self) -> None:
139
+ """Task.from_v4() accepts JSON string."""
140
+ import json
141
+
142
+ data = {
143
+ "prompt": "Navigate to google.com",
144
+ "mcp_config": {"hud": {"url": "https://mcp.hud.ai"}},
145
+ "evaluate_tool": {"name": "check", "arguments": {}},
146
+ }
147
+ task = Task.from_v4(json.dumps(data))
148
+
149
+ assert isinstance(task, Task)
150
+ assert task.env is not None
151
+ assert task.env.prompt == "Navigate to google.com"
152
+
153
+ def test_from_v4_with_setup_tool(self) -> None:
154
+ """Task.from_v4() preserves setup_tool via env._setup_calls."""
155
+ task = Task.from_v4(
156
+ {
157
+ "prompt": "Check URL",
158
+ "mcp_config": {"hud": {"url": "https://mcp.hud.ai"}},
159
+ "setup_tool": {"name": "navigate", "arguments": {"url": "https://google.com"}},
160
+ "evaluate_tool": {"name": "check", "arguments": {}},
161
+ }
162
+ )
163
+
164
+ # setup_tool is converted to env._setup_calls
165
+ assert len(task.env._setup_calls) == 1
166
+ assert task.env._setup_calls[0] == ("navigate", {"url": "https://google.com"})
167
+
168
+ def test_from_v4_with_evaluate_tool(self) -> None:
169
+ """Task.from_v4() preserves evaluate_tool via env._evaluate_calls."""
170
+ task = Task.from_v4(
171
+ {
172
+ "prompt": "Check URL",
173
+ "mcp_config": {"hud": {"url": "https://mcp.hud.ai"}},
174
+ "evaluate_tool": {"name": "check_url", "arguments": {"expected": "google"}},
175
+ }
176
+ )
177
+
178
+ # evaluate_tool is converted to env._evaluate_calls
179
+ assert len(task.env._evaluate_calls) == 1
180
+ assert task.env._evaluate_calls[0] == ("check_url", {"expected": "google"})
181
+
182
+ def test_from_v4_with_invalid_type_raises(self) -> None:
183
+ """Task.from_v4() raises TypeError for invalid input."""
184
+ with pytest.raises(TypeError):
185
+ Task.from_v4(12345) # type: ignore[arg-type]
186
+
187
+ def test_from_v4_with_invalid_json_raises(self) -> None:
188
+ """Task.from_v4() raises JSONDecodeError for invalid JSON."""
189
+ import json
190
+
191
+ with pytest.raises(json.JSONDecodeError):
192
+ Task.from_v4("not valid json")
193
+
194
+ def test_from_v4_does_not_warn_on_use(self) -> None:
195
+ """Task.from_v4() suppresses LegacyTask deprecation warning."""
196
+ import warnings
197
+
198
+ with warnings.catch_warnings(record=True) as w:
199
+ warnings.simplefilter("always")
200
+ Task.from_v4(
201
+ {
202
+ "prompt": "test",
203
+ "mcp_config": {"hud": {}},
204
+ "evaluate_tool": {"name": "check", "arguments": {}},
205
+ }
206
+ )
207
+
208
+ # Should not trigger deprecation warning since we're migrating
209
+ legacy_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
210
+ assert len(legacy_warnings) == 0
@@ -0,0 +1,152 @@
1
+ """Tests for hud.eval.manager module (hud.eval() function)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from unittest.mock import AsyncMock, patch
6
+
7
+ import pytest
8
+
9
+ from hud.eval.context import EvalContext, get_current_trace_headers
10
+ from hud.eval.manager import run_eval
11
+
12
+
13
+ class TestRunEvalNoArgs:
14
+ """Tests for hud.eval() with no arguments (blank eval)."""
15
+
16
+ @pytest.mark.asyncio
17
+ async def test_blank_eval_creates_context(self) -> None:
18
+ """hud.eval() with no args creates an EvalContext."""
19
+ with (
20
+ patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
21
+ patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
22
+ ):
23
+ async with run_eval(quiet=True) as ctx:
24
+ assert isinstance(ctx, EvalContext)
25
+ assert ctx.eval_name == "eval"
26
+
27
+ @pytest.mark.asyncio
28
+ async def test_blank_eval_generates_trace_id(self) -> None:
29
+ """hud.eval() with no args generates a trace_id."""
30
+ with (
31
+ patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
32
+ patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
33
+ ):
34
+ async with run_eval(quiet=True) as ctx:
35
+ assert ctx.trace_id is not None
36
+ assert len(ctx.trace_id) == 36 # UUID format
37
+
38
+ @pytest.mark.asyncio
39
+ async def test_blank_eval_sets_trace_headers(self) -> None:
40
+ """hud.eval() sets trace headers in contextvar during context."""
41
+ with (
42
+ patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
43
+ patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
44
+ ):
45
+ # Before context, no headers
46
+ assert get_current_trace_headers() is None
47
+
48
+ async with run_eval(quiet=True) as ctx:
49
+ # Inside context, headers are set
50
+ headers = get_current_trace_headers()
51
+ assert headers is not None
52
+ assert headers["Trace-Id"] == ctx.trace_id
53
+
54
+ # After context, headers are cleared
55
+ assert get_current_trace_headers() is None
56
+
57
+ @pytest.mark.asyncio
58
+ async def test_blank_eval_reward_can_be_set(self) -> None:
59
+ """hud.eval() allows setting reward on context."""
60
+ with (
61
+ patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
62
+ patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
63
+ ):
64
+ async with run_eval(quiet=True) as ctx:
65
+ assert ctx.reward is None
66
+ ctx.reward = 0.95
67
+
68
+ assert ctx.reward == 0.95
69
+
70
+ @pytest.mark.asyncio
71
+ async def test_blank_eval_reports_reward_on_exit(self) -> None:
72
+ """hud.eval() reports reward to backend on exit."""
73
+ with (
74
+ patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
75
+ patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock) as mock_exit,
76
+ ):
77
+ async with run_eval(quiet=True) as ctx:
78
+ ctx.reward = 0.85
79
+
80
+ # _eval_exit should have been called (with no error)
81
+ mock_exit.assert_called_once_with(None)
82
+
83
+ @pytest.mark.asyncio
84
+ async def test_blank_eval_empty_variants(self) -> None:
85
+ """hud.eval() with no args has empty variants dict."""
86
+ with (
87
+ patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
88
+ patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
89
+ ):
90
+ async with run_eval(quiet=True) as ctx:
91
+ assert ctx.variants == {}
92
+
93
+ @pytest.mark.asyncio
94
+ async def test_blank_eval_has_headers_property(self) -> None:
95
+ """hud.eval() context has headers property for gateway integration."""
96
+ with (
97
+ patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
98
+ patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
99
+ ):
100
+ async with run_eval(quiet=True) as ctx:
101
+ headers = ctx.headers
102
+ assert "Trace-Id" in headers
103
+ assert headers["Trace-Id"] == ctx.trace_id
104
+
105
+
106
+ class TestRunEvalWithApiKey:
107
+ """Tests for hud.eval() with api_key parameter."""
108
+
109
+ @pytest.mark.asyncio
110
+ async def test_api_key_passed_to_context(self) -> None:
111
+ """hud.eval(api_key=...) passes api_key to context."""
112
+ with (
113
+ patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
114
+ patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
115
+ ):
116
+ async with run_eval(api_key="test-key", quiet=True) as ctx:
117
+ assert ctx._eval_api_key == "test-key"
118
+
119
+
120
+ class TestRunEvalWithJobId:
121
+ """Tests for hud.eval() with job_id parameter."""
122
+
123
+ @pytest.mark.asyncio
124
+ async def test_job_id_passed_to_context(self) -> None:
125
+ """hud.eval(job_id=...) passes job_id to context."""
126
+ with (
127
+ patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
128
+ patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
129
+ ):
130
+ async with run_eval(job_id="job-123", quiet=True) as ctx:
131
+ assert ctx.job_id == "job-123"
132
+
133
+
134
+ class TestRunEvalErrorHandling:
135
+ """Tests for hud.eval() error handling."""
136
+
137
+ @pytest.mark.asyncio
138
+ async def test_error_tracked_on_exception(self) -> None:
139
+ """hud.eval() tracks error when exception occurs."""
140
+ with (
141
+ patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
142
+ patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock) as mock_exit,
143
+ ):
144
+ with pytest.raises(ValueError):
145
+ async with run_eval(quiet=True):
146
+ raise ValueError("test error")
147
+
148
+ # _eval_exit should have been called with error message
149
+ mock_exit.assert_called_once()
150
+ error_msg = mock_exit.call_args[0][0]
151
+ assert error_msg is not None
152
+ assert "test error" in error_msg