hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,197 @@
1
+ """Tests for CLI dev module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from unittest import mock
6
+
7
+ from hud.cli.dev import auto_detect_module, should_use_docker_mode
8
+
9
+
10
+ class TestShouldUseDockerMode:
11
+ """Test Docker mode detection."""
12
+
13
+ def test_docker_mode_with_dockerfile(self, tmp_path):
14
+ """Test detection when Dockerfile exists."""
15
+ dockerfile = tmp_path / "Dockerfile"
16
+ dockerfile.write_text("FROM python:3.11")
17
+
18
+ assert should_use_docker_mode(tmp_path) is True
19
+
20
+ def test_no_docker_mode_without_dockerfile(self, tmp_path):
21
+ """Test detection when Dockerfile doesn't exist."""
22
+ assert should_use_docker_mode(tmp_path) is False
23
+
24
+ def test_docker_mode_empty_dockerfile(self, tmp_path):
25
+ """Test detection with empty Dockerfile."""
26
+ dockerfile = tmp_path / "Dockerfile"
27
+ dockerfile.write_text("")
28
+
29
+ assert should_use_docker_mode(tmp_path) is True
30
+
31
+
32
+ class TestAutoDetectModule:
33
+ """Test MCP module auto-detection."""
34
+
35
+ def test_detect_module_from_init_with_mcpserver(self, tmp_path, monkeypatch):
36
+ """Test detection from __init__.py with MCPServer."""
37
+ monkeypatch.chdir(tmp_path)
38
+
39
+ init_file = tmp_path / "__init__.py"
40
+ init_file.write_text("""
41
+ from hud.server import MCPServer
42
+ mcp = MCPServer(name='test')
43
+ """)
44
+
45
+ module_name, extra_path = auto_detect_module()
46
+
47
+ assert module_name == tmp_path.name
48
+ assert extra_path is None
49
+
50
+ def test_detect_module_from_init_with_fastmcp(self, tmp_path, monkeypatch):
51
+ """Test detection from __init__.py with FastMCP."""
52
+ monkeypatch.chdir(tmp_path)
53
+
54
+ init_file = tmp_path / "__init__.py"
55
+ init_file.write_text("""
56
+ from fastmcp import FastMCP
57
+ mcp = FastMCP(name='test')
58
+ """)
59
+
60
+ module_name, extra_path = auto_detect_module()
61
+
62
+ assert module_name == tmp_path.name
63
+ assert extra_path is None
64
+
65
+ def test_detect_module_from_main_py(self, tmp_path, monkeypatch):
66
+ """Test detection from main.py with MCPServer."""
67
+ monkeypatch.chdir(tmp_path)
68
+
69
+ # Need both __init__.py and main.py
70
+ init_file = tmp_path / "__init__.py"
71
+ init_file.write_text("")
72
+
73
+ main_file = tmp_path / "main.py"
74
+ main_file.write_text("""
75
+ from hud.server import MCPServer
76
+ mcp = MCPServer(name='test')
77
+ """)
78
+
79
+ module_name, extra_path = auto_detect_module()
80
+
81
+ assert module_name == f"{tmp_path.name}.main"
82
+ assert extra_path == tmp_path.parent
83
+
84
+ def test_detect_module_from_init_with_environment(self, tmp_path, monkeypatch):
85
+ """Test detection from __init__.py with Environment."""
86
+ monkeypatch.chdir(tmp_path)
87
+
88
+ init_file = tmp_path / "__init__.py"
89
+ init_file.write_text("""
90
+ from hud import Environment
91
+ env = Environment(name='test')
92
+ """)
93
+
94
+ module_name, extra_path = auto_detect_module()
95
+
96
+ assert module_name == tmp_path.name
97
+ assert extra_path is None
98
+
99
+ def test_detect_module_from_main_py_with_environment(self, tmp_path, monkeypatch):
100
+ """Test detection from main.py with Environment."""
101
+ monkeypatch.chdir(tmp_path)
102
+
103
+ # Need both __init__.py and main.py
104
+ init_file = tmp_path / "__init__.py"
105
+ init_file.write_text("")
106
+
107
+ main_file = tmp_path / "main.py"
108
+ main_file.write_text("""
109
+ from hud import Environment
110
+ env = Environment(name='test')
111
+ """)
112
+
113
+ module_name, extra_path = auto_detect_module()
114
+
115
+ assert module_name == f"{tmp_path.name}.main"
116
+ assert extra_path == tmp_path.parent
117
+
118
+ def test_no_detection_without_mcp_or_env(self, tmp_path, monkeypatch):
119
+ """Test no detection when neither mcp nor env is defined."""
120
+ monkeypatch.chdir(tmp_path)
121
+
122
+ init_file = tmp_path / "__init__.py"
123
+ init_file.write_text("# Just a comment")
124
+
125
+ module_name, extra_path = auto_detect_module()
126
+
127
+ assert module_name is None
128
+ assert extra_path is None
129
+
130
+ def test_no_detection_empty_dir(self, tmp_path, monkeypatch):
131
+ """Test no detection in empty directory."""
132
+ monkeypatch.chdir(tmp_path)
133
+
134
+ module_name, extra_path = auto_detect_module()
135
+
136
+ assert module_name is None
137
+ assert extra_path is None
138
+
139
+
140
+ class TestShowDevServerInfo:
141
+ """Test dev server info display."""
142
+
143
+ @mock.patch("hud.cli.dev.hud_console")
144
+ def test_show_dev_server_info_http(self, mock_console):
145
+ """Test showing server info for HTTP transport."""
146
+ from hud.cli.dev import show_dev_server_info
147
+
148
+ result = show_dev_server_info(
149
+ server_name="test-server",
150
+ port=8000,
151
+ transport="http",
152
+ inspector=False,
153
+ interactive=False,
154
+ )
155
+
156
+ # Returns cursor deeplink
157
+ assert result.startswith("cursor://")
158
+ assert "test-server" in result
159
+
160
+ # Console should have been called
161
+ assert mock_console.section_title.called
162
+ assert mock_console.info.called
163
+
164
+ @mock.patch("hud.cli.dev.hud_console")
165
+ def test_show_dev_server_info_stdio(self, mock_console):
166
+ """Test showing server info for stdio transport."""
167
+ from hud.cli.dev import show_dev_server_info
168
+
169
+ result = show_dev_server_info(
170
+ server_name="test-server",
171
+ port=8000,
172
+ transport="stdio",
173
+ inspector=False,
174
+ interactive=False,
175
+ )
176
+
177
+ # Returns cursor deeplink
178
+ assert result.startswith("cursor://")
179
+
180
+ @mock.patch("hud.cli.dev.hud_console")
181
+ def test_show_dev_server_info_with_telemetry(self, mock_console):
182
+ """Test showing server info with telemetry URLs."""
183
+ from hud.cli.dev import show_dev_server_info
184
+
185
+ result = show_dev_server_info(
186
+ server_name="browser-env",
187
+ port=8000,
188
+ transport="http",
189
+ inspector=False,
190
+ interactive=False,
191
+ telemetry={
192
+ "live_url": "https://hud.ai/trace/123",
193
+ "vnc_url": "http://localhost:5900",
194
+ },
195
+ )
196
+
197
+ assert result.startswith("cursor://")
@@ -0,0 +1,251 @@
1
+ """Tests for hud.cli.eval module and run_dataset function."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+ from unittest.mock import AsyncMock, MagicMock, patch
7
+
8
+ import pytest
9
+ from mcp import types
10
+
11
+ from hud.environment.router import ToolRouter
12
+ from hud.eval.context import EvalContext
13
+ from hud.types import AgentType, MCPToolResult, Trace
14
+
15
+
16
+ class MockEvalContext(EvalContext):
17
+ """Mock EvalContext for testing."""
18
+
19
+ def __init__(
20
+ self,
21
+ prompt: str = "Test prompt",
22
+ tools: list[types.Tool] | None = None,
23
+ ) -> None:
24
+ # Core attributes
25
+ self.prompt = prompt
26
+ self._tools = tools or []
27
+ self._submitted: str | None = None
28
+ self.reward: float | None = None
29
+ self.results: list[EvalContext] = []
30
+
31
+ # Environment attributes
32
+ self._router = ToolRouter()
33
+ self._agent_include: list[str] | None = None
34
+ self._agent_exclude: list[str] | None = None
35
+
36
+ # EvalContext attributes
37
+ self._task = None
38
+ self.trace_id = "test-trace-id"
39
+ self.eval_name = "test-eval"
40
+ self.job_id: str | None = None
41
+ self.group_id: str | None = None
42
+ self.index = 0
43
+ self.variants: dict[str, Any] = {}
44
+ self.answer: str | None = None
45
+ self.system_prompt: str | None = None
46
+ self.error: BaseException | None = None
47
+ self.metadata: dict[str, Any] = {}
48
+ self._is_summary = False
49
+
50
+ def as_tools(self) -> list[types.Tool]:
51
+ return self._tools
52
+
53
+ @property
54
+ def has_scenario(self) -> bool:
55
+ return False
56
+
57
+ async def list_tools(self) -> list[types.Tool]:
58
+ return self._tools
59
+
60
+ async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
61
+ return MCPToolResult(
62
+ content=[types.TextContent(type="text", text="ok")],
63
+ isError=False,
64
+ )
65
+
66
+ async def submit(self, answer: str) -> None:
67
+ self._submitted = answer
68
+
69
+
70
+ def _create_mock_agent_cls() -> tuple[MagicMock, MagicMock]:
71
+ """Create a mock agent class and instance for testing."""
72
+ mock_agent_instance = MagicMock()
73
+ mock_agent_instance.run = AsyncMock(return_value=Trace(reward=1.0, done=True))
74
+ mock_agent_cls = MagicMock()
75
+ mock_agent_cls.create.return_value = mock_agent_instance
76
+ return mock_agent_cls, mock_agent_instance
77
+
78
+
79
+ class TestRunDataset:
80
+ """Test the new run_dataset function."""
81
+
82
+ @pytest.mark.asyncio
83
+ async def test_run_dataset_with_task_list(self) -> None:
84
+ """Test run_dataset with a list of tasks."""
85
+ from hud.eval.task import Task
86
+
87
+ tasks = [
88
+ Task(env={"name": "test"}, id="task1", scenario="test"),
89
+ Task(env={"name": "test"}, id="task2", scenario="test"),
90
+ ]
91
+ mock_agent_cls, mock_agent_instance = _create_mock_agent_cls()
92
+
93
+ # Mock hud.eval to return our mock context
94
+ mock_ctx = MockEvalContext()
95
+
96
+ with (
97
+ patch("hud.datasets.runner.hud.eval") as mock_eval,
98
+ patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
99
+ ):
100
+ # Set up the async context manager
101
+ mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
102
+ mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
103
+
104
+ from hud.datasets.runner import run_dataset
105
+
106
+ await run_dataset(tasks, agent_type="claude", max_steps=5)
107
+
108
+ # Verify hud.eval was called with correct params
109
+ mock_eval.assert_called_once()
110
+ call_kwargs = mock_eval.call_args[1]
111
+ assert call_kwargs["group"] == 1
112
+ assert call_kwargs["max_concurrent"] == 30
113
+
114
+ # Agent should have run
115
+ mock_agent_instance.run.assert_called_once()
116
+
117
+ @pytest.mark.asyncio
118
+ async def test_run_dataset_with_string_source(self) -> None:
119
+ """Test run_dataset with a string source (loads via load_dataset)."""
120
+ from hud.eval.task import Task
121
+
122
+ mock_tasks = [Task(env={"name": "test"}, id="loaded_task", scenario="loaded")]
123
+ mock_agent_cls, _ = _create_mock_agent_cls()
124
+ mock_ctx = MockEvalContext()
125
+
126
+ with (
127
+ patch("hud.datasets.loader.load_tasks", return_value=mock_tasks) as mock_load,
128
+ patch("hud.datasets.runner.hud.eval") as mock_eval,
129
+ patch("hud.agents.OpenAIAgent", mock_agent_cls),
130
+ ):
131
+ mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
132
+ mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
133
+
134
+ from hud.datasets.runner import run_dataset
135
+
136
+ await run_dataset("my-tasks.json", agent_type="openai")
137
+
138
+ # Verify load_dataset was called
139
+ mock_load.assert_called_once_with("my-tasks.json")
140
+
141
+ @pytest.mark.asyncio
142
+ async def test_run_dataset_empty_tasks_raises(self) -> None:
143
+ """Test run_dataset raises ValueError for empty tasks."""
144
+ with patch("hud.datasets.loader.load_dataset", return_value=[]):
145
+ from hud.datasets.runner import run_dataset
146
+
147
+ with pytest.raises(ValueError, match="No tasks to run"):
148
+ await run_dataset([], agent_type=AgentType.CLAUDE)
149
+
150
+ @pytest.mark.asyncio
151
+ async def test_run_dataset_with_group_size(self) -> None:
152
+ """Test run_dataset passes group_size to hud.eval."""
153
+ from hud.eval.task import Task
154
+
155
+ tasks = [Task(env={"name": "test"}, id="task1", scenario="test")]
156
+ mock_agent_cls, _ = _create_mock_agent_cls()
157
+ mock_ctx = MockEvalContext()
158
+
159
+ with (
160
+ patch("hud.datasets.runner.hud.eval") as mock_eval,
161
+ patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
162
+ ):
163
+ mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
164
+ mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
165
+
166
+ from hud.datasets.runner import run_dataset
167
+
168
+ await run_dataset(tasks, agent_type="claude", group_size=3)
169
+
170
+ call_kwargs = mock_eval.call_args[1]
171
+ assert call_kwargs["group"] == 3
172
+
173
+ @pytest.mark.asyncio
174
+ async def test_run_dataset_with_max_concurrent(self) -> None:
175
+ """Test run_dataset passes max_concurrent to hud.eval."""
176
+ from hud.eval.task import Task
177
+
178
+ tasks = [Task(env={"name": "test"}, id="task1", scenario="test")]
179
+ mock_agent_cls, _ = _create_mock_agent_cls()
180
+ mock_ctx = MockEvalContext()
181
+
182
+ with (
183
+ patch("hud.datasets.runner.hud.eval") as mock_eval,
184
+ patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
185
+ ):
186
+ mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
187
+ mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
188
+
189
+ from hud.datasets.runner import run_dataset
190
+
191
+ await run_dataset(tasks, agent_type="claude", max_concurrent=10)
192
+
193
+ call_kwargs = mock_eval.call_args[1]
194
+ assert call_kwargs["max_concurrent"] == 10
195
+
196
+ @pytest.mark.asyncio
197
+ async def test_run_dataset_returns_results(self) -> None:
198
+ """Test run_dataset returns EvalContext results."""
199
+ from hud.eval.task import Task
200
+
201
+ tasks = [Task(env={"name": "test"}, id="task1", scenario="test")]
202
+ mock_agent_cls, _ = _create_mock_agent_cls()
203
+ mock_ctx = MockEvalContext()
204
+
205
+ with (
206
+ patch("hud.datasets.runner.hud.eval") as mock_eval,
207
+ patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
208
+ ):
209
+ mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
210
+ mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
211
+
212
+ from hud.datasets.runner import run_dataset
213
+
214
+ results = await run_dataset(tasks, agent_type="claude")
215
+
216
+ # Should return list with the context
217
+ assert len(results) == 1
218
+ assert results[0] is mock_ctx
219
+
220
+ @pytest.mark.asyncio
221
+ async def test_run_dataset_parallel_results(self) -> None:
222
+ """Test run_dataset returns ctx.results for parallel execution."""
223
+ from hud.eval.task import Task
224
+
225
+ tasks = [Task(env={"name": "test"}, id="task1", scenario="test")]
226
+ mock_agent_cls, _ = _create_mock_agent_cls()
227
+
228
+ # Create mock context with results (parallel execution)
229
+ mock_result1 = MockEvalContext(prompt="result1")
230
+ mock_result1.reward = 0.8
231
+ mock_result2 = MockEvalContext(prompt="result2")
232
+ mock_result2.reward = 0.9
233
+
234
+ mock_ctx = MockEvalContext()
235
+ mock_ctx.results = [mock_result1, mock_result2]
236
+
237
+ with (
238
+ patch("hud.datasets.runner.hud.eval") as mock_eval,
239
+ patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
240
+ ):
241
+ mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
242
+ mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
243
+
244
+ from hud.datasets.runner import run_dataset
245
+
246
+ results = await run_dataset(tasks, agent_type="claude")
247
+
248
+ # Should return the parallel results
249
+ assert len(results) == 2
250
+ assert results[0].reward == 0.8
251
+ assert results[1].reward == 0.9
@@ -0,0 +1,51 @@
1
+ """Tests for AWS Bedrock auto-detection in hud.cli.eval."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from unittest.mock import MagicMock, patch
6
+
7
+ import pytest
8
+ import typer
9
+
10
+ from hud.cli.eval import EvalConfig
11
+ from hud.types import AgentType
12
+
13
+
14
+ class TestBedrockAutoDetection:
15
+ VALID_ARN = "arn:aws:bedrock:us-east-1:123456789012:inference-profile/my-profile"
16
+
17
+ def test_get_agent_kwargs_detects_bedrock_arn_from_config_checkpoint_name(self) -> None:
18
+ """Regression: ARN in [claude].checkpoint_name should trigger Bedrock client."""
19
+ cfg = EvalConfig(
20
+ agent_type=AgentType.CLAUDE,
21
+ model=None, # no CLI --model
22
+ agent_config={"claude": {"checkpoint_name": self.VALID_ARN}},
23
+ )
24
+
25
+ with (
26
+ patch("hud.settings.settings.aws_access_key_id", "AKIATEST"),
27
+ patch("hud.settings.settings.aws_secret_access_key", "secret"),
28
+ patch("hud.settings.settings.aws_region", "us-east-1"),
29
+ patch("anthropic.AsyncAnthropicBedrock", return_value=MagicMock()) as mock_bedrock,
30
+ ):
31
+ kwargs = cfg.get_agent_kwargs()
32
+
33
+ assert kwargs.get("checkpoint_name") == self.VALID_ARN
34
+ assert "model_client" in kwargs
35
+ mock_bedrock.assert_called_once()
36
+
37
+ def test_get_agent_kwargs_bedrock_arn_missing_aws_creds_exits(self) -> None:
38
+ """Should fail fast if ARN is detected but AWS creds are missing."""
39
+ cfg = EvalConfig(
40
+ agent_type=AgentType.CLAUDE,
41
+ model=None,
42
+ agent_config={"claude": {"checkpoint_name": self.VALID_ARN}},
43
+ )
44
+
45
+ with (
46
+ patch("hud.settings.settings.aws_access_key_id", None),
47
+ patch("hud.settings.settings.aws_secret_access_key", None),
48
+ patch("hud.settings.settings.aws_region", None),
49
+ pytest.raises(typer.Exit),
50
+ ):
51
+ cfg.get_agent_kwargs()
@@ -0,0 +1,124 @@
1
+ """Tests for CLI init module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from hud.cli.init import _replace_placeholders
6
+
7
+
8
+ class TestReplacePlaceholders:
9
+ """Test placeholder replacement in template files."""
10
+
11
+ def test_replace_in_pyproject(self, tmp_path):
12
+ """Test replacing placeholders in pyproject.toml."""
13
+ # Create server directory structure
14
+ server_dir = tmp_path / "server"
15
+ server_dir.mkdir()
16
+
17
+ pyproject = server_dir / "pyproject.toml"
18
+ pyproject.write_text("""
19
+ [project]
20
+ name = "blank"
21
+ description = "blank environment"
22
+ """)
23
+
24
+ modified = _replace_placeholders(tmp_path, "my-cool-env")
25
+
26
+ # Normalize paths for cross-platform comparison
27
+ modified_normalized = [p.replace("\\", "/") for p in modified]
28
+ assert "server/pyproject.toml" in modified_normalized
29
+ content = pyproject.read_text()
30
+ assert "my_cool_env" in content
31
+ assert "blank" not in content
32
+
33
+ def test_replace_in_readme(self, tmp_path):
34
+ """Test replacing placeholders in README.md."""
35
+ readme = tmp_path / "README.md"
36
+ readme.write_text("# blank\n\nThis is the blank environment.")
37
+
38
+ modified = _replace_placeholders(tmp_path, "test-env")
39
+
40
+ assert "README.md" in modified
41
+ content = readme.read_text()
42
+ assert "test_env" in content
43
+ assert "blank" not in content
44
+
45
+ def test_replace_in_tasks_json(self, tmp_path):
46
+ """Test replacing placeholders in tasks.json."""
47
+ tasks = tmp_path / "tasks.json"
48
+ tasks.write_text('{"name": "blank", "tasks": []}')
49
+
50
+ modified = _replace_placeholders(tmp_path, "my-tasks")
51
+
52
+ assert "tasks.json" in modified
53
+ content = tasks.read_text()
54
+ assert "my_tasks" in content
55
+
56
+ def test_no_replace_in_non_placeholder_files(self, tmp_path):
57
+ """Test that non-placeholder files are not modified."""
58
+ other_file = tmp_path / "other.py"
59
+ other_file.write_text("# blank comment")
60
+
61
+ modified = _replace_placeholders(tmp_path, "test")
62
+
63
+ assert "other.py" not in modified
64
+ content = other_file.read_text()
65
+ assert "blank" in content # Should be unchanged
66
+
67
+ def test_skip_pycache_directories(self, tmp_path):
68
+ """Test that __pycache__ directories are skipped."""
69
+ pycache = tmp_path / "__pycache__"
70
+ pycache.mkdir()
71
+
72
+ cached_file = pycache / "module.pyc"
73
+ cached_file.write_text("blank")
74
+
75
+ modified = _replace_placeholders(tmp_path, "test")
76
+
77
+ # __pycache__ files should not be in modified list
78
+ assert not any("__pycache__" in f for f in modified)
79
+
80
+ def test_normalize_special_characters(self, tmp_path):
81
+ """Test that environment name is normalized for Python identifiers."""
82
+ server_dir = tmp_path / "server"
83
+ server_dir.mkdir()
84
+
85
+ pyproject = server_dir / "pyproject.toml"
86
+ pyproject.write_text('name = "blank"')
87
+
88
+ _replace_placeholders(tmp_path, "my cool-env.v2!")
89
+
90
+ content = pyproject.read_text()
91
+ # Special characters should be replaced with underscores
92
+ assert "my_cool_env_v2_" in content
93
+
94
+ def test_no_changes_when_no_placeholder(self, tmp_path):
95
+ """Test that files without placeholder are not modified."""
96
+ server_dir = tmp_path / "server"
97
+ server_dir.mkdir()
98
+
99
+ pyproject = server_dir / "pyproject.toml"
100
+ pyproject.write_text('name = "other-name"')
101
+
102
+ modified = _replace_placeholders(tmp_path, "test")
103
+
104
+ assert "server/pyproject.toml" not in modified
105
+
106
+ def test_nested_directory_structure(self, tmp_path):
107
+ """Test replacement in nested directory structure."""
108
+ # Create nested structure
109
+ server_dir = tmp_path / "server"
110
+ server_dir.mkdir()
111
+ (server_dir / "pyproject.toml").write_text('name = "blank"')
112
+
113
+ env_dir = tmp_path / "environment"
114
+ env_dir.mkdir()
115
+ (env_dir / "pyproject.toml").write_text('name = "blank"')
116
+ (env_dir / "README.md").write_text("# blank environment")
117
+
118
+ modified = _replace_placeholders(tmp_path, "nested-test")
119
+
120
+ # Normalize paths for cross-platform comparison
121
+ modified_normalized = [p.replace("\\", "/") for p in modified]
122
+ assert "server/pyproject.toml" in modified_normalized
123
+ assert "environment/pyproject.toml" in modified_normalized
124
+ assert "environment/README.md" in modified_normalized
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import os
5
6
  import subprocess
6
7
  import sys
7
8
 
@@ -20,11 +21,16 @@ class TestMainModule:
20
21
  def test_main_module_executes(self):
21
22
  """Test that running the module as main executes correctly."""
22
23
  # Use subprocess to run the module as __main__ and check it doesn't crash
23
- # We expect it to show help/error since we're not providing arguments
24
+ # Use --version flag for a quick, deterministic test that doesn't require user input
25
+ env = {**os.environ, "HUD_SKIP_VERSION_CHECK": "1"}
24
26
  result = subprocess.run(
25
- [sys.executable, "-m", "hud.cli"], capture_output=True, text=True, timeout=10
27
+ [sys.executable, "-m", "hud.cli", "--version"],
28
+ capture_output=True,
29
+ text=True,
30
+ timeout=30,
31
+ env=env,
26
32
  )
27
33
 
28
- # Should exit with an error code but not crash
29
- # (The actual main function will show help or error for missing args)
30
- assert result.returncode != 0 # CLI should exit with error for no args
34
+ # Should exit successfully with version info
35
+ assert result.returncode == 0
36
+ assert "version" in result.stdout.lower() or "hud" in result.stdout.lower()