hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,362 @@
1
+ """Tests for OperatorAgent implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any, cast
6
+ from unittest.mock import AsyncMock, MagicMock, patch
7
+
8
+ import pytest
9
+ from mcp import types
10
+ from openai import AsyncOpenAI
11
+ from openai.types.responses.response_computer_tool_call import PendingSafetyCheck
12
+
13
+ from hud.agents.operator import OperatorAgent
14
+ from hud.environment.router import ToolRouter
15
+ from hud.eval.context import EvalContext
16
+ from hud.types import MCPToolCall, MCPToolResult
17
+
18
+ if TYPE_CHECKING:
19
+ from collections.abc import Generator
20
+
21
+
22
+ class MockEvalContext(EvalContext):
23
+ """Mock EvalContext for testing."""
24
+
25
+ def __init__(self, tools: list[types.Tool] | None = None) -> None:
26
+ # Core attributes
27
+ self.prompt = "Test prompt"
28
+ self._tools = tools or []
29
+ self._submitted: str | None = None
30
+ self.reward: float | None = None
31
+
32
+ # Environment attributes
33
+ self._router = ToolRouter()
34
+ self._agent_include: list[str] | None = None
35
+ self._agent_exclude: list[str] | None = None
36
+
37
+ # EvalContext attributes
38
+ self._task = None
39
+ self.trace_id = "test-trace-id"
40
+ self.eval_name = "test-eval"
41
+ self.job_id: str | None = None
42
+ self.group_id: str | None = None
43
+ self.index = 0
44
+ self.variants: dict[str, Any] = {}
45
+ self.answer: str | None = None
46
+ self.system_prompt: str | None = None
47
+ self.error: BaseException | None = None
48
+ self.metadata: dict[str, Any] = {}
49
+ self.results: list[Any] = []
50
+ self._is_summary = False
51
+
52
+ def as_tools(self) -> list[types.Tool]:
53
+ return self._tools
54
+
55
+ @property
56
+ def has_scenario(self) -> bool:
57
+ return False
58
+
59
+ async def list_tools(self) -> list[types.Tool]:
60
+ return self._tools
61
+
62
+ async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
63
+ return MCPToolResult(
64
+ content=[types.TextContent(type="text", text="ok")],
65
+ isError=False,
66
+ )
67
+
68
+ async def submit(self, answer: str) -> None:
69
+ self._submitted = answer
70
+
71
+
72
+ class TestOperatorAgent:
73
+ """Test OperatorAgent class."""
74
+
75
+ @pytest.fixture
76
+ def mock_openai(self) -> Generator[AsyncOpenAI, None, None]:
77
+ """Create a mock OpenAI client."""
78
+ client = AsyncOpenAI(api_key="test", base_url="http://localhost")
79
+ client.responses.create = AsyncMock()
80
+ with patch("hud.agents.openai.AsyncOpenAI", return_value=client):
81
+ yield client
82
+
83
+ @pytest.fixture
84
+ def mock_eval_context_computer(self) -> MockEvalContext:
85
+ """Create a mock EvalContext with computer tool."""
86
+ return MockEvalContext(
87
+ tools=[
88
+ types.Tool(
89
+ name="openai_computer",
90
+ description="OpenAI computer use tool",
91
+ inputSchema={},
92
+ )
93
+ ]
94
+ )
95
+
96
+ @pytest.mark.asyncio
97
+ async def test_init(self, mock_openai: AsyncOpenAI) -> None:
98
+ """Test agent initialization."""
99
+ agent = OperatorAgent.create(
100
+ model_client=mock_openai,
101
+ model="gpt-4",
102
+ validate_api_key=False,
103
+ )
104
+
105
+ assert agent.model_name == "Operator"
106
+ assert agent.config.model == "gpt-4"
107
+ assert agent.openai_client == mock_openai
108
+
109
+ @pytest.mark.asyncio
110
+ async def test_format_blocks(self, mock_openai: AsyncOpenAI) -> None:
111
+ """Test formatting content blocks."""
112
+ agent = OperatorAgent.create(
113
+ model_client=mock_openai,
114
+ validate_api_key=False,
115
+ )
116
+
117
+ # Test with text blocks
118
+ blocks: list[types.ContentBlock] = [
119
+ types.TextContent(type="text", text="Hello, GPT!"),
120
+ types.TextContent(type="text", text="Another message"),
121
+ ]
122
+
123
+ messages = await agent.format_blocks(blocks)
124
+ assert len(messages) == 1
125
+ msg = cast("dict[str, Any]", messages[0])
126
+ assert msg["role"] == "user"
127
+ content = cast("list[dict[str, Any]]", msg["content"])
128
+ assert len(content) == 2
129
+ assert content[0] == {"type": "input_text", "text": "Hello, GPT!"}
130
+ assert content[1] == {"type": "input_text", "text": "Another message"}
131
+
132
+ # Test with mixed content
133
+ blocks = [
134
+ types.TextContent(type="text", text="Text content"),
135
+ types.ImageContent(type="image", data="base64data", mimeType="image/png"),
136
+ ]
137
+
138
+ messages = await agent.format_blocks(blocks)
139
+ assert len(messages) == 1
140
+ msg = cast("dict[str, Any]", messages[0])
141
+ assert msg["role"] == "user"
142
+ content = cast("list[dict[str, Any]]", msg["content"])
143
+ assert len(content) == 2
144
+ assert content[0] == {"type": "input_text", "text": "Text content"}
145
+ assert content[1] == {
146
+ "type": "input_image",
147
+ "image_url": "",
148
+ "detail": "auto",
149
+ }
150
+
151
+ @pytest.mark.asyncio
152
+ async def test_format_tool_results(self, mock_openai: AsyncOpenAI) -> None:
153
+ """Test formatting tool results."""
154
+ agent = OperatorAgent.create(
155
+ model_client=mock_openai,
156
+ validate_api_key=False,
157
+ )
158
+
159
+ tool_calls = [
160
+ MCPToolCall(name="test_tool", arguments={}, id="call_123"),
161
+ MCPToolCall(name="screenshot", arguments={}, id="call_456"),
162
+ ]
163
+
164
+ tool_results = [
165
+ MCPToolResult(content=[types.TextContent(type="text", text="Success")], isError=False),
166
+ MCPToolResult(
167
+ content=[types.ImageContent(type="image", data="base64data", mimeType="image/png")],
168
+ isError=False,
169
+ ),
170
+ ]
171
+
172
+ messages = await agent.format_tool_results(tool_calls, tool_results)
173
+
174
+ # Should return both tool results as function_call_output
175
+ assert len(messages) == 2
176
+ # First result is text
177
+ msg0 = cast("dict[str, Any]", messages[0])
178
+ assert msg0["type"] == "function_call_output"
179
+ assert msg0["call_id"] == "call_123"
180
+ output0 = cast("list[dict[str, Any]]", msg0["output"])
181
+ assert output0[0]["type"] == "input_text"
182
+ assert output0[0]["text"] == "Success"
183
+ # Second result is image
184
+ msg1 = cast("dict[str, Any]", messages[1])
185
+ assert msg1["type"] == "function_call_output"
186
+ assert msg1["call_id"] == "call_456"
187
+ output1 = cast("list[dict[str, Any]]", msg1["output"])
188
+ assert output1[0]["type"] == "input_image"
189
+ assert output1[0]["image_url"] == ""
190
+
191
+ @pytest.mark.asyncio
192
+ async def test_format_tool_results_with_error(self, mock_openai: AsyncOpenAI) -> None:
193
+ """Test formatting tool results with errors."""
194
+ agent = OperatorAgent.create(
195
+ model_client=mock_openai,
196
+ validate_api_key=False,
197
+ )
198
+
199
+ tool_calls = [
200
+ MCPToolCall(name="failing_tool", arguments={}, id="call_error"),
201
+ ]
202
+
203
+ tool_results = [
204
+ MCPToolResult(
205
+ content=[types.TextContent(type="text", text="Something went wrong")], isError=True
206
+ ),
207
+ ]
208
+
209
+ messages = await agent.format_tool_results(tool_calls, tool_results)
210
+
211
+ # Error results are returned with error flag and content
212
+ assert len(messages) == 1
213
+ msg = cast("dict[str, Any]", messages[0])
214
+ assert msg["type"] == "function_call_output"
215
+ assert msg["call_id"] == "call_error"
216
+ output = cast("list[dict[str, Any]]", msg["output"])
217
+ assert output[0]["type"] == "input_text"
218
+ assert output[0]["text"] == "[tool_error] true"
219
+ assert output[1]["type"] == "input_text"
220
+ assert output[1]["text"] == "Something went wrong"
221
+
222
+ @pytest.mark.asyncio
223
+ async def test_get_model_response(
224
+ self, mock_openai: AsyncOpenAI, mock_eval_context_computer: MockEvalContext
225
+ ) -> None:
226
+ """Test getting model response from OpenAI API."""
227
+ with patch("hud.settings.settings.telemetry_enabled", False):
228
+ agent = OperatorAgent.create(
229
+ model_client=mock_openai,
230
+ validate_api_key=False,
231
+ )
232
+
233
+ # Initialize with context
234
+ agent.ctx = mock_eval_context_computer
235
+ await agent._initialize_from_ctx(mock_eval_context_computer)
236
+
237
+ # Mock OpenAI API response for a successful computer use response
238
+ mock_response = MagicMock()
239
+ mock_response.id = "response_123"
240
+ mock_response.state = "completed"
241
+ # Mock the output message structure
242
+ mock_output_text = MagicMock()
243
+ mock_output_text.type = "output_text"
244
+ mock_output_text.text = "I can see the screen content."
245
+
246
+ mock_output_message = MagicMock()
247
+ mock_output_message.type = "message"
248
+ mock_output_message.content = [mock_output_text]
249
+
250
+ mock_response.output = [mock_output_message]
251
+
252
+ mock_openai.responses.create = AsyncMock(return_value=mock_response)
253
+
254
+ messages = [{"prompt": "What's on the screen?", "screenshot": None}]
255
+ response = await agent.get_response(messages) # type: ignore[arg-type]
256
+
257
+ assert response.done is True
258
+ assert response.tool_calls == []
259
+
260
+ @pytest.mark.asyncio
261
+ async def test_handle_empty_response(
262
+ self, mock_openai: AsyncOpenAI, mock_eval_context_computer: MockEvalContext
263
+ ) -> None:
264
+ """Test handling empty response from API."""
265
+ agent = OperatorAgent.create(
266
+ model_client=mock_openai,
267
+ validate_api_key=False,
268
+ )
269
+
270
+ # Initialize with context
271
+ agent.ctx = mock_eval_context_computer
272
+ await agent._initialize_from_ctx(mock_eval_context_computer)
273
+
274
+ # Mock empty response
275
+ mock_response = MagicMock()
276
+ mock_response.id = "response_empty"
277
+ mock_response.state = "completed"
278
+ mock_response.output = [] # Empty output
279
+
280
+ mock_openai.responses.create = AsyncMock(return_value=mock_response)
281
+
282
+ messages = [{"prompt": "Hi", "screenshot": None}]
283
+ response = await agent.get_response(messages) # type: ignore[arg-type]
284
+
285
+ assert response.content == ""
286
+ assert response.tool_calls == []
287
+
288
+ @pytest.mark.asyncio
289
+ async def test_pending_safety_checks_initialization(self, mock_openai: AsyncOpenAI) -> None:
290
+ """Test that OperatorAgent initializes pending_call_id and pending_safety_checks."""
291
+ agent = OperatorAgent.create(
292
+ model_client=mock_openai,
293
+ validate_api_key=False,
294
+ )
295
+
296
+ # Verify initial state
297
+ assert agent.pending_call_id is None
298
+ assert agent.pending_safety_checks == []
299
+
300
+ # Set some state
301
+ agent.pending_call_id = "call_id"
302
+ agent.pending_safety_checks = [
303
+ PendingSafetyCheck(id="safety_check_id", code="value", message="message")
304
+ ]
305
+
306
+ # Verify state was set
307
+ assert agent.pending_call_id == "call_id"
308
+ assert len(agent.pending_safety_checks) == 1
309
+ assert agent.pending_safety_checks[0].id == "safety_check_id"
310
+
311
+ @pytest.mark.asyncio
312
+ async def test_extract_tool_call_computer(self, mock_openai: AsyncOpenAI) -> None:
313
+ """Test that _extract_tool_call routes computer_call to openai_computer."""
314
+ agent = OperatorAgent.create(
315
+ model_client=mock_openai,
316
+ validate_api_key=False,
317
+ )
318
+
319
+ # Create a mock computer_call item
320
+ mock_item = MagicMock()
321
+ mock_item.type = "computer_call"
322
+ mock_item.call_id = "call_123"
323
+ mock_item.pending_safety_checks = [
324
+ PendingSafetyCheck(id="check_1", code="code", message="msg")
325
+ ]
326
+ mock_item.action.to_dict.return_value = {"type": "screenshot"}
327
+
328
+ tool_call = agent._extract_tool_call(mock_item)
329
+
330
+ # Should route to openai_computer tool
331
+ assert tool_call is not None
332
+ assert tool_call.name == "openai_computer"
333
+ assert tool_call.id == "call_123"
334
+ assert tool_call.arguments == {"type": "screenshot"}
335
+ # Should update pending_safety_checks
336
+ assert agent.pending_safety_checks == mock_item.pending_safety_checks
337
+
338
+ @pytest.mark.asyncio
339
+ async def test_extract_tool_call_delegates_to_super(self, mock_openai: AsyncOpenAI) -> None:
340
+ """Test that _extract_tool_call delegates non-computer calls to parent."""
341
+ agent = OperatorAgent.create(
342
+ model_client=mock_openai,
343
+ validate_api_key=False,
344
+ )
345
+
346
+ # Set up tool name map
347
+ agent._tool_name_map = {"test_tool": "mcp_test_tool"}
348
+
349
+ # Create a mock function_call item
350
+ mock_item = MagicMock()
351
+ mock_item.type = "function_call"
352
+ mock_item.call_id = "call_456"
353
+ mock_item.name = "test_tool"
354
+ mock_item.arguments = '{"arg": "value"}'
355
+
356
+ tool_call = agent._extract_tool_call(mock_item)
357
+
358
+ # Should delegate to parent and map the tool name
359
+ assert tool_call is not None
360
+ assert tool_call.name == "mcp_test_tool"
361
+ assert tool_call.id == "call_456"
362
+ assert tool_call.arguments == {"arg": "value"}
@@ -0,0 +1,179 @@
1
+ """Tests for MCPAgent.run() with EvalContext."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, ClassVar
6
+
7
+ import pytest
8
+ from mcp import types
9
+
10
+ from hud.agents import MCPAgent
11
+ from hud.agents.base import BaseCreateParams
12
+ from hud.environment.router import ToolRouter
13
+ from hud.eval.context import EvalContext
14
+ from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
15
+
16
+
17
+ class MockConfig(BaseAgentConfig):
18
+ model_name: str = "MockAgent"
19
+ model: str = "mock-model"
20
+
21
+
22
+ class MockCreateParams(BaseCreateParams, MockConfig):
23
+ pass
24
+
25
+
26
+ class MockMCPAgent(MCPAgent):
27
+ """Mock agent for testing run()."""
28
+
29
+ metadata: ClassVar[dict[str, Any] | None] = {}
30
+ config_cls: ClassVar[type[BaseAgentConfig]] = MockConfig
31
+
32
+ def __init__(self, **kwargs: Any) -> None:
33
+ params = MockCreateParams(**kwargs)
34
+ super().__init__(params)
35
+ self._response = AgentResponse(content="Test response", tool_calls=[], done=True)
36
+
37
+ def set_response(self, response: AgentResponse) -> None:
38
+ self._response = response
39
+
40
+ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
41
+ return self._response
42
+
43
+ async def format_tool_results(
44
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
45
+ ) -> list[dict[str, Any]]:
46
+ return [{"role": "tool", "content": str(r)} for r in tool_results]
47
+
48
+ async def get_system_messages(self) -> list[Any]:
49
+ return []
50
+
51
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
52
+ return [{"type": "text", "text": getattr(b, "text")} for b in blocks if hasattr(b, "text")]
53
+
54
+
55
+ class MockEvalContext(EvalContext):
56
+ """Mock EvalContext for testing - inherits from real EvalContext."""
57
+
58
+ def __init__(self, prompt: str = "Test prompt", tools: list[types.Tool] | None = None) -> None:
59
+ # Core attributes
60
+ self.prompt = prompt
61
+ self._tools = tools or [types.Tool(name="test_tool", description="Test", inputSchema={})]
62
+ self._submitted: str | None = None
63
+ self.reward: float | None = None
64
+ self._initialized = True
65
+
66
+ # Environment attributes
67
+ self._router = ToolRouter()
68
+ self._agent_include: list[str] | None = None
69
+ self._agent_exclude: list[str] | None = None
70
+
71
+ # EvalContext attributes
72
+ self._task = None
73
+ self.trace_id = "test-trace-id"
74
+ self.eval_name = "test-eval"
75
+ self.job_id: str | None = None
76
+ self.group_id: str | None = None
77
+ self.index = 0
78
+ self.variants: dict[str, Any] = {}
79
+ self.answer: str | None = None
80
+ self.system_prompt: str | None = None
81
+ self.error: BaseException | None = None
82
+ self.metadata: dict[str, Any] = {}
83
+ self.results: list[Any] = []
84
+ self._is_summary = False
85
+
86
+ def as_tools(self) -> list[types.Tool]:
87
+ return self._tools
88
+
89
+ @property
90
+ def has_scenario(self) -> bool:
91
+ return True
92
+
93
+ async def list_tools(self) -> list[types.Tool]:
94
+ return self._tools
95
+
96
+ async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
97
+ # Handle tuple format (name, args)
98
+ if isinstance(call, tuple):
99
+ name = call[0]
100
+ elif hasattr(call, "name"):
101
+ name = call.name
102
+ else:
103
+ name = str(call)
104
+ return MCPToolResult(
105
+ content=[types.TextContent(type="text", text=f"Result from {name}")],
106
+ isError=False,
107
+ )
108
+
109
+ async def submit(self, answer: str) -> None:
110
+ self._submitted = answer
111
+
112
+
113
+ class TestRun:
114
+ """Tests for MCPAgent.run() with EvalContext."""
115
+
116
+ @pytest.mark.asyncio
117
+ async def test_run_basic(self) -> None:
118
+ """Test basic run() flow."""
119
+ ctx = MockEvalContext(prompt="Do the task")
120
+ agent = MockMCPAgent()
121
+
122
+ result = await agent.run(ctx)
123
+
124
+ assert result.done
125
+ assert result.content == "Test response"
126
+ assert ctx._submitted == "Test response"
127
+
128
+ @pytest.mark.asyncio
129
+ async def test_run_no_prompt_raises(self) -> None:
130
+ """Test run() raises when prompt is not set."""
131
+ ctx = MockEvalContext(prompt="")
132
+ agent = MockMCPAgent()
133
+
134
+ with pytest.raises(ValueError, match="prompt is not set"):
135
+ await agent.run(ctx)
136
+
137
+ @pytest.mark.asyncio
138
+ async def test_run_wrong_type_raises(self) -> None:
139
+ """Test run() raises TypeError for non-EvalContext."""
140
+ agent = MockMCPAgent()
141
+
142
+ with pytest.raises(TypeError, match="must be EvalContext"):
143
+ await agent.run("not an eval context") # type: ignore[arg-type]
144
+
145
+ @pytest.mark.asyncio
146
+ async def test_run_clears_ctx(self) -> None:
147
+ """Test run() clears ctx after completion."""
148
+ ctx = MockEvalContext(prompt="Do the task")
149
+ agent = MockMCPAgent()
150
+
151
+ await agent.run(ctx)
152
+ assert agent.ctx is None
153
+
154
+ @pytest.mark.asyncio
155
+ async def test_run_no_submit_on_empty_content(self) -> None:
156
+ """Test run() doesn't submit when content is empty."""
157
+ ctx = MockEvalContext(prompt="Do the task")
158
+ agent = MockMCPAgent()
159
+ agent.set_response(AgentResponse(content="", tool_calls=[], done=True))
160
+
161
+ await agent.run(ctx)
162
+ assert ctx._submitted is None
163
+
164
+ @pytest.mark.asyncio
165
+ async def test_run_initializes_tools(self) -> None:
166
+ """Test run() initializes tools from context."""
167
+ ctx = MockEvalContext(
168
+ prompt="Do the task",
169
+ tools=[
170
+ types.Tool(name="tool1", description="Tool 1", inputSchema={}),
171
+ types.Tool(name="tool2", description="Tool 2", inputSchema={}),
172
+ ],
173
+ )
174
+ agent = MockMCPAgent()
175
+
176
+ await agent.run(ctx)
177
+
178
+ assert agent._initialized
179
+ # After cleanup, ctx is None but tools were discovered