hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,362 @@
1
+ """Tests for OperatorAgent implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any, cast
6
+ from unittest.mock import AsyncMock, MagicMock, patch
7
+
8
+ import pytest
9
+ from mcp import types
10
+ from openai import AsyncOpenAI
11
+ from openai.types.responses.response_computer_tool_call import PendingSafetyCheck
12
+
13
+ from hud.agents.operator import OperatorAgent
14
+ from hud.environment.router import ToolRouter
15
+ from hud.eval.context import EvalContext
16
+ from hud.types import MCPToolCall, MCPToolResult
17
+
18
+ if TYPE_CHECKING:
19
+ from collections.abc import Generator
20
+
21
+
22
+ class MockEvalContext(EvalContext):
23
+ """Mock EvalContext for testing."""
24
+
25
+ def __init__(self, tools: list[types.Tool] | None = None) -> None:
26
+ # Core attributes
27
+ self.prompt = "Test prompt"
28
+ self._tools = tools or []
29
+ self._submitted: str | None = None
30
+ self.reward: float | None = None
31
+
32
+ # Environment attributes
33
+ self._router = ToolRouter()
34
+ self._agent_include: list[str] | None = None
35
+ self._agent_exclude: list[str] | None = None
36
+
37
+ # EvalContext attributes
38
+ self._task = None
39
+ self.trace_id = "test-trace-id"
40
+ self.eval_name = "test-eval"
41
+ self.job_id: str | None = None
42
+ self.group_id: str | None = None
43
+ self.index = 0
44
+ self.variants: dict[str, Any] = {}
45
+ self.answer: str | None = None
46
+ self.system_prompt: str | None = None
47
+ self.error: BaseException | None = None
48
+ self.metadata: dict[str, Any] = {}
49
+ self.results: list[Any] = []
50
+ self._is_summary = False
51
+
52
+ def as_tools(self) -> list[types.Tool]:
53
+ return self._tools
54
+
55
+ @property
56
+ def has_scenario(self) -> bool:
57
+ return False
58
+
59
+ async def list_tools(self) -> list[types.Tool]:
60
+ return self._tools
61
+
62
+ async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
63
+ return MCPToolResult(
64
+ content=[types.TextContent(type="text", text="ok")],
65
+ isError=False,
66
+ )
67
+
68
+ async def submit(self, answer: str) -> None:
69
+ self._submitted = answer
70
+
71
+
72
+ class TestOperatorAgent:
73
+ """Test OperatorAgent class."""
74
+
75
+ @pytest.fixture
76
+ def mock_openai(self) -> Generator[AsyncOpenAI, None, None]:
77
+ """Create a mock OpenAI client."""
78
+ client = AsyncOpenAI(api_key="test", base_url="http://localhost")
79
+ client.responses.create = AsyncMock()
80
+ with patch("hud.agents.openai.AsyncOpenAI", return_value=client):
81
+ yield client
82
+
83
+ @pytest.fixture
84
+ def mock_eval_context_computer(self) -> MockEvalContext:
85
+ """Create a mock EvalContext with computer tool."""
86
+ return MockEvalContext(
87
+ tools=[
88
+ types.Tool(
89
+ name="openai_computer",
90
+ description="OpenAI computer use tool",
91
+ inputSchema={},
92
+ )
93
+ ]
94
+ )
95
+
96
+ @pytest.mark.asyncio
97
+ async def test_init(self, mock_openai: AsyncOpenAI) -> None:
98
+ """Test agent initialization."""
99
+ agent = OperatorAgent.create(
100
+ model_client=mock_openai,
101
+ model="gpt-4",
102
+ validate_api_key=False,
103
+ )
104
+
105
+ assert agent.model_name == "Operator"
106
+ assert agent.config.model == "gpt-4"
107
+ assert agent.openai_client == mock_openai
108
+
109
+ @pytest.mark.asyncio
110
+ async def test_format_blocks(self, mock_openai: AsyncOpenAI) -> None:
111
+ """Test formatting content blocks."""
112
+ agent = OperatorAgent.create(
113
+ model_client=mock_openai,
114
+ validate_api_key=False,
115
+ )
116
+
117
+ # Test with text blocks
118
+ blocks: list[types.ContentBlock] = [
119
+ types.TextContent(type="text", text="Hello, GPT!"),
120
+ types.TextContent(type="text", text="Another message"),
121
+ ]
122
+
123
+ messages = await agent.format_blocks(blocks)
124
+ assert len(messages) == 1
125
+ msg = cast("dict[str, Any]", messages[0])
126
+ assert msg["role"] == "user"
127
+ content = cast("list[dict[str, Any]]", msg["content"])
128
+ assert len(content) == 2
129
+ assert content[0] == {"type": "input_text", "text": "Hello, GPT!"}
130
+ assert content[1] == {"type": "input_text", "text": "Another message"}
131
+
132
+ # Test with mixed content
133
+ blocks = [
134
+ types.TextContent(type="text", text="Text content"),
135
+ types.ImageContent(type="image", data="base64data", mimeType="image/png"),
136
+ ]
137
+
138
+ messages = await agent.format_blocks(blocks)
139
+ assert len(messages) == 1
140
+ msg = cast("dict[str, Any]", messages[0])
141
+ assert msg["role"] == "user"
142
+ content = cast("list[dict[str, Any]]", msg["content"])
143
+ assert len(content) == 2
144
+ assert content[0] == {"type": "input_text", "text": "Text content"}
145
+ assert content[1] == {
146
+ "type": "input_image",
147
+ "image_url": "data:image/png;base64,base64data",
148
+ "detail": "auto",
149
+ }
150
+
151
+ @pytest.mark.asyncio
152
+ async def test_format_tool_results(self, mock_openai: AsyncOpenAI) -> None:
153
+ """Test formatting tool results."""
154
+ agent = OperatorAgent.create(
155
+ model_client=mock_openai,
156
+ validate_api_key=False,
157
+ )
158
+
159
+ tool_calls = [
160
+ MCPToolCall(name="test_tool", arguments={}, id="call_123"),
161
+ MCPToolCall(name="screenshot", arguments={}, id="call_456"),
162
+ ]
163
+
164
+ tool_results = [
165
+ MCPToolResult(content=[types.TextContent(type="text", text="Success")], isError=False),
166
+ MCPToolResult(
167
+ content=[types.ImageContent(type="image", data="base64data", mimeType="image/png")],
168
+ isError=False,
169
+ ),
170
+ ]
171
+
172
+ messages = await agent.format_tool_results(tool_calls, tool_results)
173
+
174
+ # Should return both tool results as function_call_output
175
+ assert len(messages) == 2
176
+ # First result is text
177
+ msg0 = cast("dict[str, Any]", messages[0])
178
+ assert msg0["type"] == "function_call_output"
179
+ assert msg0["call_id"] == "call_123"
180
+ output0 = cast("list[dict[str, Any]]", msg0["output"])
181
+ assert output0[0]["type"] == "input_text"
182
+ assert output0[0]["text"] == "Success"
183
+ # Second result is image
184
+ msg1 = cast("dict[str, Any]", messages[1])
185
+ assert msg1["type"] == "function_call_output"
186
+ assert msg1["call_id"] == "call_456"
187
+ output1 = cast("list[dict[str, Any]]", msg1["output"])
188
+ assert output1[0]["type"] == "input_image"
189
+ assert output1[0]["image_url"] == "data:image/png;base64,base64data"
190
+
191
+ @pytest.mark.asyncio
192
+ async def test_format_tool_results_with_error(self, mock_openai: AsyncOpenAI) -> None:
193
+ """Test formatting tool results with errors."""
194
+ agent = OperatorAgent.create(
195
+ model_client=mock_openai,
196
+ validate_api_key=False,
197
+ )
198
+
199
+ tool_calls = [
200
+ MCPToolCall(name="failing_tool", arguments={}, id="call_error"),
201
+ ]
202
+
203
+ tool_results = [
204
+ MCPToolResult(
205
+ content=[types.TextContent(type="text", text="Something went wrong")], isError=True
206
+ ),
207
+ ]
208
+
209
+ messages = await agent.format_tool_results(tool_calls, tool_results)
210
+
211
+ # Error results are returned with error flag and content
212
+ assert len(messages) == 1
213
+ msg = cast("dict[str, Any]", messages[0])
214
+ assert msg["type"] == "function_call_output"
215
+ assert msg["call_id"] == "call_error"
216
+ output = cast("list[dict[str, Any]]", msg["output"])
217
+ assert output[0]["type"] == "input_text"
218
+ assert output[0]["text"] == "[tool_error] true"
219
+ assert output[1]["type"] == "input_text"
220
+ assert output[1]["text"] == "Something went wrong"
221
+
222
+ @pytest.mark.asyncio
223
+ async def test_get_model_response(
224
+ self, mock_openai: AsyncOpenAI, mock_eval_context_computer: MockEvalContext
225
+ ) -> None:
226
+ """Test getting model response from OpenAI API."""
227
+ with patch("hud.settings.settings.telemetry_enabled", False):
228
+ agent = OperatorAgent.create(
229
+ model_client=mock_openai,
230
+ validate_api_key=False,
231
+ )
232
+
233
+ # Initialize with context
234
+ agent.ctx = mock_eval_context_computer
235
+ await agent._initialize_from_ctx(mock_eval_context_computer)
236
+
237
+ # Mock OpenAI API response for a successful computer use response
238
+ mock_response = MagicMock()
239
+ mock_response.id = "response_123"
240
+ mock_response.state = "completed"
241
+ # Mock the output message structure
242
+ mock_output_text = MagicMock()
243
+ mock_output_text.type = "output_text"
244
+ mock_output_text.text = "I can see the screen content."
245
+
246
+ mock_output_message = MagicMock()
247
+ mock_output_message.type = "message"
248
+ mock_output_message.content = [mock_output_text]
249
+
250
+ mock_response.output = [mock_output_message]
251
+
252
+ mock_openai.responses.create = AsyncMock(return_value=mock_response)
253
+
254
+ messages = [{"prompt": "What's on the screen?", "screenshot": None}]
255
+ response = await agent.get_response(messages) # type: ignore[arg-type]
256
+
257
+ assert response.done is True
258
+ assert response.tool_calls == []
259
+
260
+ @pytest.mark.asyncio
261
+ async def test_handle_empty_response(
262
+ self, mock_openai: AsyncOpenAI, mock_eval_context_computer: MockEvalContext
263
+ ) -> None:
264
+ """Test handling empty response from API."""
265
+ agent = OperatorAgent.create(
266
+ model_client=mock_openai,
267
+ validate_api_key=False,
268
+ )
269
+
270
+ # Initialize with context
271
+ agent.ctx = mock_eval_context_computer
272
+ await agent._initialize_from_ctx(mock_eval_context_computer)
273
+
274
+ # Mock empty response
275
+ mock_response = MagicMock()
276
+ mock_response.id = "response_empty"
277
+ mock_response.state = "completed"
278
+ mock_response.output = [] # Empty output
279
+
280
+ mock_openai.responses.create = AsyncMock(return_value=mock_response)
281
+
282
+ messages = [{"prompt": "Hi", "screenshot": None}]
283
+ response = await agent.get_response(messages) # type: ignore[arg-type]
284
+
285
+ assert response.content == ""
286
+ assert response.tool_calls == []
287
+
288
+ @pytest.mark.asyncio
289
+ async def test_pending_safety_checks_initialization(self, mock_openai: AsyncOpenAI) -> None:
290
+ """Test that OperatorAgent initializes pending_call_id and pending_safety_checks."""
291
+ agent = OperatorAgent.create(
292
+ model_client=mock_openai,
293
+ validate_api_key=False,
294
+ )
295
+
296
+ # Verify initial state
297
+ assert agent.pending_call_id is None
298
+ assert agent.pending_safety_checks == []
299
+
300
+ # Set some state
301
+ agent.pending_call_id = "call_id"
302
+ agent.pending_safety_checks = [
303
+ PendingSafetyCheck(id="safety_check_id", code="value", message="message")
304
+ ]
305
+
306
+ # Verify state was set
307
+ assert agent.pending_call_id == "call_id"
308
+ assert len(agent.pending_safety_checks) == 1
309
+ assert agent.pending_safety_checks[0].id == "safety_check_id"
310
+
311
+ @pytest.mark.asyncio
312
+ async def test_extract_tool_call_computer(self, mock_openai: AsyncOpenAI) -> None:
313
+ """Test that _extract_tool_call routes computer_call to openai_computer."""
314
+ agent = OperatorAgent.create(
315
+ model_client=mock_openai,
316
+ validate_api_key=False,
317
+ )
318
+
319
+ # Create a mock computer_call item
320
+ mock_item = MagicMock()
321
+ mock_item.type = "computer_call"
322
+ mock_item.call_id = "call_123"
323
+ mock_item.pending_safety_checks = [
324
+ PendingSafetyCheck(id="check_1", code="code", message="msg")
325
+ ]
326
+ mock_item.action.to_dict.return_value = {"type": "screenshot"}
327
+
328
+ tool_call = agent._extract_tool_call(mock_item)
329
+
330
+ # Should route to openai_computer tool
331
+ assert tool_call is not None
332
+ assert tool_call.name == "openai_computer"
333
+ assert tool_call.id == "call_123"
334
+ assert tool_call.arguments == {"type": "screenshot"}
335
+ # Should update pending_safety_checks
336
+ assert agent.pending_safety_checks == mock_item.pending_safety_checks
337
+
338
+ @pytest.mark.asyncio
339
+ async def test_extract_tool_call_delegates_to_super(self, mock_openai: AsyncOpenAI) -> None:
340
+ """Test that _extract_tool_call delegates non-computer calls to parent."""
341
+ agent = OperatorAgent.create(
342
+ model_client=mock_openai,
343
+ validate_api_key=False,
344
+ )
345
+
346
+ # Set up tool name map
347
+ agent._tool_name_map = {"test_tool": "mcp_test_tool"}
348
+
349
+ # Create a mock function_call item
350
+ mock_item = MagicMock()
351
+ mock_item.type = "function_call"
352
+ mock_item.call_id = "call_456"
353
+ mock_item.name = "test_tool"
354
+ mock_item.arguments = '{"arg": "value"}'
355
+
356
+ tool_call = agent._extract_tool_call(mock_item)
357
+
358
+ # Should delegate to parent and map the tool name
359
+ assert tool_call is not None
360
+ assert tool_call.name == "mcp_test_tool"
361
+ assert tool_call.id == "call_456"
362
+ assert tool_call.arguments == {"arg": "value"}
@@ -0,0 +1,192 @@
1
+ """Tests for model resolution and create_agent."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from unittest.mock import MagicMock, patch
6
+
7
+ import pytest
8
+
9
+ from hud.agents import create_agent
10
+ from hud.agents.resolver import resolve_cls
11
+
12
+
13
+ class TestResolveCls:
14
+ """Tests for resolve_cls function."""
15
+
16
+ def test_resolves_known_agent_type(self) -> None:
17
+ """Known AgentType strings resolve to their class."""
18
+ from hud.agents.claude import ClaudeAgent
19
+
20
+ cls, gateway_info = resolve_cls("claude")
21
+ assert cls == ClaudeAgent
22
+ assert gateway_info is None
23
+
24
+ def test_resolves_openai(self) -> None:
25
+ """Resolves 'openai' to OpenAIAgent."""
26
+ from hud.agents import OpenAIAgent
27
+
28
+ cls, _gateway_info = resolve_cls("openai")
29
+ assert cls == OpenAIAgent
30
+
31
+ def test_resolves_gemini(self) -> None:
32
+ """Resolves 'gemini' to GeminiAgent."""
33
+ from hud.agents.gemini import GeminiAgent
34
+
35
+ cls, _gateway_info = resolve_cls("gemini")
36
+ assert cls == GeminiAgent
37
+
38
+ def test_unknown_model_without_gateway_raises(self) -> None:
39
+ """Unknown model with no gateway models raises ValueError."""
40
+ with (
41
+ patch("hud.agents.resolver._fetch_gateway_models", return_value=[]),
42
+ pytest.raises(ValueError, match="not found"),
43
+ ):
44
+ resolve_cls("unknown-model-xyz")
45
+
46
+ def test_resolves_gateway_model(self) -> None:
47
+ """Resolves model found in gateway."""
48
+ from hud.agents import OpenAIAgent
49
+
50
+ mock_models = [
51
+ {"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"},
52
+ ]
53
+
54
+ with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models):
55
+ cls, info = resolve_cls("gpt-4o")
56
+ assert cls == OpenAIAgent
57
+ assert info is not None
58
+ assert info["id"] == "gpt-4o"
59
+
60
+ def test_resolves_anthropic_provider_to_claude(self) -> None:
61
+ """Provider 'anthropic' maps to ClaudeAgent."""
62
+ from hud.agents.claude import ClaudeAgent
63
+
64
+ mock_models = [
65
+ {"id": "claude-sonnet", "model": "claude-3-sonnet", "provider": "anthropic"},
66
+ ]
67
+
68
+ with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models):
69
+ cls, _info = resolve_cls("claude-sonnet")
70
+ assert cls == ClaudeAgent
71
+
72
+ def test_resolves_unknown_provider_to_openai_compatible(self) -> None:
73
+ """Unknown provider maps to OpenAIChatAgent."""
74
+ from hud.agents.openai_chat import OpenAIChatAgent
75
+
76
+ mock_models = [
77
+ {"id": "custom-model", "model": "custom", "provider": "custom-provider"},
78
+ ]
79
+
80
+ with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models):
81
+ cls, _info = resolve_cls("custom-model")
82
+ assert cls == OpenAIChatAgent
83
+
84
+
85
+ class TestCreateAgent:
86
+ """Tests for create_agent function - gateway-only."""
87
+
88
+ def test_creates_with_gateway_client(self) -> None:
89
+ """create_agent always uses gateway routing."""
90
+ from hud.agents import OpenAIAgent
91
+
92
+ mock_models = [
93
+ {"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"},
94
+ ]
95
+
96
+ with (
97
+ patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models),
98
+ patch.object(OpenAIAgent, "create") as mock_create,
99
+ patch("hud.agents.gateway.build_gateway_client") as mock_build_client,
100
+ ):
101
+ mock_client = MagicMock()
102
+ mock_build_client.return_value = mock_client
103
+ mock_agent = MagicMock()
104
+ mock_create.return_value = mock_agent
105
+
106
+ agent = create_agent("gpt-4o")
107
+
108
+ # Should have set model and model_client
109
+ call_kwargs = mock_create.call_args.kwargs
110
+ assert call_kwargs["model"] == "gpt-4o"
111
+ assert "model_client" in call_kwargs
112
+ assert agent == mock_agent
113
+
114
+ def test_passes_kwargs_to_create(self) -> None:
115
+ """Extra kwargs are passed to agent.create()."""
116
+ from hud.agents import OpenAIAgent
117
+
118
+ mock_models = [
119
+ {"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"},
120
+ ]
121
+
122
+ with (
123
+ patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models),
124
+ patch.object(OpenAIAgent, "create") as mock_create,
125
+ patch("hud.agents.gateway.build_gateway_client"),
126
+ ):
127
+ mock_create.return_value = MagicMock()
128
+
129
+ create_agent("gpt-4o", temperature=0.5, max_tokens=1000)
130
+
131
+ call_kwargs = mock_create.call_args.kwargs
132
+ assert call_kwargs["temperature"] == 0.5
133
+ assert call_kwargs["max_tokens"] == 1000
134
+
135
+ def test_known_agent_type_also_uses_gateway(self) -> None:
136
+ """Even 'claude' string uses gateway (it's a gateway shortcut)."""
137
+ from hud.agents.claude import ClaudeAgent
138
+
139
+ with (
140
+ patch.object(ClaudeAgent, "create") as mock_create,
141
+ patch("hud.agents.gateway.build_gateway_client") as mock_build_client,
142
+ ):
143
+ mock_client = MagicMock()
144
+ mock_build_client.return_value = mock_client
145
+ mock_create.return_value = MagicMock()
146
+
147
+ create_agent("claude")
148
+
149
+ # Should still build gateway client
150
+ mock_build_client.assert_called_once()
151
+ call_kwargs = mock_create.call_args.kwargs
152
+ assert "model_client" in call_kwargs
153
+
154
+
155
+ class TestBuildGatewayClient:
156
+ """Tests for build_gateway_client function."""
157
+
158
+ def test_builds_anthropic_client(self) -> None:
159
+ """Builds AsyncAnthropic for anthropic provider."""
160
+ from hud.agents.gateway import build_gateway_client
161
+
162
+ with patch("hud.settings.settings") as mock_settings:
163
+ mock_settings.api_key = "test-key"
164
+ mock_settings.hud_gateway_url = "https://gateway.hud.ai"
165
+
166
+ with patch("anthropic.AsyncAnthropic") as mock_client_cls:
167
+ build_gateway_client("anthropic")
168
+ mock_client_cls.assert_called_once()
169
+
170
+ def test_builds_openai_client_for_openai(self) -> None:
171
+ """Builds AsyncOpenAI for openai provider."""
172
+ from hud.agents.gateway import build_gateway_client
173
+
174
+ with patch("hud.settings.settings") as mock_settings:
175
+ mock_settings.api_key = "test-key"
176
+ mock_settings.hud_gateway_url = "https://gateway.hud.ai"
177
+
178
+ with patch("openai.AsyncOpenAI") as mock_client_cls:
179
+ build_gateway_client("openai")
180
+ mock_client_cls.assert_called_once()
181
+
182
+ def test_builds_openai_client_for_unknown(self) -> None:
183
+ """Builds AsyncOpenAI for unknown providers (openai-compatible)."""
184
+ from hud.agents.gateway import build_gateway_client
185
+
186
+ with patch("hud.settings.settings") as mock_settings:
187
+ mock_settings.api_key = "test-key"
188
+ mock_settings.hud_gateway_url = "https://gateway.hud.ai"
189
+
190
+ with patch("openai.AsyncOpenAI") as mock_client_cls:
191
+ build_gateway_client("together")
192
+ mock_client_cls.assert_called_once()