hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -15,7 +15,6 @@ from hud.types import MCPToolResult
15
15
  logger = logging.getLogger(__name__)
16
16
 
17
17
 
18
- @patch("hud.clients.base.setup_hud_telemetry")
19
18
  class TestMCPClient:
20
19
  """Test MCPClient class."""
21
20
 
@@ -34,7 +33,7 @@ class TestMCPClient:
34
33
  yield mock_instance
35
34
 
36
35
  @pytest.mark.asyncio
37
- async def test_connect_single_server(self, mock_telemetry, mock_mcp_use_client):
36
+ async def test_connect_single_server(self, mock_mcp_use_client):
38
37
  """Test connecting to a single server."""
39
38
  config = {"test_server": {"command": "python", "args": ["-m", "test_server"]}}
40
39
 
@@ -77,7 +76,7 @@ class TestMCPClient:
77
76
  assert names == {"tool1", "tool2"}
78
77
 
79
78
  @pytest.mark.asyncio
80
- async def test_connect_multiple_servers(self, mock_telemetry, mock_mcp_use_client):
79
+ async def test_connect_multiple_servers(self, mock_mcp_use_client):
81
80
  """Test connecting to multiple servers."""
82
81
  config = {
83
82
  "server1": {"command": "python", "args": ["-m", "server1"]},
@@ -129,7 +128,7 @@ class TestMCPClient:
129
128
  assert names == {"server1_tool1", "server2_tool2"}
130
129
 
131
130
  @pytest.mark.asyncio
132
- async def test_call_tool(self, mock_telemetry, mock_mcp_use_client):
131
+ async def test_call_tool(self, mock_mcp_use_client):
133
132
  """Test calling a tool."""
134
133
  config = {"test": {"command": "test"}}
135
134
  client = MCPClient(mcp_config=config)
@@ -180,7 +179,7 @@ class TestMCPClient:
180
179
  )
181
180
 
182
181
  @pytest.mark.asyncio
183
- async def test_call_tool_not_found(self, mock_telemetry, mock_mcp_use_client):
182
+ async def test_call_tool_not_found(self, mock_mcp_use_client):
184
183
  """Test calling a non-existent tool."""
185
184
  config = {"test": {"command": "test"}}
186
185
  client = MCPClient(mcp_config=config)
@@ -208,7 +207,7 @@ class TestMCPClient:
208
207
  assert "Tool 'nonexistent' not found" in text_content
209
208
 
210
209
  @pytest.mark.asyncio
211
- async def test_get_telemetry_data(self, mock_telemetry, mock_mcp_use_client):
210
+ async def test_get_telemetry_data(self, mock_mcp_use_client):
212
211
  """Test getting telemetry data."""
213
212
  config = {"test": {"command": "test"}}
214
213
  client = MCPClient(mcp_config=config)
@@ -245,7 +244,7 @@ class TestMCPClient:
245
244
  assert isinstance(telemetry_data, dict)
246
245
 
247
246
  @pytest.mark.asyncio
248
- async def test_close(self, mock_telemetry, mock_mcp_use_client):
247
+ async def test_close(self, mock_mcp_use_client):
249
248
  """Test closing client connections."""
250
249
  config = {"test": {"command": "test"}}
251
250
  client = MCPClient(mcp_config=config)
@@ -267,7 +266,7 @@ class TestMCPClient:
267
266
  mock_mcp_use_client.close_all_sessions.assert_called_once()
268
267
 
269
268
  @pytest.mark.asyncio
270
- async def test_context_manager(self, mock_telemetry, mock_mcp_use_client):
269
+ async def test_context_manager(self, mock_mcp_use_client):
271
270
  """Test using client as context manager."""
272
271
  mock_session = MagicMock()
273
272
  mock_session.connector = MagicMock()
@@ -291,7 +290,7 @@ class TestMCPClient:
291
290
  mock_mcp_use_client.close_all_sessions.assert_called_once()
292
291
 
293
292
  @pytest.mark.asyncio
294
- async def test_get_available_tools(self, mock_telemetry, mock_mcp_use_client):
293
+ async def test_get_available_tools(self, mock_mcp_use_client):
295
294
  """Test getting available tools."""
296
295
  config = {"test": {"command": "test"}}
297
296
  client = MCPClient(mcp_config=config)
@@ -319,7 +318,7 @@ class TestMCPClient:
319
318
  assert names == {"tool1", "tool2"}
320
319
 
321
320
  @pytest.mark.asyncio
322
- async def test_get_tool_map(self, mock_telemetry, mock_mcp_use_client):
321
+ async def test_get_tool_map(self, mock_mcp_use_client):
323
322
  """Test getting tool map."""
324
323
  config = {"test": {"command": "test"}}
325
324
  client = MCPClient(mcp_config=config)
@@ -0,0 +1,369 @@
1
+ """Tests for Gemini MCP Agent implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import base64
6
+ from typing import Any
7
+ from unittest.mock import AsyncMock, MagicMock, patch
8
+
9
+ import pytest
10
+ from google import genai
11
+ from google.genai import types as genai_types
12
+ from mcp import types
13
+
14
+ from hud.agents.gemini import GeminiAgent
15
+ from hud.environment.router import ToolRouter
16
+ from hud.eval.context import EvalContext
17
+ from hud.types import MCPToolCall, MCPToolResult
18
+
19
+
20
+ class MockEvalContext(EvalContext):
21
+ """Mock EvalContext for testing."""
22
+
23
+ def __init__(self, tools: list[types.Tool] | None = None) -> None:
24
+ # Core attributes
25
+ self.prompt = "Test prompt"
26
+ self._tools = tools or []
27
+ self._submitted: str | None = None
28
+ self.reward: float | None = None
29
+
30
+ # Environment attributes
31
+ self._router = ToolRouter()
32
+ self._agent_include: list[str] | None = None
33
+ self._agent_exclude: list[str] | None = None
34
+
35
+ # EvalContext attributes
36
+ self._task = None
37
+ self.trace_id = "test-trace-id"
38
+ self.eval_name = "test-eval"
39
+ self.job_id: str | None = None
40
+ self.group_id: str | None = None
41
+ self.index = 0
42
+ self.variants: dict[str, Any] = {}
43
+ self.answer: str | None = None
44
+ self.system_prompt: str | None = None
45
+ self.error: BaseException | None = None
46
+ self.metadata: dict[str, Any] = {}
47
+ self.results: list[Any] = []
48
+ self._is_summary = False
49
+
50
+ def as_tools(self) -> list[types.Tool]:
51
+ return self._tools
52
+
53
+ @property
54
+ def has_scenario(self) -> bool:
55
+ return False
56
+
57
+ async def list_tools(self) -> list[types.Tool]:
58
+ return self._tools
59
+
60
+ async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
61
+ return MCPToolResult(
62
+ content=[types.TextContent(type="text", text="ok")],
63
+ isError=False,
64
+ )
65
+
66
+ async def submit(self, answer: str) -> None:
67
+ self._submitted = answer
68
+
69
+
70
+ class TestGeminiAgent:
71
+ """Test GeminiAgent base class."""
72
+
73
+ @pytest.fixture
74
+ def mock_gemini_client(self) -> MagicMock:
75
+ """Create a stub Gemini client."""
76
+ client = MagicMock(spec=genai.Client)
77
+ client.api_key = "test_key"
78
+ client.models = MagicMock()
79
+ client.models.list = MagicMock(return_value=iter([]))
80
+ client.models.generate_content = MagicMock()
81
+ # Set up async interface (aio.models.generate_content)
82
+ client.aio = MagicMock()
83
+ client.aio.models = MagicMock()
84
+ client.aio.models.generate_content = AsyncMock()
85
+ return client
86
+
87
+ @pytest.mark.asyncio
88
+ async def test_init(self, mock_gemini_client: MagicMock) -> None:
89
+ """Test agent initialization."""
90
+ agent = GeminiAgent.create(
91
+ model_client=mock_gemini_client,
92
+ model="gemini-2.5-flash",
93
+ validate_api_key=False,
94
+ )
95
+
96
+ assert agent.model_name == "Gemini"
97
+ assert agent.config.model == "gemini-2.5-flash"
98
+ assert agent.gemini_client == mock_gemini_client
99
+
100
+ @pytest.mark.asyncio
101
+ async def test_init_without_model_client(self) -> None:
102
+ """Test agent initialization without model client."""
103
+ with (
104
+ patch("hud.settings.settings.gemini_api_key", "test_key"),
105
+ patch("hud.agents.gemini.genai.Client") as mock_client_class,
106
+ ):
107
+ mock_client = MagicMock()
108
+ mock_client.api_key = "test_key"
109
+ mock_client.models = MagicMock()
110
+ mock_client.models.list = MagicMock(return_value=iter([]))
111
+ mock_client_class.return_value = mock_client
112
+
113
+ agent = GeminiAgent.create(
114
+ model="gemini-2.5-flash",
115
+ validate_api_key=False,
116
+ )
117
+
118
+ assert agent.gemini_client is not None
119
+
120
+ @pytest.mark.asyncio
121
+ async def test_format_blocks_text_only(self, mock_gemini_client: MagicMock) -> None:
122
+ """Test formatting text content blocks."""
123
+ agent = GeminiAgent.create(
124
+ model_client=mock_gemini_client,
125
+ validate_api_key=False,
126
+ )
127
+
128
+ blocks: list[types.ContentBlock] = [
129
+ types.TextContent(type="text", text="Hello, world!"),
130
+ types.TextContent(type="text", text="How are you?"),
131
+ ]
132
+
133
+ messages = await agent.format_blocks(blocks)
134
+ assert len(messages) == 1
135
+ assert messages[0].role == "user"
136
+ assert messages[0].parts is not None
137
+ assert len(messages[0].parts) == 2
138
+
139
+ @pytest.mark.asyncio
140
+ async def test_format_blocks_with_image(self, mock_gemini_client: MagicMock) -> None:
141
+ """Test formatting image content blocks."""
142
+ agent = GeminiAgent.create(
143
+ model_client=mock_gemini_client,
144
+ validate_api_key=False,
145
+ )
146
+
147
+ # Create a tiny valid base64 PNG
148
+ png_data = base64.b64encode(b"\x89PNG\r\n\x1a\n").decode()
149
+
150
+ blocks: list[types.ContentBlock] = [
151
+ types.TextContent(type="text", text="Look at this:"),
152
+ types.ImageContent(type="image", data=png_data, mimeType="image/png"),
153
+ ]
154
+
155
+ messages = await agent.format_blocks(blocks)
156
+ assert len(messages) == 1
157
+ assert messages[0].parts is not None
158
+ assert len(messages[0].parts) == 2
159
+
160
+ @pytest.mark.asyncio
161
+ async def test_format_tool_results(self, mock_gemini_client: MagicMock) -> None:
162
+ """Test formatting tool results."""
163
+ agent = GeminiAgent.create(
164
+ model_client=mock_gemini_client,
165
+ validate_api_key=False,
166
+ )
167
+
168
+ tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
169
+ tool_results = [
170
+ MCPToolResult(
171
+ content=[types.TextContent(type="text", text="Tool output")],
172
+ isError=False,
173
+ )
174
+ ]
175
+
176
+ messages = await agent.format_tool_results(tool_calls, tool_results)
177
+ assert len(messages) == 1
178
+ assert messages[0].role == "user"
179
+
180
+ @pytest.mark.asyncio
181
+ async def test_get_system_messages(self, mock_gemini_client: MagicMock) -> None:
182
+ """Test that system messages return empty (Gemini uses system_instruction)."""
183
+ agent = GeminiAgent.create(
184
+ model_client=mock_gemini_client,
185
+ system_prompt="You are a helpful assistant.",
186
+ validate_api_key=False,
187
+ )
188
+
189
+ messages = await agent.get_system_messages()
190
+ # Gemini doesn't use system messages in the message list
191
+ assert messages == []
192
+
193
+ @pytest.mark.asyncio
194
+ async def test_get_response_text_only(self, mock_gemini_client: MagicMock) -> None:
195
+ """Test getting text-only response."""
196
+ # Disable telemetry for this test
197
+ with patch("hud.settings.settings.telemetry_enabled", False):
198
+ agent = GeminiAgent.create(
199
+ model_client=mock_gemini_client,
200
+ validate_api_key=False,
201
+ )
202
+ # Set up agent as initialized (no tools needed for this test)
203
+ agent.gemini_tools = []
204
+ agent._initialized = True
205
+
206
+ # Mock the API response with text only
207
+ mock_response = MagicMock()
208
+ mock_candidate = MagicMock()
209
+
210
+ text_part = MagicMock()
211
+ text_part.text = "Task completed successfully"
212
+ text_part.function_call = None
213
+
214
+ mock_candidate.content = MagicMock()
215
+ mock_candidate.content.parts = [text_part]
216
+
217
+ mock_response.candidates = [mock_candidate]
218
+
219
+ mock_gemini_client.aio.models.generate_content = AsyncMock(return_value=mock_response)
220
+
221
+ messages = [
222
+ genai_types.Content(role="user", parts=[genai_types.Part.from_text(text="Status?")])
223
+ ]
224
+ response = await agent.get_response(messages)
225
+
226
+ assert response.content == "Task completed successfully"
227
+ assert response.tool_calls == []
228
+ assert response.done is True
229
+
230
+ @pytest.mark.asyncio
231
+ async def test_get_response_with_thinking(self, mock_gemini_client: MagicMock) -> None:
232
+ """Test getting response with thinking content."""
233
+ with patch("hud.settings.settings.telemetry_enabled", False):
234
+ agent = GeminiAgent.create(
235
+ model_client=mock_gemini_client,
236
+ validate_api_key=False,
237
+ )
238
+ # Set up agent as initialized (no tools needed for this test)
239
+ agent.gemini_tools = []
240
+ agent._initialized = True
241
+
242
+ mock_response = MagicMock()
243
+ mock_candidate = MagicMock()
244
+
245
+ thinking_part = MagicMock()
246
+ thinking_part.text = "Let me reason through this..."
247
+ thinking_part.function_call = None
248
+ thinking_part.thought = True
249
+
250
+ text_part = MagicMock()
251
+ text_part.text = "Here is my answer"
252
+ text_part.function_call = None
253
+ text_part.thought = False
254
+
255
+ mock_candidate.content = MagicMock()
256
+ mock_candidate.content.parts = [thinking_part, text_part]
257
+
258
+ mock_response.candidates = [mock_candidate]
259
+
260
+ mock_gemini_client.aio.models.generate_content = AsyncMock(return_value=mock_response)
261
+
262
+ messages = [
263
+ genai_types.Content(
264
+ role="user", parts=[genai_types.Part.from_text(text="Hard question")]
265
+ )
266
+ ]
267
+ response = await agent.get_response(messages)
268
+
269
+ assert response.content == "Here is my answer"
270
+ assert response.reasoning == "Let me reason through this..."
271
+
272
+ @pytest.mark.asyncio
273
+ async def test_convert_tools_for_gemini(self, mock_gemini_client: MagicMock) -> None:
274
+ """Test converting MCP tools to Gemini format."""
275
+ tools = [
276
+ types.Tool(
277
+ name="my_tool",
278
+ description="A test tool",
279
+ inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
280
+ )
281
+ ]
282
+ ctx = MockEvalContext(tools=tools)
283
+ agent = GeminiAgent.create(
284
+ model_client=mock_gemini_client,
285
+ validate_api_key=False,
286
+ )
287
+
288
+ agent.ctx = ctx
289
+ await agent._initialize_from_ctx(ctx)
290
+
291
+ # Check that tools were converted
292
+ assert len(agent.gemini_tools) == 1
293
+ # Gemini tools have function_declarations - cast to genai Tool type
294
+ gemini_tool = agent.gemini_tools[0]
295
+ assert isinstance(gemini_tool, genai_types.Tool)
296
+ assert gemini_tool.function_declarations is not None
297
+ assert gemini_tool.function_declarations[0].name == "my_tool"
298
+
299
+
300
+ class TestGeminiToolConversion:
301
+ """Tests for tool conversion to Gemini format."""
302
+
303
+ @pytest.fixture
304
+ def mock_gemini_client(self) -> MagicMock:
305
+ """Create a stub Gemini client."""
306
+ client = MagicMock(spec=genai.Client)
307
+ client.api_key = "test_key"
308
+ client.models = MagicMock()
309
+ client.models.list = MagicMock(return_value=iter([]))
310
+ # Set up async interface
311
+ client.aio = MagicMock()
312
+ client.aio.models = MagicMock()
313
+ client.aio.models.generate_content = AsyncMock()
314
+ return client
315
+
316
+ @pytest.mark.asyncio
317
+ async def test_tool_with_properties(self, mock_gemini_client: MagicMock) -> None:
318
+ """Test tool with input properties."""
319
+ tools = [
320
+ types.Tool(
321
+ name="search",
322
+ description="Search the web",
323
+ inputSchema={
324
+ "type": "object",
325
+ "properties": {
326
+ "query": {"type": "string", "description": "Search query"},
327
+ "limit": {"type": "integer", "description": "Max results"},
328
+ },
329
+ "required": ["query"],
330
+ },
331
+ )
332
+ ]
333
+ ctx = MockEvalContext(tools=tools)
334
+ agent = GeminiAgent.create(
335
+ model_client=mock_gemini_client,
336
+ validate_api_key=False,
337
+ )
338
+
339
+ agent.ctx = ctx
340
+ await agent._initialize_from_ctx(ctx)
341
+
342
+ assert len(agent.gemini_tools) == 1
343
+ gemini_tool = agent.gemini_tools[0]
344
+ # Gemini tools have function_declarations - cast to genai Tool type
345
+ assert isinstance(gemini_tool, genai_types.Tool)
346
+ assert gemini_tool.function_declarations is not None
347
+ assert gemini_tool.function_declarations[0].name == "search"
348
+ assert gemini_tool.function_declarations[0].parameters_json_schema is not None
349
+
350
+ @pytest.mark.asyncio
351
+ async def test_tool_without_schema(self, mock_gemini_client: MagicMock) -> None:
352
+ """Test tool without description raises error."""
353
+ # Create a tool with inputSchema but no description
354
+ tools = [
355
+ types.Tool(
356
+ name="incomplete",
357
+ description=None,
358
+ inputSchema={"type": "object"},
359
+ )
360
+ ]
361
+ ctx = MockEvalContext(tools=tools)
362
+ agent = GeminiAgent.create(
363
+ model_client=mock_gemini_client,
364
+ validate_api_key=False,
365
+ )
366
+
367
+ agent.ctx = ctx
368
+ with pytest.raises(ValueError, match="requires both a description"):
369
+ await agent._initialize_from_ctx(ctx)
@@ -1,60 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
- import json
4
3
  from typing import Any
5
4
 
6
5
  import mcp.types as types
7
6
  import pytest
7
+ from openai import AsyncOpenAI
8
8
 
9
9
  from hud.agents.grounded_openai import GroundedOpenAIChatAgent
10
10
  from hud.tools.grounding import GrounderConfig
11
11
  from hud.types import MCPToolCall, MCPToolResult
12
12
 
13
13
 
14
- class DummyOpenAI:
15
- class chat: # type: ignore[no-redef]
16
- class completions:
17
- @staticmethod
18
- async def create(**kwargs: Any) -> Any:
19
- # Return a minimal object mimicking OpenAI response
20
- class Msg:
21
- def __init__(self) -> None:
22
- self.content = "Thinking..."
23
- self.tool_calls = [
24
- type(
25
- "ToolCall",
26
- (),
27
- {
28
- "id": "call_1",
29
- "function": type(
30
- "Fn",
31
- (),
32
- {
33
- "name": "computer",
34
- "arguments": json.dumps(
35
- {
36
- "action": "click",
37
- "element_description": "blue button",
38
- }
39
- ),
40
- },
41
- ),
42
- },
43
- )()
44
- ]
45
-
46
- class Choice:
47
- def __init__(self) -> None:
48
- self.message = Msg()
49
- self.finish_reason = "tool_calls"
50
-
51
- class Resp:
52
- def __init__(self) -> None:
53
- self.choices = [Choice()]
54
-
55
- return Resp()
56
-
57
-
58
14
  class FakeMCPClient:
59
15
  def __init__(self) -> None:
60
16
  self.tools: list[types.Tool] = [
@@ -62,6 +18,7 @@ class FakeMCPClient:
62
18
  types.Tool(name="setup", description="internal functions", inputSchema={}),
63
19
  ]
64
20
  self.called: list[MCPToolCall] = []
21
+ self._initialized = True
65
22
 
66
23
  async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
67
24
  return None
@@ -77,6 +34,10 @@ class FakeMCPClient:
77
34
  def mcp_config(self) -> dict[str, dict[str, Any]]:
78
35
  return {"local": {"command": "echo", "args": ["ok"]}}
79
36
 
37
+ @property
38
+ def is_connected(self) -> bool:
39
+ return self._initialized
40
+
80
41
  async def shutdown(self) -> None:
81
42
  return None
82
43
 
@@ -109,19 +70,20 @@ class DummyGroundedTool:
109
70
 
110
71
  @pytest.mark.asyncio
111
72
  async def test_call_tools_injects_screenshot_and_delegates(monkeypatch: pytest.MonkeyPatch) -> None:
112
- # Agent with fake OpenAI client and fake MCP client
73
+ # Agent with fake OpenAI client
113
74
  grounder_cfg = GrounderConfig(api_base="http://example", model="qwen")
114
- agent = GroundedOpenAIChatAgent(
75
+ fake_openai = AsyncOpenAI(api_key="test")
76
+ agent = GroundedOpenAIChatAgent.create(
115
77
  grounder_config=grounder_cfg,
116
- openai_client=DummyOpenAI(),
117
- model_name="gpt-4o-mini",
118
- mcp_client=FakeMCPClient(),
78
+ openai_client=fake_openai,
79
+ model="gpt-4o-mini",
119
80
  initial_screenshot=False,
120
81
  )
121
82
 
122
83
  # Inject a dummy grounded tool to observe args without full initialization
123
84
  dummy_tool = DummyGroundedTool()
124
85
  agent.grounded_tool = dummy_tool # type: ignore
86
+ agent._initialized = True # Mark as initialized to skip context initialization
125
87
 
126
88
  # Seed conversation history with a user image
127
89
  png_b64 = (
@@ -153,3 +115,56 @@ async def test_call_tools_injects_screenshot_and_delegates(monkeypatch: pytest.M
153
115
  assert dummy_tool.last_args["element_description"] == "blue button"
154
116
  assert "screenshot_b64" in dummy_tool.last_args
155
117
  assert isinstance(dummy_tool.last_args["screenshot_b64"], str)
118
+
119
+
120
+ @pytest.mark.asyncio
121
+ async def test_get_response_with_reasoning() -> None:
122
+ """Test that reasoning content is extracted from the response."""
123
+ from unittest.mock import AsyncMock, MagicMock, patch
124
+
125
+ grounder_cfg = GrounderConfig(api_base="http://example", model="qwen")
126
+ fake_openai = AsyncOpenAI(api_key="test")
127
+
128
+ with patch("hud.settings.settings.telemetry_enabled", False):
129
+ agent = GroundedOpenAIChatAgent.create(
130
+ grounder_config=grounder_cfg,
131
+ openai_client=fake_openai,
132
+ model="gpt-4o-mini",
133
+ initial_screenshot=False,
134
+ )
135
+
136
+ mock_response = MagicMock()
137
+ mock_choice = MagicMock()
138
+ mock_message = MagicMock()
139
+
140
+ mock_message.content = "Here is my answer"
141
+ mock_message.reasoning_content = "Let me think step by step..."
142
+ mock_message.tool_calls = None
143
+
144
+ mock_choice.message = mock_message
145
+ mock_choice.finish_reason = "stop"
146
+
147
+ mock_response.choices = [mock_choice]
148
+
149
+ agent.oai.chat.completions.create = AsyncMock(return_value=mock_response)
150
+ agent._initialized = True # Mark as initialized to skip context initialization
151
+
152
+ # Include an image so get_response doesn't try to take a screenshot via ctx
153
+ png_b64 = (
154
+ "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGMAAQAABQAB"
155
+ "J2n0mQAAAABJRU5ErkJggg=="
156
+ )
157
+ agent.conversation_history = [
158
+ {
159
+ "role": "user",
160
+ "content": [
161
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{png_b64}"}},
162
+ {"type": "text", "text": "Hard question"},
163
+ ],
164
+ }
165
+ ]
166
+
167
+ response = await agent.get_response(agent.conversation_history)
168
+
169
+ assert response.content == "Here is my answer"
170
+ assert response.reasoning == "Let me think step by step..."