hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/agents/tests/test_client.py
CHANGED
|
@@ -15,7 +15,6 @@ from hud.types import MCPToolResult
|
|
|
15
15
|
logger = logging.getLogger(__name__)
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
@patch("hud.clients.base.setup_hud_telemetry")
|
|
19
18
|
class TestMCPClient:
|
|
20
19
|
"""Test MCPClient class."""
|
|
21
20
|
|
|
@@ -34,7 +33,7 @@ class TestMCPClient:
|
|
|
34
33
|
yield mock_instance
|
|
35
34
|
|
|
36
35
|
@pytest.mark.asyncio
|
|
37
|
-
async def test_connect_single_server(self,
|
|
36
|
+
async def test_connect_single_server(self, mock_mcp_use_client):
|
|
38
37
|
"""Test connecting to a single server."""
|
|
39
38
|
config = {"test_server": {"command": "python", "args": ["-m", "test_server"]}}
|
|
40
39
|
|
|
@@ -77,7 +76,7 @@ class TestMCPClient:
|
|
|
77
76
|
assert names == {"tool1", "tool2"}
|
|
78
77
|
|
|
79
78
|
@pytest.mark.asyncio
|
|
80
|
-
async def test_connect_multiple_servers(self,
|
|
79
|
+
async def test_connect_multiple_servers(self, mock_mcp_use_client):
|
|
81
80
|
"""Test connecting to multiple servers."""
|
|
82
81
|
config = {
|
|
83
82
|
"server1": {"command": "python", "args": ["-m", "server1"]},
|
|
@@ -129,7 +128,7 @@ class TestMCPClient:
|
|
|
129
128
|
assert names == {"server1_tool1", "server2_tool2"}
|
|
130
129
|
|
|
131
130
|
@pytest.mark.asyncio
|
|
132
|
-
async def test_call_tool(self,
|
|
131
|
+
async def test_call_tool(self, mock_mcp_use_client):
|
|
133
132
|
"""Test calling a tool."""
|
|
134
133
|
config = {"test": {"command": "test"}}
|
|
135
134
|
client = MCPClient(mcp_config=config)
|
|
@@ -180,7 +179,7 @@ class TestMCPClient:
|
|
|
180
179
|
)
|
|
181
180
|
|
|
182
181
|
@pytest.mark.asyncio
|
|
183
|
-
async def test_call_tool_not_found(self,
|
|
182
|
+
async def test_call_tool_not_found(self, mock_mcp_use_client):
|
|
184
183
|
"""Test calling a non-existent tool."""
|
|
185
184
|
config = {"test": {"command": "test"}}
|
|
186
185
|
client = MCPClient(mcp_config=config)
|
|
@@ -208,7 +207,7 @@ class TestMCPClient:
|
|
|
208
207
|
assert "Tool 'nonexistent' not found" in text_content
|
|
209
208
|
|
|
210
209
|
@pytest.mark.asyncio
|
|
211
|
-
async def test_get_telemetry_data(self,
|
|
210
|
+
async def test_get_telemetry_data(self, mock_mcp_use_client):
|
|
212
211
|
"""Test getting telemetry data."""
|
|
213
212
|
config = {"test": {"command": "test"}}
|
|
214
213
|
client = MCPClient(mcp_config=config)
|
|
@@ -245,7 +244,7 @@ class TestMCPClient:
|
|
|
245
244
|
assert isinstance(telemetry_data, dict)
|
|
246
245
|
|
|
247
246
|
@pytest.mark.asyncio
|
|
248
|
-
async def test_close(self,
|
|
247
|
+
async def test_close(self, mock_mcp_use_client):
|
|
249
248
|
"""Test closing client connections."""
|
|
250
249
|
config = {"test": {"command": "test"}}
|
|
251
250
|
client = MCPClient(mcp_config=config)
|
|
@@ -267,7 +266,7 @@ class TestMCPClient:
|
|
|
267
266
|
mock_mcp_use_client.close_all_sessions.assert_called_once()
|
|
268
267
|
|
|
269
268
|
@pytest.mark.asyncio
|
|
270
|
-
async def test_context_manager(self,
|
|
269
|
+
async def test_context_manager(self, mock_mcp_use_client):
|
|
271
270
|
"""Test using client as context manager."""
|
|
272
271
|
mock_session = MagicMock()
|
|
273
272
|
mock_session.connector = MagicMock()
|
|
@@ -291,7 +290,7 @@ class TestMCPClient:
|
|
|
291
290
|
mock_mcp_use_client.close_all_sessions.assert_called_once()
|
|
292
291
|
|
|
293
292
|
@pytest.mark.asyncio
|
|
294
|
-
async def test_get_available_tools(self,
|
|
293
|
+
async def test_get_available_tools(self, mock_mcp_use_client):
|
|
295
294
|
"""Test getting available tools."""
|
|
296
295
|
config = {"test": {"command": "test"}}
|
|
297
296
|
client = MCPClient(mcp_config=config)
|
|
@@ -319,7 +318,7 @@ class TestMCPClient:
|
|
|
319
318
|
assert names == {"tool1", "tool2"}
|
|
320
319
|
|
|
321
320
|
@pytest.mark.asyncio
|
|
322
|
-
async def test_get_tool_map(self,
|
|
321
|
+
async def test_get_tool_map(self, mock_mcp_use_client):
|
|
323
322
|
"""Test getting tool map."""
|
|
324
323
|
config = {"test": {"command": "test"}}
|
|
325
324
|
client = MCPClient(mcp_config=config)
|
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
"""Tests for Gemini MCP Agent implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
from typing import Any
|
|
7
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
8
|
+
|
|
9
|
+
import pytest
|
|
10
|
+
from google import genai
|
|
11
|
+
from google.genai import types as genai_types
|
|
12
|
+
from mcp import types
|
|
13
|
+
|
|
14
|
+
from hud.agents.gemini import GeminiAgent
|
|
15
|
+
from hud.environment.router import ToolRouter
|
|
16
|
+
from hud.eval.context import EvalContext
|
|
17
|
+
from hud.types import MCPToolCall, MCPToolResult
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MockEvalContext(EvalContext):
|
|
21
|
+
"""Mock EvalContext for testing."""
|
|
22
|
+
|
|
23
|
+
def __init__(self, tools: list[types.Tool] | None = None) -> None:
|
|
24
|
+
# Core attributes
|
|
25
|
+
self.prompt = "Test prompt"
|
|
26
|
+
self._tools = tools or []
|
|
27
|
+
self._submitted: str | None = None
|
|
28
|
+
self.reward: float | None = None
|
|
29
|
+
|
|
30
|
+
# Environment attributes
|
|
31
|
+
self._router = ToolRouter()
|
|
32
|
+
self._agent_include: list[str] | None = None
|
|
33
|
+
self._agent_exclude: list[str] | None = None
|
|
34
|
+
|
|
35
|
+
# EvalContext attributes
|
|
36
|
+
self._task = None
|
|
37
|
+
self.trace_id = "test-trace-id"
|
|
38
|
+
self.eval_name = "test-eval"
|
|
39
|
+
self.job_id: str | None = None
|
|
40
|
+
self.group_id: str | None = None
|
|
41
|
+
self.index = 0
|
|
42
|
+
self.variants: dict[str, Any] = {}
|
|
43
|
+
self.answer: str | None = None
|
|
44
|
+
self.system_prompt: str | None = None
|
|
45
|
+
self.error: BaseException | None = None
|
|
46
|
+
self.metadata: dict[str, Any] = {}
|
|
47
|
+
self.results: list[Any] = []
|
|
48
|
+
self._is_summary = False
|
|
49
|
+
|
|
50
|
+
def as_tools(self) -> list[types.Tool]:
|
|
51
|
+
return self._tools
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def has_scenario(self) -> bool:
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
async def list_tools(self) -> list[types.Tool]:
|
|
58
|
+
return self._tools
|
|
59
|
+
|
|
60
|
+
async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
|
|
61
|
+
return MCPToolResult(
|
|
62
|
+
content=[types.TextContent(type="text", text="ok")],
|
|
63
|
+
isError=False,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
async def submit(self, answer: str) -> None:
|
|
67
|
+
self._submitted = answer
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class TestGeminiAgent:
|
|
71
|
+
"""Test GeminiAgent base class."""
|
|
72
|
+
|
|
73
|
+
@pytest.fixture
|
|
74
|
+
def mock_gemini_client(self) -> MagicMock:
|
|
75
|
+
"""Create a stub Gemini client."""
|
|
76
|
+
client = MagicMock(spec=genai.Client)
|
|
77
|
+
client.api_key = "test_key"
|
|
78
|
+
client.models = MagicMock()
|
|
79
|
+
client.models.list = MagicMock(return_value=iter([]))
|
|
80
|
+
client.models.generate_content = MagicMock()
|
|
81
|
+
# Set up async interface (aio.models.generate_content)
|
|
82
|
+
client.aio = MagicMock()
|
|
83
|
+
client.aio.models = MagicMock()
|
|
84
|
+
client.aio.models.generate_content = AsyncMock()
|
|
85
|
+
return client
|
|
86
|
+
|
|
87
|
+
@pytest.mark.asyncio
|
|
88
|
+
async def test_init(self, mock_gemini_client: MagicMock) -> None:
|
|
89
|
+
"""Test agent initialization."""
|
|
90
|
+
agent = GeminiAgent.create(
|
|
91
|
+
model_client=mock_gemini_client,
|
|
92
|
+
model="gemini-2.5-flash",
|
|
93
|
+
validate_api_key=False,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
assert agent.model_name == "Gemini"
|
|
97
|
+
assert agent.config.model == "gemini-2.5-flash"
|
|
98
|
+
assert agent.gemini_client == mock_gemini_client
|
|
99
|
+
|
|
100
|
+
@pytest.mark.asyncio
|
|
101
|
+
async def test_init_without_model_client(self) -> None:
|
|
102
|
+
"""Test agent initialization without model client."""
|
|
103
|
+
with (
|
|
104
|
+
patch("hud.settings.settings.gemini_api_key", "test_key"),
|
|
105
|
+
patch("hud.agents.gemini.genai.Client") as mock_client_class,
|
|
106
|
+
):
|
|
107
|
+
mock_client = MagicMock()
|
|
108
|
+
mock_client.api_key = "test_key"
|
|
109
|
+
mock_client.models = MagicMock()
|
|
110
|
+
mock_client.models.list = MagicMock(return_value=iter([]))
|
|
111
|
+
mock_client_class.return_value = mock_client
|
|
112
|
+
|
|
113
|
+
agent = GeminiAgent.create(
|
|
114
|
+
model="gemini-2.5-flash",
|
|
115
|
+
validate_api_key=False,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
assert agent.gemini_client is not None
|
|
119
|
+
|
|
120
|
+
@pytest.mark.asyncio
|
|
121
|
+
async def test_format_blocks_text_only(self, mock_gemini_client: MagicMock) -> None:
|
|
122
|
+
"""Test formatting text content blocks."""
|
|
123
|
+
agent = GeminiAgent.create(
|
|
124
|
+
model_client=mock_gemini_client,
|
|
125
|
+
validate_api_key=False,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
blocks: list[types.ContentBlock] = [
|
|
129
|
+
types.TextContent(type="text", text="Hello, world!"),
|
|
130
|
+
types.TextContent(type="text", text="How are you?"),
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
messages = await agent.format_blocks(blocks)
|
|
134
|
+
assert len(messages) == 1
|
|
135
|
+
assert messages[0].role == "user"
|
|
136
|
+
assert messages[0].parts is not None
|
|
137
|
+
assert len(messages[0].parts) == 2
|
|
138
|
+
|
|
139
|
+
@pytest.mark.asyncio
|
|
140
|
+
async def test_format_blocks_with_image(self, mock_gemini_client: MagicMock) -> None:
|
|
141
|
+
"""Test formatting image content blocks."""
|
|
142
|
+
agent = GeminiAgent.create(
|
|
143
|
+
model_client=mock_gemini_client,
|
|
144
|
+
validate_api_key=False,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Create a tiny valid base64 PNG
|
|
148
|
+
png_data = base64.b64encode(b"\x89PNG\r\n\x1a\n").decode()
|
|
149
|
+
|
|
150
|
+
blocks: list[types.ContentBlock] = [
|
|
151
|
+
types.TextContent(type="text", text="Look at this:"),
|
|
152
|
+
types.ImageContent(type="image", data=png_data, mimeType="image/png"),
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
messages = await agent.format_blocks(blocks)
|
|
156
|
+
assert len(messages) == 1
|
|
157
|
+
assert messages[0].parts is not None
|
|
158
|
+
assert len(messages[0].parts) == 2
|
|
159
|
+
|
|
160
|
+
@pytest.mark.asyncio
|
|
161
|
+
async def test_format_tool_results(self, mock_gemini_client: MagicMock) -> None:
|
|
162
|
+
"""Test formatting tool results."""
|
|
163
|
+
agent = GeminiAgent.create(
|
|
164
|
+
model_client=mock_gemini_client,
|
|
165
|
+
validate_api_key=False,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
|
|
169
|
+
tool_results = [
|
|
170
|
+
MCPToolResult(
|
|
171
|
+
content=[types.TextContent(type="text", text="Tool output")],
|
|
172
|
+
isError=False,
|
|
173
|
+
)
|
|
174
|
+
]
|
|
175
|
+
|
|
176
|
+
messages = await agent.format_tool_results(tool_calls, tool_results)
|
|
177
|
+
assert len(messages) == 1
|
|
178
|
+
assert messages[0].role == "user"
|
|
179
|
+
|
|
180
|
+
@pytest.mark.asyncio
|
|
181
|
+
async def test_get_system_messages(self, mock_gemini_client: MagicMock) -> None:
|
|
182
|
+
"""Test that system messages return empty (Gemini uses system_instruction)."""
|
|
183
|
+
agent = GeminiAgent.create(
|
|
184
|
+
model_client=mock_gemini_client,
|
|
185
|
+
system_prompt="You are a helpful assistant.",
|
|
186
|
+
validate_api_key=False,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
messages = await agent.get_system_messages()
|
|
190
|
+
# Gemini doesn't use system messages in the message list
|
|
191
|
+
assert messages == []
|
|
192
|
+
|
|
193
|
+
@pytest.mark.asyncio
|
|
194
|
+
async def test_get_response_text_only(self, mock_gemini_client: MagicMock) -> None:
|
|
195
|
+
"""Test getting text-only response."""
|
|
196
|
+
# Disable telemetry for this test
|
|
197
|
+
with patch("hud.settings.settings.telemetry_enabled", False):
|
|
198
|
+
agent = GeminiAgent.create(
|
|
199
|
+
model_client=mock_gemini_client,
|
|
200
|
+
validate_api_key=False,
|
|
201
|
+
)
|
|
202
|
+
# Set up agent as initialized (no tools needed for this test)
|
|
203
|
+
agent.gemini_tools = []
|
|
204
|
+
agent._initialized = True
|
|
205
|
+
|
|
206
|
+
# Mock the API response with text only
|
|
207
|
+
mock_response = MagicMock()
|
|
208
|
+
mock_candidate = MagicMock()
|
|
209
|
+
|
|
210
|
+
text_part = MagicMock()
|
|
211
|
+
text_part.text = "Task completed successfully"
|
|
212
|
+
text_part.function_call = None
|
|
213
|
+
|
|
214
|
+
mock_candidate.content = MagicMock()
|
|
215
|
+
mock_candidate.content.parts = [text_part]
|
|
216
|
+
|
|
217
|
+
mock_response.candidates = [mock_candidate]
|
|
218
|
+
|
|
219
|
+
mock_gemini_client.aio.models.generate_content = AsyncMock(return_value=mock_response)
|
|
220
|
+
|
|
221
|
+
messages = [
|
|
222
|
+
genai_types.Content(role="user", parts=[genai_types.Part.from_text(text="Status?")])
|
|
223
|
+
]
|
|
224
|
+
response = await agent.get_response(messages)
|
|
225
|
+
|
|
226
|
+
assert response.content == "Task completed successfully"
|
|
227
|
+
assert response.tool_calls == []
|
|
228
|
+
assert response.done is True
|
|
229
|
+
|
|
230
|
+
@pytest.mark.asyncio
|
|
231
|
+
async def test_get_response_with_thinking(self, mock_gemini_client: MagicMock) -> None:
|
|
232
|
+
"""Test getting response with thinking content."""
|
|
233
|
+
with patch("hud.settings.settings.telemetry_enabled", False):
|
|
234
|
+
agent = GeminiAgent.create(
|
|
235
|
+
model_client=mock_gemini_client,
|
|
236
|
+
validate_api_key=False,
|
|
237
|
+
)
|
|
238
|
+
# Set up agent as initialized (no tools needed for this test)
|
|
239
|
+
agent.gemini_tools = []
|
|
240
|
+
agent._initialized = True
|
|
241
|
+
|
|
242
|
+
mock_response = MagicMock()
|
|
243
|
+
mock_candidate = MagicMock()
|
|
244
|
+
|
|
245
|
+
thinking_part = MagicMock()
|
|
246
|
+
thinking_part.text = "Let me reason through this..."
|
|
247
|
+
thinking_part.function_call = None
|
|
248
|
+
thinking_part.thought = True
|
|
249
|
+
|
|
250
|
+
text_part = MagicMock()
|
|
251
|
+
text_part.text = "Here is my answer"
|
|
252
|
+
text_part.function_call = None
|
|
253
|
+
text_part.thought = False
|
|
254
|
+
|
|
255
|
+
mock_candidate.content = MagicMock()
|
|
256
|
+
mock_candidate.content.parts = [thinking_part, text_part]
|
|
257
|
+
|
|
258
|
+
mock_response.candidates = [mock_candidate]
|
|
259
|
+
|
|
260
|
+
mock_gemini_client.aio.models.generate_content = AsyncMock(return_value=mock_response)
|
|
261
|
+
|
|
262
|
+
messages = [
|
|
263
|
+
genai_types.Content(
|
|
264
|
+
role="user", parts=[genai_types.Part.from_text(text="Hard question")]
|
|
265
|
+
)
|
|
266
|
+
]
|
|
267
|
+
response = await agent.get_response(messages)
|
|
268
|
+
|
|
269
|
+
assert response.content == "Here is my answer"
|
|
270
|
+
assert response.reasoning == "Let me reason through this..."
|
|
271
|
+
|
|
272
|
+
@pytest.mark.asyncio
|
|
273
|
+
async def test_convert_tools_for_gemini(self, mock_gemini_client: MagicMock) -> None:
|
|
274
|
+
"""Test converting MCP tools to Gemini format."""
|
|
275
|
+
tools = [
|
|
276
|
+
types.Tool(
|
|
277
|
+
name="my_tool",
|
|
278
|
+
description="A test tool",
|
|
279
|
+
inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
|
|
280
|
+
)
|
|
281
|
+
]
|
|
282
|
+
ctx = MockEvalContext(tools=tools)
|
|
283
|
+
agent = GeminiAgent.create(
|
|
284
|
+
model_client=mock_gemini_client,
|
|
285
|
+
validate_api_key=False,
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
agent.ctx = ctx
|
|
289
|
+
await agent._initialize_from_ctx(ctx)
|
|
290
|
+
|
|
291
|
+
# Check that tools were converted
|
|
292
|
+
assert len(agent.gemini_tools) == 1
|
|
293
|
+
# Gemini tools have function_declarations - cast to genai Tool type
|
|
294
|
+
gemini_tool = agent.gemini_tools[0]
|
|
295
|
+
assert isinstance(gemini_tool, genai_types.Tool)
|
|
296
|
+
assert gemini_tool.function_declarations is not None
|
|
297
|
+
assert gemini_tool.function_declarations[0].name == "my_tool"
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class TestGeminiToolConversion:
|
|
301
|
+
"""Tests for tool conversion to Gemini format."""
|
|
302
|
+
|
|
303
|
+
@pytest.fixture
|
|
304
|
+
def mock_gemini_client(self) -> MagicMock:
|
|
305
|
+
"""Create a stub Gemini client."""
|
|
306
|
+
client = MagicMock(spec=genai.Client)
|
|
307
|
+
client.api_key = "test_key"
|
|
308
|
+
client.models = MagicMock()
|
|
309
|
+
client.models.list = MagicMock(return_value=iter([]))
|
|
310
|
+
# Set up async interface
|
|
311
|
+
client.aio = MagicMock()
|
|
312
|
+
client.aio.models = MagicMock()
|
|
313
|
+
client.aio.models.generate_content = AsyncMock()
|
|
314
|
+
return client
|
|
315
|
+
|
|
316
|
+
@pytest.mark.asyncio
|
|
317
|
+
async def test_tool_with_properties(self, mock_gemini_client: MagicMock) -> None:
|
|
318
|
+
"""Test tool with input properties."""
|
|
319
|
+
tools = [
|
|
320
|
+
types.Tool(
|
|
321
|
+
name="search",
|
|
322
|
+
description="Search the web",
|
|
323
|
+
inputSchema={
|
|
324
|
+
"type": "object",
|
|
325
|
+
"properties": {
|
|
326
|
+
"query": {"type": "string", "description": "Search query"},
|
|
327
|
+
"limit": {"type": "integer", "description": "Max results"},
|
|
328
|
+
},
|
|
329
|
+
"required": ["query"],
|
|
330
|
+
},
|
|
331
|
+
)
|
|
332
|
+
]
|
|
333
|
+
ctx = MockEvalContext(tools=tools)
|
|
334
|
+
agent = GeminiAgent.create(
|
|
335
|
+
model_client=mock_gemini_client,
|
|
336
|
+
validate_api_key=False,
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
agent.ctx = ctx
|
|
340
|
+
await agent._initialize_from_ctx(ctx)
|
|
341
|
+
|
|
342
|
+
assert len(agent.gemini_tools) == 1
|
|
343
|
+
gemini_tool = agent.gemini_tools[0]
|
|
344
|
+
# Gemini tools have function_declarations - cast to genai Tool type
|
|
345
|
+
assert isinstance(gemini_tool, genai_types.Tool)
|
|
346
|
+
assert gemini_tool.function_declarations is not None
|
|
347
|
+
assert gemini_tool.function_declarations[0].name == "search"
|
|
348
|
+
assert gemini_tool.function_declarations[0].parameters_json_schema is not None
|
|
349
|
+
|
|
350
|
+
@pytest.mark.asyncio
|
|
351
|
+
async def test_tool_without_schema(self, mock_gemini_client: MagicMock) -> None:
|
|
352
|
+
"""Test tool without description raises error."""
|
|
353
|
+
# Create a tool with inputSchema but no description
|
|
354
|
+
tools = [
|
|
355
|
+
types.Tool(
|
|
356
|
+
name="incomplete",
|
|
357
|
+
description=None,
|
|
358
|
+
inputSchema={"type": "object"},
|
|
359
|
+
)
|
|
360
|
+
]
|
|
361
|
+
ctx = MockEvalContext(tools=tools)
|
|
362
|
+
agent = GeminiAgent.create(
|
|
363
|
+
model_client=mock_gemini_client,
|
|
364
|
+
validate_api_key=False,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
agent.ctx = ctx
|
|
368
|
+
with pytest.raises(ValueError, match="requires both a description"):
|
|
369
|
+
await agent._initialize_from_ctx(ctx)
|
|
@@ -1,60 +1,16 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import json
|
|
4
3
|
from typing import Any
|
|
5
4
|
|
|
6
5
|
import mcp.types as types
|
|
7
6
|
import pytest
|
|
7
|
+
from openai import AsyncOpenAI
|
|
8
8
|
|
|
9
9
|
from hud.agents.grounded_openai import GroundedOpenAIChatAgent
|
|
10
10
|
from hud.tools.grounding import GrounderConfig
|
|
11
11
|
from hud.types import MCPToolCall, MCPToolResult
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
class DummyOpenAI:
|
|
15
|
-
class chat: # type: ignore[no-redef]
|
|
16
|
-
class completions:
|
|
17
|
-
@staticmethod
|
|
18
|
-
async def create(**kwargs: Any) -> Any:
|
|
19
|
-
# Return a minimal object mimicking OpenAI response
|
|
20
|
-
class Msg:
|
|
21
|
-
def __init__(self) -> None:
|
|
22
|
-
self.content = "Thinking..."
|
|
23
|
-
self.tool_calls = [
|
|
24
|
-
type(
|
|
25
|
-
"ToolCall",
|
|
26
|
-
(),
|
|
27
|
-
{
|
|
28
|
-
"id": "call_1",
|
|
29
|
-
"function": type(
|
|
30
|
-
"Fn",
|
|
31
|
-
(),
|
|
32
|
-
{
|
|
33
|
-
"name": "computer",
|
|
34
|
-
"arguments": json.dumps(
|
|
35
|
-
{
|
|
36
|
-
"action": "click",
|
|
37
|
-
"element_description": "blue button",
|
|
38
|
-
}
|
|
39
|
-
),
|
|
40
|
-
},
|
|
41
|
-
),
|
|
42
|
-
},
|
|
43
|
-
)()
|
|
44
|
-
]
|
|
45
|
-
|
|
46
|
-
class Choice:
|
|
47
|
-
def __init__(self) -> None:
|
|
48
|
-
self.message = Msg()
|
|
49
|
-
self.finish_reason = "tool_calls"
|
|
50
|
-
|
|
51
|
-
class Resp:
|
|
52
|
-
def __init__(self) -> None:
|
|
53
|
-
self.choices = [Choice()]
|
|
54
|
-
|
|
55
|
-
return Resp()
|
|
56
|
-
|
|
57
|
-
|
|
58
14
|
class FakeMCPClient:
|
|
59
15
|
def __init__(self) -> None:
|
|
60
16
|
self.tools: list[types.Tool] = [
|
|
@@ -62,6 +18,7 @@ class FakeMCPClient:
|
|
|
62
18
|
types.Tool(name="setup", description="internal functions", inputSchema={}),
|
|
63
19
|
]
|
|
64
20
|
self.called: list[MCPToolCall] = []
|
|
21
|
+
self._initialized = True
|
|
65
22
|
|
|
66
23
|
async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
|
|
67
24
|
return None
|
|
@@ -77,6 +34,10 @@ class FakeMCPClient:
|
|
|
77
34
|
def mcp_config(self) -> dict[str, dict[str, Any]]:
|
|
78
35
|
return {"local": {"command": "echo", "args": ["ok"]}}
|
|
79
36
|
|
|
37
|
+
@property
|
|
38
|
+
def is_connected(self) -> bool:
|
|
39
|
+
return self._initialized
|
|
40
|
+
|
|
80
41
|
async def shutdown(self) -> None:
|
|
81
42
|
return None
|
|
82
43
|
|
|
@@ -109,19 +70,20 @@ class DummyGroundedTool:
|
|
|
109
70
|
|
|
110
71
|
@pytest.mark.asyncio
|
|
111
72
|
async def test_call_tools_injects_screenshot_and_delegates(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
112
|
-
# Agent with fake OpenAI client
|
|
73
|
+
# Agent with fake OpenAI client
|
|
113
74
|
grounder_cfg = GrounderConfig(api_base="http://example", model="qwen")
|
|
114
|
-
|
|
75
|
+
fake_openai = AsyncOpenAI(api_key="test")
|
|
76
|
+
agent = GroundedOpenAIChatAgent.create(
|
|
115
77
|
grounder_config=grounder_cfg,
|
|
116
|
-
openai_client=
|
|
117
|
-
|
|
118
|
-
mcp_client=FakeMCPClient(),
|
|
78
|
+
openai_client=fake_openai,
|
|
79
|
+
model="gpt-4o-mini",
|
|
119
80
|
initial_screenshot=False,
|
|
120
81
|
)
|
|
121
82
|
|
|
122
83
|
# Inject a dummy grounded tool to observe args without full initialization
|
|
123
84
|
dummy_tool = DummyGroundedTool()
|
|
124
85
|
agent.grounded_tool = dummy_tool # type: ignore
|
|
86
|
+
agent._initialized = True # Mark as initialized to skip context initialization
|
|
125
87
|
|
|
126
88
|
# Seed conversation history with a user image
|
|
127
89
|
png_b64 = (
|
|
@@ -153,3 +115,56 @@ async def test_call_tools_injects_screenshot_and_delegates(monkeypatch: pytest.M
|
|
|
153
115
|
assert dummy_tool.last_args["element_description"] == "blue button"
|
|
154
116
|
assert "screenshot_b64" in dummy_tool.last_args
|
|
155
117
|
assert isinstance(dummy_tool.last_args["screenshot_b64"], str)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@pytest.mark.asyncio
|
|
121
|
+
async def test_get_response_with_reasoning() -> None:
|
|
122
|
+
"""Test that reasoning content is extracted from the response."""
|
|
123
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
124
|
+
|
|
125
|
+
grounder_cfg = GrounderConfig(api_base="http://example", model="qwen")
|
|
126
|
+
fake_openai = AsyncOpenAI(api_key="test")
|
|
127
|
+
|
|
128
|
+
with patch("hud.settings.settings.telemetry_enabled", False):
|
|
129
|
+
agent = GroundedOpenAIChatAgent.create(
|
|
130
|
+
grounder_config=grounder_cfg,
|
|
131
|
+
openai_client=fake_openai,
|
|
132
|
+
model="gpt-4o-mini",
|
|
133
|
+
initial_screenshot=False,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
mock_response = MagicMock()
|
|
137
|
+
mock_choice = MagicMock()
|
|
138
|
+
mock_message = MagicMock()
|
|
139
|
+
|
|
140
|
+
mock_message.content = "Here is my answer"
|
|
141
|
+
mock_message.reasoning_content = "Let me think step by step..."
|
|
142
|
+
mock_message.tool_calls = None
|
|
143
|
+
|
|
144
|
+
mock_choice.message = mock_message
|
|
145
|
+
mock_choice.finish_reason = "stop"
|
|
146
|
+
|
|
147
|
+
mock_response.choices = [mock_choice]
|
|
148
|
+
|
|
149
|
+
agent.oai.chat.completions.create = AsyncMock(return_value=mock_response)
|
|
150
|
+
agent._initialized = True # Mark as initialized to skip context initialization
|
|
151
|
+
|
|
152
|
+
# Include an image so get_response doesn't try to take a screenshot via ctx
|
|
153
|
+
png_b64 = (
|
|
154
|
+
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGMAAQAABQAB"
|
|
155
|
+
"J2n0mQAAAABJRU5ErkJggg=="
|
|
156
|
+
)
|
|
157
|
+
agent.conversation_history = [
|
|
158
|
+
{
|
|
159
|
+
"role": "user",
|
|
160
|
+
"content": [
|
|
161
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{png_b64}"}},
|
|
162
|
+
{"type": "text", "text": "Hard question"},
|
|
163
|
+
],
|
|
164
|
+
}
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
response = await agent.get_response(agent.conversation_history)
|
|
168
|
+
|
|
169
|
+
assert response.content == "Here is my answer"
|
|
170
|
+
assert response.reasoning == "Let me think step by step..."
|