hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
"""Tests for OperatorAgent implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
6
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
from mcp import types
|
|
10
|
+
from openai import AsyncOpenAI
|
|
11
|
+
from openai.types.responses.response_computer_tool_call import PendingSafetyCheck
|
|
12
|
+
|
|
13
|
+
from hud.agents.operator import OperatorAgent
|
|
14
|
+
from hud.environment.router import ToolRouter
|
|
15
|
+
from hud.eval.context import EvalContext
|
|
16
|
+
from hud.types import MCPToolCall, MCPToolResult
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import Generator
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MockEvalContext(EvalContext):
|
|
23
|
+
"""Mock EvalContext for testing."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, tools: list[types.Tool] | None = None) -> None:
|
|
26
|
+
# Core attributes
|
|
27
|
+
self.prompt = "Test prompt"
|
|
28
|
+
self._tools = tools or []
|
|
29
|
+
self._submitted: str | None = None
|
|
30
|
+
self.reward: float | None = None
|
|
31
|
+
|
|
32
|
+
# Environment attributes
|
|
33
|
+
self._router = ToolRouter()
|
|
34
|
+
self._agent_include: list[str] | None = None
|
|
35
|
+
self._agent_exclude: list[str] | None = None
|
|
36
|
+
|
|
37
|
+
# EvalContext attributes
|
|
38
|
+
self._task = None
|
|
39
|
+
self.trace_id = "test-trace-id"
|
|
40
|
+
self.eval_name = "test-eval"
|
|
41
|
+
self.job_id: str | None = None
|
|
42
|
+
self.group_id: str | None = None
|
|
43
|
+
self.index = 0
|
|
44
|
+
self.variants: dict[str, Any] = {}
|
|
45
|
+
self.answer: str | None = None
|
|
46
|
+
self.system_prompt: str | None = None
|
|
47
|
+
self.error: BaseException | None = None
|
|
48
|
+
self.metadata: dict[str, Any] = {}
|
|
49
|
+
self.results: list[Any] = []
|
|
50
|
+
self._is_summary = False
|
|
51
|
+
|
|
52
|
+
def as_tools(self) -> list[types.Tool]:
|
|
53
|
+
return self._tools
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def has_scenario(self) -> bool:
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
async def list_tools(self) -> list[types.Tool]:
|
|
60
|
+
return self._tools
|
|
61
|
+
|
|
62
|
+
async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
|
|
63
|
+
return MCPToolResult(
|
|
64
|
+
content=[types.TextContent(type="text", text="ok")],
|
|
65
|
+
isError=False,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
async def submit(self, answer: str) -> None:
|
|
69
|
+
self._submitted = answer
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class TestOperatorAgent:
|
|
73
|
+
"""Test OperatorAgent class."""
|
|
74
|
+
|
|
75
|
+
@pytest.fixture
|
|
76
|
+
def mock_openai(self) -> Generator[AsyncOpenAI, None, None]:
|
|
77
|
+
"""Create a mock OpenAI client."""
|
|
78
|
+
client = AsyncOpenAI(api_key="test", base_url="http://localhost")
|
|
79
|
+
client.responses.create = AsyncMock()
|
|
80
|
+
with patch("hud.agents.openai.AsyncOpenAI", return_value=client):
|
|
81
|
+
yield client
|
|
82
|
+
|
|
83
|
+
@pytest.fixture
|
|
84
|
+
def mock_eval_context_computer(self) -> MockEvalContext:
|
|
85
|
+
"""Create a mock EvalContext with computer tool."""
|
|
86
|
+
return MockEvalContext(
|
|
87
|
+
tools=[
|
|
88
|
+
types.Tool(
|
|
89
|
+
name="openai_computer",
|
|
90
|
+
description="OpenAI computer use tool",
|
|
91
|
+
inputSchema={},
|
|
92
|
+
)
|
|
93
|
+
]
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
@pytest.mark.asyncio
|
|
97
|
+
async def test_init(self, mock_openai: AsyncOpenAI) -> None:
|
|
98
|
+
"""Test agent initialization."""
|
|
99
|
+
agent = OperatorAgent.create(
|
|
100
|
+
model_client=mock_openai,
|
|
101
|
+
model="gpt-4",
|
|
102
|
+
validate_api_key=False,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
assert agent.model_name == "Operator"
|
|
106
|
+
assert agent.config.model == "gpt-4"
|
|
107
|
+
assert agent.openai_client == mock_openai
|
|
108
|
+
|
|
109
|
+
@pytest.mark.asyncio
|
|
110
|
+
async def test_format_blocks(self, mock_openai: AsyncOpenAI) -> None:
|
|
111
|
+
"""Test formatting content blocks."""
|
|
112
|
+
agent = OperatorAgent.create(
|
|
113
|
+
model_client=mock_openai,
|
|
114
|
+
validate_api_key=False,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Test with text blocks
|
|
118
|
+
blocks: list[types.ContentBlock] = [
|
|
119
|
+
types.TextContent(type="text", text="Hello, GPT!"),
|
|
120
|
+
types.TextContent(type="text", text="Another message"),
|
|
121
|
+
]
|
|
122
|
+
|
|
123
|
+
messages = await agent.format_blocks(blocks)
|
|
124
|
+
assert len(messages) == 1
|
|
125
|
+
msg = cast("dict[str, Any]", messages[0])
|
|
126
|
+
assert msg["role"] == "user"
|
|
127
|
+
content = cast("list[dict[str, Any]]", msg["content"])
|
|
128
|
+
assert len(content) == 2
|
|
129
|
+
assert content[0] == {"type": "input_text", "text": "Hello, GPT!"}
|
|
130
|
+
assert content[1] == {"type": "input_text", "text": "Another message"}
|
|
131
|
+
|
|
132
|
+
# Test with mixed content
|
|
133
|
+
blocks = [
|
|
134
|
+
types.TextContent(type="text", text="Text content"),
|
|
135
|
+
types.ImageContent(type="image", data="base64data", mimeType="image/png"),
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
messages = await agent.format_blocks(blocks)
|
|
139
|
+
assert len(messages) == 1
|
|
140
|
+
msg = cast("dict[str, Any]", messages[0])
|
|
141
|
+
assert msg["role"] == "user"
|
|
142
|
+
content = cast("list[dict[str, Any]]", msg["content"])
|
|
143
|
+
assert len(content) == 2
|
|
144
|
+
assert content[0] == {"type": "input_text", "text": "Text content"}
|
|
145
|
+
assert content[1] == {
|
|
146
|
+
"type": "input_image",
|
|
147
|
+
"image_url": "",
|
|
148
|
+
"detail": "auto",
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
@pytest.mark.asyncio
|
|
152
|
+
async def test_format_tool_results(self, mock_openai: AsyncOpenAI) -> None:
|
|
153
|
+
"""Test formatting tool results."""
|
|
154
|
+
agent = OperatorAgent.create(
|
|
155
|
+
model_client=mock_openai,
|
|
156
|
+
validate_api_key=False,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
tool_calls = [
|
|
160
|
+
MCPToolCall(name="test_tool", arguments={}, id="call_123"),
|
|
161
|
+
MCPToolCall(name="screenshot", arguments={}, id="call_456"),
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
tool_results = [
|
|
165
|
+
MCPToolResult(content=[types.TextContent(type="text", text="Success")], isError=False),
|
|
166
|
+
MCPToolResult(
|
|
167
|
+
content=[types.ImageContent(type="image", data="base64data", mimeType="image/png")],
|
|
168
|
+
isError=False,
|
|
169
|
+
),
|
|
170
|
+
]
|
|
171
|
+
|
|
172
|
+
messages = await agent.format_tool_results(tool_calls, tool_results)
|
|
173
|
+
|
|
174
|
+
# Should return both tool results as function_call_output
|
|
175
|
+
assert len(messages) == 2
|
|
176
|
+
# First result is text
|
|
177
|
+
msg0 = cast("dict[str, Any]", messages[0])
|
|
178
|
+
assert msg0["type"] == "function_call_output"
|
|
179
|
+
assert msg0["call_id"] == "call_123"
|
|
180
|
+
output0 = cast("list[dict[str, Any]]", msg0["output"])
|
|
181
|
+
assert output0[0]["type"] == "input_text"
|
|
182
|
+
assert output0[0]["text"] == "Success"
|
|
183
|
+
# Second result is image
|
|
184
|
+
msg1 = cast("dict[str, Any]", messages[1])
|
|
185
|
+
assert msg1["type"] == "function_call_output"
|
|
186
|
+
assert msg1["call_id"] == "call_456"
|
|
187
|
+
output1 = cast("list[dict[str, Any]]", msg1["output"])
|
|
188
|
+
assert output1[0]["type"] == "input_image"
|
|
189
|
+
assert output1[0]["image_url"] == ""
|
|
190
|
+
|
|
191
|
+
@pytest.mark.asyncio
|
|
192
|
+
async def test_format_tool_results_with_error(self, mock_openai: AsyncOpenAI) -> None:
|
|
193
|
+
"""Test formatting tool results with errors."""
|
|
194
|
+
agent = OperatorAgent.create(
|
|
195
|
+
model_client=mock_openai,
|
|
196
|
+
validate_api_key=False,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
tool_calls = [
|
|
200
|
+
MCPToolCall(name="failing_tool", arguments={}, id="call_error"),
|
|
201
|
+
]
|
|
202
|
+
|
|
203
|
+
tool_results = [
|
|
204
|
+
MCPToolResult(
|
|
205
|
+
content=[types.TextContent(type="text", text="Something went wrong")], isError=True
|
|
206
|
+
),
|
|
207
|
+
]
|
|
208
|
+
|
|
209
|
+
messages = await agent.format_tool_results(tool_calls, tool_results)
|
|
210
|
+
|
|
211
|
+
# Error results are returned with error flag and content
|
|
212
|
+
assert len(messages) == 1
|
|
213
|
+
msg = cast("dict[str, Any]", messages[0])
|
|
214
|
+
assert msg["type"] == "function_call_output"
|
|
215
|
+
assert msg["call_id"] == "call_error"
|
|
216
|
+
output = cast("list[dict[str, Any]]", msg["output"])
|
|
217
|
+
assert output[0]["type"] == "input_text"
|
|
218
|
+
assert output[0]["text"] == "[tool_error] true"
|
|
219
|
+
assert output[1]["type"] == "input_text"
|
|
220
|
+
assert output[1]["text"] == "Something went wrong"
|
|
221
|
+
|
|
222
|
+
@pytest.mark.asyncio
|
|
223
|
+
async def test_get_model_response(
|
|
224
|
+
self, mock_openai: AsyncOpenAI, mock_eval_context_computer: MockEvalContext
|
|
225
|
+
) -> None:
|
|
226
|
+
"""Test getting model response from OpenAI API."""
|
|
227
|
+
with patch("hud.settings.settings.telemetry_enabled", False):
|
|
228
|
+
agent = OperatorAgent.create(
|
|
229
|
+
model_client=mock_openai,
|
|
230
|
+
validate_api_key=False,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Initialize with context
|
|
234
|
+
agent.ctx = mock_eval_context_computer
|
|
235
|
+
await agent._initialize_from_ctx(mock_eval_context_computer)
|
|
236
|
+
|
|
237
|
+
# Mock OpenAI API response for a successful computer use response
|
|
238
|
+
mock_response = MagicMock()
|
|
239
|
+
mock_response.id = "response_123"
|
|
240
|
+
mock_response.state = "completed"
|
|
241
|
+
# Mock the output message structure
|
|
242
|
+
mock_output_text = MagicMock()
|
|
243
|
+
mock_output_text.type = "output_text"
|
|
244
|
+
mock_output_text.text = "I can see the screen content."
|
|
245
|
+
|
|
246
|
+
mock_output_message = MagicMock()
|
|
247
|
+
mock_output_message.type = "message"
|
|
248
|
+
mock_output_message.content = [mock_output_text]
|
|
249
|
+
|
|
250
|
+
mock_response.output = [mock_output_message]
|
|
251
|
+
|
|
252
|
+
mock_openai.responses.create = AsyncMock(return_value=mock_response)
|
|
253
|
+
|
|
254
|
+
messages = [{"prompt": "What's on the screen?", "screenshot": None}]
|
|
255
|
+
response = await agent.get_response(messages) # type: ignore[arg-type]
|
|
256
|
+
|
|
257
|
+
assert response.done is True
|
|
258
|
+
assert response.tool_calls == []
|
|
259
|
+
|
|
260
|
+
@pytest.mark.asyncio
|
|
261
|
+
async def test_handle_empty_response(
|
|
262
|
+
self, mock_openai: AsyncOpenAI, mock_eval_context_computer: MockEvalContext
|
|
263
|
+
) -> None:
|
|
264
|
+
"""Test handling empty response from API."""
|
|
265
|
+
agent = OperatorAgent.create(
|
|
266
|
+
model_client=mock_openai,
|
|
267
|
+
validate_api_key=False,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Initialize with context
|
|
271
|
+
agent.ctx = mock_eval_context_computer
|
|
272
|
+
await agent._initialize_from_ctx(mock_eval_context_computer)
|
|
273
|
+
|
|
274
|
+
# Mock empty response
|
|
275
|
+
mock_response = MagicMock()
|
|
276
|
+
mock_response.id = "response_empty"
|
|
277
|
+
mock_response.state = "completed"
|
|
278
|
+
mock_response.output = [] # Empty output
|
|
279
|
+
|
|
280
|
+
mock_openai.responses.create = AsyncMock(return_value=mock_response)
|
|
281
|
+
|
|
282
|
+
messages = [{"prompt": "Hi", "screenshot": None}]
|
|
283
|
+
response = await agent.get_response(messages) # type: ignore[arg-type]
|
|
284
|
+
|
|
285
|
+
assert response.content == ""
|
|
286
|
+
assert response.tool_calls == []
|
|
287
|
+
|
|
288
|
+
@pytest.mark.asyncio
|
|
289
|
+
async def test_pending_safety_checks_initialization(self, mock_openai: AsyncOpenAI) -> None:
|
|
290
|
+
"""Test that OperatorAgent initializes pending_call_id and pending_safety_checks."""
|
|
291
|
+
agent = OperatorAgent.create(
|
|
292
|
+
model_client=mock_openai,
|
|
293
|
+
validate_api_key=False,
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Verify initial state
|
|
297
|
+
assert agent.pending_call_id is None
|
|
298
|
+
assert agent.pending_safety_checks == []
|
|
299
|
+
|
|
300
|
+
# Set some state
|
|
301
|
+
agent.pending_call_id = "call_id"
|
|
302
|
+
agent.pending_safety_checks = [
|
|
303
|
+
PendingSafetyCheck(id="safety_check_id", code="value", message="message")
|
|
304
|
+
]
|
|
305
|
+
|
|
306
|
+
# Verify state was set
|
|
307
|
+
assert agent.pending_call_id == "call_id"
|
|
308
|
+
assert len(agent.pending_safety_checks) == 1
|
|
309
|
+
assert agent.pending_safety_checks[0].id == "safety_check_id"
|
|
310
|
+
|
|
311
|
+
@pytest.mark.asyncio
|
|
312
|
+
async def test_extract_tool_call_computer(self, mock_openai: AsyncOpenAI) -> None:
|
|
313
|
+
"""Test that _extract_tool_call routes computer_call to openai_computer."""
|
|
314
|
+
agent = OperatorAgent.create(
|
|
315
|
+
model_client=mock_openai,
|
|
316
|
+
validate_api_key=False,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# Create a mock computer_call item
|
|
320
|
+
mock_item = MagicMock()
|
|
321
|
+
mock_item.type = "computer_call"
|
|
322
|
+
mock_item.call_id = "call_123"
|
|
323
|
+
mock_item.pending_safety_checks = [
|
|
324
|
+
PendingSafetyCheck(id="check_1", code="code", message="msg")
|
|
325
|
+
]
|
|
326
|
+
mock_item.action.to_dict.return_value = {"type": "screenshot"}
|
|
327
|
+
|
|
328
|
+
tool_call = agent._extract_tool_call(mock_item)
|
|
329
|
+
|
|
330
|
+
# Should route to openai_computer tool
|
|
331
|
+
assert tool_call is not None
|
|
332
|
+
assert tool_call.name == "openai_computer"
|
|
333
|
+
assert tool_call.id == "call_123"
|
|
334
|
+
assert tool_call.arguments == {"type": "screenshot"}
|
|
335
|
+
# Should update pending_safety_checks
|
|
336
|
+
assert agent.pending_safety_checks == mock_item.pending_safety_checks
|
|
337
|
+
|
|
338
|
+
@pytest.mark.asyncio
|
|
339
|
+
async def test_extract_tool_call_delegates_to_super(self, mock_openai: AsyncOpenAI) -> None:
|
|
340
|
+
"""Test that _extract_tool_call delegates non-computer calls to parent."""
|
|
341
|
+
agent = OperatorAgent.create(
|
|
342
|
+
model_client=mock_openai,
|
|
343
|
+
validate_api_key=False,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Set up tool name map
|
|
347
|
+
agent._tool_name_map = {"test_tool": "mcp_test_tool"}
|
|
348
|
+
|
|
349
|
+
# Create a mock function_call item
|
|
350
|
+
mock_item = MagicMock()
|
|
351
|
+
mock_item.type = "function_call"
|
|
352
|
+
mock_item.call_id = "call_456"
|
|
353
|
+
mock_item.name = "test_tool"
|
|
354
|
+
mock_item.arguments = '{"arg": "value"}'
|
|
355
|
+
|
|
356
|
+
tool_call = agent._extract_tool_call(mock_item)
|
|
357
|
+
|
|
358
|
+
# Should delegate to parent and map the tool name
|
|
359
|
+
assert tool_call is not None
|
|
360
|
+
assert tool_call.name == "mcp_test_tool"
|
|
361
|
+
assert tool_call.id == "call_456"
|
|
362
|
+
assert tool_call.arguments == {"arg": "value"}
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Tests for model resolution and create_agent."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from unittest.mock import MagicMock, patch
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from hud.agents import create_agent
|
|
10
|
+
from hud.agents.resolver import resolve_cls
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TestResolveCls:
|
|
14
|
+
"""Tests for resolve_cls function."""
|
|
15
|
+
|
|
16
|
+
def test_resolves_known_agent_type(self) -> None:
|
|
17
|
+
"""Known AgentType strings resolve to their class."""
|
|
18
|
+
from hud.agents.claude import ClaudeAgent
|
|
19
|
+
|
|
20
|
+
cls, gateway_info = resolve_cls("claude")
|
|
21
|
+
assert cls == ClaudeAgent
|
|
22
|
+
assert gateway_info is None
|
|
23
|
+
|
|
24
|
+
def test_resolves_openai(self) -> None:
|
|
25
|
+
"""Resolves 'openai' to OpenAIAgent."""
|
|
26
|
+
from hud.agents import OpenAIAgent
|
|
27
|
+
|
|
28
|
+
cls, _gateway_info = resolve_cls("openai")
|
|
29
|
+
assert cls == OpenAIAgent
|
|
30
|
+
|
|
31
|
+
def test_resolves_gemini(self) -> None:
|
|
32
|
+
"""Resolves 'gemini' to GeminiAgent."""
|
|
33
|
+
from hud.agents.gemini import GeminiAgent
|
|
34
|
+
|
|
35
|
+
cls, _gateway_info = resolve_cls("gemini")
|
|
36
|
+
assert cls == GeminiAgent
|
|
37
|
+
|
|
38
|
+
def test_unknown_model_without_gateway_raises(self) -> None:
|
|
39
|
+
"""Unknown model with no gateway models raises ValueError."""
|
|
40
|
+
with (
|
|
41
|
+
patch("hud.agents.resolver._fetch_gateway_models", return_value=[]),
|
|
42
|
+
pytest.raises(ValueError, match="not found"),
|
|
43
|
+
):
|
|
44
|
+
resolve_cls("unknown-model-xyz")
|
|
45
|
+
|
|
46
|
+
def test_resolves_gateway_model(self) -> None:
|
|
47
|
+
"""Resolves model found in gateway."""
|
|
48
|
+
from hud.agents import OpenAIAgent
|
|
49
|
+
|
|
50
|
+
mock_models = [
|
|
51
|
+
{"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"},
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models):
|
|
55
|
+
cls, info = resolve_cls("gpt-4o")
|
|
56
|
+
assert cls == OpenAIAgent
|
|
57
|
+
assert info is not None
|
|
58
|
+
assert info["id"] == "gpt-4o"
|
|
59
|
+
|
|
60
|
+
def test_resolves_anthropic_provider_to_claude(self) -> None:
|
|
61
|
+
"""Provider 'anthropic' maps to ClaudeAgent."""
|
|
62
|
+
from hud.agents.claude import ClaudeAgent
|
|
63
|
+
|
|
64
|
+
mock_models = [
|
|
65
|
+
{"id": "claude-sonnet", "model": "claude-3-sonnet", "provider": "anthropic"},
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models):
|
|
69
|
+
cls, _info = resolve_cls("claude-sonnet")
|
|
70
|
+
assert cls == ClaudeAgent
|
|
71
|
+
|
|
72
|
+
def test_resolves_unknown_provider_to_openai_compatible(self) -> None:
|
|
73
|
+
"""Unknown provider maps to OpenAIChatAgent."""
|
|
74
|
+
from hud.agents.openai_chat import OpenAIChatAgent
|
|
75
|
+
|
|
76
|
+
mock_models = [
|
|
77
|
+
{"id": "custom-model", "model": "custom", "provider": "custom-provider"},
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models):
|
|
81
|
+
cls, _info = resolve_cls("custom-model")
|
|
82
|
+
assert cls == OpenAIChatAgent
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class TestCreateAgent:
|
|
86
|
+
"""Tests for create_agent function - gateway-only."""
|
|
87
|
+
|
|
88
|
+
def test_creates_with_gateway_client(self) -> None:
|
|
89
|
+
"""create_agent always uses gateway routing."""
|
|
90
|
+
from hud.agents import OpenAIAgent
|
|
91
|
+
|
|
92
|
+
mock_models = [
|
|
93
|
+
{"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"},
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
with (
|
|
97
|
+
patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models),
|
|
98
|
+
patch.object(OpenAIAgent, "create") as mock_create,
|
|
99
|
+
patch("hud.agents.gateway.build_gateway_client") as mock_build_client,
|
|
100
|
+
):
|
|
101
|
+
mock_client = MagicMock()
|
|
102
|
+
mock_build_client.return_value = mock_client
|
|
103
|
+
mock_agent = MagicMock()
|
|
104
|
+
mock_create.return_value = mock_agent
|
|
105
|
+
|
|
106
|
+
agent = create_agent("gpt-4o")
|
|
107
|
+
|
|
108
|
+
# Should have set model and model_client
|
|
109
|
+
call_kwargs = mock_create.call_args.kwargs
|
|
110
|
+
assert call_kwargs["model"] == "gpt-4o"
|
|
111
|
+
assert "model_client" in call_kwargs
|
|
112
|
+
assert agent == mock_agent
|
|
113
|
+
|
|
114
|
+
def test_passes_kwargs_to_create(self) -> None:
|
|
115
|
+
"""Extra kwargs are passed to agent.create()."""
|
|
116
|
+
from hud.agents import OpenAIAgent
|
|
117
|
+
|
|
118
|
+
mock_models = [
|
|
119
|
+
{"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"},
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
with (
|
|
123
|
+
patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models),
|
|
124
|
+
patch.object(OpenAIAgent, "create") as mock_create,
|
|
125
|
+
patch("hud.agents.gateway.build_gateway_client"),
|
|
126
|
+
):
|
|
127
|
+
mock_create.return_value = MagicMock()
|
|
128
|
+
|
|
129
|
+
create_agent("gpt-4o", temperature=0.5, max_tokens=1000)
|
|
130
|
+
|
|
131
|
+
call_kwargs = mock_create.call_args.kwargs
|
|
132
|
+
assert call_kwargs["temperature"] == 0.5
|
|
133
|
+
assert call_kwargs["max_tokens"] == 1000
|
|
134
|
+
|
|
135
|
+
def test_known_agent_type_also_uses_gateway(self) -> None:
|
|
136
|
+
"""Even 'claude' string uses gateway (it's a gateway shortcut)."""
|
|
137
|
+
from hud.agents.claude import ClaudeAgent
|
|
138
|
+
|
|
139
|
+
with (
|
|
140
|
+
patch.object(ClaudeAgent, "create") as mock_create,
|
|
141
|
+
patch("hud.agents.gateway.build_gateway_client") as mock_build_client,
|
|
142
|
+
):
|
|
143
|
+
mock_client = MagicMock()
|
|
144
|
+
mock_build_client.return_value = mock_client
|
|
145
|
+
mock_create.return_value = MagicMock()
|
|
146
|
+
|
|
147
|
+
create_agent("claude")
|
|
148
|
+
|
|
149
|
+
# Should still build gateway client
|
|
150
|
+
mock_build_client.assert_called_once()
|
|
151
|
+
call_kwargs = mock_create.call_args.kwargs
|
|
152
|
+
assert "model_client" in call_kwargs
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class TestBuildGatewayClient:
|
|
156
|
+
"""Tests for build_gateway_client function."""
|
|
157
|
+
|
|
158
|
+
def test_builds_anthropic_client(self) -> None:
|
|
159
|
+
"""Builds AsyncAnthropic for anthropic provider."""
|
|
160
|
+
from hud.agents.gateway import build_gateway_client
|
|
161
|
+
|
|
162
|
+
with patch("hud.settings.settings") as mock_settings:
|
|
163
|
+
mock_settings.api_key = "test-key"
|
|
164
|
+
mock_settings.hud_gateway_url = "https://gateway.hud.ai"
|
|
165
|
+
|
|
166
|
+
with patch("anthropic.AsyncAnthropic") as mock_client_cls:
|
|
167
|
+
build_gateway_client("anthropic")
|
|
168
|
+
mock_client_cls.assert_called_once()
|
|
169
|
+
|
|
170
|
+
def test_builds_openai_client_for_openai(self) -> None:
|
|
171
|
+
"""Builds AsyncOpenAI for openai provider."""
|
|
172
|
+
from hud.agents.gateway import build_gateway_client
|
|
173
|
+
|
|
174
|
+
with patch("hud.settings.settings") as mock_settings:
|
|
175
|
+
mock_settings.api_key = "test-key"
|
|
176
|
+
mock_settings.hud_gateway_url = "https://gateway.hud.ai"
|
|
177
|
+
|
|
178
|
+
with patch("openai.AsyncOpenAI") as mock_client_cls:
|
|
179
|
+
build_gateway_client("openai")
|
|
180
|
+
mock_client_cls.assert_called_once()
|
|
181
|
+
|
|
182
|
+
def test_builds_openai_client_for_unknown(self) -> None:
|
|
183
|
+
"""Builds AsyncOpenAI for unknown providers (openai-compatible)."""
|
|
184
|
+
from hud.agents.gateway import build_gateway_client
|
|
185
|
+
|
|
186
|
+
with patch("hud.settings.settings") as mock_settings:
|
|
187
|
+
mock_settings.api_key = "test-key"
|
|
188
|
+
mock_settings.hud_gateway_url = "https://gateway.hud.ai"
|
|
189
|
+
|
|
190
|
+
with patch("openai.AsyncOpenAI") as mock_client_cls:
|
|
191
|
+
build_gateway_client("together")
|
|
192
|
+
mock_client_cls.assert_called_once()
|