hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
"""Tests for OperatorAgent implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
6
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
from mcp import types
|
|
10
|
+
from openai import AsyncOpenAI
|
|
11
|
+
from openai.types.responses.response_computer_tool_call import PendingSafetyCheck
|
|
12
|
+
|
|
13
|
+
from hud.agents.operator import OperatorAgent
|
|
14
|
+
from hud.environment.router import ToolRouter
|
|
15
|
+
from hud.eval.context import EvalContext
|
|
16
|
+
from hud.types import MCPToolCall, MCPToolResult
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import Generator
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MockEvalContext(EvalContext):
|
|
23
|
+
"""Mock EvalContext for testing."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, tools: list[types.Tool] | None = None) -> None:
|
|
26
|
+
# Core attributes
|
|
27
|
+
self.prompt = "Test prompt"
|
|
28
|
+
self._tools = tools or []
|
|
29
|
+
self._submitted: str | None = None
|
|
30
|
+
self.reward: float | None = None
|
|
31
|
+
|
|
32
|
+
# Environment attributes
|
|
33
|
+
self._router = ToolRouter()
|
|
34
|
+
self._agent_include: list[str] | None = None
|
|
35
|
+
self._agent_exclude: list[str] | None = None
|
|
36
|
+
|
|
37
|
+
# EvalContext attributes
|
|
38
|
+
self._task = None
|
|
39
|
+
self.trace_id = "test-trace-id"
|
|
40
|
+
self.eval_name = "test-eval"
|
|
41
|
+
self.job_id: str | None = None
|
|
42
|
+
self.group_id: str | None = None
|
|
43
|
+
self.index = 0
|
|
44
|
+
self.variants: dict[str, Any] = {}
|
|
45
|
+
self.answer: str | None = None
|
|
46
|
+
self.system_prompt: str | None = None
|
|
47
|
+
self.error: BaseException | None = None
|
|
48
|
+
self.metadata: dict[str, Any] = {}
|
|
49
|
+
self.results: list[Any] = []
|
|
50
|
+
self._is_summary = False
|
|
51
|
+
|
|
52
|
+
def as_tools(self) -> list[types.Tool]:
|
|
53
|
+
return self._tools
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def has_scenario(self) -> bool:
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
async def list_tools(self) -> list[types.Tool]:
|
|
60
|
+
return self._tools
|
|
61
|
+
|
|
62
|
+
async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
|
|
63
|
+
return MCPToolResult(
|
|
64
|
+
content=[types.TextContent(type="text", text="ok")],
|
|
65
|
+
isError=False,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
async def submit(self, answer: str) -> None:
|
|
69
|
+
self._submitted = answer
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class TestOperatorAgent:
|
|
73
|
+
"""Test OperatorAgent class."""
|
|
74
|
+
|
|
75
|
+
@pytest.fixture
|
|
76
|
+
def mock_openai(self) -> Generator[AsyncOpenAI, None, None]:
|
|
77
|
+
"""Create a mock OpenAI client."""
|
|
78
|
+
client = AsyncOpenAI(api_key="test", base_url="http://localhost")
|
|
79
|
+
client.responses.create = AsyncMock()
|
|
80
|
+
with patch("hud.agents.openai.AsyncOpenAI", return_value=client):
|
|
81
|
+
yield client
|
|
82
|
+
|
|
83
|
+
@pytest.fixture
|
|
84
|
+
def mock_eval_context_computer(self) -> MockEvalContext:
|
|
85
|
+
"""Create a mock EvalContext with computer tool."""
|
|
86
|
+
return MockEvalContext(
|
|
87
|
+
tools=[
|
|
88
|
+
types.Tool(
|
|
89
|
+
name="openai_computer",
|
|
90
|
+
description="OpenAI computer use tool",
|
|
91
|
+
inputSchema={},
|
|
92
|
+
)
|
|
93
|
+
]
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
@pytest.mark.asyncio
|
|
97
|
+
async def test_init(self, mock_openai: AsyncOpenAI) -> None:
|
|
98
|
+
"""Test agent initialization."""
|
|
99
|
+
agent = OperatorAgent.create(
|
|
100
|
+
model_client=mock_openai,
|
|
101
|
+
model="gpt-4",
|
|
102
|
+
validate_api_key=False,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
assert agent.model_name == "Operator"
|
|
106
|
+
assert agent.config.model == "gpt-4"
|
|
107
|
+
assert agent.openai_client == mock_openai
|
|
108
|
+
|
|
109
|
+
@pytest.mark.asyncio
|
|
110
|
+
async def test_format_blocks(self, mock_openai: AsyncOpenAI) -> None:
|
|
111
|
+
"""Test formatting content blocks."""
|
|
112
|
+
agent = OperatorAgent.create(
|
|
113
|
+
model_client=mock_openai,
|
|
114
|
+
validate_api_key=False,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Test with text blocks
|
|
118
|
+
blocks: list[types.ContentBlock] = [
|
|
119
|
+
types.TextContent(type="text", text="Hello, GPT!"),
|
|
120
|
+
types.TextContent(type="text", text="Another message"),
|
|
121
|
+
]
|
|
122
|
+
|
|
123
|
+
messages = await agent.format_blocks(blocks)
|
|
124
|
+
assert len(messages) == 1
|
|
125
|
+
msg = cast("dict[str, Any]", messages[0])
|
|
126
|
+
assert msg["role"] == "user"
|
|
127
|
+
content = cast("list[dict[str, Any]]", msg["content"])
|
|
128
|
+
assert len(content) == 2
|
|
129
|
+
assert content[0] == {"type": "input_text", "text": "Hello, GPT!"}
|
|
130
|
+
assert content[1] == {"type": "input_text", "text": "Another message"}
|
|
131
|
+
|
|
132
|
+
# Test with mixed content
|
|
133
|
+
blocks = [
|
|
134
|
+
types.TextContent(type="text", text="Text content"),
|
|
135
|
+
types.ImageContent(type="image", data="base64data", mimeType="image/png"),
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
messages = await agent.format_blocks(blocks)
|
|
139
|
+
assert len(messages) == 1
|
|
140
|
+
msg = cast("dict[str, Any]", messages[0])
|
|
141
|
+
assert msg["role"] == "user"
|
|
142
|
+
content = cast("list[dict[str, Any]]", msg["content"])
|
|
143
|
+
assert len(content) == 2
|
|
144
|
+
assert content[0] == {"type": "input_text", "text": "Text content"}
|
|
145
|
+
assert content[1] == {
|
|
146
|
+
"type": "input_image",
|
|
147
|
+
"image_url": "",
|
|
148
|
+
"detail": "auto",
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
@pytest.mark.asyncio
|
|
152
|
+
async def test_format_tool_results(self, mock_openai: AsyncOpenAI) -> None:
|
|
153
|
+
"""Test formatting tool results."""
|
|
154
|
+
agent = OperatorAgent.create(
|
|
155
|
+
model_client=mock_openai,
|
|
156
|
+
validate_api_key=False,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
tool_calls = [
|
|
160
|
+
MCPToolCall(name="test_tool", arguments={}, id="call_123"),
|
|
161
|
+
MCPToolCall(name="screenshot", arguments={}, id="call_456"),
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
tool_results = [
|
|
165
|
+
MCPToolResult(content=[types.TextContent(type="text", text="Success")], isError=False),
|
|
166
|
+
MCPToolResult(
|
|
167
|
+
content=[types.ImageContent(type="image", data="base64data", mimeType="image/png")],
|
|
168
|
+
isError=False,
|
|
169
|
+
),
|
|
170
|
+
]
|
|
171
|
+
|
|
172
|
+
messages = await agent.format_tool_results(tool_calls, tool_results)
|
|
173
|
+
|
|
174
|
+
# Should return both tool results as function_call_output
|
|
175
|
+
assert len(messages) == 2
|
|
176
|
+
# First result is text
|
|
177
|
+
msg0 = cast("dict[str, Any]", messages[0])
|
|
178
|
+
assert msg0["type"] == "function_call_output"
|
|
179
|
+
assert msg0["call_id"] == "call_123"
|
|
180
|
+
output0 = cast("list[dict[str, Any]]", msg0["output"])
|
|
181
|
+
assert output0[0]["type"] == "input_text"
|
|
182
|
+
assert output0[0]["text"] == "Success"
|
|
183
|
+
# Second result is image
|
|
184
|
+
msg1 = cast("dict[str, Any]", messages[1])
|
|
185
|
+
assert msg1["type"] == "function_call_output"
|
|
186
|
+
assert msg1["call_id"] == "call_456"
|
|
187
|
+
output1 = cast("list[dict[str, Any]]", msg1["output"])
|
|
188
|
+
assert output1[0]["type"] == "input_image"
|
|
189
|
+
assert output1[0]["image_url"] == ""
|
|
190
|
+
|
|
191
|
+
@pytest.mark.asyncio
|
|
192
|
+
async def test_format_tool_results_with_error(self, mock_openai: AsyncOpenAI) -> None:
|
|
193
|
+
"""Test formatting tool results with errors."""
|
|
194
|
+
agent = OperatorAgent.create(
|
|
195
|
+
model_client=mock_openai,
|
|
196
|
+
validate_api_key=False,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
tool_calls = [
|
|
200
|
+
MCPToolCall(name="failing_tool", arguments={}, id="call_error"),
|
|
201
|
+
]
|
|
202
|
+
|
|
203
|
+
tool_results = [
|
|
204
|
+
MCPToolResult(
|
|
205
|
+
content=[types.TextContent(type="text", text="Something went wrong")], isError=True
|
|
206
|
+
),
|
|
207
|
+
]
|
|
208
|
+
|
|
209
|
+
messages = await agent.format_tool_results(tool_calls, tool_results)
|
|
210
|
+
|
|
211
|
+
# Error results are returned with error flag and content
|
|
212
|
+
assert len(messages) == 1
|
|
213
|
+
msg = cast("dict[str, Any]", messages[0])
|
|
214
|
+
assert msg["type"] == "function_call_output"
|
|
215
|
+
assert msg["call_id"] == "call_error"
|
|
216
|
+
output = cast("list[dict[str, Any]]", msg["output"])
|
|
217
|
+
assert output[0]["type"] == "input_text"
|
|
218
|
+
assert output[0]["text"] == "[tool_error] true"
|
|
219
|
+
assert output[1]["type"] == "input_text"
|
|
220
|
+
assert output[1]["text"] == "Something went wrong"
|
|
221
|
+
|
|
222
|
+
@pytest.mark.asyncio
|
|
223
|
+
async def test_get_model_response(
|
|
224
|
+
self, mock_openai: AsyncOpenAI, mock_eval_context_computer: MockEvalContext
|
|
225
|
+
) -> None:
|
|
226
|
+
"""Test getting model response from OpenAI API."""
|
|
227
|
+
with patch("hud.settings.settings.telemetry_enabled", False):
|
|
228
|
+
agent = OperatorAgent.create(
|
|
229
|
+
model_client=mock_openai,
|
|
230
|
+
validate_api_key=False,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Initialize with context
|
|
234
|
+
agent.ctx = mock_eval_context_computer
|
|
235
|
+
await agent._initialize_from_ctx(mock_eval_context_computer)
|
|
236
|
+
|
|
237
|
+
# Mock OpenAI API response for a successful computer use response
|
|
238
|
+
mock_response = MagicMock()
|
|
239
|
+
mock_response.id = "response_123"
|
|
240
|
+
mock_response.state = "completed"
|
|
241
|
+
# Mock the output message structure
|
|
242
|
+
mock_output_text = MagicMock()
|
|
243
|
+
mock_output_text.type = "output_text"
|
|
244
|
+
mock_output_text.text = "I can see the screen content."
|
|
245
|
+
|
|
246
|
+
mock_output_message = MagicMock()
|
|
247
|
+
mock_output_message.type = "message"
|
|
248
|
+
mock_output_message.content = [mock_output_text]
|
|
249
|
+
|
|
250
|
+
mock_response.output = [mock_output_message]
|
|
251
|
+
|
|
252
|
+
mock_openai.responses.create = AsyncMock(return_value=mock_response)
|
|
253
|
+
|
|
254
|
+
messages = [{"prompt": "What's on the screen?", "screenshot": None}]
|
|
255
|
+
response = await agent.get_response(messages) # type: ignore[arg-type]
|
|
256
|
+
|
|
257
|
+
assert response.done is True
|
|
258
|
+
assert response.tool_calls == []
|
|
259
|
+
|
|
260
|
+
@pytest.mark.asyncio
|
|
261
|
+
async def test_handle_empty_response(
|
|
262
|
+
self, mock_openai: AsyncOpenAI, mock_eval_context_computer: MockEvalContext
|
|
263
|
+
) -> None:
|
|
264
|
+
"""Test handling empty response from API."""
|
|
265
|
+
agent = OperatorAgent.create(
|
|
266
|
+
model_client=mock_openai,
|
|
267
|
+
validate_api_key=False,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Initialize with context
|
|
271
|
+
agent.ctx = mock_eval_context_computer
|
|
272
|
+
await agent._initialize_from_ctx(mock_eval_context_computer)
|
|
273
|
+
|
|
274
|
+
# Mock empty response
|
|
275
|
+
mock_response = MagicMock()
|
|
276
|
+
mock_response.id = "response_empty"
|
|
277
|
+
mock_response.state = "completed"
|
|
278
|
+
mock_response.output = [] # Empty output
|
|
279
|
+
|
|
280
|
+
mock_openai.responses.create = AsyncMock(return_value=mock_response)
|
|
281
|
+
|
|
282
|
+
messages = [{"prompt": "Hi", "screenshot": None}]
|
|
283
|
+
response = await agent.get_response(messages) # type: ignore[arg-type]
|
|
284
|
+
|
|
285
|
+
assert response.content == ""
|
|
286
|
+
assert response.tool_calls == []
|
|
287
|
+
|
|
288
|
+
@pytest.mark.asyncio
|
|
289
|
+
async def test_pending_safety_checks_initialization(self, mock_openai: AsyncOpenAI) -> None:
|
|
290
|
+
"""Test that OperatorAgent initializes pending_call_id and pending_safety_checks."""
|
|
291
|
+
agent = OperatorAgent.create(
|
|
292
|
+
model_client=mock_openai,
|
|
293
|
+
validate_api_key=False,
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Verify initial state
|
|
297
|
+
assert agent.pending_call_id is None
|
|
298
|
+
assert agent.pending_safety_checks == []
|
|
299
|
+
|
|
300
|
+
# Set some state
|
|
301
|
+
agent.pending_call_id = "call_id"
|
|
302
|
+
agent.pending_safety_checks = [
|
|
303
|
+
PendingSafetyCheck(id="safety_check_id", code="value", message="message")
|
|
304
|
+
]
|
|
305
|
+
|
|
306
|
+
# Verify state was set
|
|
307
|
+
assert agent.pending_call_id == "call_id"
|
|
308
|
+
assert len(agent.pending_safety_checks) == 1
|
|
309
|
+
assert agent.pending_safety_checks[0].id == "safety_check_id"
|
|
310
|
+
|
|
311
|
+
@pytest.mark.asyncio
|
|
312
|
+
async def test_extract_tool_call_computer(self, mock_openai: AsyncOpenAI) -> None:
|
|
313
|
+
"""Test that _extract_tool_call routes computer_call to openai_computer."""
|
|
314
|
+
agent = OperatorAgent.create(
|
|
315
|
+
model_client=mock_openai,
|
|
316
|
+
validate_api_key=False,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# Create a mock computer_call item
|
|
320
|
+
mock_item = MagicMock()
|
|
321
|
+
mock_item.type = "computer_call"
|
|
322
|
+
mock_item.call_id = "call_123"
|
|
323
|
+
mock_item.pending_safety_checks = [
|
|
324
|
+
PendingSafetyCheck(id="check_1", code="code", message="msg")
|
|
325
|
+
]
|
|
326
|
+
mock_item.action.to_dict.return_value = {"type": "screenshot"}
|
|
327
|
+
|
|
328
|
+
tool_call = agent._extract_tool_call(mock_item)
|
|
329
|
+
|
|
330
|
+
# Should route to openai_computer tool
|
|
331
|
+
assert tool_call is not None
|
|
332
|
+
assert tool_call.name == "openai_computer"
|
|
333
|
+
assert tool_call.id == "call_123"
|
|
334
|
+
assert tool_call.arguments == {"type": "screenshot"}
|
|
335
|
+
# Should update pending_safety_checks
|
|
336
|
+
assert agent.pending_safety_checks == mock_item.pending_safety_checks
|
|
337
|
+
|
|
338
|
+
@pytest.mark.asyncio
|
|
339
|
+
async def test_extract_tool_call_delegates_to_super(self, mock_openai: AsyncOpenAI) -> None:
|
|
340
|
+
"""Test that _extract_tool_call delegates non-computer calls to parent."""
|
|
341
|
+
agent = OperatorAgent.create(
|
|
342
|
+
model_client=mock_openai,
|
|
343
|
+
validate_api_key=False,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Set up tool name map
|
|
347
|
+
agent._tool_name_map = {"test_tool": "mcp_test_tool"}
|
|
348
|
+
|
|
349
|
+
# Create a mock function_call item
|
|
350
|
+
mock_item = MagicMock()
|
|
351
|
+
mock_item.type = "function_call"
|
|
352
|
+
mock_item.call_id = "call_456"
|
|
353
|
+
mock_item.name = "test_tool"
|
|
354
|
+
mock_item.arguments = '{"arg": "value"}'
|
|
355
|
+
|
|
356
|
+
tool_call = agent._extract_tool_call(mock_item)
|
|
357
|
+
|
|
358
|
+
# Should delegate to parent and map the tool name
|
|
359
|
+
assert tool_call is not None
|
|
360
|
+
assert tool_call.name == "mcp_test_tool"
|
|
361
|
+
assert tool_call.id == "call_456"
|
|
362
|
+
assert tool_call.arguments == {"arg": "value"}
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Tests for MCPAgent.run() with EvalContext."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, ClassVar
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
from mcp import types
|
|
9
|
+
|
|
10
|
+
from hud.agents import MCPAgent
|
|
11
|
+
from hud.agents.base import BaseCreateParams
|
|
12
|
+
from hud.environment.router import ToolRouter
|
|
13
|
+
from hud.eval.context import EvalContext
|
|
14
|
+
from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MockConfig(BaseAgentConfig):
|
|
18
|
+
model_name: str = "MockAgent"
|
|
19
|
+
model: str = "mock-model"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MockCreateParams(BaseCreateParams, MockConfig):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class MockMCPAgent(MCPAgent):
|
|
27
|
+
"""Mock agent for testing run()."""
|
|
28
|
+
|
|
29
|
+
metadata: ClassVar[dict[str, Any] | None] = {}
|
|
30
|
+
config_cls: ClassVar[type[BaseAgentConfig]] = MockConfig
|
|
31
|
+
|
|
32
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
33
|
+
params = MockCreateParams(**kwargs)
|
|
34
|
+
super().__init__(params)
|
|
35
|
+
self._response = AgentResponse(content="Test response", tool_calls=[], done=True)
|
|
36
|
+
|
|
37
|
+
def set_response(self, response: AgentResponse) -> None:
|
|
38
|
+
self._response = response
|
|
39
|
+
|
|
40
|
+
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
41
|
+
return self._response
|
|
42
|
+
|
|
43
|
+
async def format_tool_results(
|
|
44
|
+
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
45
|
+
) -> list[dict[str, Any]]:
|
|
46
|
+
return [{"role": "tool", "content": str(r)} for r in tool_results]
|
|
47
|
+
|
|
48
|
+
async def get_system_messages(self) -> list[Any]:
|
|
49
|
+
return []
|
|
50
|
+
|
|
51
|
+
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
|
|
52
|
+
return [{"type": "text", "text": getattr(b, "text")} for b in blocks if hasattr(b, "text")]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class MockEvalContext(EvalContext):
|
|
56
|
+
"""Mock EvalContext for testing - inherits from real EvalContext."""
|
|
57
|
+
|
|
58
|
+
def __init__(self, prompt: str = "Test prompt", tools: list[types.Tool] | None = None) -> None:
|
|
59
|
+
# Core attributes
|
|
60
|
+
self.prompt = prompt
|
|
61
|
+
self._tools = tools or [types.Tool(name="test_tool", description="Test", inputSchema={})]
|
|
62
|
+
self._submitted: str | None = None
|
|
63
|
+
self.reward: float | None = None
|
|
64
|
+
self._initialized = True
|
|
65
|
+
|
|
66
|
+
# Environment attributes
|
|
67
|
+
self._router = ToolRouter()
|
|
68
|
+
self._agent_include: list[str] | None = None
|
|
69
|
+
self._agent_exclude: list[str] | None = None
|
|
70
|
+
|
|
71
|
+
# EvalContext attributes
|
|
72
|
+
self._task = None
|
|
73
|
+
self.trace_id = "test-trace-id"
|
|
74
|
+
self.eval_name = "test-eval"
|
|
75
|
+
self.job_id: str | None = None
|
|
76
|
+
self.group_id: str | None = None
|
|
77
|
+
self.index = 0
|
|
78
|
+
self.variants: dict[str, Any] = {}
|
|
79
|
+
self.answer: str | None = None
|
|
80
|
+
self.system_prompt: str | None = None
|
|
81
|
+
self.error: BaseException | None = None
|
|
82
|
+
self.metadata: dict[str, Any] = {}
|
|
83
|
+
self.results: list[Any] = []
|
|
84
|
+
self._is_summary = False
|
|
85
|
+
|
|
86
|
+
def as_tools(self) -> list[types.Tool]:
|
|
87
|
+
return self._tools
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def has_scenario(self) -> bool:
|
|
91
|
+
return True
|
|
92
|
+
|
|
93
|
+
async def list_tools(self) -> list[types.Tool]:
|
|
94
|
+
return self._tools
|
|
95
|
+
|
|
96
|
+
async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
|
|
97
|
+
# Handle tuple format (name, args)
|
|
98
|
+
if isinstance(call, tuple):
|
|
99
|
+
name = call[0]
|
|
100
|
+
elif hasattr(call, "name"):
|
|
101
|
+
name = call.name
|
|
102
|
+
else:
|
|
103
|
+
name = str(call)
|
|
104
|
+
return MCPToolResult(
|
|
105
|
+
content=[types.TextContent(type="text", text=f"Result from {name}")],
|
|
106
|
+
isError=False,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
async def submit(self, answer: str) -> None:
|
|
110
|
+
self._submitted = answer
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class TestRun:
|
|
114
|
+
"""Tests for MCPAgent.run() with EvalContext."""
|
|
115
|
+
|
|
116
|
+
@pytest.mark.asyncio
|
|
117
|
+
async def test_run_basic(self) -> None:
|
|
118
|
+
"""Test basic run() flow."""
|
|
119
|
+
ctx = MockEvalContext(prompt="Do the task")
|
|
120
|
+
agent = MockMCPAgent()
|
|
121
|
+
|
|
122
|
+
result = await agent.run(ctx)
|
|
123
|
+
|
|
124
|
+
assert result.done
|
|
125
|
+
assert result.content == "Test response"
|
|
126
|
+
assert ctx._submitted == "Test response"
|
|
127
|
+
|
|
128
|
+
@pytest.mark.asyncio
|
|
129
|
+
async def test_run_no_prompt_raises(self) -> None:
|
|
130
|
+
"""Test run() raises when prompt is not set."""
|
|
131
|
+
ctx = MockEvalContext(prompt="")
|
|
132
|
+
agent = MockMCPAgent()
|
|
133
|
+
|
|
134
|
+
with pytest.raises(ValueError, match="prompt is not set"):
|
|
135
|
+
await agent.run(ctx)
|
|
136
|
+
|
|
137
|
+
@pytest.mark.asyncio
|
|
138
|
+
async def test_run_wrong_type_raises(self) -> None:
|
|
139
|
+
"""Test run() raises TypeError for non-EvalContext."""
|
|
140
|
+
agent = MockMCPAgent()
|
|
141
|
+
|
|
142
|
+
with pytest.raises(TypeError, match="must be EvalContext"):
|
|
143
|
+
await agent.run("not an eval context") # type: ignore[arg-type]
|
|
144
|
+
|
|
145
|
+
@pytest.mark.asyncio
|
|
146
|
+
async def test_run_clears_ctx(self) -> None:
|
|
147
|
+
"""Test run() clears ctx after completion."""
|
|
148
|
+
ctx = MockEvalContext(prompt="Do the task")
|
|
149
|
+
agent = MockMCPAgent()
|
|
150
|
+
|
|
151
|
+
await agent.run(ctx)
|
|
152
|
+
assert agent.ctx is None
|
|
153
|
+
|
|
154
|
+
@pytest.mark.asyncio
|
|
155
|
+
async def test_run_no_submit_on_empty_content(self) -> None:
|
|
156
|
+
"""Test run() doesn't submit when content is empty."""
|
|
157
|
+
ctx = MockEvalContext(prompt="Do the task")
|
|
158
|
+
agent = MockMCPAgent()
|
|
159
|
+
agent.set_response(AgentResponse(content="", tool_calls=[], done=True))
|
|
160
|
+
|
|
161
|
+
await agent.run(ctx)
|
|
162
|
+
assert ctx._submitted is None
|
|
163
|
+
|
|
164
|
+
@pytest.mark.asyncio
|
|
165
|
+
async def test_run_initializes_tools(self) -> None:
|
|
166
|
+
"""Test run() initializes tools from context."""
|
|
167
|
+
ctx = MockEvalContext(
|
|
168
|
+
prompt="Do the task",
|
|
169
|
+
tools=[
|
|
170
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={}),
|
|
171
|
+
types.Tool(name="tool2", description="Tool 2", inputSchema={}),
|
|
172
|
+
],
|
|
173
|
+
)
|
|
174
|
+
agent = MockMCPAgent()
|
|
175
|
+
|
|
176
|
+
await agent.run(ctx)
|
|
177
|
+
|
|
178
|
+
assert agent._initialized
|
|
179
|
+
# After cleanup, ctx is None but tools were discovered
|