hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/agents/tests/test_openai.py
CHANGED
|
@@ -2,212 +2,448 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
6
|
+
from unittest.mock import AsyncMock, patch
|
|
6
7
|
|
|
7
8
|
import pytest
|
|
8
9
|
from mcp import types
|
|
9
|
-
|
|
10
|
-
from
|
|
10
|
+
from openai import AsyncOpenAI
|
|
11
|
+
from openai.types.responses import (
|
|
12
|
+
ResponseFunctionToolCall,
|
|
13
|
+
ResponseOutputMessage,
|
|
14
|
+
ResponseOutputText,
|
|
15
|
+
ResponseReasoningItem,
|
|
16
|
+
)
|
|
17
|
+
from openai.types.responses.response_reasoning_item import Summary
|
|
18
|
+
|
|
19
|
+
from hud.agents.openai import OpenAIAgent
|
|
20
|
+
from hud.environment.router import ToolRouter
|
|
21
|
+
from hud.eval.context import EvalContext
|
|
11
22
|
from hud.types import MCPToolCall, MCPToolResult
|
|
12
23
|
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from collections.abc import Generator
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MockEvalContext(EvalContext):
|
|
29
|
+
"""Mock EvalContext for testing."""
|
|
30
|
+
|
|
31
|
+
def __init__(self, tools: list[types.Tool] | None = None) -> None:
|
|
32
|
+
# Core attributes
|
|
33
|
+
self.prompt = "Test prompt"
|
|
34
|
+
self._tools = tools or []
|
|
35
|
+
self._submitted: str | None = None
|
|
36
|
+
self.reward: float | None = None
|
|
37
|
+
|
|
38
|
+
# Environment attributes
|
|
39
|
+
self._router = ToolRouter()
|
|
40
|
+
self._agent_include: list[str] | None = None
|
|
41
|
+
self._agent_exclude: list[str] | None = None
|
|
42
|
+
|
|
43
|
+
# EvalContext attributes
|
|
44
|
+
self._task = None
|
|
45
|
+
self.trace_id = "test-trace-id"
|
|
46
|
+
self.eval_name = "test-eval"
|
|
47
|
+
self.job_id: str | None = None
|
|
48
|
+
self.group_id: str | None = None
|
|
49
|
+
self.index = 0
|
|
50
|
+
self.variants: dict[str, Any] = {}
|
|
51
|
+
self.answer: str | None = None
|
|
52
|
+
self.system_prompt: str | None = None
|
|
53
|
+
self.error: BaseException | None = None
|
|
54
|
+
self.metadata: dict[str, Any] = {}
|
|
55
|
+
self.results: list[Any] = []
|
|
56
|
+
self._is_summary = False
|
|
57
|
+
|
|
58
|
+
def as_tools(self) -> list[types.Tool]:
|
|
59
|
+
return self._tools
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def has_scenario(self) -> bool:
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
async def list_tools(self) -> list[types.Tool]:
|
|
66
|
+
return self._tools
|
|
67
|
+
|
|
68
|
+
async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
|
|
69
|
+
return MCPToolResult(
|
|
70
|
+
content=[types.TextContent(type="text", text="ok")],
|
|
71
|
+
isError=False,
|
|
72
|
+
)
|
|
13
73
|
|
|
14
|
-
|
|
15
|
-
|
|
74
|
+
async def submit(self, answer: str) -> None:
|
|
75
|
+
self._submitted = answer
|
|
16
76
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
mcp_client = AsyncMock()
|
|
21
|
-
# Set up the mcp_config attribute as a regular dict, not a coroutine
|
|
22
|
-
mcp_client.mcp_config = {"test_server": {"url": "http://test"}}
|
|
23
|
-
# Mock list_tools to return the required openai_computer tool
|
|
24
|
-
mcp_client.list_tools = AsyncMock(
|
|
25
|
-
return_value=[
|
|
26
|
-
types.Tool(
|
|
27
|
-
name="openai_computer", description="OpenAI computer use tool", inputSchema={}
|
|
28
|
-
)
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
mcp_client.initialize = AsyncMock()
|
|
32
|
-
return mcp_client
|
|
77
|
+
|
|
78
|
+
class TestOpenAIAgent:
|
|
79
|
+
"""Test OpenAIAgent class."""
|
|
33
80
|
|
|
34
81
|
@pytest.fixture
|
|
35
|
-
def mock_openai(self):
|
|
36
|
-
"""Create a
|
|
37
|
-
with patch("hud.agents.openai.AsyncOpenAI") as
|
|
38
|
-
client =
|
|
39
|
-
|
|
40
|
-
|
|
82
|
+
def mock_openai(self) -> Generator[AsyncOpenAI, None, None]: # type: ignore[misc]
|
|
83
|
+
"""Create a stub OpenAI client."""
|
|
84
|
+
with patch("hud.agents.openai.AsyncOpenAI") as mock_class:
|
|
85
|
+
client = AsyncOpenAI(api_key="test", base_url="http://localhost")
|
|
86
|
+
client.chat.completions.create = AsyncMock()
|
|
87
|
+
client.responses.create = AsyncMock()
|
|
88
|
+
mock_class.return_value = client
|
|
89
|
+
yield client # type: ignore[misc]
|
|
41
90
|
|
|
42
91
|
@pytest.mark.asyncio
|
|
43
|
-
async def
|
|
44
|
-
"""Test agent initialization."""
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
model="gpt-4",
|
|
50
|
-
validate_api_key=False, # Skip validation in tests
|
|
92
|
+
async def test_init_with_client(self, mock_openai: AsyncOpenAI) -> None:
|
|
93
|
+
"""Test agent initialization with provided client."""
|
|
94
|
+
agent = OpenAIAgent.create(
|
|
95
|
+
model_client=mock_openai,
|
|
96
|
+
model="gpt-4o",
|
|
97
|
+
validate_api_key=False,
|
|
51
98
|
)
|
|
52
99
|
|
|
53
|
-
assert agent.model_name == "
|
|
54
|
-
assert agent.model == "gpt-
|
|
55
|
-
assert agent.
|
|
100
|
+
assert agent.model_name == "OpenAI"
|
|
101
|
+
assert agent.config.model == "gpt-4o"
|
|
102
|
+
assert agent.model == "gpt-4o"
|
|
103
|
+
assert agent.openai_client == mock_openai
|
|
104
|
+
assert agent.max_output_tokens is None
|
|
105
|
+
assert agent.temperature is None
|
|
106
|
+
|
|
107
|
+
@pytest.mark.asyncio
|
|
108
|
+
async def test_init_with_parameters(self, mock_openai: AsyncOpenAI) -> None:
|
|
109
|
+
"""Test agent initialization with various parameters."""
|
|
110
|
+
agent = OpenAIAgent.create(
|
|
111
|
+
model_client=mock_openai,
|
|
112
|
+
model="gpt-4o",
|
|
113
|
+
max_output_tokens=2048,
|
|
114
|
+
temperature=0.7,
|
|
115
|
+
reasoning={"effort": "high"},
|
|
116
|
+
tool_choice="auto",
|
|
117
|
+
parallel_tool_calls=True,
|
|
118
|
+
validate_api_key=False,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
assert agent.max_output_tokens == 2048
|
|
122
|
+
assert agent.temperature == 0.7
|
|
123
|
+
assert agent.reasoning == {"effort": "high"}
|
|
124
|
+
assert agent.tool_choice == "auto"
|
|
125
|
+
assert agent.parallel_tool_calls is True
|
|
126
|
+
|
|
127
|
+
@pytest.mark.asyncio
|
|
128
|
+
async def test_init_without_client_no_api_key(self) -> None:
|
|
129
|
+
"""Test agent initialization fails without API key."""
|
|
130
|
+
with patch("hud.agents.openai.settings") as mock_settings:
|
|
131
|
+
mock_settings.openai_api_key = None
|
|
132
|
+
with pytest.raises(ValueError, match="OpenAI API key not found"):
|
|
133
|
+
OpenAIAgent.create()
|
|
56
134
|
|
|
57
135
|
@pytest.mark.asyncio
|
|
58
|
-
async def
|
|
59
|
-
"""Test formatting content blocks."""
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
model_client=mock_model_client,
|
|
64
|
-
validate_api_key=False, # Skip validation in tests
|
|
136
|
+
async def test_format_blocks_text_only(self, mock_openai: AsyncOpenAI) -> None:
|
|
137
|
+
"""Test formatting text content blocks."""
|
|
138
|
+
agent = OpenAIAgent.create(
|
|
139
|
+
model_client=mock_openai,
|
|
140
|
+
validate_api_key=False,
|
|
65
141
|
)
|
|
66
142
|
|
|
67
|
-
# Test with text blocks
|
|
68
143
|
blocks: list[types.ContentBlock] = [
|
|
69
|
-
types.TextContent(type="text", text="Hello,
|
|
70
|
-
types.TextContent(type="text", text="
|
|
144
|
+
types.TextContent(type="text", text="Hello, world!"),
|
|
145
|
+
types.TextContent(type="text", text="How are you?"),
|
|
71
146
|
]
|
|
72
147
|
|
|
73
148
|
messages = await agent.format_blocks(blocks)
|
|
74
|
-
assert len(messages) ==
|
|
75
|
-
assert messages[0]
|
|
76
|
-
assert messages[
|
|
149
|
+
assert len(messages) == 1
|
|
150
|
+
assert messages[0]["role"] == "user"
|
|
151
|
+
assert len(messages[0]["content"]) == 2
|
|
152
|
+
assert messages[0]["content"][0]["type"] == "input_text"
|
|
153
|
+
assert messages[0]["content"][0]["text"] == "Hello, world!"
|
|
77
154
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
155
|
+
@pytest.mark.asyncio
|
|
156
|
+
async def test_format_blocks_with_image(self, mock_openai: AsyncOpenAI) -> None:
|
|
157
|
+
"""Test formatting image content blocks."""
|
|
158
|
+
agent = OpenAIAgent.create(
|
|
159
|
+
model_client=mock_openai,
|
|
160
|
+
validate_api_key=False,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
blocks: list[types.ContentBlock] = [
|
|
164
|
+
types.TextContent(type="text", text="Look at this:"),
|
|
81
165
|
types.ImageContent(type="image", data="base64data", mimeType="image/png"),
|
|
82
166
|
]
|
|
83
167
|
|
|
84
168
|
messages = await agent.format_blocks(blocks)
|
|
85
|
-
assert len(messages) ==
|
|
86
|
-
assert messages[0]
|
|
87
|
-
assert messages[1] ==
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
"""Test formatting tool results."""
|
|
95
|
-
agent = OperatorAgent(
|
|
96
|
-
mcp_client=mock_mcp_client,
|
|
169
|
+
assert len(messages) == 1
|
|
170
|
+
assert len(messages[0]["content"]) == 2
|
|
171
|
+
assert messages[0]["content"][1]["type"] == "input_image"
|
|
172
|
+
assert messages[0]["content"][1]["image_url"] == "" # type: ignore[typeddict-item]
|
|
173
|
+
|
|
174
|
+
@pytest.mark.asyncio
|
|
175
|
+
async def test_format_blocks_empty(self, mock_openai: AsyncOpenAI) -> None:
|
|
176
|
+
"""Test formatting empty content blocks."""
|
|
177
|
+
agent = OpenAIAgent.create(
|
|
97
178
|
model_client=mock_openai,
|
|
98
|
-
validate_api_key=False,
|
|
179
|
+
validate_api_key=False,
|
|
99
180
|
)
|
|
100
181
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
]
|
|
182
|
+
messages = await agent.format_blocks([])
|
|
183
|
+
assert len(messages) == 1
|
|
184
|
+
# Empty blocks produce a single empty text item
|
|
185
|
+
assert len(messages[0]["content"]) == 1
|
|
186
|
+
assert messages[0]["content"][0]["type"] == "input_text"
|
|
187
|
+
assert messages[0]["content"][0]["text"] == ""
|
|
188
|
+
|
|
189
|
+
@pytest.mark.asyncio
|
|
190
|
+
async def test_format_tool_results_text(self, mock_openai: AsyncOpenAI) -> None:
|
|
191
|
+
"""Test formatting tool results with text content."""
|
|
192
|
+
agent = OpenAIAgent.create(
|
|
193
|
+
model_client=mock_openai,
|
|
194
|
+
validate_api_key=False,
|
|
195
|
+
)
|
|
105
196
|
|
|
197
|
+
tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
|
|
106
198
|
tool_results = [
|
|
107
|
-
MCPToolResult(content=[types.TextContent(type="text", text="Success")], isError=False),
|
|
108
199
|
MCPToolResult(
|
|
109
|
-
content=[types.
|
|
200
|
+
content=[types.TextContent(type="text", text="Tool output")],
|
|
110
201
|
isError=False,
|
|
111
|
-
)
|
|
202
|
+
)
|
|
112
203
|
]
|
|
113
204
|
|
|
114
205
|
messages = await agent.format_tool_results(tool_calls, tool_results)
|
|
115
|
-
|
|
116
|
-
# OpenAI's format_tool_results returns input_image with screenshot
|
|
117
206
|
assert len(messages) == 1
|
|
118
|
-
assert messages[0]["type"] == "
|
|
119
|
-
assert "
|
|
120
|
-
|
|
207
|
+
assert messages[0]["type"] == "function_call_output"
|
|
208
|
+
assert messages[0]["call_id"] == "call_123"
|
|
209
|
+
# Output is a list of content items
|
|
210
|
+
assert len(messages[0]["output"]) == 1
|
|
211
|
+
assert messages[0]["output"][0]["text"] == "Tool output" # type: ignore[index]
|
|
121
212
|
|
|
122
213
|
@pytest.mark.asyncio
|
|
123
|
-
async def test_format_tool_results_with_error(self,
|
|
124
|
-
"""Test formatting tool results with
|
|
125
|
-
agent =
|
|
126
|
-
mcp_client=mock_mcp_client,
|
|
214
|
+
async def test_format_tool_results_with_error(self, mock_openai: AsyncOpenAI) -> None:
|
|
215
|
+
"""Test formatting tool results with error."""
|
|
216
|
+
agent = OpenAIAgent.create(
|
|
127
217
|
model_client=mock_openai,
|
|
128
|
-
validate_api_key=False,
|
|
218
|
+
validate_api_key=False,
|
|
129
219
|
)
|
|
130
220
|
|
|
131
|
-
tool_calls = [
|
|
132
|
-
MCPToolCall(name="failing_tool", arguments={}, id="call_error"), # type: ignore
|
|
133
|
-
]
|
|
134
|
-
|
|
221
|
+
tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
|
|
135
222
|
tool_results = [
|
|
136
223
|
MCPToolResult(
|
|
137
|
-
content=[types.TextContent(type="text", text="
|
|
138
|
-
|
|
224
|
+
content=[types.TextContent(type="text", text="Error message")],
|
|
225
|
+
isError=True,
|
|
226
|
+
)
|
|
139
227
|
]
|
|
140
228
|
|
|
141
229
|
messages = await agent.format_tool_results(tool_calls, tool_results)
|
|
230
|
+
assert len(messages) == 1
|
|
231
|
+
# Output is a list; first item is error indicator, second is the message
|
|
232
|
+
msg = cast("dict[str, Any]", messages[0])
|
|
233
|
+
output = cast("list[dict[str, Any]]", msg["output"])
|
|
234
|
+
assert any(item.get("text") == "[tool_error] true" for item in output)
|
|
235
|
+
assert any(item.get("text") == "Error message" for item in output)
|
|
236
|
+
|
|
237
|
+
@pytest.mark.asyncio
|
|
238
|
+
async def test_get_system_messages(self, mock_openai: AsyncOpenAI) -> None:
|
|
239
|
+
"""Test getting system messages - OpenAI uses instructions field instead."""
|
|
240
|
+
agent = OpenAIAgent.create(
|
|
241
|
+
model_client=mock_openai,
|
|
242
|
+
system_prompt="You are a helpful assistant.",
|
|
243
|
+
validate_api_key=False,
|
|
244
|
+
)
|
|
142
245
|
|
|
143
|
-
#
|
|
246
|
+
# OpenAI agent returns empty list - system prompt is passed via instructions
|
|
247
|
+
messages = await agent.get_system_messages()
|
|
144
248
|
assert len(messages) == 0
|
|
145
249
|
|
|
146
250
|
@pytest.mark.asyncio
|
|
147
|
-
async def
|
|
148
|
-
"""Test
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
validate_api_key=False, # Skip validation in tests
|
|
251
|
+
async def test_convert_tools_for_openai(self, mock_openai: AsyncOpenAI) -> None:
|
|
252
|
+
"""Test converting MCP tools to OpenAI format."""
|
|
253
|
+
tools = [
|
|
254
|
+
types.Tool(
|
|
255
|
+
name="my_tool",
|
|
256
|
+
description="A test tool",
|
|
257
|
+
inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
|
|
155
258
|
)
|
|
259
|
+
]
|
|
260
|
+
ctx = MockEvalContext(tools=tools)
|
|
261
|
+
agent = OpenAIAgent.create(
|
|
262
|
+
model_client=mock_openai,
|
|
263
|
+
validate_api_key=False,
|
|
264
|
+
)
|
|
156
265
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
]
|
|
266
|
+
# Initialize with context to trigger tool conversion
|
|
267
|
+
agent.ctx = ctx
|
|
268
|
+
await agent._initialize_from_ctx(ctx)
|
|
161
269
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
mock_output_text.type = "output_text"
|
|
169
|
-
mock_output_text.text = "I can see the screen content."
|
|
270
|
+
# Check that tools were converted
|
|
271
|
+
assert len(agent._openai_tools) >= 1
|
|
272
|
+
# Find our tool
|
|
273
|
+
tool = next((t for t in agent._openai_tools if t.get("name") == "my_tool"), None)
|
|
274
|
+
assert tool is not None
|
|
275
|
+
assert tool["type"] == "function"
|
|
170
276
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
277
|
+
@pytest.mark.asyncio
|
|
278
|
+
async def test_convert_tools_raises_on_incomplete(self, mock_openai: AsyncOpenAI) -> None:
|
|
279
|
+
"""Test that tools without description raise error."""
|
|
280
|
+
tools = [
|
|
281
|
+
types.Tool(
|
|
282
|
+
name="incomplete_tool",
|
|
283
|
+
description=None, # Missing description
|
|
284
|
+
inputSchema={"type": "object"},
|
|
285
|
+
)
|
|
286
|
+
]
|
|
287
|
+
ctx = MockEvalContext(tools=tools)
|
|
288
|
+
agent = OpenAIAgent.create(
|
|
289
|
+
model_client=mock_openai,
|
|
290
|
+
validate_api_key=False,
|
|
291
|
+
)
|
|
174
292
|
|
|
175
|
-
|
|
293
|
+
agent.ctx = ctx
|
|
294
|
+
with pytest.raises(ValueError, match="requires both a description"):
|
|
295
|
+
await agent._initialize_from_ctx(ctx)
|
|
176
296
|
|
|
177
|
-
|
|
297
|
+
@pytest.mark.asyncio
|
|
298
|
+
async def test_get_response_with_text(self, mock_openai: AsyncOpenAI) -> None:
|
|
299
|
+
"""Test getting response with text output."""
|
|
300
|
+
# Setup mock response
|
|
301
|
+
mock_response = AsyncMock()
|
|
302
|
+
mock_response.output = [
|
|
303
|
+
ResponseOutputMessage(
|
|
304
|
+
id="msg_123",
|
|
305
|
+
type="message",
|
|
306
|
+
role="assistant",
|
|
307
|
+
status="completed",
|
|
308
|
+
content=[ResponseOutputText(type="output_text", text="Hello!", annotations=[])],
|
|
309
|
+
)
|
|
310
|
+
]
|
|
311
|
+
mock_openai.responses.create = AsyncMock(return_value=mock_response)
|
|
178
312
|
|
|
179
|
-
|
|
180
|
-
|
|
313
|
+
agent = OpenAIAgent.create(
|
|
314
|
+
model_client=mock_openai,
|
|
315
|
+
validate_api_key=False,
|
|
316
|
+
)
|
|
317
|
+
# Set empty tools to avoid needing initialization
|
|
318
|
+
agent._openai_tools = []
|
|
319
|
+
agent._initialized = True
|
|
181
320
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
321
|
+
response = await agent.get_response([])
|
|
322
|
+
assert response.content == "Hello!"
|
|
323
|
+
assert response.done is True
|
|
324
|
+
assert len(response.tool_calls) == 0
|
|
186
325
|
|
|
187
326
|
@pytest.mark.asyncio
|
|
188
|
-
async def
|
|
189
|
-
"""Test
|
|
190
|
-
|
|
191
|
-
|
|
327
|
+
async def test_get_response_with_tool_call(self, mock_openai: AsyncOpenAI) -> None:
|
|
328
|
+
"""Test getting response with tool call."""
|
|
329
|
+
mock_response = AsyncMock()
|
|
330
|
+
# Tool calls come as separate output items, not inside message content
|
|
331
|
+
mock_response.output = [
|
|
332
|
+
ResponseFunctionToolCall(
|
|
333
|
+
id="call_123",
|
|
334
|
+
type="function_call",
|
|
335
|
+
call_id="call_123",
|
|
336
|
+
name="my_tool",
|
|
337
|
+
arguments='{"x": "value"}',
|
|
338
|
+
)
|
|
339
|
+
]
|
|
340
|
+
mock_openai.responses.create = AsyncMock(return_value=mock_response)
|
|
341
|
+
|
|
342
|
+
agent = OpenAIAgent.create(
|
|
343
|
+
model_client=mock_openai,
|
|
344
|
+
validate_api_key=False,
|
|
345
|
+
)
|
|
346
|
+
agent._openai_tools = []
|
|
347
|
+
agent._tool_name_map = {"my_tool": "my_tool"}
|
|
348
|
+
agent._initialized = True
|
|
349
|
+
|
|
350
|
+
response = await agent.get_response([])
|
|
351
|
+
assert response.done is False
|
|
352
|
+
assert len(response.tool_calls) == 1
|
|
353
|
+
assert response.tool_calls[0].name == "my_tool"
|
|
354
|
+
assert response.tool_calls[0].arguments == {"x": "value"}
|
|
355
|
+
|
|
356
|
+
@pytest.mark.asyncio
|
|
357
|
+
async def test_get_response_with_reasoning(self, mock_openai: AsyncOpenAI) -> None:
|
|
358
|
+
"""Test getting response with reasoning."""
|
|
359
|
+
mock_response = AsyncMock()
|
|
360
|
+
mock_response.output = [
|
|
361
|
+
ResponseReasoningItem(
|
|
362
|
+
id="reason_123",
|
|
363
|
+
type="reasoning",
|
|
364
|
+
summary=[Summary(type="summary_text", text="Thinking about it...")],
|
|
365
|
+
),
|
|
366
|
+
ResponseOutputMessage(
|
|
367
|
+
id="msg_123",
|
|
368
|
+
type="message",
|
|
369
|
+
role="assistant",
|
|
370
|
+
status="completed",
|
|
371
|
+
content=[ResponseOutputText(type="output_text", text="Answer!", annotations=[])],
|
|
372
|
+
),
|
|
373
|
+
]
|
|
374
|
+
mock_openai.responses.create = AsyncMock(return_value=mock_response)
|
|
375
|
+
|
|
376
|
+
agent = OpenAIAgent.create(
|
|
192
377
|
model_client=mock_openai,
|
|
193
|
-
validate_api_key=False,
|
|
378
|
+
validate_api_key=False,
|
|
194
379
|
)
|
|
380
|
+
agent._openai_tools = []
|
|
381
|
+
agent._initialized = True
|
|
382
|
+
|
|
383
|
+
response = await agent.get_response([])
|
|
384
|
+
# Reasoning is stored separately from content
|
|
385
|
+
assert response.reasoning == "Thinking about it..."
|
|
386
|
+
assert response.content == "Answer!"
|
|
387
|
+
|
|
195
388
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
389
|
+
class TestOpenAIToolConversion:
|
|
390
|
+
"""Tests for tool conversion to OpenAI format."""
|
|
391
|
+
|
|
392
|
+
@pytest.fixture
|
|
393
|
+
def mock_openai(self) -> Generator[AsyncOpenAI, None, None]: # type: ignore[misc]
|
|
394
|
+
"""Create a stub OpenAI client."""
|
|
395
|
+
with patch("hud.agents.openai.AsyncOpenAI") as mock_class:
|
|
396
|
+
client = AsyncOpenAI(api_key="test", base_url="http://localhost")
|
|
397
|
+
client.responses.create = AsyncMock()
|
|
398
|
+
mock_class.return_value = client
|
|
399
|
+
yield client # type: ignore[misc]
|
|
400
|
+
|
|
401
|
+
@pytest.mark.asyncio
|
|
402
|
+
async def test_shell_tool_conversion(self, mock_openai: AsyncOpenAI) -> None:
|
|
403
|
+
"""Test that shell tool is converted to native format."""
|
|
404
|
+
tools = [
|
|
405
|
+
types.Tool(
|
|
406
|
+
name="shell",
|
|
407
|
+
description="Execute shell commands",
|
|
408
|
+
inputSchema={"type": "object"},
|
|
409
|
+
)
|
|
199
410
|
]
|
|
411
|
+
ctx = MockEvalContext(tools=tools)
|
|
412
|
+
agent = OpenAIAgent.create(
|
|
413
|
+
model_client=mock_openai,
|
|
414
|
+
validate_api_key=False,
|
|
415
|
+
)
|
|
200
416
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
mock_response.id = "response_empty"
|
|
204
|
-
mock_response.state = "completed"
|
|
205
|
-
mock_response.output = [] # Empty output
|
|
417
|
+
agent.ctx = ctx
|
|
418
|
+
await agent._initialize_from_ctx(ctx)
|
|
206
419
|
|
|
207
|
-
|
|
420
|
+
# Check for native shell tool
|
|
421
|
+
shell_tool = next((t for t in agent._openai_tools if t.get("type") == "shell"), None)
|
|
422
|
+
assert shell_tool is not None
|
|
423
|
+
|
|
424
|
+
@pytest.mark.asyncio
|
|
425
|
+
async def test_computer_tool_conversion(self, mock_openai: AsyncOpenAI) -> None:
|
|
426
|
+
"""Test that computer tool is converted to function format."""
|
|
427
|
+
tools = [
|
|
428
|
+
types.Tool(
|
|
429
|
+
name="computer",
|
|
430
|
+
description="Control computer",
|
|
431
|
+
inputSchema={"type": "object"},
|
|
432
|
+
)
|
|
433
|
+
]
|
|
434
|
+
ctx = MockEvalContext(tools=tools)
|
|
435
|
+
agent = OpenAIAgent.create(
|
|
436
|
+
model_client=mock_openai,
|
|
437
|
+
validate_api_key=False,
|
|
438
|
+
)
|
|
208
439
|
|
|
209
|
-
|
|
210
|
-
|
|
440
|
+
agent.ctx = ctx
|
|
441
|
+
await agent._initialize_from_ctx(ctx)
|
|
211
442
|
|
|
212
|
-
|
|
213
|
-
|
|
443
|
+
# Computer tool is converted to a regular function tool
|
|
444
|
+
computer_tool = next(
|
|
445
|
+
(t for t in agent._openai_tools if t.get("name") == "computer"),
|
|
446
|
+
None,
|
|
447
|
+
)
|
|
448
|
+
assert computer_tool is not None
|
|
449
|
+
assert computer_tool.get("type") == "function"
|