hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/agents/tests/test_openai.py
CHANGED
|
@@ -2,212 +2,449 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
6
|
+
from unittest.mock import AsyncMock, patch
|
|
6
7
|
|
|
7
8
|
import pytest
|
|
8
9
|
from mcp import types
|
|
9
|
-
|
|
10
|
-
from
|
|
10
|
+
from openai import AsyncOpenAI
|
|
11
|
+
from openai.types.responses import (
|
|
12
|
+
ResponseFunctionToolCall,
|
|
13
|
+
ResponseOutputMessage,
|
|
14
|
+
ResponseOutputText,
|
|
15
|
+
ResponseReasoningItem,
|
|
16
|
+
)
|
|
17
|
+
from openai.types.responses.response_reasoning_item import Summary
|
|
18
|
+
|
|
19
|
+
from hud.agents.openai import OpenAIAgent
|
|
20
|
+
from hud.environment.router import ToolRouter
|
|
21
|
+
from hud.eval.context import EvalContext
|
|
11
22
|
from hud.types import MCPToolCall, MCPToolResult
|
|
12
23
|
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from collections.abc import Generator
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MockEvalContext(EvalContext):
|
|
29
|
+
"""Mock EvalContext for testing."""
|
|
30
|
+
|
|
31
|
+
def __init__(self, tools: list[types.Tool] | None = None) -> None:
|
|
32
|
+
# Core attributes
|
|
33
|
+
self.prompt = "Test prompt"
|
|
34
|
+
self._tools = tools or []
|
|
35
|
+
self._submitted: str | None = None
|
|
36
|
+
self.reward: float | None = None
|
|
37
|
+
|
|
38
|
+
# Environment attributes
|
|
39
|
+
self._router = ToolRouter()
|
|
40
|
+
self._agent_include: list[str] | None = None
|
|
41
|
+
self._agent_exclude: list[str] | None = None
|
|
42
|
+
|
|
43
|
+
# EvalContext attributes
|
|
44
|
+
self._task = None
|
|
45
|
+
self.trace_id = "test-trace-id"
|
|
46
|
+
self.eval_name = "test-eval"
|
|
47
|
+
self.job_id: str | None = None
|
|
48
|
+
self.group_id: str | None = None
|
|
49
|
+
self.index = 0
|
|
50
|
+
self.variants: dict[str, Any] = {}
|
|
51
|
+
self.answer: str | None = None
|
|
52
|
+
self.system_prompt: str | None = None
|
|
53
|
+
self.error: BaseException | None = None
|
|
54
|
+
self.metadata: dict[str, Any] = {}
|
|
55
|
+
self.results: list[Any] = []
|
|
56
|
+
self._is_summary = False
|
|
57
|
+
|
|
58
|
+
def as_tools(self) -> list[types.Tool]:
|
|
59
|
+
return self._tools
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def has_scenario(self) -> bool:
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
async def list_tools(self) -> list[types.Tool]:
|
|
66
|
+
return self._tools
|
|
67
|
+
|
|
68
|
+
async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
|
|
69
|
+
return MCPToolResult(
|
|
70
|
+
content=[types.TextContent(type="text", text="ok")],
|
|
71
|
+
isError=False,
|
|
72
|
+
)
|
|
13
73
|
|
|
14
|
-
|
|
15
|
-
|
|
74
|
+
async def submit(self, answer: str) -> None:
|
|
75
|
+
self._submitted = answer
|
|
16
76
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
mcp_client = AsyncMock()
|
|
21
|
-
# Set up the mcp_config attribute as a regular dict, not a coroutine
|
|
22
|
-
mcp_client.mcp_config = {"test_server": {"url": "http://test"}}
|
|
23
|
-
# Mock list_tools to return the required openai_computer tool
|
|
24
|
-
mcp_client.list_tools = AsyncMock(
|
|
25
|
-
return_value=[
|
|
26
|
-
types.Tool(
|
|
27
|
-
name="openai_computer", description="OpenAI computer use tool", inputSchema={}
|
|
28
|
-
)
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
mcp_client.initialize = AsyncMock()
|
|
32
|
-
return mcp_client
|
|
77
|
+
|
|
78
|
+
class TestOpenAIAgent:
|
|
79
|
+
"""Test OpenAIAgent class."""
|
|
33
80
|
|
|
34
81
|
@pytest.fixture
|
|
35
|
-
def mock_openai(self):
|
|
36
|
-
"""Create a
|
|
37
|
-
with patch("hud.agents.openai.AsyncOpenAI") as
|
|
38
|
-
client =
|
|
39
|
-
|
|
40
|
-
|
|
82
|
+
def mock_openai(self) -> Generator[AsyncOpenAI, None, None]: # type: ignore[misc]
|
|
83
|
+
"""Create a stub OpenAI client."""
|
|
84
|
+
with patch("hud.agents.openai.AsyncOpenAI") as mock_class:
|
|
85
|
+
client = AsyncOpenAI(api_key="test", base_url="http://localhost")
|
|
86
|
+
client.chat.completions.create = AsyncMock()
|
|
87
|
+
client.responses.create = AsyncMock()
|
|
88
|
+
mock_class.return_value = client
|
|
89
|
+
yield client # type: ignore[misc]
|
|
41
90
|
|
|
42
91
|
@pytest.mark.asyncio
|
|
43
|
-
async def
|
|
44
|
-
"""Test agent initialization."""
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
model="gpt-4",
|
|
50
|
-
validate_api_key=False, # Skip validation in tests
|
|
92
|
+
async def test_init_with_client(self, mock_openai: AsyncOpenAI) -> None:
|
|
93
|
+
"""Test agent initialization with provided client."""
|
|
94
|
+
agent = OpenAIAgent.create(
|
|
95
|
+
model_client=mock_openai,
|
|
96
|
+
model="gpt-4o",
|
|
97
|
+
validate_api_key=False,
|
|
51
98
|
)
|
|
52
99
|
|
|
53
|
-
assert agent.model_name == "
|
|
54
|
-
assert agent.model == "gpt-
|
|
55
|
-
assert agent.
|
|
100
|
+
assert agent.model_name == "OpenAI"
|
|
101
|
+
assert agent.config.model == "gpt-4o"
|
|
102
|
+
assert agent.model == "gpt-4o"
|
|
103
|
+
assert agent.openai_client == mock_openai
|
|
104
|
+
assert agent.max_output_tokens is None
|
|
105
|
+
assert agent.temperature is None
|
|
106
|
+
|
|
107
|
+
@pytest.mark.asyncio
|
|
108
|
+
async def test_init_with_parameters(self, mock_openai: AsyncOpenAI) -> None:
|
|
109
|
+
"""Test agent initialization with various parameters."""
|
|
110
|
+
agent = OpenAIAgent.create(
|
|
111
|
+
model_client=mock_openai,
|
|
112
|
+
model="gpt-4o",
|
|
113
|
+
max_output_tokens=2048,
|
|
114
|
+
temperature=0.7,
|
|
115
|
+
reasoning={"effort": "high"},
|
|
116
|
+
tool_choice="auto",
|
|
117
|
+
parallel_tool_calls=True,
|
|
118
|
+
validate_api_key=False,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
assert agent.max_output_tokens == 2048
|
|
122
|
+
assert agent.temperature == 0.7
|
|
123
|
+
assert agent.reasoning == {"effort": "high"}
|
|
124
|
+
assert agent.tool_choice == "auto"
|
|
125
|
+
assert agent.parallel_tool_calls is True
|
|
126
|
+
|
|
127
|
+
@pytest.mark.asyncio
|
|
128
|
+
async def test_init_without_client_no_api_key(self) -> None:
|
|
129
|
+
"""Test agent initialization fails without API key."""
|
|
130
|
+
with patch("hud.agents.openai.settings") as mock_settings:
|
|
131
|
+
mock_settings.api_key = None
|
|
132
|
+
mock_settings.openai_api_key = None
|
|
133
|
+
with pytest.raises(ValueError, match="No API key found"):
|
|
134
|
+
OpenAIAgent.create()
|
|
56
135
|
|
|
57
136
|
@pytest.mark.asyncio
|
|
58
|
-
async def
|
|
59
|
-
"""Test formatting content blocks."""
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
model_client=mock_model_client,
|
|
64
|
-
validate_api_key=False, # Skip validation in tests
|
|
137
|
+
async def test_format_blocks_text_only(self, mock_openai: AsyncOpenAI) -> None:
|
|
138
|
+
"""Test formatting text content blocks."""
|
|
139
|
+
agent = OpenAIAgent.create(
|
|
140
|
+
model_client=mock_openai,
|
|
141
|
+
validate_api_key=False,
|
|
65
142
|
)
|
|
66
143
|
|
|
67
|
-
# Test with text blocks
|
|
68
144
|
blocks: list[types.ContentBlock] = [
|
|
69
|
-
types.TextContent(type="text", text="Hello,
|
|
70
|
-
types.TextContent(type="text", text="
|
|
145
|
+
types.TextContent(type="text", text="Hello, world!"),
|
|
146
|
+
types.TextContent(type="text", text="How are you?"),
|
|
71
147
|
]
|
|
72
148
|
|
|
73
149
|
messages = await agent.format_blocks(blocks)
|
|
74
|
-
assert len(messages) ==
|
|
75
|
-
assert messages[0]
|
|
76
|
-
assert messages[
|
|
150
|
+
assert len(messages) == 1
|
|
151
|
+
assert messages[0]["role"] == "user"
|
|
152
|
+
assert len(messages[0]["content"]) == 2
|
|
153
|
+
assert messages[0]["content"][0]["type"] == "input_text"
|
|
154
|
+
assert messages[0]["content"][0]["text"] == "Hello, world!"
|
|
77
155
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
156
|
+
@pytest.mark.asyncio
|
|
157
|
+
async def test_format_blocks_with_image(self, mock_openai: AsyncOpenAI) -> None:
|
|
158
|
+
"""Test formatting image content blocks."""
|
|
159
|
+
agent = OpenAIAgent.create(
|
|
160
|
+
model_client=mock_openai,
|
|
161
|
+
validate_api_key=False,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
blocks: list[types.ContentBlock] = [
|
|
165
|
+
types.TextContent(type="text", text="Look at this:"),
|
|
81
166
|
types.ImageContent(type="image", data="base64data", mimeType="image/png"),
|
|
82
167
|
]
|
|
83
168
|
|
|
84
169
|
messages = await agent.format_blocks(blocks)
|
|
85
|
-
assert len(messages) ==
|
|
86
|
-
assert messages[0]
|
|
87
|
-
assert messages[1] ==
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
"""Test formatting tool results."""
|
|
95
|
-
agent = OperatorAgent(
|
|
96
|
-
mcp_client=mock_mcp_client,
|
|
170
|
+
assert len(messages) == 1
|
|
171
|
+
assert len(messages[0]["content"]) == 2
|
|
172
|
+
assert messages[0]["content"][1]["type"] == "input_image"
|
|
173
|
+
assert messages[0]["content"][1]["image_url"] == "data:image/png;base64,base64data" # type: ignore[typeddict-item]
|
|
174
|
+
|
|
175
|
+
@pytest.mark.asyncio
|
|
176
|
+
async def test_format_blocks_empty(self, mock_openai: AsyncOpenAI) -> None:
|
|
177
|
+
"""Test formatting empty content blocks."""
|
|
178
|
+
agent = OpenAIAgent.create(
|
|
97
179
|
model_client=mock_openai,
|
|
98
|
-
validate_api_key=False,
|
|
180
|
+
validate_api_key=False,
|
|
99
181
|
)
|
|
100
182
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
]
|
|
183
|
+
messages = await agent.format_blocks([])
|
|
184
|
+
assert len(messages) == 1
|
|
185
|
+
# Empty blocks produce a single empty text item
|
|
186
|
+
assert len(messages[0]["content"]) == 1
|
|
187
|
+
assert messages[0]["content"][0]["type"] == "input_text"
|
|
188
|
+
assert messages[0]["content"][0]["text"] == ""
|
|
189
|
+
|
|
190
|
+
@pytest.mark.asyncio
|
|
191
|
+
async def test_format_tool_results_text(self, mock_openai: AsyncOpenAI) -> None:
|
|
192
|
+
"""Test formatting tool results with text content."""
|
|
193
|
+
agent = OpenAIAgent.create(
|
|
194
|
+
model_client=mock_openai,
|
|
195
|
+
validate_api_key=False,
|
|
196
|
+
)
|
|
105
197
|
|
|
198
|
+
tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
|
|
106
199
|
tool_results = [
|
|
107
|
-
MCPToolResult(content=[types.TextContent(type="text", text="Success")], isError=False),
|
|
108
200
|
MCPToolResult(
|
|
109
|
-
content=[types.
|
|
201
|
+
content=[types.TextContent(type="text", text="Tool output")],
|
|
110
202
|
isError=False,
|
|
111
|
-
)
|
|
203
|
+
)
|
|
112
204
|
]
|
|
113
205
|
|
|
114
206
|
messages = await agent.format_tool_results(tool_calls, tool_results)
|
|
115
|
-
|
|
116
|
-
# OpenAI's format_tool_results returns input_image with screenshot
|
|
117
207
|
assert len(messages) == 1
|
|
118
|
-
assert messages[0]["type"] == "
|
|
119
|
-
assert "
|
|
120
|
-
|
|
208
|
+
assert messages[0]["type"] == "function_call_output"
|
|
209
|
+
assert messages[0]["call_id"] == "call_123"
|
|
210
|
+
# Output is a list of content items
|
|
211
|
+
assert len(messages[0]["output"]) == 1
|
|
212
|
+
assert messages[0]["output"][0]["text"] == "Tool output" # type: ignore[index]
|
|
121
213
|
|
|
122
214
|
@pytest.mark.asyncio
|
|
123
|
-
async def test_format_tool_results_with_error(self,
|
|
124
|
-
"""Test formatting tool results with
|
|
125
|
-
agent =
|
|
126
|
-
mcp_client=mock_mcp_client,
|
|
215
|
+
async def test_format_tool_results_with_error(self, mock_openai: AsyncOpenAI) -> None:
|
|
216
|
+
"""Test formatting tool results with error."""
|
|
217
|
+
agent = OpenAIAgent.create(
|
|
127
218
|
model_client=mock_openai,
|
|
128
|
-
validate_api_key=False,
|
|
219
|
+
validate_api_key=False,
|
|
129
220
|
)
|
|
130
221
|
|
|
131
|
-
tool_calls = [
|
|
132
|
-
MCPToolCall(name="failing_tool", arguments={}, id="call_error"), # type: ignore
|
|
133
|
-
]
|
|
134
|
-
|
|
222
|
+
tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
|
|
135
223
|
tool_results = [
|
|
136
224
|
MCPToolResult(
|
|
137
|
-
content=[types.TextContent(type="text", text="
|
|
138
|
-
|
|
225
|
+
content=[types.TextContent(type="text", text="Error message")],
|
|
226
|
+
isError=True,
|
|
227
|
+
)
|
|
139
228
|
]
|
|
140
229
|
|
|
141
230
|
messages = await agent.format_tool_results(tool_calls, tool_results)
|
|
231
|
+
assert len(messages) == 1
|
|
232
|
+
# Output is a list; first item is error indicator, second is the message
|
|
233
|
+
msg = cast("dict[str, Any]", messages[0])
|
|
234
|
+
output = cast("list[dict[str, Any]]", msg["output"])
|
|
235
|
+
assert any(item.get("text") == "[tool_error] true" for item in output)
|
|
236
|
+
assert any(item.get("text") == "Error message" for item in output)
|
|
237
|
+
|
|
238
|
+
@pytest.mark.asyncio
|
|
239
|
+
async def test_get_system_messages(self, mock_openai: AsyncOpenAI) -> None:
|
|
240
|
+
"""Test getting system messages - OpenAI uses instructions field instead."""
|
|
241
|
+
agent = OpenAIAgent.create(
|
|
242
|
+
model_client=mock_openai,
|
|
243
|
+
system_prompt="You are a helpful assistant.",
|
|
244
|
+
validate_api_key=False,
|
|
245
|
+
)
|
|
142
246
|
|
|
143
|
-
#
|
|
247
|
+
# OpenAI agent returns empty list - system prompt is passed via instructions
|
|
248
|
+
messages = await agent.get_system_messages()
|
|
144
249
|
assert len(messages) == 0
|
|
145
250
|
|
|
146
251
|
@pytest.mark.asyncio
|
|
147
|
-
async def
|
|
148
|
-
"""Test
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
validate_api_key=False, # Skip validation in tests
|
|
252
|
+
async def test_convert_tools_for_openai(self, mock_openai: AsyncOpenAI) -> None:
|
|
253
|
+
"""Test converting MCP tools to OpenAI format."""
|
|
254
|
+
tools = [
|
|
255
|
+
types.Tool(
|
|
256
|
+
name="my_tool",
|
|
257
|
+
description="A test tool",
|
|
258
|
+
inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
|
|
155
259
|
)
|
|
260
|
+
]
|
|
261
|
+
ctx = MockEvalContext(tools=tools)
|
|
262
|
+
agent = OpenAIAgent.create(
|
|
263
|
+
model_client=mock_openai,
|
|
264
|
+
validate_api_key=False,
|
|
265
|
+
)
|
|
156
266
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
]
|
|
267
|
+
# Initialize with context to trigger tool conversion
|
|
268
|
+
agent.ctx = ctx
|
|
269
|
+
await agent._initialize_from_ctx(ctx)
|
|
161
270
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
mock_output_text.type = "output_text"
|
|
169
|
-
mock_output_text.text = "I can see the screen content."
|
|
271
|
+
# Check that tools were converted
|
|
272
|
+
assert len(agent._openai_tools) >= 1
|
|
273
|
+
# Find our tool
|
|
274
|
+
tool = next((t for t in agent._openai_tools if t.get("name") == "my_tool"), None)
|
|
275
|
+
assert tool is not None
|
|
276
|
+
assert tool["type"] == "function"
|
|
170
277
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
278
|
+
@pytest.mark.asyncio
|
|
279
|
+
async def test_convert_tools_raises_on_incomplete(self, mock_openai: AsyncOpenAI) -> None:
|
|
280
|
+
"""Test that tools without description raise error."""
|
|
281
|
+
tools = [
|
|
282
|
+
types.Tool(
|
|
283
|
+
name="incomplete_tool",
|
|
284
|
+
description=None, # Missing description
|
|
285
|
+
inputSchema={"type": "object"},
|
|
286
|
+
)
|
|
287
|
+
]
|
|
288
|
+
ctx = MockEvalContext(tools=tools)
|
|
289
|
+
agent = OpenAIAgent.create(
|
|
290
|
+
model_client=mock_openai,
|
|
291
|
+
validate_api_key=False,
|
|
292
|
+
)
|
|
174
293
|
|
|
175
|
-
|
|
294
|
+
agent.ctx = ctx
|
|
295
|
+
with pytest.raises(ValueError, match="requires both a description"):
|
|
296
|
+
await agent._initialize_from_ctx(ctx)
|
|
176
297
|
|
|
177
|
-
|
|
298
|
+
@pytest.mark.asyncio
|
|
299
|
+
async def test_get_response_with_text(self, mock_openai: AsyncOpenAI) -> None:
|
|
300
|
+
"""Test getting response with text output."""
|
|
301
|
+
# Setup mock response
|
|
302
|
+
mock_response = AsyncMock()
|
|
303
|
+
mock_response.output = [
|
|
304
|
+
ResponseOutputMessage(
|
|
305
|
+
id="msg_123",
|
|
306
|
+
type="message",
|
|
307
|
+
role="assistant",
|
|
308
|
+
status="completed",
|
|
309
|
+
content=[ResponseOutputText(type="output_text", text="Hello!", annotations=[])],
|
|
310
|
+
)
|
|
311
|
+
]
|
|
312
|
+
mock_openai.responses.create = AsyncMock(return_value=mock_response)
|
|
178
313
|
|
|
179
|
-
|
|
180
|
-
|
|
314
|
+
agent = OpenAIAgent.create(
|
|
315
|
+
model_client=mock_openai,
|
|
316
|
+
validate_api_key=False,
|
|
317
|
+
)
|
|
318
|
+
# Set empty tools to avoid needing initialization
|
|
319
|
+
agent._openai_tools = []
|
|
320
|
+
agent._initialized = True
|
|
181
321
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
322
|
+
response = await agent.get_response([])
|
|
323
|
+
assert response.content == "Hello!"
|
|
324
|
+
assert response.done is True
|
|
325
|
+
assert len(response.tool_calls) == 0
|
|
186
326
|
|
|
187
327
|
@pytest.mark.asyncio
|
|
188
|
-
async def
|
|
189
|
-
"""Test
|
|
190
|
-
|
|
191
|
-
|
|
328
|
+
async def test_get_response_with_tool_call(self, mock_openai: AsyncOpenAI) -> None:
|
|
329
|
+
"""Test getting response with tool call."""
|
|
330
|
+
mock_response = AsyncMock()
|
|
331
|
+
# Tool calls come as separate output items, not inside message content
|
|
332
|
+
mock_response.output = [
|
|
333
|
+
ResponseFunctionToolCall(
|
|
334
|
+
id="call_123",
|
|
335
|
+
type="function_call",
|
|
336
|
+
call_id="call_123",
|
|
337
|
+
name="my_tool",
|
|
338
|
+
arguments='{"x": "value"}',
|
|
339
|
+
)
|
|
340
|
+
]
|
|
341
|
+
mock_openai.responses.create = AsyncMock(return_value=mock_response)
|
|
342
|
+
|
|
343
|
+
agent = OpenAIAgent.create(
|
|
344
|
+
model_client=mock_openai,
|
|
345
|
+
validate_api_key=False,
|
|
346
|
+
)
|
|
347
|
+
agent._openai_tools = []
|
|
348
|
+
agent._tool_name_map = {"my_tool": "my_tool"}
|
|
349
|
+
agent._initialized = True
|
|
350
|
+
|
|
351
|
+
response = await agent.get_response([])
|
|
352
|
+
assert response.done is False
|
|
353
|
+
assert len(response.tool_calls) == 1
|
|
354
|
+
assert response.tool_calls[0].name == "my_tool"
|
|
355
|
+
assert response.tool_calls[0].arguments == {"x": "value"}
|
|
356
|
+
|
|
357
|
+
@pytest.mark.asyncio
|
|
358
|
+
async def test_get_response_with_reasoning(self, mock_openai: AsyncOpenAI) -> None:
|
|
359
|
+
"""Test getting response with reasoning."""
|
|
360
|
+
mock_response = AsyncMock()
|
|
361
|
+
mock_response.output = [
|
|
362
|
+
ResponseReasoningItem(
|
|
363
|
+
id="reason_123",
|
|
364
|
+
type="reasoning",
|
|
365
|
+
summary=[Summary(type="summary_text", text="Thinking about it...")],
|
|
366
|
+
),
|
|
367
|
+
ResponseOutputMessage(
|
|
368
|
+
id="msg_123",
|
|
369
|
+
type="message",
|
|
370
|
+
role="assistant",
|
|
371
|
+
status="completed",
|
|
372
|
+
content=[ResponseOutputText(type="output_text", text="Answer!", annotations=[])],
|
|
373
|
+
),
|
|
374
|
+
]
|
|
375
|
+
mock_openai.responses.create = AsyncMock(return_value=mock_response)
|
|
376
|
+
|
|
377
|
+
agent = OpenAIAgent.create(
|
|
192
378
|
model_client=mock_openai,
|
|
193
|
-
validate_api_key=False,
|
|
379
|
+
validate_api_key=False,
|
|
194
380
|
)
|
|
381
|
+
agent._openai_tools = []
|
|
382
|
+
agent._initialized = True
|
|
383
|
+
|
|
384
|
+
response = await agent.get_response([])
|
|
385
|
+
# Reasoning is stored separately from content
|
|
386
|
+
assert response.reasoning == "Thinking about it..."
|
|
387
|
+
assert response.content == "Answer!"
|
|
388
|
+
|
|
195
389
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
390
|
+
class TestOpenAIToolConversion:
|
|
391
|
+
"""Tests for tool conversion to OpenAI format."""
|
|
392
|
+
|
|
393
|
+
@pytest.fixture
|
|
394
|
+
def mock_openai(self) -> Generator[AsyncOpenAI, None, None]: # type: ignore[misc]
|
|
395
|
+
"""Create a stub OpenAI client."""
|
|
396
|
+
with patch("hud.agents.openai.AsyncOpenAI") as mock_class:
|
|
397
|
+
client = AsyncOpenAI(api_key="test", base_url="http://localhost")
|
|
398
|
+
client.responses.create = AsyncMock()
|
|
399
|
+
mock_class.return_value = client
|
|
400
|
+
yield client # type: ignore[misc]
|
|
401
|
+
|
|
402
|
+
@pytest.mark.asyncio
|
|
403
|
+
async def test_shell_tool_conversion(self, mock_openai: AsyncOpenAI) -> None:
|
|
404
|
+
"""Test that shell tool is converted to native format."""
|
|
405
|
+
tools = [
|
|
406
|
+
types.Tool(
|
|
407
|
+
name="shell",
|
|
408
|
+
description="Execute shell commands",
|
|
409
|
+
inputSchema={"type": "object"},
|
|
410
|
+
)
|
|
199
411
|
]
|
|
412
|
+
ctx = MockEvalContext(tools=tools)
|
|
413
|
+
agent = OpenAIAgent.create(
|
|
414
|
+
model_client=mock_openai,
|
|
415
|
+
validate_api_key=False,
|
|
416
|
+
)
|
|
200
417
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
mock_response.id = "response_empty"
|
|
204
|
-
mock_response.state = "completed"
|
|
205
|
-
mock_response.output = [] # Empty output
|
|
418
|
+
agent.ctx = ctx
|
|
419
|
+
await agent._initialize_from_ctx(ctx)
|
|
206
420
|
|
|
207
|
-
|
|
421
|
+
# Check for native shell tool
|
|
422
|
+
shell_tool = next((t for t in agent._openai_tools if t.get("type") == "shell"), None)
|
|
423
|
+
assert shell_tool is not None
|
|
424
|
+
|
|
425
|
+
@pytest.mark.asyncio
|
|
426
|
+
async def test_computer_tool_conversion(self, mock_openai: AsyncOpenAI) -> None:
|
|
427
|
+
"""Test that computer tool is converted to function format."""
|
|
428
|
+
tools = [
|
|
429
|
+
types.Tool(
|
|
430
|
+
name="computer",
|
|
431
|
+
description="Control computer",
|
|
432
|
+
inputSchema={"type": "object"},
|
|
433
|
+
)
|
|
434
|
+
]
|
|
435
|
+
ctx = MockEvalContext(tools=tools)
|
|
436
|
+
agent = OpenAIAgent.create(
|
|
437
|
+
model_client=mock_openai,
|
|
438
|
+
validate_api_key=False,
|
|
439
|
+
)
|
|
208
440
|
|
|
209
|
-
|
|
210
|
-
|
|
441
|
+
agent.ctx = ctx
|
|
442
|
+
await agent._initialize_from_ctx(ctx)
|
|
211
443
|
|
|
212
|
-
|
|
213
|
-
|
|
444
|
+
# Computer tool is converted to a regular function tool
|
|
445
|
+
computer_tool = next(
|
|
446
|
+
(t for t in agent._openai_tools if t.get("name") == "computer"),
|
|
447
|
+
None,
|
|
448
|
+
)
|
|
449
|
+
assert computer_tool is not None
|
|
450
|
+
assert computer_tool.get("type") == "function"
|