hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -2,212 +2,449 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from unittest.mock import AsyncMock, MagicMock, patch
5
+ from typing import TYPE_CHECKING, Any, cast
6
+ from unittest.mock import AsyncMock, patch
6
7
 
7
8
  import pytest
8
9
  from mcp import types
9
-
10
- from hud.agents.openai import OperatorAgent
10
+ from openai import AsyncOpenAI
11
+ from openai.types.responses import (
12
+ ResponseFunctionToolCall,
13
+ ResponseOutputMessage,
14
+ ResponseOutputText,
15
+ ResponseReasoningItem,
16
+ )
17
+ from openai.types.responses.response_reasoning_item import Summary
18
+
19
+ from hud.agents.openai import OpenAIAgent
20
+ from hud.environment.router import ToolRouter
21
+ from hud.eval.context import EvalContext
11
22
  from hud.types import MCPToolCall, MCPToolResult
12
23
 
24
+ if TYPE_CHECKING:
25
+ from collections.abc import Generator
26
+
27
+
28
+ class MockEvalContext(EvalContext):
29
+ """Mock EvalContext for testing."""
30
+
31
+ def __init__(self, tools: list[types.Tool] | None = None) -> None:
32
+ # Core attributes
33
+ self.prompt = "Test prompt"
34
+ self._tools = tools or []
35
+ self._submitted: str | None = None
36
+ self.reward: float | None = None
37
+
38
+ # Environment attributes
39
+ self._router = ToolRouter()
40
+ self._agent_include: list[str] | None = None
41
+ self._agent_exclude: list[str] | None = None
42
+
43
+ # EvalContext attributes
44
+ self._task = None
45
+ self.trace_id = "test-trace-id"
46
+ self.eval_name = "test-eval"
47
+ self.job_id: str | None = None
48
+ self.group_id: str | None = None
49
+ self.index = 0
50
+ self.variants: dict[str, Any] = {}
51
+ self.answer: str | None = None
52
+ self.system_prompt: str | None = None
53
+ self.error: BaseException | None = None
54
+ self.metadata: dict[str, Any] = {}
55
+ self.results: list[Any] = []
56
+ self._is_summary = False
57
+
58
+ def as_tools(self) -> list[types.Tool]:
59
+ return self._tools
60
+
61
+ @property
62
+ def has_scenario(self) -> bool:
63
+ return False
64
+
65
+ async def list_tools(self) -> list[types.Tool]:
66
+ return self._tools
67
+
68
+ async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
69
+ return MCPToolResult(
70
+ content=[types.TextContent(type="text", text="ok")],
71
+ isError=False,
72
+ )
13
73
 
14
- class TestOperatorAgent:
15
- """Test OperatorAgent class."""
74
+ async def submit(self, answer: str) -> None:
75
+ self._submitted = answer
16
76
 
17
- @pytest.fixture
18
- def mock_mcp_client(self):
19
- """Create a mock MCP client."""
20
- mcp_client = AsyncMock()
21
- # Set up the mcp_config attribute as a regular dict, not a coroutine
22
- mcp_client.mcp_config = {"test_server": {"url": "http://test"}}
23
- # Mock list_tools to return the required openai_computer tool
24
- mcp_client.list_tools = AsyncMock(
25
- return_value=[
26
- types.Tool(
27
- name="openai_computer", description="OpenAI computer use tool", inputSchema={}
28
- )
29
- ]
30
- )
31
- mcp_client.initialize = AsyncMock()
32
- return mcp_client
77
+
78
+ class TestOpenAIAgent:
79
+ """Test OpenAIAgent class."""
33
80
 
34
81
  @pytest.fixture
35
- def mock_openai(self):
36
- """Create a mock OpenAI client."""
37
- with patch("hud.agents.openai.AsyncOpenAI") as mock:
38
- client = AsyncMock()
39
- mock.return_value = client
40
- yield client
82
+ def mock_openai(self) -> Generator[AsyncOpenAI, None, None]: # type: ignore[misc]
83
+ """Create a stub OpenAI client."""
84
+ with patch("hud.agents.openai.AsyncOpenAI") as mock_class:
85
+ client = AsyncOpenAI(api_key="test", base_url="http://localhost")
86
+ client.chat.completions.create = AsyncMock()
87
+ client.responses.create = AsyncMock()
88
+ mock_class.return_value = client
89
+ yield client # type: ignore[misc]
41
90
 
42
91
  @pytest.mark.asyncio
43
- async def test_init(self, mock_mcp_client):
44
- """Test agent initialization."""
45
- mock_model_client = MagicMock()
46
- agent = OperatorAgent(
47
- mcp_client=mock_mcp_client,
48
- model_client=mock_model_client,
49
- model="gpt-4",
50
- validate_api_key=False, # Skip validation in tests
92
+ async def test_init_with_client(self, mock_openai: AsyncOpenAI) -> None:
93
+ """Test agent initialization with provided client."""
94
+ agent = OpenAIAgent.create(
95
+ model_client=mock_openai,
96
+ model="gpt-4o",
97
+ validate_api_key=False,
51
98
  )
52
99
 
53
- assert agent.model_name == "openai-gpt-4"
54
- assert agent.model == "gpt-4"
55
- assert agent.openai_client == mock_model_client
100
+ assert agent.model_name == "OpenAI"
101
+ assert agent.config.model == "gpt-4o"
102
+ assert agent.model == "gpt-4o"
103
+ assert agent.openai_client == mock_openai
104
+ assert agent.max_output_tokens is None
105
+ assert agent.temperature is None
106
+
107
+ @pytest.mark.asyncio
108
+ async def test_init_with_parameters(self, mock_openai: AsyncOpenAI) -> None:
109
+ """Test agent initialization with various parameters."""
110
+ agent = OpenAIAgent.create(
111
+ model_client=mock_openai,
112
+ model="gpt-4o",
113
+ max_output_tokens=2048,
114
+ temperature=0.7,
115
+ reasoning={"effort": "high"},
116
+ tool_choice="auto",
117
+ parallel_tool_calls=True,
118
+ validate_api_key=False,
119
+ )
120
+
121
+ assert agent.max_output_tokens == 2048
122
+ assert agent.temperature == 0.7
123
+ assert agent.reasoning == {"effort": "high"}
124
+ assert agent.tool_choice == "auto"
125
+ assert agent.parallel_tool_calls is True
126
+
127
+ @pytest.mark.asyncio
128
+ async def test_init_without_client_no_api_key(self) -> None:
129
+ """Test agent initialization fails without API key."""
130
+ with patch("hud.agents.openai.settings") as mock_settings:
131
+ mock_settings.api_key = None
132
+ mock_settings.openai_api_key = None
133
+ with pytest.raises(ValueError, match="No API key found"):
134
+ OpenAIAgent.create()
56
135
 
57
136
  @pytest.mark.asyncio
58
- async def test_format_blocks(self, mock_mcp_client):
59
- """Test formatting content blocks."""
60
- mock_model_client = MagicMock()
61
- agent = OperatorAgent(
62
- mcp_client=mock_mcp_client,
63
- model_client=mock_model_client,
64
- validate_api_key=False, # Skip validation in tests
137
+ async def test_format_blocks_text_only(self, mock_openai: AsyncOpenAI) -> None:
138
+ """Test formatting text content blocks."""
139
+ agent = OpenAIAgent.create(
140
+ model_client=mock_openai,
141
+ validate_api_key=False,
65
142
  )
66
143
 
67
- # Test with text blocks
68
144
  blocks: list[types.ContentBlock] = [
69
- types.TextContent(type="text", text="Hello, GPT!"),
70
- types.TextContent(type="text", text="Another message"),
145
+ types.TextContent(type="text", text="Hello, world!"),
146
+ types.TextContent(type="text", text="How are you?"),
71
147
  ]
72
148
 
73
149
  messages = await agent.format_blocks(blocks)
74
- assert len(messages) == 2
75
- assert messages[0] == {"type": "input_text", "text": "Hello, GPT!"}
76
- assert messages[1] == {"type": "input_text", "text": "Another message"}
150
+ assert len(messages) == 1
151
+ assert messages[0]["role"] == "user"
152
+ assert len(messages[0]["content"]) == 2
153
+ assert messages[0]["content"][0]["type"] == "input_text"
154
+ assert messages[0]["content"][0]["text"] == "Hello, world!"
77
155
 
78
- # Test with mixed content
79
- blocks = [
80
- types.TextContent(type="text", text="Text content"),
156
+ @pytest.mark.asyncio
157
+ async def test_format_blocks_with_image(self, mock_openai: AsyncOpenAI) -> None:
158
+ """Test formatting image content blocks."""
159
+ agent = OpenAIAgent.create(
160
+ model_client=mock_openai,
161
+ validate_api_key=False,
162
+ )
163
+
164
+ blocks: list[types.ContentBlock] = [
165
+ types.TextContent(type="text", text="Look at this:"),
81
166
  types.ImageContent(type="image", data="base64data", mimeType="image/png"),
82
167
  ]
83
168
 
84
169
  messages = await agent.format_blocks(blocks)
85
- assert len(messages) == 2
86
- assert messages[0] == {"type": "input_text", "text": "Text content"}
87
- assert messages[1] == {
88
- "type": "input_image",
89
- "image_url": "data:image/png;base64,base64data",
90
- }
91
-
92
- @pytest.mark.asyncio
93
- async def test_format_tool_results(self, mock_mcp_client, mock_openai):
94
- """Test formatting tool results."""
95
- agent = OperatorAgent(
96
- mcp_client=mock_mcp_client,
170
+ assert len(messages) == 1
171
+ assert len(messages[0]["content"]) == 2
172
+ assert messages[0]["content"][1]["type"] == "input_image"
173
+ assert messages[0]["content"][1]["image_url"] == "data:image/png;base64,base64data" # type: ignore[typeddict-item]
174
+
175
+ @pytest.mark.asyncio
176
+ async def test_format_blocks_empty(self, mock_openai: AsyncOpenAI) -> None:
177
+ """Test formatting empty content blocks."""
178
+ agent = OpenAIAgent.create(
97
179
  model_client=mock_openai,
98
- validate_api_key=False, # Skip validation in tests
180
+ validate_api_key=False,
99
181
  )
100
182
 
101
- tool_calls = [
102
- MCPToolCall(name="test_tool", arguments={}, id="call_123"), # type: ignore
103
- MCPToolCall(name="screenshot", arguments={}, id="call_456"), # type: ignore
104
- ]
183
+ messages = await agent.format_blocks([])
184
+ assert len(messages) == 1
185
+ # Empty blocks produce a single empty text item
186
+ assert len(messages[0]["content"]) == 1
187
+ assert messages[0]["content"][0]["type"] == "input_text"
188
+ assert messages[0]["content"][0]["text"] == ""
189
+
190
+ @pytest.mark.asyncio
191
+ async def test_format_tool_results_text(self, mock_openai: AsyncOpenAI) -> None:
192
+ """Test formatting tool results with text content."""
193
+ agent = OpenAIAgent.create(
194
+ model_client=mock_openai,
195
+ validate_api_key=False,
196
+ )
105
197
 
198
+ tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
106
199
  tool_results = [
107
- MCPToolResult(content=[types.TextContent(type="text", text="Success")], isError=False),
108
200
  MCPToolResult(
109
- content=[types.ImageContent(type="image", data="base64data", mimeType="image/png")],
201
+ content=[types.TextContent(type="text", text="Tool output")],
110
202
  isError=False,
111
- ),
203
+ )
112
204
  ]
113
205
 
114
206
  messages = await agent.format_tool_results(tool_calls, tool_results)
115
-
116
- # OpenAI's format_tool_results returns input_image with screenshot
117
207
  assert len(messages) == 1
118
- assert messages[0]["type"] == "input_image"
119
- assert "image_url" in messages[0]
120
- assert messages[0]["image_url"] == "data:image/png;base64,base64data"
208
+ assert messages[0]["type"] == "function_call_output"
209
+ assert messages[0]["call_id"] == "call_123"
210
+ # Output is a list of content items
211
+ assert len(messages[0]["output"]) == 1
212
+ assert messages[0]["output"][0]["text"] == "Tool output" # type: ignore[index]
121
213
 
122
214
  @pytest.mark.asyncio
123
- async def test_format_tool_results_with_error(self, mock_mcp_client, mock_openai):
124
- """Test formatting tool results with errors."""
125
- agent = OperatorAgent(
126
- mcp_client=mock_mcp_client,
215
+ async def test_format_tool_results_with_error(self, mock_openai: AsyncOpenAI) -> None:
216
+ """Test formatting tool results with error."""
217
+ agent = OpenAIAgent.create(
127
218
  model_client=mock_openai,
128
- validate_api_key=False, # Skip validation in tests
219
+ validate_api_key=False,
129
220
  )
130
221
 
131
- tool_calls = [
132
- MCPToolCall(name="failing_tool", arguments={}, id="call_error"), # type: ignore
133
- ]
134
-
222
+ tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
135
223
  tool_results = [
136
224
  MCPToolResult(
137
- content=[types.TextContent(type="text", text="Something went wrong")], isError=True
138
- ),
225
+ content=[types.TextContent(type="text", text="Error message")],
226
+ isError=True,
227
+ )
139
228
  ]
140
229
 
141
230
  messages = await agent.format_tool_results(tool_calls, tool_results)
231
+ assert len(messages) == 1
232
+ # Output is a list; first item is error indicator, second is the message
233
+ msg = cast("dict[str, Any]", messages[0])
234
+ output = cast("list[dict[str, Any]]", msg["output"])
235
+ assert any(item.get("text") == "[tool_error] true" for item in output)
236
+ assert any(item.get("text") == "Error message" for item in output)
237
+
238
+ @pytest.mark.asyncio
239
+ async def test_get_system_messages(self, mock_openai: AsyncOpenAI) -> None:
240
+ """Test getting system messages - OpenAI uses instructions field instead."""
241
+ agent = OpenAIAgent.create(
242
+ model_client=mock_openai,
243
+ system_prompt="You are a helpful assistant.",
244
+ validate_api_key=False,
245
+ )
142
246
 
143
- # Since the result has isError=True and no screenshot, returns empty list
247
+ # OpenAI agent returns empty list - system prompt is passed via instructions
248
+ messages = await agent.get_system_messages()
144
249
  assert len(messages) == 0
145
250
 
146
251
  @pytest.mark.asyncio
147
- async def test_get_model_response(self, mock_mcp_client, mock_openai):
148
- """Test getting model response from OpenAI API."""
149
- # Disable telemetry for this test to avoid backend configuration issues
150
- with patch("hud.settings.settings.telemetry_enabled", False):
151
- agent = OperatorAgent(
152
- mcp_client=mock_mcp_client,
153
- model_client=mock_openai,
154
- validate_api_key=False, # Skip validation in tests
252
+ async def test_convert_tools_for_openai(self, mock_openai: AsyncOpenAI) -> None:
253
+ """Test converting MCP tools to OpenAI format."""
254
+ tools = [
255
+ types.Tool(
256
+ name="my_tool",
257
+ description="A test tool",
258
+ inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
155
259
  )
260
+ ]
261
+ ctx = MockEvalContext(tools=tools)
262
+ agent = OpenAIAgent.create(
263
+ model_client=mock_openai,
264
+ validate_api_key=False,
265
+ )
156
266
 
157
- # Set up available tools so agent doesn't return "No computer use tools available"
158
- agent._available_tools = [
159
- types.Tool(name="computer_openai", description="Computer tool", inputSchema={})
160
- ]
267
+ # Initialize with context to trigger tool conversion
268
+ agent.ctx = ctx
269
+ await agent._initialize_from_ctx(ctx)
161
270
 
162
- # Mock OpenAI API response for a successful computer use response
163
- mock_response = MagicMock()
164
- mock_response.id = "response_123"
165
- mock_response.state = "completed"
166
- # Mock the output message structure
167
- mock_output_text = MagicMock()
168
- mock_output_text.type = "output_text"
169
- mock_output_text.text = "I can see the screen content."
271
+ # Check that tools were converted
272
+ assert len(agent._openai_tools) >= 1
273
+ # Find our tool
274
+ tool = next((t for t in agent._openai_tools if t.get("name") == "my_tool"), None)
275
+ assert tool is not None
276
+ assert tool["type"] == "function"
170
277
 
171
- mock_output_message = MagicMock()
172
- mock_output_message.type = "message"
173
- mock_output_message.content = [mock_output_text]
278
+ @pytest.mark.asyncio
279
+ async def test_convert_tools_raises_on_incomplete(self, mock_openai: AsyncOpenAI) -> None:
280
+ """Test that tools without description raise error."""
281
+ tools = [
282
+ types.Tool(
283
+ name="incomplete_tool",
284
+ description=None, # Missing description
285
+ inputSchema={"type": "object"},
286
+ )
287
+ ]
288
+ ctx = MockEvalContext(tools=tools)
289
+ agent = OpenAIAgent.create(
290
+ model_client=mock_openai,
291
+ validate_api_key=False,
292
+ )
174
293
 
175
- mock_response.output = [mock_output_message]
294
+ agent.ctx = ctx
295
+ with pytest.raises(ValueError, match="requires both a description"):
296
+ await agent._initialize_from_ctx(ctx)
176
297
 
177
- mock_openai.responses.create = AsyncMock(return_value=mock_response)
298
+ @pytest.mark.asyncio
299
+ async def test_get_response_with_text(self, mock_openai: AsyncOpenAI) -> None:
300
+ """Test getting response with text output."""
301
+ # Setup mock response
302
+ mock_response = AsyncMock()
303
+ mock_response.output = [
304
+ ResponseOutputMessage(
305
+ id="msg_123",
306
+ type="message",
307
+ role="assistant",
308
+ status="completed",
309
+ content=[ResponseOutputText(type="output_text", text="Hello!", annotations=[])],
310
+ )
311
+ ]
312
+ mock_openai.responses.create = AsyncMock(return_value=mock_response)
178
313
 
179
- messages = [{"prompt": "What's on the screen?", "screenshot": None}]
180
- response = await agent.get_response(messages)
314
+ agent = OpenAIAgent.create(
315
+ model_client=mock_openai,
316
+ validate_api_key=False,
317
+ )
318
+ # Set empty tools to avoid needing initialization
319
+ agent._openai_tools = []
320
+ agent._initialized = True
181
321
 
182
- # The test should verify that the response is processed correctly
183
- # Since the isinstance checks will fail, content will be empty, but done should be True
184
- assert response.done is True
185
- assert response.tool_calls == []
322
+ response = await agent.get_response([])
323
+ assert response.content == "Hello!"
324
+ assert response.done is True
325
+ assert len(response.tool_calls) == 0
186
326
 
187
327
  @pytest.mark.asyncio
188
- async def test_handle_empty_response(self, mock_mcp_client, mock_openai):
189
- """Test handling empty response from API."""
190
- agent = OperatorAgent(
191
- mcp_client=mock_mcp_client,
328
+ async def test_get_response_with_tool_call(self, mock_openai: AsyncOpenAI) -> None:
329
+ """Test getting response with tool call."""
330
+ mock_response = AsyncMock()
331
+ # Tool calls come as separate output items, not inside message content
332
+ mock_response.output = [
333
+ ResponseFunctionToolCall(
334
+ id="call_123",
335
+ type="function_call",
336
+ call_id="call_123",
337
+ name="my_tool",
338
+ arguments='{"x": "value"}',
339
+ )
340
+ ]
341
+ mock_openai.responses.create = AsyncMock(return_value=mock_response)
342
+
343
+ agent = OpenAIAgent.create(
344
+ model_client=mock_openai,
345
+ validate_api_key=False,
346
+ )
347
+ agent._openai_tools = []
348
+ agent._tool_name_map = {"my_tool": "my_tool"}
349
+ agent._initialized = True
350
+
351
+ response = await agent.get_response([])
352
+ assert response.done is False
353
+ assert len(response.tool_calls) == 1
354
+ assert response.tool_calls[0].name == "my_tool"
355
+ assert response.tool_calls[0].arguments == {"x": "value"}
356
+
357
+ @pytest.mark.asyncio
358
+ async def test_get_response_with_reasoning(self, mock_openai: AsyncOpenAI) -> None:
359
+ """Test getting response with reasoning."""
360
+ mock_response = AsyncMock()
361
+ mock_response.output = [
362
+ ResponseReasoningItem(
363
+ id="reason_123",
364
+ type="reasoning",
365
+ summary=[Summary(type="summary_text", text="Thinking about it...")],
366
+ ),
367
+ ResponseOutputMessage(
368
+ id="msg_123",
369
+ type="message",
370
+ role="assistant",
371
+ status="completed",
372
+ content=[ResponseOutputText(type="output_text", text="Answer!", annotations=[])],
373
+ ),
374
+ ]
375
+ mock_openai.responses.create = AsyncMock(return_value=mock_response)
376
+
377
+ agent = OpenAIAgent.create(
192
378
  model_client=mock_openai,
193
- validate_api_key=False, # Skip validation in tests
379
+ validate_api_key=False,
194
380
  )
381
+ agent._openai_tools = []
382
+ agent._initialized = True
383
+
384
+ response = await agent.get_response([])
385
+ # Reasoning is stored separately from content
386
+ assert response.reasoning == "Thinking about it..."
387
+ assert response.content == "Answer!"
388
+
195
389
 
196
- # Set up available tools
197
- agent._available_tools = [
198
- types.Tool(name="openai_computer", description="Computer tool", inputSchema={})
390
+ class TestOpenAIToolConversion:
391
+ """Tests for tool conversion to OpenAI format."""
392
+
393
+ @pytest.fixture
394
+ def mock_openai(self) -> Generator[AsyncOpenAI, None, None]: # type: ignore[misc]
395
+ """Create a stub OpenAI client."""
396
+ with patch("hud.agents.openai.AsyncOpenAI") as mock_class:
397
+ client = AsyncOpenAI(api_key="test", base_url="http://localhost")
398
+ client.responses.create = AsyncMock()
399
+ mock_class.return_value = client
400
+ yield client # type: ignore[misc]
401
+
402
+ @pytest.mark.asyncio
403
+ async def test_shell_tool_conversion(self, mock_openai: AsyncOpenAI) -> None:
404
+ """Test that shell tool is converted to native format."""
405
+ tools = [
406
+ types.Tool(
407
+ name="shell",
408
+ description="Execute shell commands",
409
+ inputSchema={"type": "object"},
410
+ )
199
411
  ]
412
+ ctx = MockEvalContext(tools=tools)
413
+ agent = OpenAIAgent.create(
414
+ model_client=mock_openai,
415
+ validate_api_key=False,
416
+ )
200
417
 
201
- # Mock empty response
202
- mock_response = MagicMock()
203
- mock_response.id = "response_empty"
204
- mock_response.state = "completed"
205
- mock_response.output = [] # Empty output
418
+ agent.ctx = ctx
419
+ await agent._initialize_from_ctx(ctx)
206
420
 
207
- mock_openai.responses.create = AsyncMock(return_value=mock_response)
421
+ # Check for native shell tool
422
+ shell_tool = next((t for t in agent._openai_tools if t.get("type") == "shell"), None)
423
+ assert shell_tool is not None
424
+
425
+ @pytest.mark.asyncio
426
+ async def test_computer_tool_conversion(self, mock_openai: AsyncOpenAI) -> None:
427
+ """Test that computer tool is converted to function format."""
428
+ tools = [
429
+ types.Tool(
430
+ name="computer",
431
+ description="Control computer",
432
+ inputSchema={"type": "object"},
433
+ )
434
+ ]
435
+ ctx = MockEvalContext(tools=tools)
436
+ agent = OpenAIAgent.create(
437
+ model_client=mock_openai,
438
+ validate_api_key=False,
439
+ )
208
440
 
209
- messages = [{"prompt": "Hi", "screenshot": None}]
210
- response = await agent.get_response(messages)
441
+ agent.ctx = ctx
442
+ await agent._initialize_from_ctx(ctx)
211
443
 
212
- assert response.content == ""
213
- assert response.tool_calls == []
444
+ # Computer tool is converted to a regular function tool
445
+ computer_tool = next(
446
+ (t for t in agent._openai_tools if t.get("name") == "computer"),
447
+ None,
448
+ )
449
+ assert computer_tool is not None
450
+ assert computer_tool.get("type") == "function"