hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -2,212 +2,448 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from unittest.mock import AsyncMock, MagicMock, patch
5
+ from typing import TYPE_CHECKING, Any, cast
6
+ from unittest.mock import AsyncMock, patch
6
7
 
7
8
  import pytest
8
9
  from mcp import types
9
-
10
- from hud.agents.openai import OperatorAgent
10
+ from openai import AsyncOpenAI
11
+ from openai.types.responses import (
12
+ ResponseFunctionToolCall,
13
+ ResponseOutputMessage,
14
+ ResponseOutputText,
15
+ ResponseReasoningItem,
16
+ )
17
+ from openai.types.responses.response_reasoning_item import Summary
18
+
19
+ from hud.agents.openai import OpenAIAgent
20
+ from hud.environment.router import ToolRouter
21
+ from hud.eval.context import EvalContext
11
22
  from hud.types import MCPToolCall, MCPToolResult
12
23
 
24
+ if TYPE_CHECKING:
25
+ from collections.abc import Generator
26
+
27
+
28
+ class MockEvalContext(EvalContext):
29
+ """Mock EvalContext for testing."""
30
+
31
+ def __init__(self, tools: list[types.Tool] | None = None) -> None:
32
+ # Core attributes
33
+ self.prompt = "Test prompt"
34
+ self._tools = tools or []
35
+ self._submitted: str | None = None
36
+ self.reward: float | None = None
37
+
38
+ # Environment attributes
39
+ self._router = ToolRouter()
40
+ self._agent_include: list[str] | None = None
41
+ self._agent_exclude: list[str] | None = None
42
+
43
+ # EvalContext attributes
44
+ self._task = None
45
+ self.trace_id = "test-trace-id"
46
+ self.eval_name = "test-eval"
47
+ self.job_id: str | None = None
48
+ self.group_id: str | None = None
49
+ self.index = 0
50
+ self.variants: dict[str, Any] = {}
51
+ self.answer: str | None = None
52
+ self.system_prompt: str | None = None
53
+ self.error: BaseException | None = None
54
+ self.metadata: dict[str, Any] = {}
55
+ self.results: list[Any] = []
56
+ self._is_summary = False
57
+
58
+ def as_tools(self) -> list[types.Tool]:
59
+ return self._tools
60
+
61
+ @property
62
+ def has_scenario(self) -> bool:
63
+ return False
64
+
65
+ async def list_tools(self) -> list[types.Tool]:
66
+ return self._tools
67
+
68
+ async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
69
+ return MCPToolResult(
70
+ content=[types.TextContent(type="text", text="ok")],
71
+ isError=False,
72
+ )
13
73
 
14
- class TestOperatorAgent:
15
- """Test OperatorAgent class."""
74
+ async def submit(self, answer: str) -> None:
75
+ self._submitted = answer
16
76
 
17
- @pytest.fixture
18
- def mock_mcp_client(self):
19
- """Create a mock MCP client."""
20
- mcp_client = AsyncMock()
21
- # Set up the mcp_config attribute as a regular dict, not a coroutine
22
- mcp_client.mcp_config = {"test_server": {"url": "http://test"}}
23
- # Mock list_tools to return the required openai_computer tool
24
- mcp_client.list_tools = AsyncMock(
25
- return_value=[
26
- types.Tool(
27
- name="openai_computer", description="OpenAI computer use tool", inputSchema={}
28
- )
29
- ]
30
- )
31
- mcp_client.initialize = AsyncMock()
32
- return mcp_client
77
+
78
+ class TestOpenAIAgent:
79
+ """Test OpenAIAgent class."""
33
80
 
34
81
  @pytest.fixture
35
- def mock_openai(self):
36
- """Create a mock OpenAI client."""
37
- with patch("hud.agents.openai.AsyncOpenAI") as mock:
38
- client = AsyncMock()
39
- mock.return_value = client
40
- yield client
82
+ def mock_openai(self) -> Generator[AsyncOpenAI, None, None]: # type: ignore[misc]
83
+ """Create a stub OpenAI client."""
84
+ with patch("hud.agents.openai.AsyncOpenAI") as mock_class:
85
+ client = AsyncOpenAI(api_key="test", base_url="http://localhost")
86
+ client.chat.completions.create = AsyncMock()
87
+ client.responses.create = AsyncMock()
88
+ mock_class.return_value = client
89
+ yield client # type: ignore[misc]
41
90
 
42
91
  @pytest.mark.asyncio
43
- async def test_init(self, mock_mcp_client):
44
- """Test agent initialization."""
45
- mock_model_client = MagicMock()
46
- agent = OperatorAgent(
47
- mcp_client=mock_mcp_client,
48
- model_client=mock_model_client,
49
- model="gpt-4",
50
- validate_api_key=False, # Skip validation in tests
92
+ async def test_init_with_client(self, mock_openai: AsyncOpenAI) -> None:
93
+ """Test agent initialization with provided client."""
94
+ agent = OpenAIAgent.create(
95
+ model_client=mock_openai,
96
+ model="gpt-4o",
97
+ validate_api_key=False,
51
98
  )
52
99
 
53
- assert agent.model_name == "openai-gpt-4"
54
- assert agent.model == "gpt-4"
55
- assert agent.openai_client == mock_model_client
100
+ assert agent.model_name == "OpenAI"
101
+ assert agent.config.model == "gpt-4o"
102
+ assert agent.model == "gpt-4o"
103
+ assert agent.openai_client == mock_openai
104
+ assert agent.max_output_tokens is None
105
+ assert agent.temperature is None
106
+
107
+ @pytest.mark.asyncio
108
+ async def test_init_with_parameters(self, mock_openai: AsyncOpenAI) -> None:
109
+ """Test agent initialization with various parameters."""
110
+ agent = OpenAIAgent.create(
111
+ model_client=mock_openai,
112
+ model="gpt-4o",
113
+ max_output_tokens=2048,
114
+ temperature=0.7,
115
+ reasoning={"effort": "high"},
116
+ tool_choice="auto",
117
+ parallel_tool_calls=True,
118
+ validate_api_key=False,
119
+ )
120
+
121
+ assert agent.max_output_tokens == 2048
122
+ assert agent.temperature == 0.7
123
+ assert agent.reasoning == {"effort": "high"}
124
+ assert agent.tool_choice == "auto"
125
+ assert agent.parallel_tool_calls is True
126
+
127
+ @pytest.mark.asyncio
128
+ async def test_init_without_client_no_api_key(self) -> None:
129
+ """Test agent initialization fails without API key."""
130
+ with patch("hud.agents.openai.settings") as mock_settings:
131
+ mock_settings.openai_api_key = None
132
+ with pytest.raises(ValueError, match="OpenAI API key not found"):
133
+ OpenAIAgent.create()
56
134
 
57
135
  @pytest.mark.asyncio
58
- async def test_format_blocks(self, mock_mcp_client):
59
- """Test formatting content blocks."""
60
- mock_model_client = MagicMock()
61
- agent = OperatorAgent(
62
- mcp_client=mock_mcp_client,
63
- model_client=mock_model_client,
64
- validate_api_key=False, # Skip validation in tests
136
+ async def test_format_blocks_text_only(self, mock_openai: AsyncOpenAI) -> None:
137
+ """Test formatting text content blocks."""
138
+ agent = OpenAIAgent.create(
139
+ model_client=mock_openai,
140
+ validate_api_key=False,
65
141
  )
66
142
 
67
- # Test with text blocks
68
143
  blocks: list[types.ContentBlock] = [
69
- types.TextContent(type="text", text="Hello, GPT!"),
70
- types.TextContent(type="text", text="Another message"),
144
+ types.TextContent(type="text", text="Hello, world!"),
145
+ types.TextContent(type="text", text="How are you?"),
71
146
  ]
72
147
 
73
148
  messages = await agent.format_blocks(blocks)
74
- assert len(messages) == 2
75
- assert messages[0] == {"type": "input_text", "text": "Hello, GPT!"}
76
- assert messages[1] == {"type": "input_text", "text": "Another message"}
149
+ assert len(messages) == 1
150
+ assert messages[0]["role"] == "user"
151
+ assert len(messages[0]["content"]) == 2
152
+ assert messages[0]["content"][0]["type"] == "input_text"
153
+ assert messages[0]["content"][0]["text"] == "Hello, world!"
77
154
 
78
- # Test with mixed content
79
- blocks = [
80
- types.TextContent(type="text", text="Text content"),
155
+ @pytest.mark.asyncio
156
+ async def test_format_blocks_with_image(self, mock_openai: AsyncOpenAI) -> None:
157
+ """Test formatting image content blocks."""
158
+ agent = OpenAIAgent.create(
159
+ model_client=mock_openai,
160
+ validate_api_key=False,
161
+ )
162
+
163
+ blocks: list[types.ContentBlock] = [
164
+ types.TextContent(type="text", text="Look at this:"),
81
165
  types.ImageContent(type="image", data="base64data", mimeType="image/png"),
82
166
  ]
83
167
 
84
168
  messages = await agent.format_blocks(blocks)
85
- assert len(messages) == 2
86
- assert messages[0] == {"type": "input_text", "text": "Text content"}
87
- assert messages[1] == {
88
- "type": "input_image",
89
- "image_url": "data:image/png;base64,base64data",
90
- }
91
-
92
- @pytest.mark.asyncio
93
- async def test_format_tool_results(self, mock_mcp_client, mock_openai):
94
- """Test formatting tool results."""
95
- agent = OperatorAgent(
96
- mcp_client=mock_mcp_client,
169
+ assert len(messages) == 1
170
+ assert len(messages[0]["content"]) == 2
171
+ assert messages[0]["content"][1]["type"] == "input_image"
172
+ assert messages[0]["content"][1]["image_url"] == "data:image/png;base64,base64data" # type: ignore[typeddict-item]
173
+
174
+ @pytest.mark.asyncio
175
+ async def test_format_blocks_empty(self, mock_openai: AsyncOpenAI) -> None:
176
+ """Test formatting empty content blocks."""
177
+ agent = OpenAIAgent.create(
97
178
  model_client=mock_openai,
98
- validate_api_key=False, # Skip validation in tests
179
+ validate_api_key=False,
99
180
  )
100
181
 
101
- tool_calls = [
102
- MCPToolCall(name="test_tool", arguments={}, id="call_123"), # type: ignore
103
- MCPToolCall(name="screenshot", arguments={}, id="call_456"), # type: ignore
104
- ]
182
+ messages = await agent.format_blocks([])
183
+ assert len(messages) == 1
184
+ # Empty blocks produce a single empty text item
185
+ assert len(messages[0]["content"]) == 1
186
+ assert messages[0]["content"][0]["type"] == "input_text"
187
+ assert messages[0]["content"][0]["text"] == ""
188
+
189
+ @pytest.mark.asyncio
190
+ async def test_format_tool_results_text(self, mock_openai: AsyncOpenAI) -> None:
191
+ """Test formatting tool results with text content."""
192
+ agent = OpenAIAgent.create(
193
+ model_client=mock_openai,
194
+ validate_api_key=False,
195
+ )
105
196
 
197
+ tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
106
198
  tool_results = [
107
- MCPToolResult(content=[types.TextContent(type="text", text="Success")], isError=False),
108
199
  MCPToolResult(
109
- content=[types.ImageContent(type="image", data="base64data", mimeType="image/png")],
200
+ content=[types.TextContent(type="text", text="Tool output")],
110
201
  isError=False,
111
- ),
202
+ )
112
203
  ]
113
204
 
114
205
  messages = await agent.format_tool_results(tool_calls, tool_results)
115
-
116
- # OpenAI's format_tool_results returns input_image with screenshot
117
206
  assert len(messages) == 1
118
- assert messages[0]["type"] == "input_image"
119
- assert "image_url" in messages[0]
120
- assert messages[0]["image_url"] == "data:image/png;base64,base64data"
207
+ assert messages[0]["type"] == "function_call_output"
208
+ assert messages[0]["call_id"] == "call_123"
209
+ # Output is a list of content items
210
+ assert len(messages[0]["output"]) == 1
211
+ assert messages[0]["output"][0]["text"] == "Tool output" # type: ignore[index]
121
212
 
122
213
  @pytest.mark.asyncio
123
- async def test_format_tool_results_with_error(self, mock_mcp_client, mock_openai):
124
- """Test formatting tool results with errors."""
125
- agent = OperatorAgent(
126
- mcp_client=mock_mcp_client,
214
+ async def test_format_tool_results_with_error(self, mock_openai: AsyncOpenAI) -> None:
215
+ """Test formatting tool results with error."""
216
+ agent = OpenAIAgent.create(
127
217
  model_client=mock_openai,
128
- validate_api_key=False, # Skip validation in tests
218
+ validate_api_key=False,
129
219
  )
130
220
 
131
- tool_calls = [
132
- MCPToolCall(name="failing_tool", arguments={}, id="call_error"), # type: ignore
133
- ]
134
-
221
+ tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
135
222
  tool_results = [
136
223
  MCPToolResult(
137
- content=[types.TextContent(type="text", text="Something went wrong")], isError=True
138
- ),
224
+ content=[types.TextContent(type="text", text="Error message")],
225
+ isError=True,
226
+ )
139
227
  ]
140
228
 
141
229
  messages = await agent.format_tool_results(tool_calls, tool_results)
230
+ assert len(messages) == 1
231
+ # Output is a list; first item is error indicator, second is the message
232
+ msg = cast("dict[str, Any]", messages[0])
233
+ output = cast("list[dict[str, Any]]", msg["output"])
234
+ assert any(item.get("text") == "[tool_error] true" for item in output)
235
+ assert any(item.get("text") == "Error message" for item in output)
236
+
237
+ @pytest.mark.asyncio
238
+ async def test_get_system_messages(self, mock_openai: AsyncOpenAI) -> None:
239
+ """Test getting system messages - OpenAI uses instructions field instead."""
240
+ agent = OpenAIAgent.create(
241
+ model_client=mock_openai,
242
+ system_prompt="You are a helpful assistant.",
243
+ validate_api_key=False,
244
+ )
142
245
 
143
- # Since the result has isError=True and no screenshot, returns empty list
246
+ # OpenAI agent returns empty list - system prompt is passed via instructions
247
+ messages = await agent.get_system_messages()
144
248
  assert len(messages) == 0
145
249
 
146
250
  @pytest.mark.asyncio
147
- async def test_get_model_response(self, mock_mcp_client, mock_openai):
148
- """Test getting model response from OpenAI API."""
149
- # Disable telemetry for this test to avoid backend configuration issues
150
- with patch("hud.settings.settings.telemetry_enabled", False):
151
- agent = OperatorAgent(
152
- mcp_client=mock_mcp_client,
153
- model_client=mock_openai,
154
- validate_api_key=False, # Skip validation in tests
251
+ async def test_convert_tools_for_openai(self, mock_openai: AsyncOpenAI) -> None:
252
+ """Test converting MCP tools to OpenAI format."""
253
+ tools = [
254
+ types.Tool(
255
+ name="my_tool",
256
+ description="A test tool",
257
+ inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
155
258
  )
259
+ ]
260
+ ctx = MockEvalContext(tools=tools)
261
+ agent = OpenAIAgent.create(
262
+ model_client=mock_openai,
263
+ validate_api_key=False,
264
+ )
156
265
 
157
- # Set up available tools so agent doesn't return "No computer use tools available"
158
- agent._available_tools = [
159
- types.Tool(name="computer_openai", description="Computer tool", inputSchema={})
160
- ]
266
+ # Initialize with context to trigger tool conversion
267
+ agent.ctx = ctx
268
+ await agent._initialize_from_ctx(ctx)
161
269
 
162
- # Mock OpenAI API response for a successful computer use response
163
- mock_response = MagicMock()
164
- mock_response.id = "response_123"
165
- mock_response.state = "completed"
166
- # Mock the output message structure
167
- mock_output_text = MagicMock()
168
- mock_output_text.type = "output_text"
169
- mock_output_text.text = "I can see the screen content."
270
+ # Check that tools were converted
271
+ assert len(agent._openai_tools) >= 1
272
+ # Find our tool
273
+ tool = next((t for t in agent._openai_tools if t.get("name") == "my_tool"), None)
274
+ assert tool is not None
275
+ assert tool["type"] == "function"
170
276
 
171
- mock_output_message = MagicMock()
172
- mock_output_message.type = "message"
173
- mock_output_message.content = [mock_output_text]
277
+ @pytest.mark.asyncio
278
+ async def test_convert_tools_raises_on_incomplete(self, mock_openai: AsyncOpenAI) -> None:
279
+ """Test that tools without description raise error."""
280
+ tools = [
281
+ types.Tool(
282
+ name="incomplete_tool",
283
+ description=None, # Missing description
284
+ inputSchema={"type": "object"},
285
+ )
286
+ ]
287
+ ctx = MockEvalContext(tools=tools)
288
+ agent = OpenAIAgent.create(
289
+ model_client=mock_openai,
290
+ validate_api_key=False,
291
+ )
174
292
 
175
- mock_response.output = [mock_output_message]
293
+ agent.ctx = ctx
294
+ with pytest.raises(ValueError, match="requires both a description"):
295
+ await agent._initialize_from_ctx(ctx)
176
296
 
177
- mock_openai.responses.create = AsyncMock(return_value=mock_response)
297
+ @pytest.mark.asyncio
298
+ async def test_get_response_with_text(self, mock_openai: AsyncOpenAI) -> None:
299
+ """Test getting response with text output."""
300
+ # Setup mock response
301
+ mock_response = AsyncMock()
302
+ mock_response.output = [
303
+ ResponseOutputMessage(
304
+ id="msg_123",
305
+ type="message",
306
+ role="assistant",
307
+ status="completed",
308
+ content=[ResponseOutputText(type="output_text", text="Hello!", annotations=[])],
309
+ )
310
+ ]
311
+ mock_openai.responses.create = AsyncMock(return_value=mock_response)
178
312
 
179
- messages = [{"prompt": "What's on the screen?", "screenshot": None}]
180
- response = await agent.get_response(messages)
313
+ agent = OpenAIAgent.create(
314
+ model_client=mock_openai,
315
+ validate_api_key=False,
316
+ )
317
+ # Set empty tools to avoid needing initialization
318
+ agent._openai_tools = []
319
+ agent._initialized = True
181
320
 
182
- # The test should verify that the response is processed correctly
183
- # Since the isinstance checks will fail, content will be empty, but done should be True
184
- assert response.done is True
185
- assert response.tool_calls == []
321
+ response = await agent.get_response([])
322
+ assert response.content == "Hello!"
323
+ assert response.done is True
324
+ assert len(response.tool_calls) == 0
186
325
 
187
326
  @pytest.mark.asyncio
188
- async def test_handle_empty_response(self, mock_mcp_client, mock_openai):
189
- """Test handling empty response from API."""
190
- agent = OperatorAgent(
191
- mcp_client=mock_mcp_client,
327
+ async def test_get_response_with_tool_call(self, mock_openai: AsyncOpenAI) -> None:
328
+ """Test getting response with tool call."""
329
+ mock_response = AsyncMock()
330
+ # Tool calls come as separate output items, not inside message content
331
+ mock_response.output = [
332
+ ResponseFunctionToolCall(
333
+ id="call_123",
334
+ type="function_call",
335
+ call_id="call_123",
336
+ name="my_tool",
337
+ arguments='{"x": "value"}',
338
+ )
339
+ ]
340
+ mock_openai.responses.create = AsyncMock(return_value=mock_response)
341
+
342
+ agent = OpenAIAgent.create(
343
+ model_client=mock_openai,
344
+ validate_api_key=False,
345
+ )
346
+ agent._openai_tools = []
347
+ agent._tool_name_map = {"my_tool": "my_tool"}
348
+ agent._initialized = True
349
+
350
+ response = await agent.get_response([])
351
+ assert response.done is False
352
+ assert len(response.tool_calls) == 1
353
+ assert response.tool_calls[0].name == "my_tool"
354
+ assert response.tool_calls[0].arguments == {"x": "value"}
355
+
356
+ @pytest.mark.asyncio
357
+ async def test_get_response_with_reasoning(self, mock_openai: AsyncOpenAI) -> None:
358
+ """Test getting response with reasoning."""
359
+ mock_response = AsyncMock()
360
+ mock_response.output = [
361
+ ResponseReasoningItem(
362
+ id="reason_123",
363
+ type="reasoning",
364
+ summary=[Summary(type="summary_text", text="Thinking about it...")],
365
+ ),
366
+ ResponseOutputMessage(
367
+ id="msg_123",
368
+ type="message",
369
+ role="assistant",
370
+ status="completed",
371
+ content=[ResponseOutputText(type="output_text", text="Answer!", annotations=[])],
372
+ ),
373
+ ]
374
+ mock_openai.responses.create = AsyncMock(return_value=mock_response)
375
+
376
+ agent = OpenAIAgent.create(
192
377
  model_client=mock_openai,
193
- validate_api_key=False, # Skip validation in tests
378
+ validate_api_key=False,
194
379
  )
380
+ agent._openai_tools = []
381
+ agent._initialized = True
382
+
383
+ response = await agent.get_response([])
384
+ # Reasoning is stored separately from content
385
+ assert response.reasoning == "Thinking about it..."
386
+ assert response.content == "Answer!"
387
+
195
388
 
196
- # Set up available tools
197
- agent._available_tools = [
198
- types.Tool(name="openai_computer", description="Computer tool", inputSchema={})
389
+ class TestOpenAIToolConversion:
390
+ """Tests for tool conversion to OpenAI format."""
391
+
392
+ @pytest.fixture
393
+ def mock_openai(self) -> Generator[AsyncOpenAI, None, None]: # type: ignore[misc]
394
+ """Create a stub OpenAI client."""
395
+ with patch("hud.agents.openai.AsyncOpenAI") as mock_class:
396
+ client = AsyncOpenAI(api_key="test", base_url="http://localhost")
397
+ client.responses.create = AsyncMock()
398
+ mock_class.return_value = client
399
+ yield client # type: ignore[misc]
400
+
401
+ @pytest.mark.asyncio
402
+ async def test_shell_tool_conversion(self, mock_openai: AsyncOpenAI) -> None:
403
+ """Test that shell tool is converted to native format."""
404
+ tools = [
405
+ types.Tool(
406
+ name="shell",
407
+ description="Execute shell commands",
408
+ inputSchema={"type": "object"},
409
+ )
199
410
  ]
411
+ ctx = MockEvalContext(tools=tools)
412
+ agent = OpenAIAgent.create(
413
+ model_client=mock_openai,
414
+ validate_api_key=False,
415
+ )
200
416
 
201
- # Mock empty response
202
- mock_response = MagicMock()
203
- mock_response.id = "response_empty"
204
- mock_response.state = "completed"
205
- mock_response.output = [] # Empty output
417
+ agent.ctx = ctx
418
+ await agent._initialize_from_ctx(ctx)
206
419
 
207
- mock_openai.responses.create = AsyncMock(return_value=mock_response)
420
+ # Check for native shell tool
421
+ shell_tool = next((t for t in agent._openai_tools if t.get("type") == "shell"), None)
422
+ assert shell_tool is not None
423
+
424
+ @pytest.mark.asyncio
425
+ async def test_computer_tool_conversion(self, mock_openai: AsyncOpenAI) -> None:
426
+ """Test that computer tool is converted to function format."""
427
+ tools = [
428
+ types.Tool(
429
+ name="computer",
430
+ description="Control computer",
431
+ inputSchema={"type": "object"},
432
+ )
433
+ ]
434
+ ctx = MockEvalContext(tools=tools)
435
+ agent = OpenAIAgent.create(
436
+ model_client=mock_openai,
437
+ validate_api_key=False,
438
+ )
208
439
 
209
- messages = [{"prompt": "Hi", "screenshot": None}]
210
- response = await agent.get_response(messages)
440
+ agent.ctx = ctx
441
+ await agent._initialize_from_ctx(ctx)
211
442
 
212
- assert response.content == ""
213
- assert response.tool_calls == []
443
+ # Computer tool is converted to a regular function tool
444
+ computer_tool = next(
445
+ (t for t in agent._openai_tools if t.get("name") == "computer"),
446
+ None,
447
+ )
448
+ assert computer_tool is not None
449
+ assert computer_tool.get("type") == "function"