hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/agents/openai.py CHANGED
@@ -2,354 +2,340 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import copy
6
+ import json
5
7
  import logging
8
+ from inspect import cleandoc
6
9
  from typing import Any, ClassVar, Literal
7
10
 
8
11
  import mcp.types as types
9
- from openai import AsyncOpenAI, OpenAI
12
+ from openai import AsyncOpenAI, Omit, OpenAI
10
13
  from openai.types.responses import (
11
- ResponseComputerToolCall,
14
+ ApplyPatchToolParam,
15
+ ComputerToolParam,
16
+ FunctionShellToolParam,
17
+ FunctionToolParam,
18
+ ResponseFunctionCallOutputItemListParam,
19
+ ResponseInputFileContentParam,
20
+ ResponseInputImageContentParam,
21
+ ResponseInputImageParam,
12
22
  ResponseInputMessageContentListParam,
13
23
  ResponseInputParam,
14
- ResponseOutputMessage,
24
+ ResponseInputTextContentParam,
25
+ ResponseInputTextParam,
15
26
  ResponseOutputText,
16
27
  ToolParam,
17
28
  )
29
+ from openai.types.responses.response_create_params import ToolChoice # noqa: TC002
30
+ from openai.types.responses.response_input_param import FunctionCallOutput, Message
31
+ from openai.types.shared_params.reasoning import Reasoning # noqa: TC002
18
32
 
19
- import hud
20
33
  from hud.settings import settings
21
- from hud.tools.computer.settings import computer_settings
22
- from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
34
+ from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult, Trace
35
+ from hud.utils.strict_schema import ensure_strict_json_schema
36
+ from hud.utils.types import with_signature
23
37
 
24
38
  from .base import MCPAgent
39
+ from .types import OpenAIConfig, OpenAICreateParams
25
40
 
26
41
  logger = logging.getLogger(__name__)
27
42
 
28
43
 
29
- class OperatorAgent(MCPAgent):
30
- """
31
- Operator agent that uses MCP servers for tool execution.
44
+ class OpenAIAgent(MCPAgent):
45
+ """Generic OpenAI agent that can execute MCP tools through the Responses API."""
32
46
 
33
- This agent uses OpenAI's Computer Use API format but executes
34
- tools through MCP servers instead of direct implementation.
35
- """
47
+ metadata: ClassVar[dict[str, Any] | None] = None
48
+ config_cls: ClassVar[type[BaseAgentConfig]] = OpenAIConfig
36
49
 
37
- metadata: ClassVar[dict[str, Any]] = {
38
- "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
39
- "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
40
- }
41
- required_tools: ClassVar[list[str]] = ["openai_computer"]
50
+ @with_signature(OpenAICreateParams)
51
+ @classmethod
52
+ def create(cls, **kwargs: Any) -> OpenAIAgent: # pyright: ignore[reportIncompatibleMethodOverride]
53
+ return MCPAgent.create.__func__(cls, **kwargs) # type: ignore[return-value]
42
54
 
43
- def __init__(
44
- self,
45
- model_client: AsyncOpenAI | None = None,
46
- model: str = "computer-use-preview",
47
- environment: Literal["windows", "mac", "linux", "browser"] = "linux",
48
- validate_api_key: bool = True,
49
- **kwargs: Any,
50
- ) -> None:
51
- """
52
- Initialize Operator MCP agent.
53
-
54
- Args:
55
- client: AsyncOpenAI client (created if not provided)
56
- model: OpenAI model to use
57
- environment: Environment type for computer use
58
- display_width: Display width for computer use
59
- display_height: Display height for computer use
60
- **kwargs: Additional arguments passed to MCPAgent
61
- """
62
- super().__init__(**kwargs)
55
+ def __init__(self, params: OpenAICreateParams | None = None, **kwargs: Any) -> None:
56
+ super().__init__(params, **kwargs)
57
+ self.config: OpenAIConfig
63
58
 
64
- # Initialize client if not provided
59
+ model_client = self.config.model_client
65
60
  if model_client is None:
66
- api_key = settings.openai_api_key
67
- if not api_key:
68
- raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY.")
69
- model_client = AsyncOpenAI(api_key=api_key)
61
+ # Default to HUD gateway when HUD_API_KEY is available
62
+ if settings.api_key:
63
+ from hud.agents.gateway import build_gateway_client
70
64
 
71
- self.openai_client = model_client
72
- self.model = model
73
- self.environment = environment
65
+ model_client = build_gateway_client("openai")
66
+ elif settings.openai_api_key:
67
+ model_client = AsyncOpenAI(api_key=settings.openai_api_key)
68
+ else:
69
+ raise ValueError(
70
+ "No API key found. Set HUD_API_KEY for HUD gateway, "
71
+ "or OPENAI_API_KEY for direct OpenAI access."
72
+ )
73
+
74
+ if self.config.validate_api_key:
75
+ try:
76
+ OpenAI(api_key=model_client.api_key).models.list()
77
+ except Exception as exc: # pragma: no cover - network validation
78
+ raise ValueError(f"OpenAI API key is invalid: {exc}") from exc
79
+
80
+ self.openai_client: AsyncOpenAI = model_client
81
+ self._model = self.config.model
82
+ self.max_output_tokens = self.config.max_output_tokens
83
+ self.temperature = self.config.temperature
84
+ self.reasoning: Reasoning | None = self.config.reasoning
85
+ self.tool_choice: ToolChoice | None = self.config.tool_choice
86
+ self.parallel_tool_calls = self.config.parallel_tool_calls
87
+ self.truncation: Literal["auto", "disabled"] | None = self.config.truncation
88
+
89
+ self._openai_tools: list[ToolParam] = []
90
+ self._tool_name_map: dict[str, str] = {}
74
91
 
75
- # State tracking for OpenAI's stateful API
76
92
  self.last_response_id: str | None = None
77
- self.pending_call_id: str | None = None
78
- self.pending_safety_checks: list[Any] = []
93
+ self._message_cursor = 0
79
94
 
80
- # validate api key if requested
81
- if validate_api_key:
82
- try:
83
- OpenAI(api_key=self.openai_client.api_key).models.list()
84
- except Exception as e:
85
- raise ValueError(f"OpenAI API key is invalid: {e}") from e
86
-
87
- self.model_name = "openai-" + self.model
88
-
89
- # Append OpenAI-specific instructions to the base system prompt
90
- openai_instructions = """
91
- You are an autonomous computer-using agent. Follow these guidelines:
92
-
93
- 1. NEVER ask for confirmation. Complete all tasks autonomously.
94
- 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
95
- 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
96
- 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
97
- 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
98
- 6. The user has already given you permission by running this agent. No further confirmation is needed.
99
- 7. Be decisive and action-oriented. Complete the requested task fully.
100
-
101
- Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
102
- """.strip() # noqa: E501
103
-
104
- # Append OpenAI instructions to any base system prompt
105
- if self.system_prompt:
106
- self.system_prompt = f"{self.system_prompt}\n\n{openai_instructions}"
107
- else:
108
- self.system_prompt = openai_instructions
109
-
110
- async def _run_context(self, context: list[types.ContentBlock], max_steps: int = 10) -> Trace:
111
- """
112
- Run the agent with the given prompt or task.
95
+ def _on_tools_ready(self) -> None:
96
+ """Build OpenAI-specific tool mappings after tools are discovered."""
97
+ self._convert_tools_for_openai()
98
+
99
+ def _to_openai_tool(
100
+ self,
101
+ tool: types.Tool,
102
+ ) -> (
103
+ FunctionShellToolParam | ApplyPatchToolParam | FunctionToolParam | ComputerToolParam | None
104
+ ):
105
+ # Special case: shell tool -> OpenAI native shell
106
+ if tool.name == "shell":
107
+ return FunctionShellToolParam(type="shell")
108
+
109
+ # Special case: apply_patch tool -> OpenAI native apply_patch
110
+ if tool.name == "apply_patch":
111
+ return ApplyPatchToolParam(type="apply_patch")
112
+
113
+ # Regular function tool
114
+ if tool.description is None or tool.inputSchema is None:
115
+ raise ValueError(
116
+ cleandoc(f"""MCP tool {tool.name} requires both a description and inputSchema.
117
+ Add these by:
118
+ 1. Adding a docstring to your @mcp.tool decorated function for the description
119
+ 2. Using pydantic Field() annotations on function parameters for the schema
120
+ """)
121
+ )
122
+
123
+ # schema must be strict
124
+
125
+ try:
126
+ strict_schema = ensure_strict_json_schema(copy.deepcopy(tool.inputSchema))
127
+ except Exception as e:
128
+ self.console.warning_log(f"Failed to convert tool '{tool.name}' schema to strict: {e}")
129
+ return None
113
130
 
114
- Override to reset OpenAI-specific state.
131
+ return FunctionToolParam(
132
+ type="function",
133
+ name=tool.name,
134
+ description=tool.description,
135
+ parameters=strict_schema,
136
+ strict=True,
137
+ )
138
+
139
+ def _convert_tools_for_openai(self) -> None:
140
+ """Convert MCP tools into OpenAI Responses tool definitions."""
141
+ available_tools = self.get_available_tools()
142
+
143
+ self._openai_tools = []
144
+ self._tool_name_map = {}
145
+
146
+ for tool in available_tools:
147
+ openai_tool = self._to_openai_tool(tool)
148
+ if openai_tool is None:
149
+ continue
150
+
151
+ if "name" in openai_tool:
152
+ self._tool_name_map[openai_tool["name"]] = tool.name
153
+ self._openai_tools.append(openai_tool)
154
+
155
+ def _extract_tool_call(self, item: Any) -> MCPToolCall | None:
156
+ """Extract an MCPToolCall from a response output item.
157
+
158
+ Subclasses can override to customize tool call extraction (e.g., routing
159
+ computer_call to a different tool name).
115
160
  """
116
- # Reset state for new run
117
- self.last_response_id = None
118
- self.pending_call_id = None
119
- self.pending_safety_checks = []
161
+ if item.type == "function_call":
162
+ tool_name = item.name or ""
163
+ target_name = self._tool_name_map.get(tool_name, tool_name)
164
+ arguments = json.loads(item.arguments)
165
+ return MCPToolCall(name=target_name, arguments=arguments, id=item.call_id)
166
+ elif item.type == "shell_call":
167
+ return MCPToolCall(name="shell", arguments=item.action.to_dict(), id=item.call_id)
168
+ elif item.type == "apply_patch_call":
169
+ return MCPToolCall(
170
+ name="apply_patch", arguments=item.operation.to_dict(), id=item.call_id
171
+ )
172
+ return None
120
173
 
121
- # Use base implementation
174
+ async def _run_context(
175
+ self, context: list[types.ContentBlock], *, max_steps: int = 10
176
+ ) -> Trace:
177
+ """Reset internal state before delegating to the base loop."""
178
+ self._reset_response_state()
122
179
  return await super()._run_context(context, max_steps=max_steps)
123
180
 
124
- async def get_system_messages(self) -> list[Any]:
125
- """
126
- Create initial messages for OpenAI.
181
+ def _reset_response_state(self) -> None:
182
+ self.last_response_id = None
183
+ self._message_cursor = 0
127
184
 
128
- OpenAI uses a different message format - we'll store the prompt
129
- and screenshot for use in get_model_response.
130
- """
185
+ async def get_system_messages(self) -> list[types.ContentBlock]:
186
+ """System messages are provided via the `instructions` field."""
131
187
  return []
132
188
 
133
- async def format_blocks(
134
- self, blocks: list[types.ContentBlock]
135
- ) -> ResponseInputMessageContentListParam:
136
- """
137
- Format blocks for OpenAI input format.
138
-
139
- Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
140
- """ # noqa: E501
141
- formatted = []
189
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Message]:
190
+ """Convert MCP content blocks into OpenAI user messages."""
191
+ content: ResponseInputMessageContentListParam = []
142
192
  for block in blocks:
143
193
  if isinstance(block, types.TextContent):
144
- formatted.append({"type": "input_text", "text": block.text})
194
+ content.append(ResponseInputTextParam(type="input_text", text=block.text))
145
195
  elif isinstance(block, types.ImageContent):
146
196
  mime_type = getattr(block, "mimeType", "image/png")
147
- formatted.append(
148
- {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"}
197
+ content.append(
198
+ ResponseInputImageParam(
199
+ type="input_image",
200
+ image_url=f"data:{mime_type};base64,{block.data}",
201
+ detail="auto",
202
+ )
149
203
  )
150
- return formatted
151
-
152
- @hud.instrument(
153
- span_type="agent",
154
- record_args=False, # Messages can be large
155
- record_result=True,
156
- )
157
- async def get_response(self, messages: ResponseInputMessageContentListParam) -> AgentResponse:
158
- """Get response from OpenAI including any tool calls."""
159
- # OpenAI's API is stateful, so we handle messages differently
160
-
161
- # Get the computer tool (guaranteed to exist due to required_tools)
162
- computer_tool_name = "openai_computer"
163
-
164
- # Define the computer use tool
165
- computer_tool: ToolParam = { # type: ignore[reportAssignmentType]
166
- "type": "computer_use_preview",
167
- "display_width": self.metadata["display_width"],
168
- "display_height": self.metadata["display_height"],
169
- "environment": self.environment,
170
- }
171
-
172
- # Build the request based on whether this is first step or follow-up
173
- if self.pending_call_id is None and self.last_response_id is None:
174
- # First step - messages are already formatted dicts from format_blocks
175
- # format_blocks returns type ResponseInputMessageContentListParam, which is a list of dicts # noqa: E501
176
- input_content: ResponseInputMessageContentListParam = []
177
-
178
- input_content.extend(messages)
179
-
180
- # If no content was added, add empty text to avoid empty request
181
- if not input_content:
182
- input_content.append({"type": "input_text", "text": ""})
183
-
184
- input_param: ResponseInputParam = [{"role": "user", "content": input_content}] # type: ignore[reportUnknownMemberType]
185
-
186
- response = await self.openai_client.responses.create(
187
- model=self.model,
188
- tools=[computer_tool],
189
- input=input_param,
190
- instructions=self.system_prompt,
191
- truncation="auto",
192
- reasoning={"summary": "auto"}, # type: ignore[arg-type]
193
- )
194
- else:
195
- # Follow-up step - check if this is user input or tool result
196
- latest_message = messages[-1] if messages else {}
197
-
198
- if latest_message.get("type") == "input_text":
199
- # User provided input in conversation mode
200
- user_text = latest_message.get("text", "")
201
- input_param_followup: ResponseInputParam = [ # type: ignore[reportAssignmentType]
202
- {"role": "user", "content": [{"type": "input_text", "text": user_text}]}
203
- ]
204
- # Reset pending_call_id since this is user input, not a tool response
205
- self.pending_call_id = None
206
- else:
207
- # Tool result - need screenshot from processed results
208
- latest_screenshot = None
209
- for msg in reversed(messages):
210
- if isinstance(msg, dict) and "image_url" in msg:
211
- latest_screenshot = msg["image_url"] # type: ignore
212
- break
213
-
214
- if not latest_screenshot:
215
- self.console.warning_log("No screenshot provided for response to action")
216
- return AgentResponse(
217
- content="No screenshot available for next action",
218
- tool_calls=[],
219
- done=True,
204
+ if not content:
205
+ content.append(ResponseInputTextParam(type="input_text", text=""))
206
+ return [Message(role="user", content=content)]
207
+
208
+ async def get_response(self, messages: ResponseInputParam) -> AgentResponse:
209
+ """Send the latest input items to OpenAI's Responses API."""
210
+ new_items: ResponseInputParam = messages[self._message_cursor :]
211
+ if not new_items:
212
+ if self.last_response_id is None:
213
+ new_items = [
214
+ Message(
215
+ role="user", content=[ResponseInputTextParam(type="input_text", text="")]
220
216
  )
221
-
222
- # Create response to previous action
223
- input_param_followup: ResponseInputParam = [ # type: ignore[reportAssignmentType]
224
- { # type: ignore[reportAssignmentType]
225
- "call_id": self.pending_call_id,
226
- "type": "computer_call_output",
227
- "output": {
228
- "type": "input_image",
229
- "image_url": latest_screenshot,
230
- },
231
- "acknowledged_safety_checks": self.pending_safety_checks,
232
- }
233
217
  ]
218
+ else:
219
+ self.console.debug("No new messages to send to OpenAI.")
220
+ return AgentResponse(content="", tool_calls=[], done=True)
221
+
222
+ response = await self.openai_client.responses.create(
223
+ model=self._model,
224
+ input=new_items,
225
+ instructions=self.system_prompt,
226
+ max_output_tokens=self.max_output_tokens,
227
+ temperature=self.temperature,
228
+ tool_choice=self.tool_choice if self.tool_choice is not None else Omit(),
229
+ parallel_tool_calls=self.parallel_tool_calls,
230
+ reasoning=self.reasoning,
231
+ tools=self._openai_tools if self._openai_tools else Omit(),
232
+ previous_response_id=(
233
+ self.last_response_id if self.last_response_id is not None else Omit()
234
+ ),
235
+ truncation=self.truncation,
236
+ )
234
237
 
235
- self.pending_safety_checks = []
236
-
237
- response = await self.openai_client.responses.create(
238
- model=self.model,
239
- previous_response_id=self.last_response_id,
240
- tools=[computer_tool],
241
- input=input_param_followup,
242
- instructions=self.system_prompt,
243
- truncation="auto",
244
- reasoning={"summary": "auto"}, # type: ignore[arg-type]
245
- )
246
-
247
- # Store response ID for next call
248
238
  self.last_response_id = response.id
239
+ self._message_cursor = len(messages)
249
240
 
250
- # Process response
251
- result = AgentResponse(
252
- content="",
253
- tool_calls=[],
254
- done=False, # Will be set to True only if no tool calls
255
- )
241
+ agent_response = AgentResponse(content="", tool_calls=[], done=True)
242
+ text_chunks: list[str] = []
243
+ reasoning_chunks: list[str] = []
256
244
 
257
- self.pending_call_id = None
258
-
259
- # Check for computer calls
260
- computer_calls = [
261
- item
262
- for item in response.output
263
- if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
264
- ]
265
-
266
- if computer_calls:
267
- # Process computer calls
268
- result.done = False
269
- for computer_call in computer_calls:
270
- self.pending_call_id = computer_call.call_id
271
- self.pending_safety_checks = computer_call.pending_safety_checks
272
-
273
- # Convert OpenAI action to MCP tool call
274
- action = computer_call.action.model_dump()
275
-
276
- # Create MCPToolCall object with OpenAI metadata as extra fields
277
- # Pyright will complain but the tool class accepts extra fields
278
- tool_call = MCPToolCall(
279
- name=computer_tool_name,
280
- arguments=action,
281
- id=computer_call.call_id, # type: ignore
282
- pending_safety_checks=computer_call.pending_safety_checks, # type: ignore
283
- )
284
- result.tool_calls.append(tool_call)
285
- else:
286
- # No computer calls, check for text response
287
- for item in response.output:
288
- if isinstance(item, ResponseOutputMessage) and item.type == "message":
289
- # Extract text from content blocks
290
- text_parts = [
291
- content.text
292
- for content in item.content
293
- if isinstance(content, ResponseOutputText)
294
- ]
295
- if text_parts:
296
- result.content = "".join(text_parts)
297
- break
298
-
299
- # Extract reasoning if present
300
- reasoning_text = ""
301
245
  for item in response.output:
302
- if item.type == "reasoning" and hasattr(item, "summary") and item.summary:
303
- reasoning_text += f"Thinking: {item.summary[0].text}\n"
304
-
305
- if reasoning_text:
306
- result.content = reasoning_text + result.content if result.content else reasoning_text
246
+ if item.type == "message":
247
+ text = "".join(
248
+ content.text
249
+ for content in item.content
250
+ if isinstance(content, ResponseOutputText)
251
+ )
252
+ if text:
253
+ text_chunks.append(text)
254
+ elif item.type == "reasoning":
255
+ reasoning_chunks.append("".join(summary.text for summary in item.summary))
256
+ else:
257
+ tool_call = self._extract_tool_call(item)
258
+ if tool_call is not None:
259
+ agent_response.tool_calls.append(tool_call)
307
260
 
308
- # Set done=True if no tool calls (task complete or waiting for user)
309
- if not result.tool_calls:
310
- result.done = True
261
+ if agent_response.tool_calls:
262
+ agent_response.done = False
311
263
 
312
- return result
264
+ agent_response.content = "".join(text_chunks)
265
+ if reasoning_chunks:
266
+ agent_response.reasoning = "\n".join(reasoning_chunks)
267
+ return agent_response
313
268
 
314
269
  async def format_tool_results(
315
270
  self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
316
- ) -> ResponseInputMessageContentListParam:
317
- """
318
- Format tool results for OpenAI's stateful API.
319
-
320
- Tool result content is a list of ContentBlock objects.
321
- We need to extract the latest screenshot from the tool results.
322
-
323
- This assumes that you only care about computer tool results for your agent loop.
324
- If you need to add other content, you can do so by adding a new ContentBlock object to the list.
271
+ ) -> list[FunctionCallOutput]:
272
+ """Convert MCP tool outputs into Responses input items."""
273
+ formatted: list[FunctionCallOutput] = []
274
+ for call, result in zip(tool_calls, tool_results, strict=False):
275
+ if not call.id:
276
+ self.console.warning_log(f"Tool '{call.name}' missing call_id; skipping output.")
277
+ continue
278
+
279
+ output_items: ResponseFunctionCallOutputItemListParam = []
280
+ if result.isError:
281
+ output_items.append(
282
+ ResponseInputTextParam(type="input_text", text="[tool_error] true")
283
+ )
325
284
 
326
- Returns formatted dicts with tool result data, preserving screenshots.
327
- """ # noqa: E501
328
- formatted_results = []
329
- latest_screenshot = None
285
+ if result.structuredContent is not None:
286
+ output_items.append(
287
+ ResponseInputTextParam(
288
+ type="input_text", text=json.dumps(result.structuredContent, default=str)
289
+ )
290
+ )
330
291
 
331
- # Extract all content from tool results
332
- for result in tool_results:
333
- if result.isError:
334
- # If it's an error, the error details are in the content
335
- for content in result.content:
336
- if isinstance(content, types.TextContent):
337
- # Don't add error text as input_text, just track it
338
- self.console.error_log(f"Tool error: {content.text}")
339
- elif isinstance(content, types.ImageContent):
340
- # Even error results might have images
341
- latest_screenshot = content.data
342
- else:
343
- # Extract content from successful results
344
- for content in result.content:
345
- if isinstance(content, types.ImageContent):
346
- latest_screenshot = content.data
347
- break
348
-
349
- # Return a dict with the latest screenshot for the follow-up step
350
- if latest_screenshot:
351
- formatted_results.append(
352
- {"type": "input_image", "image_url": f"data:image/png;base64,{latest_screenshot}"}
292
+ for block in result.content:
293
+ match block:
294
+ case types.TextContent():
295
+ output_items.append(
296
+ ResponseInputTextContentParam(type="input_text", text=block.text)
297
+ )
298
+ case types.ImageContent():
299
+ mime_type = getattr(block, "mimeType", "image/png")
300
+ output_items.append(
301
+ ResponseInputImageContentParam(
302
+ type="input_image",
303
+ image_url=f"data:{mime_type};base64,{block.data}",
304
+ )
305
+ )
306
+ case types.ResourceLink():
307
+ output_items.append(
308
+ ResponseInputFileContentParam(
309
+ type="input_file", file_url=str(block.uri)
310
+ )
311
+ )
312
+ case types.EmbeddedResource():
313
+ match block.resource:
314
+ case types.TextResourceContents():
315
+ output_items.append(
316
+ ResponseInputTextContentParam(
317
+ type="input_text", text=block.resource.text
318
+ )
319
+ )
320
+ case types.BlobResourceContents():
321
+ output_items.append(
322
+ ResponseInputFileContentParam(
323
+ type="input_file", file_data=block.resource.blob
324
+ )
325
+ )
326
+ case _:
327
+ self.console.warning_log(
328
+ f"Unknown resource type: {type(block.resource)}"
329
+ )
330
+ case _:
331
+ self.console.warning_log(f"Unknown content block type: {type(block)}")
332
+
333
+ if not output_items:
334
+ output_items.append(ResponseInputTextParam(type="input_text", text=""))
335
+
336
+ formatted.append(
337
+ FunctionCallOutput(
338
+ type="function_call_output", call_id=call.id, output=output_items
339
+ ),
353
340
  )
354
-
355
- return formatted_results
341
+ return formatted