hud-python 0.2.10__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +20 -8
- hud/adapters/common/adapter.py +14 -3
- hud/adapters/common/tests/test_adapter.py +16 -4
- hud/datasets.py +188 -0
- hud/env/docker_client.py +15 -3
- hud/env/environment.py +10 -7
- hud/env/local_docker_client.py +29 -7
- hud/env/remote_client.py +1 -1
- hud/env/remote_docker_client.py +2 -2
- hud/exceptions.py +2 -1
- hud/gym.py +0 -9
- hud/mcp/__init__.py +17 -0
- hud/mcp/base.py +631 -0
- hud/mcp/claude.py +321 -0
- hud/mcp/client.py +312 -0
- hud/mcp/langchain.py +250 -0
- hud/mcp/openai.py +334 -0
- hud/mcp/tests/__init__.py +1 -0
- hud/mcp/tests/test_base.py +512 -0
- hud/mcp/tests/test_claude.py +294 -0
- hud/mcp/tests/test_client.py +324 -0
- hud/mcp/tests/test_openai.py +238 -0
- hud/settings.py +20 -2
- hud/task.py +5 -88
- hud/taskset.py +2 -23
- hud/telemetry/__init__.py +16 -7
- hud/telemetry/_trace.py +246 -72
- hud/telemetry/context.py +88 -27
- hud/telemetry/exporter.py +171 -11
- hud/telemetry/instrumentation/mcp.py +174 -410
- hud/telemetry/job.py +141 -0
- hud/telemetry/mcp_models.py +13 -74
- hud/telemetry/tests/test_context.py +9 -6
- hud/telemetry/tests/test_trace.py +120 -78
- hud/tools/__init__.py +34 -0
- hud/tools/base.py +65 -0
- hud/tools/bash.py +137 -0
- hud/tools/computer/__init__.py +13 -0
- hud/tools/computer/anthropic.py +411 -0
- hud/tools/computer/hud.py +315 -0
- hud/tools/computer/openai.py +283 -0
- hud/tools/edit.py +290 -0
- hud/tools/executors/__init__.py +30 -0
- hud/tools/executors/base.py +331 -0
- hud/tools/executors/pyautogui.py +619 -0
- hud/tools/executors/tests/__init__.py +1 -0
- hud/tools/executors/tests/test_base_executor.py +338 -0
- hud/tools/executors/tests/test_pyautogui_executor.py +165 -0
- hud/tools/executors/xdo.py +503 -0
- hud/tools/helper/README.md +56 -0
- hud/tools/helper/__init__.py +9 -0
- hud/tools/helper/mcp_server.py +78 -0
- hud/tools/helper/server_initialization.py +115 -0
- hud/tools/helper/utils.py +58 -0
- hud/tools/playwright_tool.py +379 -0
- hud/tools/tests/__init__.py +3 -0
- hud/tools/tests/test_bash.py +152 -0
- hud/tools/tests/test_computer.py +52 -0
- hud/tools/tests/test_computer_actions.py +34 -0
- hud/tools/tests/test_edit.py +240 -0
- hud/tools/tests/test_init.py +27 -0
- hud/tools/tests/test_playwright_tool.py +183 -0
- hud/tools/tests/test_tools.py +157 -0
- hud/tools/tests/test_utils.py +156 -0
- hud/tools/utils.py +50 -0
- hud/trajectory.py +5 -1
- hud/types.py +10 -1
- hud/utils/tests/test_init.py +21 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/METADATA +27 -18
- hud_python-0.3.1.dist-info/RECORD +119 -0
- hud/evaluators/__init__.py +0 -9
- hud/evaluators/base.py +0 -32
- hud/evaluators/inspect.py +0 -24
- hud/evaluators/judge.py +0 -189
- hud/evaluators/match.py +0 -156
- hud/evaluators/remote.py +0 -65
- hud/evaluators/tests/__init__.py +0 -0
- hud/evaluators/tests/test_inspect.py +0 -12
- hud/evaluators/tests/test_judge.py +0 -231
- hud/evaluators/tests/test_match.py +0 -115
- hud/evaluators/tests/test_remote.py +0 -98
- hud_python-0.2.10.dist-info/RECORD +0 -85
- {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/WHEEL +0 -0
- {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/licenses/LICENSE +0 -0
hud/mcp/langchain.py
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"""LangChain MCP Agent implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
import mcp.types as types
|
|
9
|
+
from langchain.agents import AgentExecutor, create_tool_calling_agent
|
|
10
|
+
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
|
11
|
+
from langchain.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage
|
|
12
|
+
from mcp.types import CallToolRequestParams as MCPToolCall
|
|
13
|
+
from mcp.types import CallToolResult as MCPToolResult
|
|
14
|
+
from mcp_use.adapters.langchain_adapter import LangChainAdapter
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from langchain.schema.language_model import BaseLanguageModel
|
|
18
|
+
from langchain_core.tools import BaseTool
|
|
19
|
+
from .base import BaseMCPAgent, ModelResponse
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LangChainMCPAgent(BaseMCPAgent):
|
|
25
|
+
"""
|
|
26
|
+
LangChain agent that uses MCP servers for tool execution.
|
|
27
|
+
|
|
28
|
+
This agent wraps any LangChain-compatible LLM and provides
|
|
29
|
+
access to MCP tools through LangChain's tool-calling interface.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
llm: BaseLanguageModel,
|
|
35
|
+
**kwargs: Any,
|
|
36
|
+
) -> None:
|
|
37
|
+
"""
|
|
38
|
+
Initialize LangChain MCP agent.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
llm: Any LangChain-compatible language model
|
|
42
|
+
**kwargs: Additional arguments passed to BaseMCPAgent
|
|
43
|
+
"""
|
|
44
|
+
super().__init__(**kwargs)
|
|
45
|
+
|
|
46
|
+
self.llm = llm
|
|
47
|
+
self.adapter = LangChainAdapter(disallowed_tools=self.disallowed_tools)
|
|
48
|
+
self._langchain_tools: list[BaseTool] | None = None
|
|
49
|
+
|
|
50
|
+
self.model_name = (
|
|
51
|
+
"langchain-" + self.llm.model_name # type: ignore
|
|
52
|
+
if hasattr(self.llm, "model_name")
|
|
53
|
+
else "unknown"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def _get_langchain_tools(self) -> list[BaseTool]:
|
|
57
|
+
"""Get or create LangChain tools from MCP tools."""
|
|
58
|
+
if self._langchain_tools is not None:
|
|
59
|
+
return self._langchain_tools
|
|
60
|
+
|
|
61
|
+
# Create LangChain tools from MCP tools using the adapter
|
|
62
|
+
self._langchain_tools = []
|
|
63
|
+
|
|
64
|
+
# Get tools grouped by connector
|
|
65
|
+
tools_by_connector = self.get_tools_by_connector()
|
|
66
|
+
|
|
67
|
+
# Convert tools using the adapter
|
|
68
|
+
for connector, tools in tools_by_connector.items():
|
|
69
|
+
langchain_tools = self.adapter._convert_tools(tools, connector) # type: ignore[reportAttributeAccessIssue]
|
|
70
|
+
self._langchain_tools.extend(langchain_tools)
|
|
71
|
+
|
|
72
|
+
logger.info("Created %s LangChain tools from MCP tools", len(self._langchain_tools))
|
|
73
|
+
return self._langchain_tools
|
|
74
|
+
|
|
75
|
+
async def create_initial_messages(
|
|
76
|
+
self, prompt: str, screenshot: str | None
|
|
77
|
+
) -> list[BaseMessage]:
|
|
78
|
+
"""Create initial messages for LangChain."""
|
|
79
|
+
messages = []
|
|
80
|
+
|
|
81
|
+
# Add system message
|
|
82
|
+
system_prompt = self.get_system_prompt()
|
|
83
|
+
messages.append(SystemMessage(content=system_prompt))
|
|
84
|
+
|
|
85
|
+
# Add user message with prompt and optional screenshot
|
|
86
|
+
if screenshot:
|
|
87
|
+
# For multimodal models, include the image
|
|
88
|
+
content = [
|
|
89
|
+
{"type": "text", "text": prompt},
|
|
90
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot}"}},
|
|
91
|
+
]
|
|
92
|
+
messages.append(HumanMessage(content=content))
|
|
93
|
+
else:
|
|
94
|
+
messages.append(HumanMessage(content=prompt))
|
|
95
|
+
|
|
96
|
+
return messages
|
|
97
|
+
|
|
98
|
+
async def get_model_response(self, messages: list[BaseMessage]) -> ModelResponse:
|
|
99
|
+
"""Get response from LangChain model including any tool calls."""
|
|
100
|
+
# Get LangChain tools (created lazily)
|
|
101
|
+
langchain_tools = self._get_langchain_tools()
|
|
102
|
+
|
|
103
|
+
# Create a prompt template from current messages
|
|
104
|
+
# Extract system message if present
|
|
105
|
+
system_content = "You are a helpful assistant"
|
|
106
|
+
non_system_messages = []
|
|
107
|
+
|
|
108
|
+
for msg in messages:
|
|
109
|
+
if isinstance(msg, SystemMessage):
|
|
110
|
+
system_content = str(msg.content)
|
|
111
|
+
else:
|
|
112
|
+
non_system_messages.append(msg)
|
|
113
|
+
|
|
114
|
+
# Create prompt with placeholders
|
|
115
|
+
prompt = ChatPromptTemplate.from_messages(
|
|
116
|
+
[
|
|
117
|
+
("system", system_content),
|
|
118
|
+
MessagesPlaceholder(variable_name="chat_history"),
|
|
119
|
+
MessagesPlaceholder(variable_name="agent_scratchpad"),
|
|
120
|
+
]
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Create agent with tools
|
|
124
|
+
agent = create_tool_calling_agent(
|
|
125
|
+
llm=self.llm,
|
|
126
|
+
tools=langchain_tools,
|
|
127
|
+
prompt=prompt,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Create executor
|
|
131
|
+
executor = AgentExecutor(
|
|
132
|
+
agent=agent,
|
|
133
|
+
tools=langchain_tools,
|
|
134
|
+
verbose=False,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Format the last user message as input
|
|
138
|
+
last_user_msg = None
|
|
139
|
+
for msg in reversed(non_system_messages):
|
|
140
|
+
if isinstance(msg, HumanMessage):
|
|
141
|
+
last_user_msg = msg
|
|
142
|
+
break
|
|
143
|
+
|
|
144
|
+
if not last_user_msg:
|
|
145
|
+
return ModelResponse(content="No user message found", tool_calls=[], done=True)
|
|
146
|
+
|
|
147
|
+
# Extract text from message content
|
|
148
|
+
input_text = ""
|
|
149
|
+
if isinstance(last_user_msg.content, str):
|
|
150
|
+
input_text = last_user_msg.content
|
|
151
|
+
elif isinstance(last_user_msg.content, list):
|
|
152
|
+
# Extract text from multimodal content
|
|
153
|
+
for item in last_user_msg.content:
|
|
154
|
+
if isinstance(item, dict) and item.get("type") == "text":
|
|
155
|
+
input_text = item.get("text", "")
|
|
156
|
+
break
|
|
157
|
+
|
|
158
|
+
# Build chat history (exclude last user message and system)
|
|
159
|
+
chat_history = []
|
|
160
|
+
for _, msg in enumerate(non_system_messages[:-1]):
|
|
161
|
+
if isinstance(msg, HumanMessage | AIMessage):
|
|
162
|
+
chat_history.append(msg)
|
|
163
|
+
|
|
164
|
+
# Execute the agent
|
|
165
|
+
try:
|
|
166
|
+
result = await executor.ainvoke(
|
|
167
|
+
{
|
|
168
|
+
"input": input_text,
|
|
169
|
+
"chat_history": chat_history,
|
|
170
|
+
}
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Process the result
|
|
174
|
+
output = result.get("output", "")
|
|
175
|
+
|
|
176
|
+
# Check if tools were called
|
|
177
|
+
if result.get("intermediate_steps"):
|
|
178
|
+
# Tools were called
|
|
179
|
+
tool_calls = []
|
|
180
|
+
for action, _ in result["intermediate_steps"]:
|
|
181
|
+
if hasattr(action, "tool") and hasattr(action, "tool_input"):
|
|
182
|
+
tool_calls.append(
|
|
183
|
+
MCPToolCall(
|
|
184
|
+
name=action.tool,
|
|
185
|
+
arguments=action.tool_input,
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
return ModelResponse(content=output, tool_calls=tool_calls, done=False)
|
|
190
|
+
else:
|
|
191
|
+
# No tools called, just text response
|
|
192
|
+
return ModelResponse(content=output, tool_calls=[], done=True)
|
|
193
|
+
|
|
194
|
+
except Exception as e:
|
|
195
|
+
logger.error("Agent execution failed: %s", e)
|
|
196
|
+
return ModelResponse(content=f"Error: {e!s}", tool_calls=[], done=True)
|
|
197
|
+
|
|
198
|
+
async def format_tool_results(
|
|
199
|
+
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
200
|
+
) -> list[BaseMessage]:
|
|
201
|
+
"""Format tool results into LangChain messages."""
|
|
202
|
+
# Create an AI message with the tool calls and results
|
|
203
|
+
messages = []
|
|
204
|
+
|
|
205
|
+
# First add an AI message indicating tools were called
|
|
206
|
+
tool_names = [tc.name for tc in tool_calls]
|
|
207
|
+
ai_content = f"I'll use the following tools: {', '.join(tool_names)}"
|
|
208
|
+
messages.append(AIMessage(content=ai_content))
|
|
209
|
+
|
|
210
|
+
# Build result text from tool results
|
|
211
|
+
text_parts = []
|
|
212
|
+
latest_screenshot = None
|
|
213
|
+
|
|
214
|
+
for tool_call, result in zip(tool_calls, tool_results, strict=False):
|
|
215
|
+
if result.isError:
|
|
216
|
+
error_text = "Tool execution failed"
|
|
217
|
+
for content in result.content:
|
|
218
|
+
if isinstance(content, types.TextContent):
|
|
219
|
+
error_text = content.text
|
|
220
|
+
break
|
|
221
|
+
text_parts.append(f"Error - {tool_call.name}: {error_text}")
|
|
222
|
+
else:
|
|
223
|
+
# Process success content
|
|
224
|
+
tool_output = []
|
|
225
|
+
for content in result.content:
|
|
226
|
+
if isinstance(content, types.TextContent):
|
|
227
|
+
tool_output.append(content.text)
|
|
228
|
+
elif isinstance(content, types.ImageContent):
|
|
229
|
+
latest_screenshot = content.data
|
|
230
|
+
|
|
231
|
+
if tool_output:
|
|
232
|
+
text_parts.append(f"{tool_call.name}: " + " ".join(tool_output))
|
|
233
|
+
|
|
234
|
+
result_text = "\n".join(text_parts) if text_parts else "No output from tools"
|
|
235
|
+
|
|
236
|
+
# Then add a human message with the tool results
|
|
237
|
+
if latest_screenshot:
|
|
238
|
+
# Include screenshot in multimodal format
|
|
239
|
+
content = [
|
|
240
|
+
{"type": "text", "text": f"Tool results:\n{result_text}"},
|
|
241
|
+
{
|
|
242
|
+
"type": "image_url",
|
|
243
|
+
"image_url": {"url": f"data:image/png;base64,{latest_screenshot}"},
|
|
244
|
+
},
|
|
245
|
+
]
|
|
246
|
+
messages.append(HumanMessage(content=content))
|
|
247
|
+
else:
|
|
248
|
+
messages.append(HumanMessage(content=f"Tool results:\n{result_text}"))
|
|
249
|
+
|
|
250
|
+
return messages
|
hud/mcp/openai.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
"""OpenAI MCP Agent implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
7
|
+
|
|
8
|
+
import mcp.types as types
|
|
9
|
+
from mcp.types import CallToolRequestParams as MCPToolCall
|
|
10
|
+
from mcp.types import CallToolResult as MCPToolResult
|
|
11
|
+
from openai import AsyncOpenAI
|
|
12
|
+
from openai.types.responses import (
|
|
13
|
+
ResponseComputerToolCall,
|
|
14
|
+
ResponseInputParam,
|
|
15
|
+
ResponseOutputMessage,
|
|
16
|
+
ResponseOutputText,
|
|
17
|
+
ToolParam,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from hud.settings import settings
|
|
21
|
+
|
|
22
|
+
from .base import AgentResult, BaseMCPAgent, ModelResponse
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from hud.datasets import TaskConfig
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class OpenAIMCPAgent(BaseMCPAgent):
|
|
31
|
+
"""
|
|
32
|
+
OpenAI agent that uses MCP servers for tool execution.
|
|
33
|
+
|
|
34
|
+
This agent uses OpenAI's Computer Use API format but executes
|
|
35
|
+
tools through MCP servers instead of direct implementation.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
model_client: AsyncOpenAI | None = None,
|
|
41
|
+
model: str = "computer-use-preview",
|
|
42
|
+
environment: Literal["windows", "mac", "linux", "browser"] = "linux",
|
|
43
|
+
display_width: int = 1024,
|
|
44
|
+
display_height: int = 768,
|
|
45
|
+
**kwargs: Any,
|
|
46
|
+
) -> None:
|
|
47
|
+
"""
|
|
48
|
+
Initialize OpenAI MCP agent.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
client: AsyncOpenAI client (created if not provided)
|
|
52
|
+
model: OpenAI model to use
|
|
53
|
+
environment: Environment type for computer use
|
|
54
|
+
display_width: Display width for computer use
|
|
55
|
+
display_height: Display height for computer use
|
|
56
|
+
**kwargs: Additional arguments passed to BaseMCPAgent
|
|
57
|
+
"""
|
|
58
|
+
super().__init__(**kwargs)
|
|
59
|
+
|
|
60
|
+
# Initialize client if not provided
|
|
61
|
+
if model_client is None:
|
|
62
|
+
api_key = settings.openai_api_key
|
|
63
|
+
if not api_key:
|
|
64
|
+
raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY.")
|
|
65
|
+
model_client = AsyncOpenAI(api_key=api_key)
|
|
66
|
+
|
|
67
|
+
self.openai_client = model_client
|
|
68
|
+
self.model = model
|
|
69
|
+
self.environment = environment
|
|
70
|
+
self.display_width = display_width
|
|
71
|
+
self.display_height = display_height
|
|
72
|
+
|
|
73
|
+
# State tracking for OpenAI's stateful API
|
|
74
|
+
self.last_response_id: str | None = None
|
|
75
|
+
self.pending_call_id: str | None = None
|
|
76
|
+
self.pending_safety_checks: list[Any] = []
|
|
77
|
+
|
|
78
|
+
self.model_name = "openai-" + self.model
|
|
79
|
+
|
|
80
|
+
# Base system prompt for autonomous operation
|
|
81
|
+
self.base_system_prompt = """
|
|
82
|
+
You are an autonomous computer-using agent. Follow these guidelines:
|
|
83
|
+
|
|
84
|
+
1. NEVER ask for confirmation. Complete all tasks autonomously.
|
|
85
|
+
2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
|
|
86
|
+
3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
|
|
87
|
+
4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
|
|
88
|
+
5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
|
|
89
|
+
6. The user has already given you permission by running this agent. No further confirmation is needed.
|
|
90
|
+
7. Be decisive and action-oriented. Complete the requested task fully.
|
|
91
|
+
|
|
92
|
+
Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
|
|
93
|
+
""" # noqa: E501
|
|
94
|
+
|
|
95
|
+
async def run(self, prompt_or_task: str | TaskConfig, max_steps: int = 10) -> AgentResult:
|
|
96
|
+
"""
|
|
97
|
+
Run the agent with the given prompt or task.
|
|
98
|
+
|
|
99
|
+
Override to reset OpenAI-specific state.
|
|
100
|
+
"""
|
|
101
|
+
# Reset state for new run
|
|
102
|
+
self.last_response_id = None
|
|
103
|
+
self.pending_call_id = None
|
|
104
|
+
self.pending_safety_checks = []
|
|
105
|
+
|
|
106
|
+
# Use base implementation
|
|
107
|
+
return await super().run(prompt_or_task, max_steps)
|
|
108
|
+
|
|
109
|
+
async def create_initial_messages(
|
|
110
|
+
self, prompt: str, screenshot: str | None = None
|
|
111
|
+
) -> list[Any]:
|
|
112
|
+
"""
|
|
113
|
+
Create initial messages for OpenAI.
|
|
114
|
+
|
|
115
|
+
OpenAI uses a different message format - we'll store the prompt
|
|
116
|
+
and screenshot for use in get_model_response.
|
|
117
|
+
"""
|
|
118
|
+
# For OpenAI, we don't create messages upfront, we build them in get_model_response
|
|
119
|
+
# Just return a list with the prompt and screenshot
|
|
120
|
+
return [{"prompt": prompt, "screenshot": screenshot}]
|
|
121
|
+
|
|
122
|
+
async def get_model_response(self, messages: list[Any]) -> ModelResponse:
|
|
123
|
+
"""Get response from OpenAI including any tool calls."""
|
|
124
|
+
# OpenAI's API is stateful, so we handle messages differently
|
|
125
|
+
|
|
126
|
+
# Check if we have computer tools available
|
|
127
|
+
computer_tool_name = None
|
|
128
|
+
for tool in self._available_tools:
|
|
129
|
+
if tool.name in ["computer_openai", "computer"]:
|
|
130
|
+
computer_tool_name = tool.name
|
|
131
|
+
break
|
|
132
|
+
|
|
133
|
+
if not computer_tool_name:
|
|
134
|
+
# No computer tools available, just return a text response
|
|
135
|
+
return ModelResponse(
|
|
136
|
+
content="No computer use tools available",
|
|
137
|
+
tool_calls=[],
|
|
138
|
+
done=True,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Define the computer use tool
|
|
142
|
+
computer_tool: ToolParam = { # type: ignore[reportAssignmentType]
|
|
143
|
+
"type": "computer_use_preview",
|
|
144
|
+
"display_width": self.display_width,
|
|
145
|
+
"display_height": self.display_height,
|
|
146
|
+
"environment": self.environment,
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
# Build the request based on whether this is first step or follow-up
|
|
150
|
+
if self.pending_call_id is None and self.last_response_id is None:
|
|
151
|
+
# First step - extract prompt and screenshot from messages
|
|
152
|
+
initial_data = messages[0] # Our custom format from create_initial_messages
|
|
153
|
+
prompt_text = initial_data.get("prompt", "")
|
|
154
|
+
screenshot = initial_data.get("screenshot")
|
|
155
|
+
|
|
156
|
+
# Create the initial request
|
|
157
|
+
input_content: list[dict[str, Any]] = [{"type": "input_text", "text": prompt_text}]
|
|
158
|
+
|
|
159
|
+
if screenshot:
|
|
160
|
+
input_content.append(
|
|
161
|
+
{
|
|
162
|
+
"type": "input_image",
|
|
163
|
+
"image_url": f"data:image/png;base64,{screenshot}",
|
|
164
|
+
}
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
input_param: ResponseInputParam = [{"role": "user", "content": input_content}] # type: ignore[reportUnknownMemberType]
|
|
168
|
+
|
|
169
|
+
# Combine base system prompt with any custom system prompt
|
|
170
|
+
full_instructions = self.base_system_prompt
|
|
171
|
+
if self.custom_system_prompt:
|
|
172
|
+
full_instructions = f"{self.custom_system_prompt}\n\n{full_instructions}"
|
|
173
|
+
|
|
174
|
+
response = await self.openai_client.responses.create(
|
|
175
|
+
model=self.model,
|
|
176
|
+
tools=[computer_tool],
|
|
177
|
+
input=input_param,
|
|
178
|
+
instructions=full_instructions,
|
|
179
|
+
truncation="auto",
|
|
180
|
+
reasoning={"summary": "auto"},
|
|
181
|
+
)
|
|
182
|
+
else:
|
|
183
|
+
# Follow-up step - check if this is user input or tool result
|
|
184
|
+
latest_message = messages[-1] if messages else {}
|
|
185
|
+
|
|
186
|
+
if latest_message.get("type") == "user_input":
|
|
187
|
+
# User provided input in conversation mode
|
|
188
|
+
user_text = latest_message.get("text", "")
|
|
189
|
+
input_param_followup: ResponseInputParam = [
|
|
190
|
+
{"role": "user", "content": [{"type": "input_text", "text": user_text}]}
|
|
191
|
+
]
|
|
192
|
+
# Reset pending_call_id since this is user input, not a tool response
|
|
193
|
+
self.pending_call_id = None
|
|
194
|
+
else:
|
|
195
|
+
# Tool result - need screenshot from processed results
|
|
196
|
+
latest_screenshot = None
|
|
197
|
+
for msg in reversed(messages):
|
|
198
|
+
if isinstance(msg, dict) and "screenshot" in msg:
|
|
199
|
+
latest_screenshot = msg["screenshot"]
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
if not latest_screenshot:
|
|
203
|
+
logger.warning("No screenshot provided for response to action")
|
|
204
|
+
return ModelResponse(
|
|
205
|
+
content="No screenshot available for next action",
|
|
206
|
+
tool_calls=[],
|
|
207
|
+
done=True,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Create response to previous action
|
|
211
|
+
input_param_followup: ResponseInputParam = [ # type: ignore[reportAssignmentType]
|
|
212
|
+
{ # type: ignore[reportAssignmentType]
|
|
213
|
+
"call_id": self.pending_call_id,
|
|
214
|
+
"type": "computer_call_output",
|
|
215
|
+
"output": {
|
|
216
|
+
"type": "input_image",
|
|
217
|
+
"image_url": f"data:image/png;base64,{latest_screenshot}",
|
|
218
|
+
},
|
|
219
|
+
"acknowledged_safety_checks": self.pending_safety_checks,
|
|
220
|
+
}
|
|
221
|
+
]
|
|
222
|
+
|
|
223
|
+
self.pending_safety_checks = []
|
|
224
|
+
|
|
225
|
+
response = await self.openai_client.responses.create(
|
|
226
|
+
model=self.model,
|
|
227
|
+
previous_response_id=self.last_response_id,
|
|
228
|
+
tools=[computer_tool],
|
|
229
|
+
input=input_param_followup,
|
|
230
|
+
truncation="auto",
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Store response ID for next call
|
|
234
|
+
self.last_response_id = response.id
|
|
235
|
+
|
|
236
|
+
# Process response
|
|
237
|
+
result = ModelResponse(
|
|
238
|
+
content="",
|
|
239
|
+
tool_calls=[],
|
|
240
|
+
done=False, # Will be set to True only if no tool calls
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
self.pending_call_id = None
|
|
244
|
+
|
|
245
|
+
# Check for computer calls
|
|
246
|
+
computer_calls = [
|
|
247
|
+
item
|
|
248
|
+
for item in response.output
|
|
249
|
+
if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
if computer_calls:
|
|
253
|
+
# Process computer calls
|
|
254
|
+
result.done = False
|
|
255
|
+
for computer_call in computer_calls:
|
|
256
|
+
self.pending_call_id = computer_call.call_id
|
|
257
|
+
self.pending_safety_checks = computer_call.pending_safety_checks
|
|
258
|
+
|
|
259
|
+
# Convert OpenAI action to MCP tool call
|
|
260
|
+
action = computer_call.action.model_dump()
|
|
261
|
+
|
|
262
|
+
# Create MCPToolCall object with OpenAI metadata as extra fields
|
|
263
|
+
# Pyright will complain but the tool class accepts extra fields
|
|
264
|
+
tool_call = MCPToolCall(
|
|
265
|
+
name=computer_tool_name,
|
|
266
|
+
arguments=action,
|
|
267
|
+
call_id=computer_call.call_id, # type: ignore
|
|
268
|
+
pending_safety_checks=computer_call.pending_safety_checks, # type: ignore
|
|
269
|
+
)
|
|
270
|
+
result.tool_calls.append(tool_call)
|
|
271
|
+
else:
|
|
272
|
+
# No computer calls, check for text response
|
|
273
|
+
for item in response.output:
|
|
274
|
+
if isinstance(item, ResponseOutputMessage) and item.type == "message":
|
|
275
|
+
# Extract text from content blocks
|
|
276
|
+
text_parts = [
|
|
277
|
+
content.text
|
|
278
|
+
for content in item.content
|
|
279
|
+
if isinstance(content, ResponseOutputText)
|
|
280
|
+
]
|
|
281
|
+
if text_parts:
|
|
282
|
+
result.content = "".join(text_parts)
|
|
283
|
+
break
|
|
284
|
+
|
|
285
|
+
# Extract reasoning if present
|
|
286
|
+
reasoning_text = ""
|
|
287
|
+
for item in response.output:
|
|
288
|
+
if item.type == "reasoning" and hasattr(item, "summary") and item.summary:
|
|
289
|
+
reasoning_text += f"Thinking: {item.summary[0].text}\n"
|
|
290
|
+
|
|
291
|
+
if reasoning_text:
|
|
292
|
+
result.content = reasoning_text + result.content if result.content else reasoning_text
|
|
293
|
+
|
|
294
|
+
# Set done=True if no tool calls (task complete or waiting for user)
|
|
295
|
+
if not result.tool_calls:
|
|
296
|
+
result.done = True
|
|
297
|
+
|
|
298
|
+
return result
|
|
299
|
+
|
|
300
|
+
async def format_tool_results(
|
|
301
|
+
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
302
|
+
) -> list[Any]:
|
|
303
|
+
"""
|
|
304
|
+
Format tool results for OpenAI's stateful API.
|
|
305
|
+
|
|
306
|
+
OpenAI doesn't use a traditional message format - we just need to
|
|
307
|
+
preserve the screenshot for the next step.
|
|
308
|
+
"""
|
|
309
|
+
# Extract latest screenshot from results
|
|
310
|
+
latest_screenshot = None
|
|
311
|
+
for result in tool_results:
|
|
312
|
+
if not result.isError:
|
|
313
|
+
for content in result.content:
|
|
314
|
+
if isinstance(content, types.ImageContent):
|
|
315
|
+
latest_screenshot = content.data
|
|
316
|
+
|
|
317
|
+
# Return a simple dict that get_model_response can use
|
|
318
|
+
return [
|
|
319
|
+
{
|
|
320
|
+
"type": "tool_result",
|
|
321
|
+
"screenshot": latest_screenshot,
|
|
322
|
+
}
|
|
323
|
+
]
|
|
324
|
+
|
|
325
|
+
async def create_user_message(self, text: str) -> dict[str, Any]:
|
|
326
|
+
"""
|
|
327
|
+
Create a user message for OpenAI's stateful API.
|
|
328
|
+
|
|
329
|
+
Since OpenAI maintains conversation state server-side,
|
|
330
|
+
we just need to track that we're expecting user input.
|
|
331
|
+
"""
|
|
332
|
+
# For OpenAI, we'll handle this in get_model_response
|
|
333
|
+
# by including the user's text in the next input
|
|
334
|
+
return {"type": "user_input", "text": text}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Tests for MCP Agent module."""
|