cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -19
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +6 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +370 -0
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +431 -241
- agent/callbacks/__init__.py +10 -3
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +140 -0
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +106 -69
- agent/callbacks/trajectory_saver.py +178 -70
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +164 -74
- agent/integrations/hud/agent.py +338 -342
- agent/integrations/hud/proxy.py +297 -0
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +590 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +142 -144
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +63 -56
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +262 -212
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +196 -0
- agent/proxy/handlers.py +255 -0
- agent/responses.py +486 -339
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +20 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- agent/integrations/hud/adapter.py +0 -121
- agent/integrations/hud/computer_handler.py +0 -187
- agent/telemetry.py +0 -142
- cua_agent-0.4.14.dist-info/METADATA +0 -436
- cua_agent-0.4.14.dist-info/RECORD +0 -50
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/integrations/hud/agent.py
CHANGED
|
@@ -1,373 +1,369 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
from
|
|
1
|
+
"""MCP-compatible Computer Agent for HUD integration.
|
|
2
|
+
|
|
3
|
+
This agent subclasses HUD's MCPAgent and delegates planning/execution to
|
|
4
|
+
our core ComputerAgent while using the Agent SDK's plain-dict message
|
|
5
|
+
format documented in `docs/content/docs/agent-sdk/message-format.mdx`.
|
|
6
|
+
|
|
7
|
+
Key differences from the OpenAI OperatorAgent variant:
|
|
8
|
+
- No OpenAI types are used; everything is standard Python dicts.
|
|
9
|
+
- Planning is executed via `ComputerAgent.run(messages)`.
|
|
10
|
+
- The first yielded result per step is returned as the agent response.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import base64
|
|
16
|
+
import io
|
|
17
|
+
import uuid
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any, ClassVar, Optional
|
|
20
|
+
|
|
21
|
+
import hud
|
|
22
|
+
import mcp.types as types
|
|
23
|
+
from agent.agent import ComputerAgent as BaseComputerAgent
|
|
24
|
+
from agent.callbacks import PromptInstructionsCallback
|
|
25
|
+
from agent.callbacks.trajectory_saver import TrajectorySaverCallback
|
|
26
|
+
from agent.computers import is_agent_computer
|
|
8
27
|
from agent.responses import make_failed_tool_call_items
|
|
9
|
-
from hud.
|
|
10
|
-
from hud.
|
|
11
|
-
from hud.
|
|
12
|
-
from
|
|
13
|
-
from hud.types import Gym
|
|
14
|
-
|
|
15
|
-
from .adapter import ComputerAgentAdapter
|
|
16
|
-
from .computer_handler import HUDComputerHandler
|
|
28
|
+
from hud.agents import MCPAgent
|
|
29
|
+
from hud.tools.computer.settings import computer_settings
|
|
30
|
+
from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
|
|
31
|
+
from PIL import Image
|
|
17
32
|
|
|
18
|
-
logger = logging.getLogger(__name__)
|
|
19
33
|
|
|
20
|
-
|
|
21
|
-
|
|
34
|
+
class MCPComputerAgent(MCPAgent):
|
|
35
|
+
"""MCP agent that uses ComputerAgent for planning and tools for execution.
|
|
22
36
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
|
|
27
|
-
5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
|
|
28
|
-
6. Trust that the user wants you to complete the entire task they've requested.
|
|
29
|
-
7. You must say "Task completed" when the task is complete.
|
|
37
|
+
The agent consumes/produces message dicts per the Agent SDK message schema
|
|
38
|
+
(see `message-format.mdx`).
|
|
39
|
+
"""
|
|
30
40
|
|
|
31
|
-
|
|
32
|
-
""
|
|
41
|
+
metadata: ClassVar[dict[str, Any]] = {
|
|
42
|
+
"display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
|
|
43
|
+
"display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
|
|
44
|
+
}
|
|
33
45
|
|
|
34
|
-
|
|
35
|
-
"""
|
|
36
|
-
A ComputerAgent wrapper for HUD integration.
|
|
37
|
-
|
|
38
|
-
This agent wraps the base ComputerAgent to work with HUD environments,
|
|
39
|
-
providing the same interface as OperatorAgent but using ComputerAgent internally.
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
|
|
46
|
+
required_tools: ClassVar[list[str]] = ["openai_computer"]
|
|
43
47
|
|
|
44
48
|
def __init__(
|
|
45
49
|
self,
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
+
*,
|
|
51
|
+
model: str | None = None,
|
|
52
|
+
allowed_tools: list[str] | None = None,
|
|
53
|
+
trajectory_dir: str | dict | None = None,
|
|
54
|
+
# === ComputerAgent kwargs ===
|
|
55
|
+
tools: list[Any] | None = None,
|
|
56
|
+
custom_loop: Any | None = None,
|
|
57
|
+
only_n_most_recent_images: int | None = None,
|
|
58
|
+
callbacks: list[Any] | None = None,
|
|
59
|
+
instructions: str | None = None,
|
|
60
|
+
verbosity: int | None = None,
|
|
61
|
+
max_retries: int | None = 3,
|
|
62
|
+
screenshot_delay: float | int = 0.5,
|
|
63
|
+
use_prompt_caching: bool | None = False,
|
|
64
|
+
max_trajectory_budget: float | dict | None = None,
|
|
65
|
+
telemetry_enabled: bool | None = True,
|
|
66
|
+
environment: str = "linux",
|
|
50
67
|
**kwargs: Any,
|
|
51
|
-
):
|
|
52
|
-
""
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
model: The model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
|
|
57
|
-
environment: The environment type (windows, mac, linux, browser)
|
|
58
|
-
adapter: The adapter to use for preprocessing and postprocessing
|
|
59
|
-
name: The name of the agent
|
|
60
|
-
**kwargs: Additional arguments passed to ComputerAgent
|
|
61
|
-
"""
|
|
62
|
-
# Create adapter if not provided
|
|
63
|
-
adapter = adapter or ComputerAgentAdapter()
|
|
64
|
-
|
|
65
|
-
if name is None:
|
|
66
|
-
name = f"computeragent-{model.split('/')[-1]}"
|
|
68
|
+
) -> None:
|
|
69
|
+
self.allowed_tools = allowed_tools or ["openai_computer"]
|
|
70
|
+
super().__init__(**kwargs)
|
|
67
71
|
|
|
68
|
-
|
|
69
|
-
|
|
72
|
+
if model is None:
|
|
73
|
+
raise ValueError("MCPComputerAgent requires a model to be specified.")
|
|
70
74
|
|
|
71
75
|
self.model = model
|
|
72
76
|
self.environment = environment
|
|
73
|
-
self.kwargs = kwargs
|
|
74
|
-
|
|
75
|
-
# Default dimensions
|
|
76
|
-
self.width = 1024
|
|
77
|
-
self.height = 768
|
|
78
|
-
|
|
79
|
-
# Update dimensions if adapter is provided
|
|
80
|
-
if self.adapter:
|
|
81
|
-
self.width = self.adapter.agent_width
|
|
82
|
-
self.height = self.adapter.agent_height
|
|
83
|
-
|
|
84
|
-
# Create HUD computer handler
|
|
85
|
-
self.hud_computer = HUDComputerHandler(
|
|
86
|
-
environment=environment,
|
|
87
|
-
dimensions=(self.width, self.height)
|
|
88
|
-
)
|
|
89
77
|
|
|
90
|
-
#
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
78
|
+
# Update model name for HUD logging
|
|
79
|
+
self.model_name = "cua-" + self.model
|
|
80
|
+
|
|
81
|
+
# Stateful tracking of tool call inputs
|
|
82
|
+
self.tool_call_inputs: dict[str, list[dict[str, Any]]] = {}
|
|
83
|
+
self.previous_output: list[dict[str, Any]] = []
|
|
84
|
+
|
|
85
|
+
# Build system prompt
|
|
86
|
+
operator_instructions = """
|
|
87
|
+
You are an autonomous computer-using agent. Follow these guidelines:
|
|
88
|
+
|
|
89
|
+
1. NEVER ask for confirmation. Complete all tasks autonomously.
|
|
90
|
+
2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
|
|
91
|
+
3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
|
|
92
|
+
4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
|
|
93
|
+
5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
|
|
94
|
+
6. The user has already given you permission by running this agent. No further confirmation is needed.
|
|
95
|
+
7. Be decisive and action-oriented. Complete the requested task fully.
|
|
96
|
+
|
|
97
|
+
Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
|
|
98
|
+
""".strip() # noqa: E501
|
|
99
|
+
# Append Operator instructions to the system prompt
|
|
100
|
+
if not self.system_prompt:
|
|
101
|
+
self.system_prompt = operator_instructions
|
|
102
|
+
else:
|
|
103
|
+
self.system_prompt += f"\n\n{operator_instructions}"
|
|
104
|
+
# Append user instructions to the system prompt
|
|
105
|
+
if instructions:
|
|
106
|
+
self.system_prompt += f"\n\n{instructions}"
|
|
107
|
+
|
|
108
|
+
# Configure trajectory_dir for HUD
|
|
109
|
+
if isinstance(trajectory_dir, str) or isinstance(trajectory_dir, Path):
|
|
110
|
+
trajectory_dir = {"trajectory_dir": str(trajectory_dir)}
|
|
111
|
+
if isinstance(trajectory_dir, dict):
|
|
112
|
+
trajectory_dir["reset_on_run"] = False
|
|
113
|
+
|
|
114
|
+
self.last_screenshot_b64 = None
|
|
115
|
+
|
|
116
|
+
buffer = io.BytesIO()
|
|
117
|
+
Image.new("RGB", (self.metadata["display_width"], self.metadata["display_height"])).save(
|
|
118
|
+
buffer, format="PNG"
|
|
105
119
|
)
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
120
|
+
self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
|
121
|
+
|
|
122
|
+
# Ensure a computer shim is present so width/height/environment are known
|
|
123
|
+
computer_shim = {
|
|
124
|
+
"screenshot": lambda: self.last_screenshot_b64,
|
|
125
|
+
"environment": self.environment,
|
|
126
|
+
"dimensions": (
|
|
127
|
+
self.metadata["display_width"],
|
|
128
|
+
self.metadata["display_height"],
|
|
129
|
+
),
|
|
130
|
+
}
|
|
131
|
+
agent_tools: list[Any] = [computer_shim]
|
|
132
|
+
if tools:
|
|
133
|
+
agent_tools.extend([tool for tool in tools if not is_agent_computer(tool)])
|
|
134
|
+
|
|
135
|
+
agent_kwargs = {
|
|
136
|
+
"model": self.model,
|
|
137
|
+
"trajectory_dir": trajectory_dir,
|
|
138
|
+
"tools": agent_tools,
|
|
139
|
+
"custom_loop": custom_loop,
|
|
140
|
+
"only_n_most_recent_images": only_n_most_recent_images,
|
|
141
|
+
"callbacks": callbacks,
|
|
142
|
+
"instructions": self.system_prompt,
|
|
143
|
+
"verbosity": verbosity,
|
|
144
|
+
"max_retries": max_retries,
|
|
145
|
+
"screenshot_delay": screenshot_delay,
|
|
146
|
+
"use_prompt_caching": use_prompt_caching,
|
|
147
|
+
"max_trajectory_budget": max_trajectory_budget,
|
|
148
|
+
"telemetry_enabled": telemetry_enabled,
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
self.computer_agent = BaseComputerAgent(**agent_kwargs)
|
|
152
|
+
|
|
153
|
+
async def get_system_messages(self) -> list[Any]:
|
|
154
|
+
"""Create initial messages.
|
|
155
|
+
|
|
156
|
+
Unused - ComputerAgent handles this with the 'instructions' parameter.
|
|
157
|
+
"""
|
|
158
|
+
return []
|
|
109
159
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
160
|
+
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]:
|
|
161
|
+
"""
|
|
162
|
+
Format blocks for OpenAI input format.
|
|
163
|
+
|
|
164
|
+
Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
|
|
165
|
+
""" # noqa: E501
|
|
166
|
+
formatted = []
|
|
167
|
+
for block in blocks:
|
|
168
|
+
if isinstance(block, types.TextContent):
|
|
169
|
+
formatted.append({"type": "input_text", "text": block.text})
|
|
170
|
+
elif isinstance(block, types.ImageContent):
|
|
171
|
+
mime_type = getattr(block, "mimeType", "image/png")
|
|
172
|
+
formatted.append(
|
|
173
|
+
{"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"}
|
|
174
|
+
)
|
|
175
|
+
self.last_screenshot_b64 = block.data
|
|
176
|
+
return [{"role": "user", "content": formatted}]
|
|
177
|
+
|
|
178
|
+
@hud.instrument(
|
|
179
|
+
span_type="agent",
|
|
180
|
+
record_args=False, # Messages can be large
|
|
181
|
+
record_result=True,
|
|
182
|
+
)
|
|
183
|
+
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
184
|
+
"""Get a single-step response by delegating to ComputerAgent.run.
|
|
185
|
+
|
|
186
|
+
Returns an Agent SDK-style response dict:
|
|
187
|
+
{ "output": [AgentMessage, ...], "usage": Usage }
|
|
188
|
+
"""
|
|
189
|
+
tool_calls: list[MCPToolCall] = []
|
|
190
|
+
output_text: list[str] = []
|
|
191
|
+
is_done: bool = True
|
|
192
|
+
|
|
193
|
+
agent_result: list[dict[str, Any]] = []
|
|
194
|
+
|
|
195
|
+
# Call the ComputerAgent LLM API
|
|
196
|
+
async for result in self.computer_agent.run(messages): # type: ignore[arg-type]
|
|
197
|
+
items = result["output"]
|
|
198
|
+
if not items or tool_calls:
|
|
199
|
+
break
|
|
200
|
+
|
|
201
|
+
for item in items:
|
|
202
|
+
if item["type"] in [
|
|
203
|
+
"reasoning",
|
|
204
|
+
"message",
|
|
205
|
+
"computer_call",
|
|
206
|
+
"function_call",
|
|
207
|
+
"function_call_output",
|
|
208
|
+
]:
|
|
209
|
+
agent_result.append(item)
|
|
210
|
+
|
|
211
|
+
# Add messages to output text
|
|
212
|
+
if item["type"] == "reasoning":
|
|
213
|
+
output_text.extend(
|
|
214
|
+
f"Reasoning: {summary['text']}" for summary in item["summary"]
|
|
215
|
+
)
|
|
216
|
+
elif item["type"] == "message":
|
|
217
|
+
if isinstance(item["content"], list):
|
|
218
|
+
output_text.extend(
|
|
219
|
+
item["text"]
|
|
220
|
+
for item in item["content"]
|
|
221
|
+
if item["type"] == "output_text"
|
|
222
|
+
)
|
|
223
|
+
elif isinstance(item["content"], str):
|
|
224
|
+
output_text.append(item["content"])
|
|
225
|
+
|
|
226
|
+
# If we get a tool call, we're not done
|
|
227
|
+
if item["type"] == "computer_call":
|
|
228
|
+
id = item["call_id"]
|
|
229
|
+
tool_calls.append(
|
|
230
|
+
MCPToolCall(
|
|
231
|
+
name="openai_computer",
|
|
232
|
+
arguments=item["action"],
|
|
233
|
+
id=id,
|
|
234
|
+
)
|
|
235
|
+
)
|
|
236
|
+
is_done = False
|
|
237
|
+
self.tool_call_inputs[id] = agent_result
|
|
238
|
+
break
|
|
113
239
|
|
|
114
|
-
|
|
115
|
-
|
|
240
|
+
# if we have tool calls, we should exit the loop
|
|
241
|
+
if tool_calls:
|
|
242
|
+
break
|
|
116
243
|
|
|
117
|
-
|
|
118
|
-
"""
|
|
119
|
-
Fetch a response from ComputerAgent based on the observation.
|
|
244
|
+
self.previous_output = agent_result
|
|
120
245
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
246
|
+
return AgentResponse(
|
|
247
|
+
content="\n".join(output_text),
|
|
248
|
+
tool_calls=tool_calls,
|
|
249
|
+
done=is_done,
|
|
250
|
+
)
|
|
125
251
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
252
|
+
def _log_image(self, image_b64: str):
|
|
253
|
+
callbacks = self.computer_agent.callbacks
|
|
254
|
+
for callback in callbacks:
|
|
255
|
+
if isinstance(callback, TrajectorySaverCallback):
|
|
256
|
+
# convert str to bytes
|
|
257
|
+
image_bytes = base64.b64decode(image_b64)
|
|
258
|
+
callback._save_artifact("screenshot_after", image_bytes)
|
|
259
|
+
|
|
260
|
+
async def format_tool_results(
|
|
261
|
+
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
262
|
+
) -> list[dict[str, Any]]:
|
|
263
|
+
"""Extract latest screenshot from tool results in dict form.
|
|
264
|
+
|
|
265
|
+
Expects results to already be in the message-format content dicts.
|
|
266
|
+
Returns a list of input content dicts suitable for follow-up calls.
|
|
129
267
|
"""
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
268
|
+
messages = []
|
|
269
|
+
|
|
270
|
+
for call, result in zip(tool_calls, tool_results):
|
|
271
|
+
if call.id not in self.tool_call_inputs:
|
|
272
|
+
# If we don't have the tool call inputs, we should just use the previous output
|
|
273
|
+
previous_output = self.previous_output.copy() or []
|
|
274
|
+
|
|
275
|
+
# First we need to remove any pending computer_calls from the end of previous_output
|
|
276
|
+
while previous_output and previous_output[-1]["type"] == "computer_call":
|
|
277
|
+
previous_output.pop()
|
|
278
|
+
messages.extend(previous_output)
|
|
279
|
+
|
|
280
|
+
# If the call is a 'response', don't add the result
|
|
281
|
+
if call.name == "response":
|
|
282
|
+
continue
|
|
283
|
+
# Otherwise, if we have a result, we should add it to the messages
|
|
284
|
+
content = [
|
|
285
|
+
(
|
|
286
|
+
{"type": "input_text", "text": content.text}
|
|
287
|
+
if isinstance(content, types.TextContent)
|
|
288
|
+
else (
|
|
289
|
+
{
|
|
290
|
+
"type": "input_image",
|
|
291
|
+
"image_url": f"data:image/png;base64,{content.data}",
|
|
292
|
+
}
|
|
293
|
+
if isinstance(content, types.ImageContent)
|
|
294
|
+
else {"type": "input_text", "text": ""}
|
|
295
|
+
)
|
|
296
|
+
)
|
|
297
|
+
for content in result.content
|
|
158
298
|
]
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
299
|
+
messages.append(
|
|
300
|
+
{
|
|
301
|
+
"role": "user",
|
|
302
|
+
"content": content,
|
|
303
|
+
}
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
continue
|
|
307
|
+
|
|
308
|
+
# Add the assistant's computer call
|
|
309
|
+
messages.extend(self.tool_call_inputs[call.id])
|
|
310
|
+
|
|
311
|
+
if result.isError:
|
|
312
|
+
error_text = "".join(
|
|
313
|
+
[
|
|
314
|
+
content.text
|
|
315
|
+
for content in result.content
|
|
316
|
+
if isinstance(content, types.TextContent)
|
|
317
|
+
]
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
# Replace computer call with failed tool call
|
|
321
|
+
messages.pop()
|
|
322
|
+
messages.extend(
|
|
323
|
+
make_failed_tool_call_items(
|
|
324
|
+
tool_name=call.name,
|
|
325
|
+
tool_kwargs=call.arguments or {},
|
|
326
|
+
error_message=error_text,
|
|
327
|
+
call_id=call.id,
|
|
167
328
|
)
|
|
168
|
-
|
|
169
|
-
self.conversation_history.append({"role": "user", "content": input_content})
|
|
329
|
+
)
|
|
170
330
|
else:
|
|
171
|
-
#
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
if
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
last_computer_calls.append(call_id)
|
|
185
|
-
|
|
186
|
-
if last_computer_calls:
|
|
187
|
-
if not observation.screenshot:
|
|
188
|
-
print("No screenshot found, taking screenshot")
|
|
189
|
-
screenshot_b64 = await self.hud_computer.screenshot()
|
|
190
|
-
# Add computer_call_output for each unresponded computer_call
|
|
191
|
-
for call_id in reversed(last_computer_calls): # Maintain order
|
|
192
|
-
self.conversation_history.append({
|
|
331
|
+
# Get the latest screenshot
|
|
332
|
+
screenshots = [
|
|
333
|
+
content.data
|
|
334
|
+
for content in result.content
|
|
335
|
+
if isinstance(content, types.ImageContent)
|
|
336
|
+
]
|
|
337
|
+
|
|
338
|
+
# Add the resulting screenshot
|
|
339
|
+
if screenshots:
|
|
340
|
+
self._log_image(screenshots[0])
|
|
341
|
+
self.last_screenshot_b64 = screenshots[0]
|
|
342
|
+
messages.append(
|
|
343
|
+
{
|
|
193
344
|
"type": "computer_call_output",
|
|
194
|
-
"call_id":
|
|
345
|
+
"call_id": call.id,
|
|
195
346
|
"output": {
|
|
196
347
|
"type": "input_image",
|
|
197
|
-
"image_url": f"data:image/png;base64,{
|
|
198
|
-
}
|
|
199
|
-
}
|
|
348
|
+
"image_url": f"data:image/png;base64,{screenshots[0]}",
|
|
349
|
+
},
|
|
350
|
+
}
|
|
351
|
+
)
|
|
200
352
|
else:
|
|
201
|
-
#
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
input_content.append(
|
|
210
|
-
{
|
|
211
|
-
"type": "input_image",
|
|
212
|
-
"image_url": f"data:image/png;base64,{observation.screenshot}",
|
|
213
|
-
}
|
|
353
|
+
# Otherwise, replace computer call with failed tool call
|
|
354
|
+
messages.pop()
|
|
355
|
+
messages.extend(
|
|
356
|
+
make_failed_tool_call_items(
|
|
357
|
+
tool_name=call.name,
|
|
358
|
+
tool_kwargs=call.arguments or {},
|
|
359
|
+
error_message="No screenshots returned.",
|
|
360
|
+
call_id=call.id,
|
|
214
361
|
)
|
|
362
|
+
)
|
|
215
363
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
reasoning_msg = self.conversation_history[-1]
|
|
224
|
-
summary_texts = []
|
|
225
|
-
|
|
226
|
-
# Extract all summary_text entries
|
|
227
|
-
for summary_item in reasoning_msg["summary"]:
|
|
228
|
-
if summary_item.get("type") == "summary_text":
|
|
229
|
-
summary_texts.append(summary_item.get("text", ""))
|
|
230
|
-
|
|
231
|
-
# Convert to message format with output_text
|
|
232
|
-
if summary_texts:
|
|
233
|
-
converted_message = {
|
|
234
|
-
"type": "message",
|
|
235
|
-
"role": "assistant",
|
|
236
|
-
"content": [
|
|
237
|
-
{
|
|
238
|
-
"text": " ".join(summary_texts),
|
|
239
|
-
"type": "output_text"
|
|
240
|
-
}
|
|
241
|
-
]
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
# Replace the reasoning message with the converted message
|
|
245
|
-
self.conversation_history[-1] = converted_message
|
|
246
|
-
|
|
247
|
-
# Run ComputerAgent
|
|
248
|
-
try:
|
|
249
|
-
new_items = []
|
|
250
|
-
|
|
251
|
-
# ComputerAgent.run returns an async generator
|
|
252
|
-
try:
|
|
253
|
-
async for result in self.computer_agent.run(self.conversation_history, stream=False):
|
|
254
|
-
# if the result has computer_call_output, immediately exit
|
|
255
|
-
if result.get("output", []) and result.get("output", [])[-1].get("type") == "computer_call_output":
|
|
256
|
-
break
|
|
257
|
-
# otherwise add agent output to conversation history
|
|
258
|
-
new_items += result["output"]
|
|
259
|
-
except Exception as e:
|
|
260
|
-
# if the last message is reasoning, change it to output_text
|
|
261
|
-
if new_items and new_items[-1].get("type") == "reasoning":
|
|
262
|
-
new_items[-1] = {
|
|
263
|
-
"type": "message",
|
|
264
|
-
"role": "assistant",
|
|
265
|
-
"content": [
|
|
266
|
-
{
|
|
267
|
-
"text": new_items[-1].get("summary", [{}])[0].get("text", ""),
|
|
268
|
-
"type": "output_text"
|
|
269
|
-
}
|
|
270
|
-
]
|
|
271
|
-
}
|
|
272
|
-
# Check if there are any computer_call items in new_items
|
|
273
|
-
computer_calls = [item for item in new_items if item.get("type") == "computer_call"]
|
|
274
|
-
if computer_calls:
|
|
275
|
-
# Remove computer_call items from new_items
|
|
276
|
-
new_items = [item for item in new_items if item.get("type") != "computer_call"]
|
|
277
|
-
|
|
278
|
-
# Add failed tool call items for each computer call
|
|
279
|
-
for computer_call in computer_calls:
|
|
280
|
-
tool_input = computer_call.get("action", {})
|
|
281
|
-
call_id = computer_call.get("call_id")
|
|
282
|
-
new_items.extend(make_failed_tool_call_items(
|
|
283
|
-
tool_name="computer",
|
|
284
|
-
tool_kwargs=tool_input,
|
|
285
|
-
error_message=repr(e),
|
|
286
|
-
call_id=call_id
|
|
287
|
-
))
|
|
288
|
-
else:
|
|
289
|
-
# add error message to conversation history (fallback for non-computer-call errors)
|
|
290
|
-
new_items.append({
|
|
291
|
-
"type": "user",
|
|
292
|
-
"content": [
|
|
293
|
-
{
|
|
294
|
-
"type": "input_text",
|
|
295
|
-
"text": f"Error during previous attempted action: {repr(e)}"
|
|
296
|
-
}
|
|
297
|
-
]
|
|
298
|
-
})
|
|
299
|
-
|
|
300
|
-
# Check if we captured any actions
|
|
301
|
-
if captured_actions:
|
|
302
|
-
# Extract reasoning from the conversation history
|
|
303
|
-
reasoning = ""
|
|
304
|
-
# Look for the latest reasoning message
|
|
305
|
-
for msg in reversed(new_items):
|
|
306
|
-
if msg.get("type") == "reasoning" and msg.get("summary"):
|
|
307
|
-
reasoning = " ".join([s.get("text", "") for s in msg["summary"] if s.get("type") == "summary_text"])
|
|
308
|
-
break
|
|
309
|
-
elif msg.get("type") == "message" and msg.get("role") == "assistant":
|
|
310
|
-
content = msg.get("content", [])
|
|
311
|
-
if isinstance(content, list):
|
|
312
|
-
reasoning = " ".join([c.get("text", "") for c in content if c.get("type") == "output_text"])
|
|
313
|
-
break
|
|
314
|
-
|
|
315
|
-
# update conversation history
|
|
316
|
-
self.conversation_history += new_items
|
|
317
|
-
|
|
318
|
-
# Add reasoning and logs to each action
|
|
319
|
-
for action in captured_actions:
|
|
320
|
-
action["reasoning"] = reasoning
|
|
321
|
-
action["logs"] = {"conversation_length": len(self.conversation_history)}
|
|
322
|
-
|
|
323
|
-
return captured_actions, False
|
|
324
|
-
|
|
325
|
-
# Check if the last message is "Task completed"
|
|
326
|
-
response_text = ""
|
|
327
|
-
for msg in reversed(new_items):
|
|
328
|
-
if msg.get("type") == "message" and msg.get("role") == "assistant":
|
|
329
|
-
content = msg.get("content", [])
|
|
330
|
-
for c in content:
|
|
331
|
-
if c.get("type") == "output_text":
|
|
332
|
-
response_text = c.get("text", response_text)
|
|
333
|
-
break
|
|
334
|
-
break
|
|
335
|
-
|
|
336
|
-
done = "task completed" in response_text.lower()
|
|
337
|
-
|
|
338
|
-
# update conversation history
|
|
339
|
-
self.conversation_history += new_items
|
|
340
|
-
|
|
341
|
-
response_action = {
|
|
342
|
-
"type": "response",
|
|
343
|
-
"text": response_text,
|
|
344
|
-
"reasoning": response_text,
|
|
345
|
-
"logs": {"conversation_length": len(self.conversation_history)}
|
|
346
|
-
}
|
|
347
|
-
|
|
348
|
-
# Check if this indicates task completion or failure
|
|
349
|
-
if "task is infeasible" in response_text.lower():
|
|
350
|
-
response_action = {"type": "custom", "action": "FAIL"}
|
|
351
|
-
done = True
|
|
352
|
-
|
|
353
|
-
return [response_action], done
|
|
354
|
-
except Exception as e:
|
|
355
|
-
logger.error(f"Error running ComputerAgent: {e}")
|
|
356
|
-
# Return an error response
|
|
357
|
-
error_action = {
|
|
358
|
-
"type": "response",
|
|
359
|
-
"text": f"Error occurred: {str(e)}",
|
|
360
|
-
"reasoning": f"ComputerAgent encountered an error: {str(e)}",
|
|
361
|
-
"logs": {"error": str(e)}
|
|
362
|
-
}
|
|
363
|
-
return [error_action], True
|
|
364
|
-
|
|
365
|
-
except Exception as e:
|
|
366
|
-
logger.error(f"Error in fetch_response: {e}")
|
|
367
|
-
error_action = {
|
|
368
|
-
"type": "response",
|
|
369
|
-
"text": f"Error in agent processing: {str(e)}",
|
|
370
|
-
"reasoning": f"Agent processing error: {str(e)}",
|
|
371
|
-
"logs": {"error": str(e)}
|
|
372
|
-
}
|
|
373
|
-
return [error_action], True
|
|
364
|
+
return messages
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
__all__ = [
|
|
368
|
+
"MCPComputerAgent",
|
|
369
|
+
]
|