cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -1,373 +1,369 @@
1
- """HUD ComputerAgent wrapper for OSWorld benchmarking."""
2
-
3
- import logging
4
- from typing import Any, Literal, Optional, Union, List, Dict
5
- import asyncio
6
-
7
- from agent import ComputerAgent as BaseComputerAgent
1
+ """MCP-compatible Computer Agent for HUD integration.
2
+
3
+ This agent subclasses HUD's MCPAgent and delegates planning/execution to
4
+ our core ComputerAgent while using the Agent SDK's plain-dict message
5
+ format documented in `docs/content/docs/agent-sdk/message-format.mdx`.
6
+
7
+ Key differences from the OpenAI OperatorAgent variant:
8
+ - No OpenAI types are used; everything is standard Python dicts.
9
+ - Planning is executed via `ComputerAgent.run(messages)`.
10
+ - The first yielded result per step is returned as the agent response.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import base64
16
+ import io
17
+ import uuid
18
+ from pathlib import Path
19
+ from typing import Any, ClassVar, Optional
20
+
21
+ import hud
22
+ import mcp.types as types
23
+ from agent.agent import ComputerAgent as BaseComputerAgent
24
+ from agent.callbacks import PromptInstructionsCallback
25
+ from agent.callbacks.trajectory_saver import TrajectorySaverCallback
26
+ from agent.computers import is_agent_computer
8
27
  from agent.responses import make_failed_tool_call_items
9
- from hud.adapters import Adapter
10
- from hud.agent.base import Agent
11
- from hud.utils.common import Observation
12
- from hud.adapters.common.types import LogType
13
- from hud.types import Gym
14
-
15
- from .adapter import ComputerAgentAdapter
16
- from .computer_handler import HUDComputerHandler
28
+ from hud.agents import MCPAgent
29
+ from hud.tools.computer.settings import computer_settings
30
+ from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
31
+ from PIL import Image
17
32
 
18
- logger = logging.getLogger(__name__)
19
33
 
20
- BASE_SYSTEM_PROMPT = """
21
- You are an autonomous computer-using agent. Follow these guidelines:
34
+ class MCPComputerAgent(MCPAgent):
35
+ """MCP agent that uses ComputerAgent for planning and tools for execution.
22
36
 
23
- 1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
24
- 2. Use the computer tools to complete the task and do not stop until the task is complete.
25
- 3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
26
- 4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
27
- 5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
28
- 6. Trust that the user wants you to complete the entire task they've requested.
29
- 7. You must say "Task completed" when the task is complete.
37
+ The agent consumes/produces message dicts per the Agent SDK message schema
38
+ (see `message-format.mdx`).
39
+ """
30
40
 
31
- Remember: You have been given permission to complete the requested task autonomously.
32
- """.strip()
41
+ metadata: ClassVar[dict[str, Any]] = {
42
+ "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
43
+ "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
44
+ }
33
45
 
34
- class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
35
- """
36
- A ComputerAgent wrapper for HUD integration.
37
-
38
- This agent wraps the base ComputerAgent to work with HUD environments,
39
- providing the same interface as OperatorAgent but using ComputerAgent internally.
40
- """
41
-
42
- transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
46
+ required_tools: ClassVar[list[str]] = ["openai_computer"]
43
47
 
44
48
  def __init__(
45
49
  self,
46
- model: str = "anthropic/claude-3-5-sonnet-20241022",
47
- environment: Literal["windows", "mac", "linux", "browser"] = "linux",
48
- adapter: Optional[Adapter] = None,
49
- name: Optional[str] = None,
50
+ *,
51
+ model: str | None = None,
52
+ allowed_tools: list[str] | None = None,
53
+ trajectory_dir: str | dict | None = None,
54
+ # === ComputerAgent kwargs ===
55
+ tools: list[Any] | None = None,
56
+ custom_loop: Any | None = None,
57
+ only_n_most_recent_images: int | None = None,
58
+ callbacks: list[Any] | None = None,
59
+ instructions: str | None = None,
60
+ verbosity: int | None = None,
61
+ max_retries: int | None = 3,
62
+ screenshot_delay: float | int = 0.5,
63
+ use_prompt_caching: bool | None = False,
64
+ max_trajectory_budget: float | dict | None = None,
65
+ telemetry_enabled: bool | None = True,
66
+ environment: str = "linux",
50
67
  **kwargs: Any,
51
- ):
52
- """
53
- Initialize the ComputerAgent for HUD.
54
-
55
- Args:
56
- model: The model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
57
- environment: The environment type (windows, mac, linux, browser)
58
- adapter: The adapter to use for preprocessing and postprocessing
59
- name: The name of the agent
60
- **kwargs: Additional arguments passed to ComputerAgent
61
- """
62
- # Create adapter if not provided
63
- adapter = adapter or ComputerAgentAdapter()
64
-
65
- if name is None:
66
- name = f"computeragent-{model.split('/')[-1]}"
68
+ ) -> None:
69
+ self.allowed_tools = allowed_tools or ["openai_computer"]
70
+ super().__init__(**kwargs)
67
71
 
68
- # Initialize the base Agent class without client (we'll create it later)
69
- super().__init__(client=None, adapter=adapter, name=name)
72
+ if model is None:
73
+ raise ValueError("MCPComputerAgent requires a model to be specified.")
70
74
 
71
75
  self.model = model
72
76
  self.environment = environment
73
- self.kwargs = kwargs
74
-
75
- # Default dimensions
76
- self.width = 1024
77
- self.height = 768
78
-
79
- # Update dimensions if adapter is provided
80
- if self.adapter:
81
- self.width = self.adapter.agent_width
82
- self.height = self.adapter.agent_height
83
-
84
- # Create HUD computer handler
85
- self.hud_computer = HUDComputerHandler(
86
- environment=environment,
87
- dimensions=(self.width, self.height)
88
- )
89
77
 
90
- # Handle trajectory_dir by adding TrajectorySaverCallback
91
- trajectory_dir = kwargs.pop("trajectory_dir", None)
92
- callbacks = kwargs.get("callbacks", [])
93
-
94
- if trajectory_dir:
95
- from agent.callbacks.trajectory_saver import TrajectorySaverCallback
96
- trajectory_callback = TrajectorySaverCallback(trajectory_dir, reset_on_run=False)
97
- callbacks = callbacks + [trajectory_callback]
98
- kwargs["callbacks"] = callbacks
99
-
100
- # Initialize ComputerAgent with HUD computer handler
101
- self.computer_agent = BaseComputerAgent(
102
- model=model,
103
- tools=[self.hud_computer],
104
- **kwargs
78
+ # Update model name for HUD logging
79
+ self.model_name = "cua-" + self.model
80
+
81
+ # Stateful tracking of tool call inputs
82
+ self.tool_call_inputs: dict[str, list[dict[str, Any]]] = {}
83
+ self.previous_output: list[dict[str, Any]] = []
84
+
85
+ # Build system prompt
86
+ operator_instructions = """
87
+ You are an autonomous computer-using agent. Follow these guidelines:
88
+
89
+ 1. NEVER ask for confirmation. Complete all tasks autonomously.
90
+ 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
91
+ 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
92
+ 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
93
+ 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
94
+ 6. The user has already given you permission by running this agent. No further confirmation is needed.
95
+ 7. Be decisive and action-oriented. Complete the requested task fully.
96
+
97
+ Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
98
+ """.strip() # noqa: E501
99
+ # Append Operator instructions to the system prompt
100
+ if not self.system_prompt:
101
+ self.system_prompt = operator_instructions
102
+ else:
103
+ self.system_prompt += f"\n\n{operator_instructions}"
104
+ # Append user instructions to the system prompt
105
+ if instructions:
106
+ self.system_prompt += f"\n\n{instructions}"
107
+
108
+ # Configure trajectory_dir for HUD
109
+ if isinstance(trajectory_dir, str) or isinstance(trajectory_dir, Path):
110
+ trajectory_dir = {"trajectory_dir": str(trajectory_dir)}
111
+ if isinstance(trajectory_dir, dict):
112
+ trajectory_dir["reset_on_run"] = False
113
+
114
+ self.last_screenshot_b64 = None
115
+
116
+ buffer = io.BytesIO()
117
+ Image.new("RGB", (self.metadata["display_width"], self.metadata["display_height"])).save(
118
+ buffer, format="PNG"
105
119
  )
106
-
107
- # Set the client to the computer_agent for compatibility
108
- self.client = self.computer_agent
120
+ self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
121
+
122
+ # Ensure a computer shim is present so width/height/environment are known
123
+ computer_shim = {
124
+ "screenshot": lambda: self.last_screenshot_b64,
125
+ "environment": self.environment,
126
+ "dimensions": (
127
+ self.metadata["display_width"],
128
+ self.metadata["display_height"],
129
+ ),
130
+ }
131
+ agent_tools: list[Any] = [computer_shim]
132
+ if tools:
133
+ agent_tools.extend([tool for tool in tools if not is_agent_computer(tool)])
134
+
135
+ agent_kwargs = {
136
+ "model": self.model,
137
+ "trajectory_dir": trajectory_dir,
138
+ "tools": agent_tools,
139
+ "custom_loop": custom_loop,
140
+ "only_n_most_recent_images": only_n_most_recent_images,
141
+ "callbacks": callbacks,
142
+ "instructions": self.system_prompt,
143
+ "verbosity": verbosity,
144
+ "max_retries": max_retries,
145
+ "screenshot_delay": screenshot_delay,
146
+ "use_prompt_caching": use_prompt_caching,
147
+ "max_trajectory_budget": max_trajectory_budget,
148
+ "telemetry_enabled": telemetry_enabled,
149
+ }
150
+
151
+ self.computer_agent = BaseComputerAgent(**agent_kwargs)
152
+
153
+ async def get_system_messages(self) -> list[Any]:
154
+ """Create initial messages.
155
+
156
+ Unused - ComputerAgent handles this with the 'instructions' parameter.
157
+ """
158
+ return []
109
159
 
110
- # State tracking
111
- self.conversation_history: List[Dict[str, Any]] = []
112
- self.initial_prompt: Optional[str] = None
160
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]:
161
+ """
162
+ Format blocks for OpenAI input format.
163
+
164
+ Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
165
+ """ # noqa: E501
166
+ formatted = []
167
+ for block in blocks:
168
+ if isinstance(block, types.TextContent):
169
+ formatted.append({"type": "input_text", "text": block.text})
170
+ elif isinstance(block, types.ImageContent):
171
+ mime_type = getattr(block, "mimeType", "image/png")
172
+ formatted.append(
173
+ {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"}
174
+ )
175
+ self.last_screenshot_b64 = block.data
176
+ return [{"role": "user", "content": formatted}]
177
+
178
+ @hud.instrument(
179
+ span_type="agent",
180
+ record_args=False, # Messages can be large
181
+ record_result=True,
182
+ )
183
+ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
184
+ """Get a single-step response by delegating to ComputerAgent.run.
185
+
186
+ Returns an Agent SDK-style response dict:
187
+ { "output": [AgentMessage, ...], "usage": Usage }
188
+ """
189
+ tool_calls: list[MCPToolCall] = []
190
+ output_text: list[str] = []
191
+ is_done: bool = True
192
+
193
+ agent_result: list[dict[str, Any]] = []
194
+
195
+ # Call the ComputerAgent LLM API
196
+ async for result in self.computer_agent.run(messages): # type: ignore[arg-type]
197
+ items = result["output"]
198
+ if not items or tool_calls:
199
+ break
200
+
201
+ for item in items:
202
+ if item["type"] in [
203
+ "reasoning",
204
+ "message",
205
+ "computer_call",
206
+ "function_call",
207
+ "function_call_output",
208
+ ]:
209
+ agent_result.append(item)
210
+
211
+ # Add messages to output text
212
+ if item["type"] == "reasoning":
213
+ output_text.extend(
214
+ f"Reasoning: {summary['text']}" for summary in item["summary"]
215
+ )
216
+ elif item["type"] == "message":
217
+ if isinstance(item["content"], list):
218
+ output_text.extend(
219
+ item["text"]
220
+ for item in item["content"]
221
+ if item["type"] == "output_text"
222
+ )
223
+ elif isinstance(item["content"], str):
224
+ output_text.append(item["content"])
225
+
226
+ # If we get a tool call, we're not done
227
+ if item["type"] == "computer_call":
228
+ id = item["call_id"]
229
+ tool_calls.append(
230
+ MCPToolCall(
231
+ name="openai_computer",
232
+ arguments=item["action"],
233
+ id=id,
234
+ )
235
+ )
236
+ is_done = False
237
+ self.tool_call_inputs[id] = agent_result
238
+ break
113
239
 
114
- # System prompt for computer use tasks
115
- self.base_system_prompt = BASE_SYSTEM_PROMPT
240
+ # if we have tool calls, we should exit the loop
241
+ if tool_calls:
242
+ break
116
243
 
117
- async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
118
- """
119
- Fetch a response from ComputerAgent based on the observation.
244
+ self.previous_output = agent_result
120
245
 
121
- Args:
122
- observation: The preprocessed observation, attributes:
123
- screenshot: Base64 encoded PNG string of the screen
124
- text: Text observation, if available
246
+ return AgentResponse(
247
+ content="\n".join(output_text),
248
+ tool_calls=tool_calls,
249
+ done=is_done,
250
+ )
125
251
 
126
- Returns:
127
- tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions,
128
- boolean indicating if the agent believes the task is complete.
252
+ def _log_image(self, image_b64: str):
253
+ callbacks = self.computer_agent.callbacks
254
+ for callback in callbacks:
255
+ if isinstance(callback, TrajectorySaverCallback):
256
+ # convert str to bytes
257
+ image_bytes = base64.b64decode(image_b64)
258
+ callback._save_artifact("screenshot_after", image_bytes)
259
+
260
+ async def format_tool_results(
261
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
262
+ ) -> list[dict[str, Any]]:
263
+ """Extract latest screenshot from tool results in dict form.
264
+
265
+ Expects results to already be in the message-format content dicts.
266
+ Returns a list of input content dicts suitable for follow-up calls.
129
267
  """
130
- try:
131
- # Update the computer handler with the current screenshot
132
- if observation.screenshot:
133
- self.hud_computer.update_screenshot(observation.screenshot)
134
-
135
- # Set up action callback to capture actions
136
- captured_actions = []
137
- action_done = False
138
-
139
- async def action_callback(action: Dict[str, Any]) -> None:
140
- """Callback to capture actions from ComputerAgent."""
141
- nonlocal captured_actions, action_done
142
- captured_actions.append(action)
143
-
144
- # Set the action callback
145
- self.hud_computer.set_action_callback(action_callback)
146
-
147
- # Prepare the message for ComputerAgent
148
- if not self.conversation_history:
149
- # First interaction - use the observation text as initial prompt
150
- if observation.text:
151
- self.initial_prompt = observation.text
152
- message = f"{self.base_system_prompt}\n\nTask: {observation.text}"
153
- else:
154
- message = f"{self.base_system_prompt}\n\nPlease analyze the current screen and determine what action to take."
155
-
156
- input_content = [
157
- {"type": "input_text", "text": message}
268
+ messages = []
269
+
270
+ for call, result in zip(tool_calls, tool_results):
271
+ if call.id not in self.tool_call_inputs:
272
+ # If we don't have the tool call inputs, we should just use the previous output
273
+ previous_output = self.previous_output.copy() or []
274
+
275
+ # First we need to remove any pending computer_calls from the end of previous_output
276
+ while previous_output and previous_output[-1]["type"] == "computer_call":
277
+ previous_output.pop()
278
+ messages.extend(previous_output)
279
+
280
+ # If the call is a 'response', don't add the result
281
+ if call.name == "response":
282
+ continue
283
+ # Otherwise, if we have a result, we should add it to the messages
284
+ content = [
285
+ (
286
+ {"type": "input_text", "text": content.text}
287
+ if isinstance(content, types.TextContent)
288
+ else (
289
+ {
290
+ "type": "input_image",
291
+ "image_url": f"data:image/png;base64,{content.data}",
292
+ }
293
+ if isinstance(content, types.ImageContent)
294
+ else {"type": "input_text", "text": ""}
295
+ )
296
+ )
297
+ for content in result.content
158
298
  ]
159
-
160
- # Add screenshot if present
161
- if observation.screenshot:
162
- input_content.append(
163
- {
164
- "type": "input_image",
165
- "image_url": f"data:image/png;base64,{observation.screenshot}",
166
- }
299
+ messages.append(
300
+ {
301
+ "role": "user",
302
+ "content": content,
303
+ }
304
+ )
305
+
306
+ continue
307
+
308
+ # Add the assistant's computer call
309
+ messages.extend(self.tool_call_inputs[call.id])
310
+
311
+ if result.isError:
312
+ error_text = "".join(
313
+ [
314
+ content.text
315
+ for content in result.content
316
+ if isinstance(content, types.TextContent)
317
+ ]
318
+ )
319
+
320
+ # Replace computer call with failed tool call
321
+ messages.pop()
322
+ messages.extend(
323
+ make_failed_tool_call_items(
324
+ tool_name=call.name,
325
+ tool_kwargs=call.arguments or {},
326
+ error_message=error_text,
327
+ call_id=call.id,
167
328
  )
168
-
169
- self.conversation_history.append({"role": "user", "content": input_content})
329
+ )
170
330
  else:
171
- # Subsequent interactions - check if last action was computer_call
172
- # If so, add computer_call_output with screenshot instead of user message
173
- last_computer_calls = []
174
- for msg in reversed(self.conversation_history):
175
- if msg.get("type") == "computer_call":
176
- call_id = msg.get("call_id")
177
- if call_id:
178
- # Check if this call_id already has a computer_call_output
179
- has_output = any(
180
- m.get("type") == "computer_call_output" and m.get("call_id") == call_id
181
- for m in self.conversation_history
182
- )
183
- if not has_output:
184
- last_computer_calls.append(call_id)
185
-
186
- if last_computer_calls:
187
- if not observation.screenshot:
188
- print("No screenshot found, taking screenshot")
189
- screenshot_b64 = await self.hud_computer.screenshot()
190
- # Add computer_call_output for each unresponded computer_call
191
- for call_id in reversed(last_computer_calls): # Maintain order
192
- self.conversation_history.append({
331
+ # Get the latest screenshot
332
+ screenshots = [
333
+ content.data
334
+ for content in result.content
335
+ if isinstance(content, types.ImageContent)
336
+ ]
337
+
338
+ # Add the resulting screenshot
339
+ if screenshots:
340
+ self._log_image(screenshots[0])
341
+ self.last_screenshot_b64 = screenshots[0]
342
+ messages.append(
343
+ {
193
344
  "type": "computer_call_output",
194
- "call_id": call_id,
345
+ "call_id": call.id,
195
346
  "output": {
196
347
  "type": "input_image",
197
- "image_url": f"data:image/png;base64,{screenshot_b64}"
198
- }
199
- })
348
+ "image_url": f"data:image/png;base64,{screenshots[0]}",
349
+ },
350
+ }
351
+ )
200
352
  else:
201
- # No computer_call found, add regular user message
202
- message = "Continue with the task based on the current screen state."
203
- input_content = [
204
- {"type": "input_text", "text": message}
205
- ]
206
-
207
- # Add screenshot if present
208
- if observation.screenshot:
209
- input_content.append(
210
- {
211
- "type": "input_image",
212
- "image_url": f"data:image/png;base64,{observation.screenshot}",
213
- }
353
+ # Otherwise, replace computer call with failed tool call
354
+ messages.pop()
355
+ messages.extend(
356
+ make_failed_tool_call_items(
357
+ tool_name=call.name,
358
+ tool_kwargs=call.arguments or {},
359
+ error_message="No screenshots returned.",
360
+ call_id=call.id,
214
361
  )
362
+ )
215
363
 
216
- self.conversation_history.append({"role": "user", "content": input_content})
217
-
218
- # If the last message is a reasoning message, change it to output_text
219
- if (self.conversation_history and
220
- self.conversation_history[-1].get("type") == "reasoning" and
221
- self.conversation_history[-1].get("summary")):
222
-
223
- reasoning_msg = self.conversation_history[-1]
224
- summary_texts = []
225
-
226
- # Extract all summary_text entries
227
- for summary_item in reasoning_msg["summary"]:
228
- if summary_item.get("type") == "summary_text":
229
- summary_texts.append(summary_item.get("text", ""))
230
-
231
- # Convert to message format with output_text
232
- if summary_texts:
233
- converted_message = {
234
- "type": "message",
235
- "role": "assistant",
236
- "content": [
237
- {
238
- "text": " ".join(summary_texts),
239
- "type": "output_text"
240
- }
241
- ]
242
- }
243
-
244
- # Replace the reasoning message with the converted message
245
- self.conversation_history[-1] = converted_message
246
-
247
- # Run ComputerAgent
248
- try:
249
- new_items = []
250
-
251
- # ComputerAgent.run returns an async generator
252
- try:
253
- async for result in self.computer_agent.run(self.conversation_history, stream=False):
254
- # if the result has computer_call_output, immediately exit
255
- if result.get("output", []) and result.get("output", [])[-1].get("type") == "computer_call_output":
256
- break
257
- # otherwise add agent output to conversation history
258
- new_items += result["output"]
259
- except Exception as e:
260
- # if the last message is reasoning, change it to output_text
261
- if new_items and new_items[-1].get("type") == "reasoning":
262
- new_items[-1] = {
263
- "type": "message",
264
- "role": "assistant",
265
- "content": [
266
- {
267
- "text": new_items[-1].get("summary", [{}])[0].get("text", ""),
268
- "type": "output_text"
269
- }
270
- ]
271
- }
272
- # Check if there are any computer_call items in new_items
273
- computer_calls = [item for item in new_items if item.get("type") == "computer_call"]
274
- if computer_calls:
275
- # Remove computer_call items from new_items
276
- new_items = [item for item in new_items if item.get("type") != "computer_call"]
277
-
278
- # Add failed tool call items for each computer call
279
- for computer_call in computer_calls:
280
- tool_input = computer_call.get("action", {})
281
- call_id = computer_call.get("call_id")
282
- new_items.extend(make_failed_tool_call_items(
283
- tool_name="computer",
284
- tool_kwargs=tool_input,
285
- error_message=repr(e),
286
- call_id=call_id
287
- ))
288
- else:
289
- # add error message to conversation history (fallback for non-computer-call errors)
290
- new_items.append({
291
- "type": "user",
292
- "content": [
293
- {
294
- "type": "input_text",
295
- "text": f"Error during previous attempted action: {repr(e)}"
296
- }
297
- ]
298
- })
299
-
300
- # Check if we captured any actions
301
- if captured_actions:
302
- # Extract reasoning from the conversation history
303
- reasoning = ""
304
- # Look for the latest reasoning message
305
- for msg in reversed(new_items):
306
- if msg.get("type") == "reasoning" and msg.get("summary"):
307
- reasoning = " ".join([s.get("text", "") for s in msg["summary"] if s.get("type") == "summary_text"])
308
- break
309
- elif msg.get("type") == "message" and msg.get("role") == "assistant":
310
- content = msg.get("content", [])
311
- if isinstance(content, list):
312
- reasoning = " ".join([c.get("text", "") for c in content if c.get("type") == "output_text"])
313
- break
314
-
315
- # update conversation history
316
- self.conversation_history += new_items
317
-
318
- # Add reasoning and logs to each action
319
- for action in captured_actions:
320
- action["reasoning"] = reasoning
321
- action["logs"] = {"conversation_length": len(self.conversation_history)}
322
-
323
- return captured_actions, False
324
-
325
- # Check if the last message is "Task completed"
326
- response_text = ""
327
- for msg in reversed(new_items):
328
- if msg.get("type") == "message" and msg.get("role") == "assistant":
329
- content = msg.get("content", [])
330
- for c in content:
331
- if c.get("type") == "output_text":
332
- response_text = c.get("text", response_text)
333
- break
334
- break
335
-
336
- done = "task completed" in response_text.lower()
337
-
338
- # update conversation history
339
- self.conversation_history += new_items
340
-
341
- response_action = {
342
- "type": "response",
343
- "text": response_text,
344
- "reasoning": response_text,
345
- "logs": {"conversation_length": len(self.conversation_history)}
346
- }
347
-
348
- # Check if this indicates task completion or failure
349
- if "task is infeasible" in response_text.lower():
350
- response_action = {"type": "custom", "action": "FAIL"}
351
- done = True
352
-
353
- return [response_action], done
354
- except Exception as e:
355
- logger.error(f"Error running ComputerAgent: {e}")
356
- # Return an error response
357
- error_action = {
358
- "type": "response",
359
- "text": f"Error occurred: {str(e)}",
360
- "reasoning": f"ComputerAgent encountered an error: {str(e)}",
361
- "logs": {"error": str(e)}
362
- }
363
- return [error_action], True
364
-
365
- except Exception as e:
366
- logger.error(f"Error in fetch_response: {e}")
367
- error_action = {
368
- "type": "response",
369
- "text": f"Error in agent processing: {str(e)}",
370
- "reasoning": f"Agent processing error: {str(e)}",
371
- "logs": {"error": str(e)}
372
- }
373
- return [error_action], True
364
+ return messages
365
+
366
+
367
+ __all__ = [
368
+ "MCPComputerAgent",
369
+ ]