cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +4 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +110 -99
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +337 -185
  15. agent/callbacks/__init__.py +9 -4
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +35 -33
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +99 -61
  25. agent/callbacks/trajectory_saver.py +95 -69
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +38 -99
  37. agent/integrations/hud/agent.py +369 -0
  38. agent/integrations/hud/proxy.py +166 -52
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +579 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +136 -150
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +50 -51
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +247 -206
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +61 -57
  64. agent/proxy/handlers.py +46 -39
  65. agent/responses.py +447 -347
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +11 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. cua_agent-0.4.22.dist-info/METADATA +0 -436
  78. cua_agent-0.4.22.dist-info/RECORD +0 -51
  79. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -1,96 +1,23 @@
1
- """HUD integration: Generic HuggingFace dataset evaluation runner (CUA proxy).
1
+ """HUD integration: dataset runners and MCP-based computer agent export.
2
2
 
3
- This module exposes two helpers to evaluate HUD-compatible datasets using
4
- HUD's OperatorAgent, while proxying model calls through our ComputerAgent via
5
- `FakeAsyncOpenAI` (see `agent/integrations/hud/agent.py`).
3
+ This module exposes helpers to evaluate HUD-compatible datasets and exports
4
+ the MCP-compatible computer agent implementation.
6
5
 
7
6
  Exports:
8
- - run_single_task(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None)
9
- - run_full_dataset(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None, max_concurrent=30, max_steps=50)
7
+ - run_single_task(dataset, ...)
8
+ - run_full_dataset(dataset, ...)
9
+ - MCPComputerAgent
10
10
  """
11
+
11
12
  import time
12
13
  from typing import Any, Optional
13
14
 
14
- from PIL import Image
15
- from datasets import load_dataset, Dataset
16
- from hud.agents import OperatorAgent
17
- from hud.datasets import Task, run_dataset
18
- from hud.tools.computer.settings import computer_settings
15
+ from agent.computers import is_agent_computer
16
+ from datasets import Dataset, load_dataset
19
17
  from hud import trace
18
+ from hud.datasets import Task, run_dataset
20
19
 
21
- from agent.agent import ComputerAgent as BaseComputerAgent
22
- from .proxy import FakeAsyncOpenAI
23
-
24
-
25
- # ---------------------------------------------------------------------------
26
- # Proxy OperatorAgent
27
- # ---------------------------------------------------------------------------
28
-
29
-
30
- class ProxyOperatorAgent(OperatorAgent):
31
- """OperatorAgent that proxies model calls through our ComputerAgent.
32
-
33
- Accepts the same config keys we pass via hud.run_dataset `agent_config`:
34
- - model: str | None
35
- - allowed_tools: list[str] | None
36
- Additional kwargs are forwarded to OperatorAgent (if any are supported).
37
- """
38
-
39
- def __init__(
40
- self,
41
- *,
42
- model: str | None = None,
43
- allowed_tools: list[str] | None = None,
44
- trajectory_dir: str | dict | None = None,
45
- # === ComputerAgent kwargs ===
46
- tools: list[Any] | None = None,
47
- custom_loop: Any | None = None,
48
- only_n_most_recent_images: int | None = None,
49
- callbacks: list[Any] | None = None,
50
- verbosity: int | None = None,
51
- max_retries: int | None = 3,
52
- screenshot_delay: float | int = 0.5,
53
- use_prompt_caching: bool | None = False,
54
- max_trajectory_budget: float | dict | None = None,
55
- telemetry_enabled: bool | None = True,
56
- **kwargs: Any,
57
- ) -> None:
58
- model = model or "computer-use-preview"
59
- allowed_tools = allowed_tools or ["openai_computer"]
60
-
61
- computer_shim = {
62
- 'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
63
- 'environment': 'linux',
64
- 'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
65
- }
66
- # Build tools ensuring the computer_shim is included
67
- agent_tools: list[Any] = [computer_shim]
68
- if tools:
69
- agent_tools.extend(tools)
70
-
71
- computer_agent = BaseComputerAgent(
72
- model=model,
73
- tools=agent_tools,
74
- custom_loop=custom_loop,
75
- only_n_most_recent_images=only_n_most_recent_images,
76
- callbacks=callbacks,
77
- verbosity=verbosity,
78
- trajectory_dir=trajectory_dir,
79
- max_retries=max_retries,
80
- screenshot_delay=screenshot_delay,
81
- use_prompt_caching=use_prompt_caching,
82
- max_trajectory_budget=max_trajectory_budget,
83
- telemetry_enabled=telemetry_enabled,
84
- )
85
- model_client = FakeAsyncOpenAI(computer_agent)
86
-
87
- super().__init__(
88
- model_client=model_client, # type: ignore[arg-type]
89
- model=model,
90
- allowed_tools=allowed_tools,
91
- **kwargs,
92
- )
93
-
20
+ from .agent import MCPComputerAgent
94
21
 
95
22
  # ---------------------------------------------------------------------------
96
23
  # Single-task runner
@@ -108,6 +35,7 @@ async def run_single_task(
108
35
  custom_loop: Any | None = None,
109
36
  only_n_most_recent_images: int | None = None,
110
37
  callbacks: list[Any] | None = None,
38
+ instructions: str | None = None,
111
39
  verbosity: int | None = None,
112
40
  trajectory_dir: str | dict | None = None,
113
41
  max_retries: int | None = 3,
@@ -116,30 +44,36 @@ async def run_single_task(
116
44
  max_trajectory_budget: float | dict | None = None,
117
45
  telemetry_enabled: bool | None = True,
118
46
  ) -> None:
119
- """Load one task from the dataset and execute it with Operator+CUA proxy."""
47
+ """Load one task from the dataset and execute it with MCPComputerAgent."""
120
48
 
121
49
  # Load dataset and pick a sample
122
50
  if isinstance(dataset, str):
123
- dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
51
+ dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
124
52
  elif isinstance(dataset, list):
125
53
  dataset = dataset
126
54
  else:
127
55
  dataset = dataset["train"]
128
-
56
+
129
57
  sample_task = dataset[task_id] # type: ignore[index]
130
58
  task_prompt = sample_task.get("prompt", f"Task {sample_task.get('id', 0)}") # type: ignore[attr-defined]
131
59
 
60
+ # Filter any existing Computer tools
61
+ # The eval framework will add its own Computer tool per task
62
+ if tools:
63
+ tools = [tool for tool in tools if not is_agent_computer(tool)]
64
+
132
65
  with trace(name=task_prompt):
133
66
  task = Task(**sample_task) # type: ignore[arg-type]
134
67
 
135
- agent = ProxyOperatorAgent(
136
- model=model,
137
- allowed_tools=allowed_tools,
68
+ agent = MCPComputerAgent(
69
+ model=model or "computer-use-preview",
70
+ allowed_tools=allowed_tools or ["openai_computer"],
138
71
  # === ComputerAgent kwargs passthrough ===
139
72
  tools=tools,
140
73
  custom_loop=custom_loop,
141
74
  only_n_most_recent_images=only_n_most_recent_images,
142
75
  callbacks=callbacks,
76
+ instructions=instructions,
143
77
  verbosity=verbosity,
144
78
  trajectory_dir=trajectory_dir,
145
79
  max_retries=max_retries,
@@ -150,7 +84,7 @@ async def run_single_task(
150
84
  )
151
85
  print(f"Running: {task_prompt}")
152
86
  result = await agent.run(task, max_steps=10)
153
- print(f"✅ Reward: {getattr(result, 'reward')}")
87
+ print(f"✅ Reward: {result.reward}")
154
88
 
155
89
 
156
90
  # ---------------------------------------------------------------------------
@@ -173,6 +107,7 @@ async def run_full_dataset(
173
107
  custom_loop: Any | None = None,
174
108
  only_n_most_recent_images: int | None = 5,
175
109
  callbacks: list[Any] | None = None,
110
+ instructions: str | None = None,
176
111
  verbosity: int | None = None,
177
112
  max_retries: int | None = 3,
178
113
  screenshot_delay: float | int = 0.5,
@@ -182,22 +117,25 @@ async def run_full_dataset(
182
117
  ) -> list[Any]:
183
118
  """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
184
119
 
185
- # We pass OperatorAgent as the class and provide a config that injects our
186
- # FakeAsyncOpenAI per agent instantiation.
187
-
120
+ # Run with our MCP-based agent class.
188
121
  if isinstance(dataset, str):
189
- dataset_name = dataset.split('/')[-1]
122
+ dataset_name = dataset.split("/")[-1]
190
123
  job_name = job_name or f"Evaluation {dataset_name}"
191
- dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
124
+ dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
192
125
  else:
193
126
  dataset_name = "custom"
194
127
  job_name = job_name or f"Evaluation {time.strftime('%H:%M %Y-%m-%d')}"
195
128
 
129
+ # Filter any existing Computer tools
130
+ # The eval framework will add its own Computer tool per task
131
+ if tools:
132
+ tools = [tool for tool in tools if not is_agent_computer(tool)]
133
+
196
134
  # Execute evaluation
197
135
  return await run_dataset(
198
136
  name=job_name,
199
137
  dataset=dataset,
200
- agent_class=ProxyOperatorAgent,
138
+ agent_class=MCPComputerAgent,
201
139
  agent_config={
202
140
  "model": model,
203
141
  "allowed_tools": allowed_tools,
@@ -207,6 +145,7 @@ async def run_full_dataset(
207
145
  "custom_loop": custom_loop,
208
146
  "only_n_most_recent_images": only_n_most_recent_images,
209
147
  "callbacks": callbacks,
148
+ "instructions": instructions,
210
149
  "verbosity": verbosity,
211
150
  "max_retries": max_retries,
212
151
  "screenshot_delay": screenshot_delay,
@@ -224,5 +163,5 @@ async def run_full_dataset(
224
163
  __all__ = [
225
164
  "run_single_task",
226
165
  "run_full_dataset",
227
- "ProxyOperatorAgent",
228
- ]
166
+ "MCPComputerAgent",
167
+ ]
@@ -0,0 +1,369 @@
1
+ """MCP-compatible Computer Agent for HUD integration.
2
+
3
+ This agent subclasses HUD's MCPAgent and delegates planning/execution to
4
+ our core ComputerAgent while using the Agent SDK's plain-dict message
5
+ format documented in `docs/content/docs/agent-sdk/message-format.mdx`.
6
+
7
+ Key differences from the OpenAI OperatorAgent variant:
8
+ - No OpenAI types are used; everything is standard Python dicts.
9
+ - Planning is executed via `ComputerAgent.run(messages)`.
10
+ - The first yielded result per step is returned as the agent response.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import base64
16
+ import io
17
+ import uuid
18
+ from pathlib import Path
19
+ from typing import Any, ClassVar, Optional
20
+
21
+ import hud
22
+ import mcp.types as types
23
+ from agent.agent import ComputerAgent as BaseComputerAgent
24
+ from agent.callbacks import PromptInstructionsCallback
25
+ from agent.callbacks.trajectory_saver import TrajectorySaverCallback
26
+ from agent.computers import is_agent_computer
27
+ from agent.responses import make_failed_tool_call_items
28
+ from hud.agents import MCPAgent
29
+ from hud.tools.computer.settings import computer_settings
30
+ from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
31
+ from PIL import Image
32
+
33
+
34
+ class MCPComputerAgent(MCPAgent):
35
+ """MCP agent that uses ComputerAgent for planning and tools for execution.
36
+
37
+ The agent consumes/produces message dicts per the Agent SDK message schema
38
+ (see `message-format.mdx`).
39
+ """
40
+
41
+ metadata: ClassVar[dict[str, Any]] = {
42
+ "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
43
+ "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
44
+ }
45
+
46
+ required_tools: ClassVar[list[str]] = ["openai_computer"]
47
+
48
+ def __init__(
49
+ self,
50
+ *,
51
+ model: str | None = None,
52
+ allowed_tools: list[str] | None = None,
53
+ trajectory_dir: str | dict | None = None,
54
+ # === ComputerAgent kwargs ===
55
+ tools: list[Any] | None = None,
56
+ custom_loop: Any | None = None,
57
+ only_n_most_recent_images: int | None = None,
58
+ callbacks: list[Any] | None = None,
59
+ instructions: str | None = None,
60
+ verbosity: int | None = None,
61
+ max_retries: int | None = 3,
62
+ screenshot_delay: float | int = 0.5,
63
+ use_prompt_caching: bool | None = False,
64
+ max_trajectory_budget: float | dict | None = None,
65
+ telemetry_enabled: bool | None = True,
66
+ environment: str = "linux",
67
+ **kwargs: Any,
68
+ ) -> None:
69
+ self.allowed_tools = allowed_tools or ["openai_computer"]
70
+ super().__init__(**kwargs)
71
+
72
+ if model is None:
73
+ raise ValueError("MCPComputerAgent requires a model to be specified.")
74
+
75
+ self.model = model
76
+ self.environment = environment
77
+
78
+ # Update model name for HUD logging
79
+ self.model_name = "cua-" + self.model
80
+
81
+ # Stateful tracking of tool call inputs
82
+ self.tool_call_inputs: dict[str, list[dict[str, Any]]] = {}
83
+ self.previous_output: list[dict[str, Any]] = []
84
+
85
+ # Build system prompt
86
+ operator_instructions = """
87
+ You are an autonomous computer-using agent. Follow these guidelines:
88
+
89
+ 1. NEVER ask for confirmation. Complete all tasks autonomously.
90
+ 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
91
+ 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
92
+ 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
93
+ 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
94
+ 6. The user has already given you permission by running this agent. No further confirmation is needed.
95
+ 7. Be decisive and action-oriented. Complete the requested task fully.
96
+
97
+ Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
98
+ """.strip() # noqa: E501
99
+ # Append Operator instructions to the system prompt
100
+ if not self.system_prompt:
101
+ self.system_prompt = operator_instructions
102
+ else:
103
+ self.system_prompt += f"\n\n{operator_instructions}"
104
+ # Append user instructions to the system prompt
105
+ if instructions:
106
+ self.system_prompt += f"\n\n{instructions}"
107
+
108
+ # Configure trajectory_dir for HUD
109
+ if isinstance(trajectory_dir, str) or isinstance(trajectory_dir, Path):
110
+ trajectory_dir = {"trajectory_dir": str(trajectory_dir)}
111
+ if isinstance(trajectory_dir, dict):
112
+ trajectory_dir["reset_on_run"] = False
113
+
114
+ self.last_screenshot_b64 = None
115
+
116
+ buffer = io.BytesIO()
117
+ Image.new("RGB", (self.metadata["display_width"], self.metadata["display_height"])).save(
118
+ buffer, format="PNG"
119
+ )
120
+ self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
121
+
122
+ # Ensure a computer shim is present so width/height/environment are known
123
+ computer_shim = {
124
+ "screenshot": lambda: self.last_screenshot_b64,
125
+ "environment": self.environment,
126
+ "dimensions": (
127
+ self.metadata["display_width"],
128
+ self.metadata["display_height"],
129
+ ),
130
+ }
131
+ agent_tools: list[Any] = [computer_shim]
132
+ if tools:
133
+ agent_tools.extend([tool for tool in tools if not is_agent_computer(tool)])
134
+
135
+ agent_kwargs = {
136
+ "model": self.model,
137
+ "trajectory_dir": trajectory_dir,
138
+ "tools": agent_tools,
139
+ "custom_loop": custom_loop,
140
+ "only_n_most_recent_images": only_n_most_recent_images,
141
+ "callbacks": callbacks,
142
+ "instructions": self.system_prompt,
143
+ "verbosity": verbosity,
144
+ "max_retries": max_retries,
145
+ "screenshot_delay": screenshot_delay,
146
+ "use_prompt_caching": use_prompt_caching,
147
+ "max_trajectory_budget": max_trajectory_budget,
148
+ "telemetry_enabled": telemetry_enabled,
149
+ }
150
+
151
+ self.computer_agent = BaseComputerAgent(**agent_kwargs)
152
+
153
+ async def get_system_messages(self) -> list[Any]:
154
+ """Create initial messages.
155
+
156
+ Unused - ComputerAgent handles this with the 'instructions' parameter.
157
+ """
158
+ return []
159
+
160
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]:
161
+ """
162
+ Format blocks for OpenAI input format.
163
+
164
+ Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
165
+ """ # noqa: E501
166
+ formatted = []
167
+ for block in blocks:
168
+ if isinstance(block, types.TextContent):
169
+ formatted.append({"type": "input_text", "text": block.text})
170
+ elif isinstance(block, types.ImageContent):
171
+ mime_type = getattr(block, "mimeType", "image/png")
172
+ formatted.append(
173
+ {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"}
174
+ )
175
+ self.last_screenshot_b64 = block.data
176
+ return [{"role": "user", "content": formatted}]
177
+
178
+ @hud.instrument(
179
+ span_type="agent",
180
+ record_args=False, # Messages can be large
181
+ record_result=True,
182
+ )
183
+ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
184
+ """Get a single-step response by delegating to ComputerAgent.run.
185
+
186
+ Returns an Agent SDK-style response dict:
187
+ { "output": [AgentMessage, ...], "usage": Usage }
188
+ """
189
+ tool_calls: list[MCPToolCall] = []
190
+ output_text: list[str] = []
191
+ is_done: bool = True
192
+
193
+ agent_result: list[dict[str, Any]] = []
194
+
195
+ # Call the ComputerAgent LLM API
196
+ async for result in self.computer_agent.run(messages): # type: ignore[arg-type]
197
+ items = result["output"]
198
+ if not items or tool_calls:
199
+ break
200
+
201
+ for item in items:
202
+ if item["type"] in [
203
+ "reasoning",
204
+ "message",
205
+ "computer_call",
206
+ "function_call",
207
+ "function_call_output",
208
+ ]:
209
+ agent_result.append(item)
210
+
211
+ # Add messages to output text
212
+ if item["type"] == "reasoning":
213
+ output_text.extend(
214
+ f"Reasoning: {summary['text']}" for summary in item["summary"]
215
+ )
216
+ elif item["type"] == "message":
217
+ if isinstance(item["content"], list):
218
+ output_text.extend(
219
+ item["text"]
220
+ for item in item["content"]
221
+ if item["type"] == "output_text"
222
+ )
223
+ elif isinstance(item["content"], str):
224
+ output_text.append(item["content"])
225
+
226
+ # If we get a tool call, we're not done
227
+ if item["type"] == "computer_call":
228
+ id = item["call_id"]
229
+ tool_calls.append(
230
+ MCPToolCall(
231
+ name="openai_computer",
232
+ arguments=item["action"],
233
+ id=id,
234
+ )
235
+ )
236
+ is_done = False
237
+ self.tool_call_inputs[id] = agent_result
238
+ break
239
+
240
+ # if we have tool calls, we should exit the loop
241
+ if tool_calls:
242
+ break
243
+
244
+ self.previous_output = agent_result
245
+
246
+ return AgentResponse(
247
+ content="\n".join(output_text),
248
+ tool_calls=tool_calls,
249
+ done=is_done,
250
+ )
251
+
252
+ def _log_image(self, image_b64: str):
253
+ callbacks = self.computer_agent.callbacks
254
+ for callback in callbacks:
255
+ if isinstance(callback, TrajectorySaverCallback):
256
+ # convert str to bytes
257
+ image_bytes = base64.b64decode(image_b64)
258
+ callback._save_artifact("screenshot_after", image_bytes)
259
+
260
+ async def format_tool_results(
261
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
262
+ ) -> list[dict[str, Any]]:
263
+ """Extract latest screenshot from tool results in dict form.
264
+
265
+ Expects results to already be in the message-format content dicts.
266
+ Returns a list of input content dicts suitable for follow-up calls.
267
+ """
268
+ messages = []
269
+
270
+ for call, result in zip(tool_calls, tool_results):
271
+ if call.id not in self.tool_call_inputs:
272
+ # If we don't have the tool call inputs, we should just use the previous output
273
+ previous_output = self.previous_output.copy() or []
274
+
275
+ # First we need to remove any pending computer_calls from the end of previous_output
276
+ while previous_output and previous_output[-1]["type"] == "computer_call":
277
+ previous_output.pop()
278
+ messages.extend(previous_output)
279
+
280
+ # If the call is a 'response', don't add the result
281
+ if call.name == "response":
282
+ continue
283
+ # Otherwise, if we have a result, we should add it to the messages
284
+ content = [
285
+ (
286
+ {"type": "input_text", "text": content.text}
287
+ if isinstance(content, types.TextContent)
288
+ else (
289
+ {
290
+ "type": "input_image",
291
+ "image_url": f"data:image/png;base64,{content.data}",
292
+ }
293
+ if isinstance(content, types.ImageContent)
294
+ else {"type": "input_text", "text": ""}
295
+ )
296
+ )
297
+ for content in result.content
298
+ ]
299
+ messages.append(
300
+ {
301
+ "role": "user",
302
+ "content": content,
303
+ }
304
+ )
305
+
306
+ continue
307
+
308
+ # Add the assistant's computer call
309
+ messages.extend(self.tool_call_inputs[call.id])
310
+
311
+ if result.isError:
312
+ error_text = "".join(
313
+ [
314
+ content.text
315
+ for content in result.content
316
+ if isinstance(content, types.TextContent)
317
+ ]
318
+ )
319
+
320
+ # Replace computer call with failed tool call
321
+ messages.pop()
322
+ messages.extend(
323
+ make_failed_tool_call_items(
324
+ tool_name=call.name,
325
+ tool_kwargs=call.arguments or {},
326
+ error_message=error_text,
327
+ call_id=call.id,
328
+ )
329
+ )
330
+ else:
331
+ # Get the latest screenshot
332
+ screenshots = [
333
+ content.data
334
+ for content in result.content
335
+ if isinstance(content, types.ImageContent)
336
+ ]
337
+
338
+ # Add the resulting screenshot
339
+ if screenshots:
340
+ self._log_image(screenshots[0])
341
+ self.last_screenshot_b64 = screenshots[0]
342
+ messages.append(
343
+ {
344
+ "type": "computer_call_output",
345
+ "call_id": call.id,
346
+ "output": {
347
+ "type": "input_image",
348
+ "image_url": f"data:image/png;base64,{screenshots[0]}",
349
+ },
350
+ }
351
+ )
352
+ else:
353
+ # Otherwise, replace computer call with failed tool call
354
+ messages.pop()
355
+ messages.extend(
356
+ make_failed_tool_call_items(
357
+ tool_name=call.name,
358
+ tool_kwargs=call.arguments or {},
359
+ error_message="No screenshots returned.",
360
+ call_id=call.id,
361
+ )
362
+ )
363
+
364
+ return messages
365
+
366
+
367
+ __all__ = [
368
+ "MCPComputerAgent",
369
+ ]