cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +32 -19
- agent/computers/cua.py +33 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +215 -210
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +510 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
- cua_agent-0.4.36.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0
|
@@ -8,21 +8,22 @@ Exports:
|
|
|
8
8
|
- run_full_dataset(dataset, ...)
|
|
9
9
|
- MCPComputerAgent
|
|
10
10
|
"""
|
|
11
|
+
|
|
11
12
|
import time
|
|
12
13
|
from typing import Any, Optional
|
|
13
14
|
|
|
14
15
|
from agent.computers import is_agent_computer
|
|
15
|
-
from datasets import
|
|
16
|
-
from hud.datasets import Task, run_dataset
|
|
16
|
+
from datasets import Dataset, load_dataset
|
|
17
17
|
from hud import trace
|
|
18
|
+
from hud.datasets import Task, run_dataset
|
|
18
19
|
|
|
19
20
|
from .agent import MCPComputerAgent
|
|
20
21
|
|
|
21
|
-
|
|
22
22
|
# ---------------------------------------------------------------------------
|
|
23
23
|
# Single-task runner
|
|
24
24
|
# ---------------------------------------------------------------------------
|
|
25
25
|
|
|
26
|
+
|
|
26
27
|
async def run_single_task(
|
|
27
28
|
dataset: str | Dataset | list[dict[str, Any]],
|
|
28
29
|
*,
|
|
@@ -47,24 +48,20 @@ async def run_single_task(
|
|
|
47
48
|
|
|
48
49
|
# Load dataset and pick a sample
|
|
49
50
|
if isinstance(dataset, str):
|
|
50
|
-
dataset = load_dataset(dataset, split="train")
|
|
51
|
+
dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
|
|
51
52
|
elif isinstance(dataset, list):
|
|
52
53
|
dataset = dataset
|
|
53
54
|
else:
|
|
54
55
|
dataset = dataset["train"]
|
|
55
|
-
|
|
56
|
+
|
|
56
57
|
sample_task = dataset[task_id] # type: ignore[index]
|
|
57
58
|
task_prompt = sample_task.get("prompt", f"Task {sample_task.get('id', 0)}") # type: ignore[attr-defined]
|
|
58
59
|
|
|
59
60
|
# Filter any existing Computer tools
|
|
60
61
|
# The eval framework will add its own Computer tool per task
|
|
61
62
|
if tools:
|
|
62
|
-
tools = [
|
|
63
|
-
|
|
64
|
-
for tool in tools
|
|
65
|
-
if not is_agent_computer(tool)
|
|
66
|
-
]
|
|
67
|
-
|
|
63
|
+
tools = [tool for tool in tools if not is_agent_computer(tool)]
|
|
64
|
+
|
|
68
65
|
with trace(name=task_prompt):
|
|
69
66
|
task = Task(**sample_task) # type: ignore[arg-type]
|
|
70
67
|
|
|
@@ -87,13 +84,14 @@ async def run_single_task(
|
|
|
87
84
|
)
|
|
88
85
|
print(f"Running: {task_prompt}")
|
|
89
86
|
result = await agent.run(task, max_steps=10)
|
|
90
|
-
print(f"✅ Reward: {
|
|
87
|
+
print(f"✅ Reward: {result.reward}")
|
|
91
88
|
|
|
92
89
|
|
|
93
90
|
# ---------------------------------------------------------------------------
|
|
94
91
|
# Full-dataset runner
|
|
95
92
|
# ---------------------------------------------------------------------------
|
|
96
93
|
|
|
94
|
+
|
|
97
95
|
async def run_full_dataset(
|
|
98
96
|
dataset: str | Dataset | list[dict[str, Any]],
|
|
99
97
|
*,
|
|
@@ -121,9 +119,9 @@ async def run_full_dataset(
|
|
|
121
119
|
|
|
122
120
|
# Run with our MCP-based agent class.
|
|
123
121
|
if isinstance(dataset, str):
|
|
124
|
-
dataset_name = dataset.split(
|
|
122
|
+
dataset_name = dataset.split("/")[-1]
|
|
125
123
|
job_name = job_name or f"Evaluation {dataset_name}"
|
|
126
|
-
dataset = load_dataset(dataset, split=split)
|
|
124
|
+
dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
|
|
127
125
|
else:
|
|
128
126
|
dataset_name = "custom"
|
|
129
127
|
job_name = job_name or f"Evaluation {time.strftime('%H:%M %Y-%m-%d')}"
|
|
@@ -131,12 +129,8 @@ async def run_full_dataset(
|
|
|
131
129
|
# Filter any existing Computer tools
|
|
132
130
|
# The eval framework will add its own Computer tool per task
|
|
133
131
|
if tools:
|
|
134
|
-
tools = [
|
|
135
|
-
|
|
136
|
-
for tool in tools
|
|
137
|
-
if not is_agent_computer(tool)
|
|
138
|
-
]
|
|
139
|
-
|
|
132
|
+
tools = [tool for tool in tools if not is_agent_computer(tool)]
|
|
133
|
+
|
|
140
134
|
# Execute evaluation
|
|
141
135
|
return await run_dataset(
|
|
142
136
|
name=job_name,
|
|
@@ -170,4 +164,4 @@ __all__ = [
|
|
|
170
164
|
"run_single_task",
|
|
171
165
|
"run_full_dataset",
|
|
172
166
|
"MCPComputerAgent",
|
|
173
|
-
]
|
|
167
|
+
]
|
agent/integrations/hud/agent.py
CHANGED
|
@@ -9,26 +9,26 @@ Key differences from the OpenAI OperatorAgent variant:
|
|
|
9
9
|
- Planning is executed via `ComputerAgent.run(messages)`.
|
|
10
10
|
- The first yielded result per step is returned as the agent response.
|
|
11
11
|
"""
|
|
12
|
+
|
|
12
13
|
from __future__ import annotations
|
|
13
14
|
|
|
15
|
+
import base64
|
|
14
16
|
import io
|
|
17
|
+
import uuid
|
|
18
|
+
from pathlib import Path
|
|
15
19
|
from typing import Any, ClassVar, Optional
|
|
16
20
|
|
|
21
|
+
import hud
|
|
22
|
+
import mcp.types as types
|
|
17
23
|
from agent.agent import ComputerAgent as BaseComputerAgent
|
|
18
24
|
from agent.callbacks import PromptInstructionsCallback
|
|
19
25
|
from agent.callbacks.trajectory_saver import TrajectorySaverCallback
|
|
26
|
+
from agent.computers import is_agent_computer
|
|
27
|
+
from agent.responses import make_failed_tool_call_items
|
|
20
28
|
from hud.agents import MCPAgent
|
|
21
29
|
from hud.tools.computer.settings import computer_settings
|
|
22
30
|
from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
|
|
23
|
-
|
|
24
|
-
from agent.responses import make_failed_tool_call_items
|
|
25
|
-
from agent.computers import is_agent_computer
|
|
26
31
|
from PIL import Image
|
|
27
|
-
import mcp.types as types
|
|
28
|
-
import hud
|
|
29
|
-
import uuid
|
|
30
|
-
import base64
|
|
31
|
-
from pathlib import Path
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class MCPComputerAgent(MCPAgent):
|
|
@@ -114,8 +114,10 @@ class MCPComputerAgent(MCPAgent):
|
|
|
114
114
|
self.last_screenshot_b64 = None
|
|
115
115
|
|
|
116
116
|
buffer = io.BytesIO()
|
|
117
|
-
Image.new(
|
|
118
|
-
|
|
117
|
+
Image.new("RGB", (self.metadata["display_width"], self.metadata["display_height"])).save(
|
|
118
|
+
buffer, format="PNG"
|
|
119
|
+
)
|
|
120
|
+
self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
|
119
121
|
|
|
120
122
|
# Ensure a computer shim is present so width/height/environment are known
|
|
121
123
|
computer_shim = {
|
|
@@ -128,12 +130,8 @@ class MCPComputerAgent(MCPAgent):
|
|
|
128
130
|
}
|
|
129
131
|
agent_tools: list[Any] = [computer_shim]
|
|
130
132
|
if tools:
|
|
131
|
-
agent_tools.extend([
|
|
132
|
-
|
|
133
|
-
for tool in tools
|
|
134
|
-
if not is_agent_computer(tool)
|
|
135
|
-
])
|
|
136
|
-
|
|
133
|
+
agent_tools.extend([tool for tool in tools if not is_agent_computer(tool)])
|
|
134
|
+
|
|
137
135
|
agent_kwargs = {
|
|
138
136
|
"model": self.model,
|
|
139
137
|
"trajectory_dir": trajectory_dir,
|
|
@@ -150,9 +148,7 @@ class MCPComputerAgent(MCPAgent):
|
|
|
150
148
|
"telemetry_enabled": telemetry_enabled,
|
|
151
149
|
}
|
|
152
150
|
|
|
153
|
-
self.computer_agent = BaseComputerAgent(
|
|
154
|
-
**agent_kwargs
|
|
155
|
-
)
|
|
151
|
+
self.computer_agent = BaseComputerAgent(**agent_kwargs)
|
|
156
152
|
|
|
157
153
|
async def get_system_messages(self) -> list[Any]:
|
|
158
154
|
"""Create initial messages.
|
|
@@ -161,9 +157,7 @@ class MCPComputerAgent(MCPAgent):
|
|
|
161
157
|
"""
|
|
162
158
|
return []
|
|
163
159
|
|
|
164
|
-
async def format_blocks(
|
|
165
|
-
self, blocks: list[types.ContentBlock]
|
|
166
|
-
) -> list[dict[str, Any]]:
|
|
160
|
+
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]:
|
|
167
161
|
"""
|
|
168
162
|
Format blocks for OpenAI input format.
|
|
169
163
|
|
|
@@ -200,42 +194,49 @@ class MCPComputerAgent(MCPAgent):
|
|
|
200
194
|
|
|
201
195
|
# Call the ComputerAgent LLM API
|
|
202
196
|
async for result in self.computer_agent.run(messages): # type: ignore[arg-type]
|
|
203
|
-
items = result[
|
|
197
|
+
items = result["output"]
|
|
204
198
|
if not items or tool_calls:
|
|
205
199
|
break
|
|
206
200
|
|
|
207
201
|
for item in items:
|
|
208
|
-
if item[
|
|
202
|
+
if item["type"] in [
|
|
203
|
+
"reasoning",
|
|
204
|
+
"message",
|
|
205
|
+
"computer_call",
|
|
206
|
+
"function_call",
|
|
207
|
+
"function_call_output",
|
|
208
|
+
]:
|
|
209
209
|
agent_result.append(item)
|
|
210
|
-
|
|
210
|
+
|
|
211
211
|
# Add messages to output text
|
|
212
|
-
if item[
|
|
212
|
+
if item["type"] == "reasoning":
|
|
213
213
|
output_text.extend(
|
|
214
|
-
f"Reasoning: {summary['text']}"
|
|
215
|
-
for summary in item['summary']
|
|
214
|
+
f"Reasoning: {summary['text']}" for summary in item["summary"]
|
|
216
215
|
)
|
|
217
|
-
elif item[
|
|
218
|
-
if isinstance(item[
|
|
216
|
+
elif item["type"] == "message":
|
|
217
|
+
if isinstance(item["content"], list):
|
|
219
218
|
output_text.extend(
|
|
220
|
-
item[
|
|
221
|
-
for item in item[
|
|
222
|
-
if item[
|
|
219
|
+
item["text"]
|
|
220
|
+
for item in item["content"]
|
|
221
|
+
if item["type"] == "output_text"
|
|
223
222
|
)
|
|
224
|
-
elif isinstance(item[
|
|
225
|
-
output_text.append(item[
|
|
226
|
-
|
|
223
|
+
elif isinstance(item["content"], str):
|
|
224
|
+
output_text.append(item["content"])
|
|
225
|
+
|
|
227
226
|
# If we get a tool call, we're not done
|
|
228
|
-
if item[
|
|
227
|
+
if item["type"] == "computer_call":
|
|
229
228
|
id = item["call_id"]
|
|
230
|
-
tool_calls.append(
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
229
|
+
tool_calls.append(
|
|
230
|
+
MCPToolCall(
|
|
231
|
+
name="openai_computer",
|
|
232
|
+
arguments=item["action"],
|
|
233
|
+
id=id,
|
|
234
|
+
)
|
|
235
|
+
)
|
|
235
236
|
is_done = False
|
|
236
237
|
self.tool_call_inputs[id] = agent_result
|
|
237
238
|
break
|
|
238
|
-
|
|
239
|
+
|
|
239
240
|
# if we have tool calls, we should exit the loop
|
|
240
241
|
if tool_calls:
|
|
241
242
|
break
|
|
@@ -247,7 +248,7 @@ class MCPComputerAgent(MCPAgent):
|
|
|
247
248
|
tool_calls=tool_calls,
|
|
248
249
|
done=is_done,
|
|
249
250
|
)
|
|
250
|
-
|
|
251
|
+
|
|
251
252
|
def _log_image(self, image_b64: str):
|
|
252
253
|
callbacks = self.computer_agent.callbacks
|
|
253
254
|
for callback in callbacks:
|
|
@@ -257,9 +258,7 @@ class MCPComputerAgent(MCPAgent):
|
|
|
257
258
|
callback._save_artifact("screenshot_after", image_bytes)
|
|
258
259
|
|
|
259
260
|
async def format_tool_results(
|
|
260
|
-
self,
|
|
261
|
-
tool_calls: list[MCPToolCall],
|
|
262
|
-
tool_results: list[MCPToolResult]
|
|
261
|
+
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
263
262
|
) -> list[dict[str, Any]]:
|
|
264
263
|
"""Extract latest screenshot from tool results in dict form.
|
|
265
264
|
|
|
@@ -274,45 +273,60 @@ class MCPComputerAgent(MCPAgent):
|
|
|
274
273
|
previous_output = self.previous_output.copy() or []
|
|
275
274
|
|
|
276
275
|
# First we need to remove any pending computer_calls from the end of previous_output
|
|
277
|
-
while previous_output and previous_output[-1][
|
|
276
|
+
while previous_output and previous_output[-1]["type"] == "computer_call":
|
|
278
277
|
previous_output.pop()
|
|
279
278
|
messages.extend(previous_output)
|
|
280
279
|
|
|
281
280
|
# If the call is a 'response', don't add the result
|
|
282
|
-
if call.name ==
|
|
281
|
+
if call.name == "response":
|
|
283
282
|
continue
|
|
284
283
|
# Otherwise, if we have a result, we should add it to the messages
|
|
285
284
|
content = [
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
285
|
+
(
|
|
286
|
+
{"type": "input_text", "text": content.text}
|
|
287
|
+
if isinstance(content, types.TextContent)
|
|
288
|
+
else (
|
|
289
|
+
{
|
|
290
|
+
"type": "input_image",
|
|
291
|
+
"image_url": f"data:image/png;base64,{content.data}",
|
|
292
|
+
}
|
|
293
|
+
if isinstance(content, types.ImageContent)
|
|
294
|
+
else {"type": "input_text", "text": ""}
|
|
295
|
+
)
|
|
296
|
+
)
|
|
289
297
|
for content in result.content
|
|
290
298
|
]
|
|
291
|
-
messages.append(
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
299
|
+
messages.append(
|
|
300
|
+
{
|
|
301
|
+
"role": "user",
|
|
302
|
+
"content": content,
|
|
303
|
+
}
|
|
304
|
+
)
|
|
295
305
|
|
|
296
306
|
continue
|
|
297
|
-
|
|
307
|
+
|
|
298
308
|
# Add the assistant's computer call
|
|
299
309
|
messages.extend(self.tool_call_inputs[call.id])
|
|
300
|
-
|
|
310
|
+
|
|
301
311
|
if result.isError:
|
|
302
|
-
error_text = "".join(
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
312
|
+
error_text = "".join(
|
|
313
|
+
[
|
|
314
|
+
content.text
|
|
315
|
+
for content in result.content
|
|
316
|
+
if isinstance(content, types.TextContent)
|
|
317
|
+
]
|
|
318
|
+
)
|
|
307
319
|
|
|
308
320
|
# Replace computer call with failed tool call
|
|
309
321
|
messages.pop()
|
|
310
|
-
messages.extend(
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
322
|
+
messages.extend(
|
|
323
|
+
make_failed_tool_call_items(
|
|
324
|
+
tool_name=call.name,
|
|
325
|
+
tool_kwargs=call.arguments or {},
|
|
326
|
+
error_message=error_text,
|
|
327
|
+
call_id=call.id,
|
|
328
|
+
)
|
|
329
|
+
)
|
|
316
330
|
else:
|
|
317
331
|
# Get the latest screenshot
|
|
318
332
|
screenshots = [
|
|
@@ -325,23 +339,27 @@ class MCPComputerAgent(MCPAgent):
|
|
|
325
339
|
if screenshots:
|
|
326
340
|
self._log_image(screenshots[0])
|
|
327
341
|
self.last_screenshot_b64 = screenshots[0]
|
|
328
|
-
messages.append(
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
"
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
342
|
+
messages.append(
|
|
343
|
+
{
|
|
344
|
+
"type": "computer_call_output",
|
|
345
|
+
"call_id": call.id,
|
|
346
|
+
"output": {
|
|
347
|
+
"type": "input_image",
|
|
348
|
+
"image_url": f"data:image/png;base64,{screenshots[0]}",
|
|
349
|
+
},
|
|
350
|
+
}
|
|
351
|
+
)
|
|
336
352
|
else:
|
|
337
353
|
# Otherwise, replace computer call with failed tool call
|
|
338
354
|
messages.pop()
|
|
339
|
-
messages.extend(
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
355
|
+
messages.extend(
|
|
356
|
+
make_failed_tool_call_items(
|
|
357
|
+
tool_name=call.name,
|
|
358
|
+
tool_kwargs=call.arguments or {},
|
|
359
|
+
error_message="No screenshots returned.",
|
|
360
|
+
call_id=call.id,
|
|
361
|
+
)
|
|
362
|
+
)
|
|
345
363
|
|
|
346
364
|
return messages
|
|
347
365
|
|
agent/integrations/hud/proxy.py
CHANGED
|
@@ -7,30 +7,33 @@ OpenAI-like response blocks. We intentionally only support a single-step call
|
|
|
7
7
|
by consuming the first yielded result from `ComputerAgent.run()`.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
-
import traceback
|
|
11
10
|
import time
|
|
11
|
+
import traceback
|
|
12
12
|
import uuid
|
|
13
13
|
from typing import Any, Dict, List, Optional
|
|
14
14
|
|
|
15
15
|
from agent.agent import ComputerAgent as BaseComputerAgent
|
|
16
16
|
from agent.callbacks import PromptInstructionsCallback
|
|
17
|
-
from hud.tools.computer.settings import computer_settings
|
|
18
|
-
from PIL import Image
|
|
19
17
|
from hud.agents import OperatorAgent
|
|
18
|
+
from hud.tools.computer.settings import computer_settings
|
|
20
19
|
|
|
21
20
|
# OpenAI Responses typed models (required)
|
|
22
21
|
from openai.types.responses import (
|
|
23
22
|
Response,
|
|
23
|
+
ResponseComputerToolCall,
|
|
24
24
|
ResponseInputParam,
|
|
25
25
|
ResponseOutputItem,
|
|
26
|
-
ResponseComputerToolCall,
|
|
27
26
|
ResponseOutputMessage,
|
|
28
27
|
ResponseOutputText,
|
|
29
28
|
ResponseReasoningItem,
|
|
30
29
|
ResponseUsage,
|
|
31
30
|
)
|
|
31
|
+
from PIL import Image
|
|
32
32
|
|
|
33
|
-
|
|
33
|
+
|
|
34
|
+
def _map_agent_output_to_openai_blocks(
|
|
35
|
+
output_items: List[Dict[str, Any]],
|
|
36
|
+
) -> List[ResponseOutputItem]:
|
|
34
37
|
"""Map our agent output items to OpenAI ResponseOutputItem typed models.
|
|
35
38
|
|
|
36
39
|
Only a subset is supported: computer_call, assistant message (text), and reasoning.
|
|
@@ -40,14 +43,16 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
|
|
|
40
43
|
for item in output_items or []:
|
|
41
44
|
t = item.get("type")
|
|
42
45
|
if t == "computer_call":
|
|
43
|
-
comp = ResponseComputerToolCall.model_validate(
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
46
|
+
comp = ResponseComputerToolCall.model_validate(
|
|
47
|
+
{
|
|
48
|
+
"id": item.get("id") or f"cu_{uuid.uuid4().hex}",
|
|
49
|
+
"type": "computer_call",
|
|
50
|
+
"call_id": item["call_id"],
|
|
51
|
+
"action": item["action"],
|
|
52
|
+
"pending_safety_checks": item.get("pending_safety_checks", []),
|
|
53
|
+
"status": "completed",
|
|
54
|
+
}
|
|
55
|
+
)
|
|
51
56
|
blocks.append(comp)
|
|
52
57
|
# we will exit early here as the responses api only supports a single step
|
|
53
58
|
break
|
|
@@ -55,31 +60,38 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
|
|
|
55
60
|
content_blocks: List[ResponseOutputText] = []
|
|
56
61
|
for c in item.get("content", []) or []:
|
|
57
62
|
content_blocks.append(
|
|
58
|
-
ResponseOutputText.model_validate(
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
+
ResponseOutputText.model_validate(
|
|
64
|
+
{
|
|
65
|
+
"type": "output_text",
|
|
66
|
+
"text": c["text"],
|
|
67
|
+
"annotations": [],
|
|
68
|
+
}
|
|
69
|
+
)
|
|
63
70
|
)
|
|
64
71
|
if content_blocks:
|
|
65
|
-
msg = ResponseOutputMessage.model_validate(
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
+
msg = ResponseOutputMessage.model_validate(
|
|
73
|
+
{
|
|
74
|
+
"id": item.get("id") or f"msg_{uuid.uuid4()}",
|
|
75
|
+
"type": "message",
|
|
76
|
+
"role": "assistant",
|
|
77
|
+
"status": "completed",
|
|
78
|
+
"content": [ct.model_dump() for ct in content_blocks],
|
|
79
|
+
}
|
|
80
|
+
)
|
|
72
81
|
blocks.append(msg)
|
|
73
82
|
elif t == "reasoning":
|
|
74
|
-
reasoning = ResponseReasoningItem.model_validate(
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
83
|
+
reasoning = ResponseReasoningItem.model_validate(
|
|
84
|
+
{
|
|
85
|
+
"id": item.get("id") or f"rsn_{uuid.uuid4()}",
|
|
86
|
+
"type": "reasoning",
|
|
87
|
+
"summary": item["summary"],
|
|
88
|
+
}
|
|
89
|
+
)
|
|
79
90
|
blocks.append(reasoning)
|
|
80
91
|
# Unhandled types are ignored
|
|
81
92
|
return blocks
|
|
82
93
|
|
|
94
|
+
|
|
83
95
|
def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
|
|
84
96
|
out: List[Dict[str, Any]] = []
|
|
85
97
|
for it in list(items):
|
|
@@ -92,6 +104,7 @@ def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
|
|
|
92
104
|
out.append(dict(it)) # may raise if not mapping
|
|
93
105
|
return out
|
|
94
106
|
|
|
107
|
+
|
|
95
108
|
class FakeAsyncOpenAI:
|
|
96
109
|
"""Minimal fake OpenAI client with only `responses.create` implemented.
|
|
97
110
|
|
|
@@ -132,10 +145,12 @@ class FakeAsyncOpenAI:
|
|
|
132
145
|
# Pre-pend instructions message
|
|
133
146
|
effective_input = full_input
|
|
134
147
|
if instructions:
|
|
135
|
-
effective_input = [
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
148
|
+
effective_input = [
|
|
149
|
+
{
|
|
150
|
+
"role": "user",
|
|
151
|
+
"content": instructions,
|
|
152
|
+
}
|
|
153
|
+
] + full_input
|
|
139
154
|
|
|
140
155
|
# Run a single iteration of the ComputerAgent
|
|
141
156
|
agent_result: Optional[Dict[str, Any]] = None
|
|
@@ -152,32 +167,43 @@ class FakeAsyncOpenAI:
|
|
|
152
167
|
blocks_to_cache = full_input + output
|
|
153
168
|
for b in blocks_to_cache:
|
|
154
169
|
bid = getattr(b, "id", None) or f"tmp-{hash(repr(b))}"
|
|
155
|
-
self.blocks_cache[bid] = b
|
|
170
|
+
self.blocks_cache[bid] = b # type: ignore[assignment]
|
|
156
171
|
block_ids.append(bid)
|
|
157
172
|
response_id = agent_result.get("id") or f"fake-{int(time.time()*1000)}"
|
|
158
173
|
self.context_cache[response_id] = block_ids
|
|
159
174
|
|
|
160
175
|
try:
|
|
161
|
-
return Response.model_validate(
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
"
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
176
|
+
return Response.model_validate(
|
|
177
|
+
{
|
|
178
|
+
"id": response_id,
|
|
179
|
+
"created_at": time.time(),
|
|
180
|
+
"object": "response",
|
|
181
|
+
"model": model,
|
|
182
|
+
"output": output,
|
|
183
|
+
"parallel_tool_calls": False,
|
|
184
|
+
"tool_choice": "auto",
|
|
185
|
+
"tools": [],
|
|
186
|
+
"previous_response_id": previous_response_id,
|
|
187
|
+
"usage": ResponseUsage.model_validate(
|
|
188
|
+
{
|
|
189
|
+
"input_tokens": usage.get("input_tokens", 0),
|
|
190
|
+
"output_tokens": usage.get("output_tokens", 0),
|
|
191
|
+
"total_tokens": usage.get("total_tokens", 0),
|
|
192
|
+
"input_tokens_details": usage.get(
|
|
193
|
+
"input_tokens_details", {"cached_tokens": 0}
|
|
194
|
+
),
|
|
195
|
+
"output_tokens_details": usage.get(
|
|
196
|
+
"output_tokens_details", {"reasoning_tokens": 0}
|
|
197
|
+
),
|
|
198
|
+
}
|
|
199
|
+
),
|
|
200
|
+
}
|
|
201
|
+
)
|
|
179
202
|
except Exception as e:
|
|
180
|
-
print(
|
|
203
|
+
print(
|
|
204
|
+
f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ",
|
|
205
|
+
e,
|
|
206
|
+
)
|
|
181
207
|
if attempt == max_retries - 1:
|
|
182
208
|
print(traceback.format_exc())
|
|
183
209
|
raise e
|
|
@@ -221,9 +247,15 @@ class ProxyOperatorAgent(OperatorAgent):
|
|
|
221
247
|
allowed_tools = allowed_tools or ["openai_computer"]
|
|
222
248
|
|
|
223
249
|
computer_shim = {
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
250
|
+
"screenshot": lambda: Image.new(
|
|
251
|
+
"RGB",
|
|
252
|
+
(computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT),
|
|
253
|
+
),
|
|
254
|
+
"environment": "linux",
|
|
255
|
+
"dimensions": (
|
|
256
|
+
computer_settings.OPENAI_COMPUTER_WIDTH,
|
|
257
|
+
computer_settings.OPENAI_COMPUTER_HEIGHT,
|
|
258
|
+
),
|
|
227
259
|
}
|
|
228
260
|
# Build tools ensuring the computer_shim is included
|
|
229
261
|
agent_tools: list[Any] = [computer_shim]
|
|
@@ -258,6 +290,7 @@ class ProxyOperatorAgent(OperatorAgent):
|
|
|
258
290
|
**kwargs,
|
|
259
291
|
)
|
|
260
292
|
|
|
293
|
+
|
|
261
294
|
__all__ = [
|
|
262
295
|
"FakeAsyncOpenAI",
|
|
263
296
|
"ProxyOperatorAgent",
|