cua-agent 0.2.10__py3-none-any.whl → 0.2.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/core/messages.py +40 -7
- agent/providers/anthropic/tools/computer.py +7 -167
- agent/providers/openai/tools/computer.py +10 -40
- agent/ui/__main__.py +15 -0
- agent/ui/gradio/app.py +32 -33
- {cua_agent-0.2.10.dist-info → cua_agent-0.2.11.dist-info}/METADATA +1 -1
- {cua_agent-0.2.10.dist-info → cua_agent-0.2.11.dist-info}/RECORD +9 -8
- {cua_agent-0.2.10.dist-info → cua_agent-0.2.11.dist-info}/WHEEL +0 -0
- {cua_agent-0.2.10.dist-info → cua_agent-0.2.11.dist-info}/entry_points.txt +0 -0
agent/core/messages.py
CHANGED
|
@@ -81,16 +81,27 @@ class StandardMessageManager:
|
|
|
81
81
|
if not self.config.num_images_to_keep:
|
|
82
82
|
return messages
|
|
83
83
|
|
|
84
|
-
# Find user messages
|
|
84
|
+
# Find messages with images (both user messages and tool call outputs)
|
|
85
85
|
image_messages = []
|
|
86
86
|
for msg in messages:
|
|
87
|
+
has_image = False
|
|
88
|
+
|
|
89
|
+
# Check user messages with images
|
|
87
90
|
if msg["role"] == "user" and isinstance(msg["content"], list):
|
|
88
91
|
has_image = any(
|
|
89
92
|
item.get("type") == "image_url" or item.get("type") == "image"
|
|
90
93
|
for item in msg["content"]
|
|
91
94
|
)
|
|
92
|
-
|
|
93
|
-
|
|
95
|
+
|
|
96
|
+
# Check assistant messages with tool calls that have images
|
|
97
|
+
elif msg["role"] == "assistant" and isinstance(msg["content"], list):
|
|
98
|
+
for item in msg["content"]:
|
|
99
|
+
if item.get("type") == "tool_result" and "base64_image" in item:
|
|
100
|
+
has_image = True
|
|
101
|
+
break
|
|
102
|
+
|
|
103
|
+
if has_image:
|
|
104
|
+
image_messages.append(msg)
|
|
94
105
|
|
|
95
106
|
# If we don't have more images than the limit, return all messages
|
|
96
107
|
if len(image_messages) <= self.config.num_images_to_keep:
|
|
@@ -100,13 +111,35 @@ class StandardMessageManager:
|
|
|
100
111
|
images_to_keep = image_messages[-self.config.num_images_to_keep :]
|
|
101
112
|
images_to_remove = image_messages[: -self.config.num_images_to_keep]
|
|
102
113
|
|
|
103
|
-
# Create a new message list
|
|
114
|
+
# Create a new message list, removing images from older messages
|
|
104
115
|
result = []
|
|
105
116
|
for msg in messages:
|
|
106
117
|
if msg in images_to_remove:
|
|
107
|
-
#
|
|
108
|
-
|
|
109
|
-
|
|
118
|
+
# Remove images from this message but keep the text content
|
|
119
|
+
if msg["role"] == "user" and isinstance(msg["content"], list):
|
|
120
|
+
# Keep only text content, remove images
|
|
121
|
+
new_content = [
|
|
122
|
+
item for item in msg["content"]
|
|
123
|
+
if item.get("type") not in ["image_url", "image"]
|
|
124
|
+
]
|
|
125
|
+
if new_content: # Only add if there's still content
|
|
126
|
+
result.append({"role": msg["role"], "content": new_content})
|
|
127
|
+
elif msg["role"] == "assistant" and isinstance(msg["content"], list):
|
|
128
|
+
# Remove base64_image from tool_result items
|
|
129
|
+
new_content = []
|
|
130
|
+
for item in msg["content"]:
|
|
131
|
+
if item.get("type") == "tool_result" and "base64_image" in item:
|
|
132
|
+
# Create a copy without the base64_image
|
|
133
|
+
new_item = {k: v for k, v in item.items() if k != "base64_image"}
|
|
134
|
+
new_content.append(new_item)
|
|
135
|
+
else:
|
|
136
|
+
new_content.append(item)
|
|
137
|
+
result.append({"role": msg["role"], "content": new_content})
|
|
138
|
+
else:
|
|
139
|
+
# For other message types, keep as is
|
|
140
|
+
result.append(msg)
|
|
141
|
+
else:
|
|
142
|
+
result.append(msg)
|
|
110
143
|
|
|
111
144
|
return result
|
|
112
145
|
|
|
@@ -205,26 +205,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
205
205
|
self.logger.info(f" Coordinates: ({x}, {y})")
|
|
206
206
|
|
|
207
207
|
try:
|
|
208
|
-
# Take pre-action screenshot to get current dimensions
|
|
209
|
-
pre_screenshot = await self.computer.interface.screenshot()
|
|
210
|
-
pre_img = Image.open(io.BytesIO(pre_screenshot))
|
|
211
|
-
|
|
212
|
-
# Scale image to match screen dimensions if needed
|
|
213
|
-
if pre_img.size != (self.width, self.height):
|
|
214
|
-
self.logger.info(
|
|
215
|
-
f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
|
|
216
|
-
)
|
|
217
|
-
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
218
|
-
raise ToolError("Screen dimensions must be integers")
|
|
219
|
-
size = (int(self.width), int(self.height))
|
|
220
|
-
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
|
|
221
|
-
# Save the scaled image back to bytes
|
|
222
|
-
buffer = io.BytesIO()
|
|
223
|
-
pre_img.save(buffer, format="PNG")
|
|
224
|
-
pre_screenshot = buffer.getvalue()
|
|
225
|
-
|
|
226
|
-
self.logger.info(f" Current dimensions: {pre_img.width}x{pre_img.height}")
|
|
227
|
-
|
|
228
208
|
# Perform the click action
|
|
229
209
|
if action == "left_click":
|
|
230
210
|
self.logger.info(f"Clicking at ({x}, {y})")
|
|
@@ -242,45 +222,14 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
242
222
|
# Wait briefly for any UI changes
|
|
243
223
|
await asyncio.sleep(0.5)
|
|
244
224
|
|
|
245
|
-
# Take and save post-action screenshot
|
|
246
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
247
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
248
|
-
|
|
249
|
-
# Scale post-action image if needed
|
|
250
|
-
if post_img.size != (self.width, self.height):
|
|
251
|
-
self.logger.info(
|
|
252
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
253
|
-
)
|
|
254
|
-
post_img = post_img.resize(
|
|
255
|
-
(self.width, self.height), Image.Resampling.LANCZOS
|
|
256
|
-
)
|
|
257
|
-
buffer = io.BytesIO()
|
|
258
|
-
post_img.save(buffer, format="PNG")
|
|
259
|
-
post_screenshot = buffer.getvalue()
|
|
260
|
-
|
|
261
225
|
return ToolResult(
|
|
262
226
|
output=f"Performed {action} at ({x}, {y})",
|
|
263
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
264
227
|
)
|
|
265
228
|
except Exception as e:
|
|
266
229
|
self.logger.error(f"Error during {action} action: {str(e)}")
|
|
267
230
|
raise ToolError(f"Failed to perform {action}: {str(e)}")
|
|
268
231
|
else:
|
|
269
232
|
try:
|
|
270
|
-
# Take pre-action screenshot
|
|
271
|
-
pre_screenshot = await self.computer.interface.screenshot()
|
|
272
|
-
pre_img = Image.open(io.BytesIO(pre_screenshot))
|
|
273
|
-
|
|
274
|
-
# Scale image if needed
|
|
275
|
-
if pre_img.size != (self.width, self.height):
|
|
276
|
-
self.logger.info(
|
|
277
|
-
f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
|
|
278
|
-
)
|
|
279
|
-
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
280
|
-
raise ToolError("Screen dimensions must be integers")
|
|
281
|
-
size = (int(self.width), int(self.height))
|
|
282
|
-
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
|
|
283
|
-
|
|
284
233
|
# Perform the click action
|
|
285
234
|
if action == "left_click":
|
|
286
235
|
self.logger.info("Performing left click at current position")
|
|
@@ -295,25 +244,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
295
244
|
# Wait briefly for any UI changes
|
|
296
245
|
await asyncio.sleep(0.5)
|
|
297
246
|
|
|
298
|
-
# Take post-action screenshot
|
|
299
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
300
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
301
|
-
|
|
302
|
-
# Scale post-action image if needed
|
|
303
|
-
if post_img.size != (self.width, self.height):
|
|
304
|
-
self.logger.info(
|
|
305
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
306
|
-
)
|
|
307
|
-
post_img = post_img.resize(
|
|
308
|
-
(self.width, self.height), Image.Resampling.LANCZOS
|
|
309
|
-
)
|
|
310
|
-
buffer = io.BytesIO()
|
|
311
|
-
post_img.save(buffer, format="PNG")
|
|
312
|
-
post_screenshot = buffer.getvalue()
|
|
313
|
-
|
|
314
247
|
return ToolResult(
|
|
315
248
|
output=f"Performed {action} at current position",
|
|
316
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
317
249
|
)
|
|
318
250
|
except Exception as e:
|
|
319
251
|
self.logger.error(f"Error during {action} action: {str(e)}")
|
|
@@ -328,20 +260,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
328
260
|
raise ToolError(f"{text} must be a string")
|
|
329
261
|
|
|
330
262
|
try:
|
|
331
|
-
# Take pre-action screenshot
|
|
332
|
-
pre_screenshot = await self.computer.interface.screenshot()
|
|
333
|
-
pre_img = Image.open(io.BytesIO(pre_screenshot))
|
|
334
|
-
|
|
335
|
-
# Scale image if needed
|
|
336
|
-
if pre_img.size != (self.width, self.height):
|
|
337
|
-
self.logger.info(
|
|
338
|
-
f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
|
|
339
|
-
)
|
|
340
|
-
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
341
|
-
raise ToolError("Screen dimensions must be integers")
|
|
342
|
-
size = (int(self.width), int(self.height))
|
|
343
|
-
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
|
|
344
|
-
|
|
345
263
|
if action == "key":
|
|
346
264
|
# Special handling for page up/down on macOS
|
|
347
265
|
if text.lower() in ["pagedown", "page_down", "page down"]:
|
|
@@ -378,25 +296,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
378
296
|
# Wait briefly for UI changes
|
|
379
297
|
await asyncio.sleep(0.5)
|
|
380
298
|
|
|
381
|
-
# Take post-action screenshot
|
|
382
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
383
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
384
|
-
|
|
385
|
-
# Scale post-action image if needed
|
|
386
|
-
if post_img.size != (self.width, self.height):
|
|
387
|
-
self.logger.info(
|
|
388
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
389
|
-
)
|
|
390
|
-
post_img = post_img.resize(
|
|
391
|
-
(self.width, self.height), Image.Resampling.LANCZOS
|
|
392
|
-
)
|
|
393
|
-
buffer = io.BytesIO()
|
|
394
|
-
post_img.save(buffer, format="PNG")
|
|
395
|
-
post_screenshot = buffer.getvalue()
|
|
396
|
-
|
|
397
299
|
return ToolResult(
|
|
398
300
|
output=f"Pressed key: {output_text}",
|
|
399
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
400
301
|
)
|
|
401
302
|
|
|
402
303
|
elif action == "type":
|
|
@@ -406,66 +307,13 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
406
307
|
# Wait briefly for UI changes
|
|
407
308
|
await asyncio.sleep(0.5)
|
|
408
309
|
|
|
409
|
-
# Take post-action screenshot
|
|
410
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
411
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
412
|
-
|
|
413
|
-
# Scale post-action image if needed
|
|
414
|
-
if post_img.size != (self.width, self.height):
|
|
415
|
-
self.logger.info(
|
|
416
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
417
|
-
)
|
|
418
|
-
post_img = post_img.resize(
|
|
419
|
-
(self.width, self.height), Image.Resampling.LANCZOS
|
|
420
|
-
)
|
|
421
|
-
buffer = io.BytesIO()
|
|
422
|
-
post_img.save(buffer, format="PNG")
|
|
423
|
-
post_screenshot = buffer.getvalue()
|
|
424
|
-
|
|
425
310
|
return ToolResult(
|
|
426
311
|
output=f"Typed text: {text}",
|
|
427
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
428
312
|
)
|
|
429
313
|
except Exception as e:
|
|
430
314
|
self.logger.error(f"Error during {action} action: {str(e)}")
|
|
431
315
|
raise ToolError(f"Failed to perform {action}: {str(e)}")
|
|
432
316
|
|
|
433
|
-
elif action in ("screenshot", "cursor_position"):
|
|
434
|
-
if text is not None:
|
|
435
|
-
raise ToolError(f"text is not accepted for {action}")
|
|
436
|
-
if coordinate is not None:
|
|
437
|
-
raise ToolError(f"coordinate is not accepted for {action}")
|
|
438
|
-
|
|
439
|
-
try:
|
|
440
|
-
if action == "screenshot":
|
|
441
|
-
# Take screenshot
|
|
442
|
-
screenshot = await self.computer.interface.screenshot()
|
|
443
|
-
img = Image.open(io.BytesIO(screenshot))
|
|
444
|
-
|
|
445
|
-
# Scale image if needed
|
|
446
|
-
if img.size != (self.width, self.height):
|
|
447
|
-
self.logger.info(
|
|
448
|
-
f"Scaling image from {img.size} to {self.width}x{self.height}"
|
|
449
|
-
)
|
|
450
|
-
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
451
|
-
raise ToolError("Screen dimensions must be integers")
|
|
452
|
-
size = (int(self.width), int(self.height))
|
|
453
|
-
img = img.resize(size, Image.Resampling.LANCZOS)
|
|
454
|
-
buffer = io.BytesIO()
|
|
455
|
-
img.save(buffer, format="PNG")
|
|
456
|
-
screenshot = buffer.getvalue()
|
|
457
|
-
|
|
458
|
-
return ToolResult(base64_image=base64.b64encode(screenshot).decode())
|
|
459
|
-
|
|
460
|
-
elif action == "cursor_position":
|
|
461
|
-
pos = await self.computer.interface.get_cursor_position()
|
|
462
|
-
x, y = pos # Unpack the tuple
|
|
463
|
-
return ToolResult(output=f"X={int(x)},Y={int(y)}")
|
|
464
|
-
|
|
465
|
-
except Exception as e:
|
|
466
|
-
self.logger.error(f"Error during {action} action: {str(e)}")
|
|
467
|
-
raise ToolError(f"Failed to perform {action}: {str(e)}")
|
|
468
|
-
|
|
469
317
|
elif action == "scroll":
|
|
470
318
|
# Implement scroll action
|
|
471
319
|
direction = kwargs.get("direction", "down")
|
|
@@ -487,28 +335,20 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
487
335
|
# Wait briefly for UI changes
|
|
488
336
|
await asyncio.sleep(0.5)
|
|
489
337
|
|
|
490
|
-
# Take post-action screenshot
|
|
491
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
492
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
493
|
-
|
|
494
|
-
# Scale post-action image if needed
|
|
495
|
-
if post_img.size != (self.width, self.height):
|
|
496
|
-
self.logger.info(
|
|
497
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
498
|
-
)
|
|
499
|
-
post_img = post_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
|
|
500
|
-
buffer = io.BytesIO()
|
|
501
|
-
post_img.save(buffer, format="PNG")
|
|
502
|
-
post_screenshot = buffer.getvalue()
|
|
503
|
-
|
|
504
338
|
return ToolResult(
|
|
505
339
|
output=f"Scrolled {direction} by {amount} steps",
|
|
506
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
507
340
|
)
|
|
508
341
|
except Exception as e:
|
|
509
342
|
self.logger.error(f"Error during scroll action: {str(e)}")
|
|
510
343
|
raise ToolError(f"Failed to perform scroll: {str(e)}")
|
|
511
344
|
|
|
345
|
+
elif action == "screenshot":
|
|
346
|
+
# Take screenshot
|
|
347
|
+
return await self.screenshot()
|
|
348
|
+
elif action == "cursor_position":
|
|
349
|
+
pos = await self.computer.interface.get_cursor_position()
|
|
350
|
+
x, y = pos # Unpack the tuple
|
|
351
|
+
return ToolResult(output=f"X={int(x)},Y={int(y)}")
|
|
512
352
|
raise ToolError(f"Invalid action: {action}")
|
|
513
353
|
|
|
514
354
|
async def screenshot(self):
|
|
@@ -61,9 +61,6 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
61
61
|
computer: Computer # The CUA Computer instance
|
|
62
62
|
logger = logging.getLogger(__name__)
|
|
63
63
|
|
|
64
|
-
_screenshot_delay = 1.0 # macOS is generally faster than X11
|
|
65
|
-
_scaling_enabled = True
|
|
66
|
-
|
|
67
64
|
def __init__(self, computer: Computer):
|
|
68
65
|
"""Initialize the computer tool.
|
|
69
66
|
|
|
@@ -185,26 +182,23 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
185
182
|
raise ToolError(f"Failed to execute {type}: {str(e)}")
|
|
186
183
|
|
|
187
184
|
async def handle_click(self, button: str, x: int, y: int) -> ToolResult:
|
|
188
|
-
"""Handle
|
|
185
|
+
"""Handle mouse clicks."""
|
|
189
186
|
try:
|
|
190
|
-
# Perform
|
|
187
|
+
# Perform the click based on button type
|
|
191
188
|
if button == "left":
|
|
192
189
|
await self.computer.interface.left_click(x, y)
|
|
193
190
|
elif button == "right":
|
|
194
191
|
await self.computer.interface.right_click(x, y)
|
|
195
192
|
elif button == "double":
|
|
196
193
|
await self.computer.interface.double_click(x, y)
|
|
194
|
+
else:
|
|
195
|
+
raise ToolError(f"Unsupported button type: {button}")
|
|
197
196
|
|
|
198
|
-
# Wait for UI to update
|
|
199
|
-
await asyncio.sleep(0.
|
|
200
|
-
|
|
201
|
-
# Take screenshot after action
|
|
202
|
-
screenshot = await self.computer.interface.screenshot()
|
|
203
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
197
|
+
# Wait briefly for UI to update
|
|
198
|
+
await asyncio.sleep(0.3)
|
|
204
199
|
|
|
205
200
|
return ToolResult(
|
|
206
201
|
output=f"Performed {button} click at ({x}, {y})",
|
|
207
|
-
base64_image=base64_screenshot,
|
|
208
202
|
)
|
|
209
203
|
except Exception as e:
|
|
210
204
|
self.logger.error(f"Error in handle_click: {str(e)}")
|
|
@@ -218,11 +212,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
218
212
|
|
|
219
213
|
await asyncio.sleep(0.3)
|
|
220
214
|
|
|
221
|
-
|
|
222
|
-
screenshot = await self.computer.interface.screenshot()
|
|
223
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
224
|
-
|
|
225
|
-
return ToolResult(output=f"Typed: {text}", base64_image=base64_screenshot)
|
|
215
|
+
return ToolResult(output=f"Typed: {text}")
|
|
226
216
|
except Exception as e:
|
|
227
217
|
self.logger.error(f"Error in handle_typing: {str(e)}")
|
|
228
218
|
raise ToolError(f"Failed to type '{text}': {str(e)}")
|
|
@@ -254,11 +244,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
254
244
|
# Wait briefly
|
|
255
245
|
await asyncio.sleep(0.3)
|
|
256
246
|
|
|
257
|
-
|
|
258
|
-
screenshot = await self.computer.interface.screenshot()
|
|
259
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
260
|
-
|
|
261
|
-
return ToolResult(output=f"Pressed key: {key}", base64_image=base64_screenshot)
|
|
247
|
+
return ToolResult(output=f"Pressed key: {key}")
|
|
262
248
|
except Exception as e:
|
|
263
249
|
self.logger.error(f"Error in handle_key: {str(e)}")
|
|
264
250
|
raise ToolError(f"Failed to press key '{key}': {str(e)}")
|
|
@@ -272,11 +258,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
272
258
|
# Wait briefly
|
|
273
259
|
await asyncio.sleep(0.2)
|
|
274
260
|
|
|
275
|
-
|
|
276
|
-
screenshot = await self.computer.interface.screenshot()
|
|
277
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
278
|
-
|
|
279
|
-
return ToolResult(output=f"Moved cursor to ({x}, {y})", base64_image=base64_screenshot)
|
|
261
|
+
return ToolResult(output=f"Moved cursor to ({x}, {y})")
|
|
280
262
|
except Exception as e:
|
|
281
263
|
self.logger.error(f"Error in handle_mouse_move: {str(e)}")
|
|
282
264
|
raise ToolError(f"Failed to move cursor to ({x}, {y}): {str(e)}")
|
|
@@ -296,14 +278,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
296
278
|
# Wait for UI to update
|
|
297
279
|
await asyncio.sleep(0.5)
|
|
298
280
|
|
|
299
|
-
|
|
300
|
-
screenshot = await self.computer.interface.screenshot()
|
|
301
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
302
|
-
|
|
303
|
-
return ToolResult(
|
|
304
|
-
output=f"Scrolled at ({x}, {y}) with delta ({scroll_x}, {scroll_y})",
|
|
305
|
-
base64_image=base64_screenshot,
|
|
306
|
-
)
|
|
281
|
+
return ToolResult(output=f"Scrolled at ({x}, {y}) by ({scroll_x}, {scroll_y})")
|
|
307
282
|
except Exception as e:
|
|
308
283
|
self.logger.error(f"Error in handle_scroll: {str(e)}")
|
|
309
284
|
raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
|
|
@@ -331,13 +306,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
331
306
|
# Wait for UI to update
|
|
332
307
|
await asyncio.sleep(0.5)
|
|
333
308
|
|
|
334
|
-
# Take screenshot after action
|
|
335
|
-
screenshot = await self.computer.interface.screenshot()
|
|
336
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
337
|
-
|
|
338
309
|
return ToolResult(
|
|
339
310
|
output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
|
|
340
|
-
base64_image=base64_screenshot,
|
|
341
311
|
)
|
|
342
312
|
except Exception as e:
|
|
343
313
|
self.logger.error(f"Error in handle_drag: {str(e)}")
|
agent/ui/__main__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main entry point for agent.ui module.
|
|
3
|
+
|
|
4
|
+
This allows running the agent UI with:
|
|
5
|
+
python -m agent.ui
|
|
6
|
+
|
|
7
|
+
Instead of:
|
|
8
|
+
python -m agent.ui.gradio.app
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .gradio.app import create_gradio_ui
|
|
12
|
+
|
|
13
|
+
if __name__ == "__main__":
|
|
14
|
+
app = create_gradio_ui()
|
|
15
|
+
app.launch(share=False, inbrowser=True)
|
agent/ui/gradio/app.py
CHANGED
|
@@ -137,6 +137,7 @@ MODEL_MAPPINGS = {
|
|
|
137
137
|
"openai": {
|
|
138
138
|
# Default to operator CUA model
|
|
139
139
|
"default": "computer-use-preview",
|
|
140
|
+
"OpenAI: Computer-Use Preview": "computer-use-preview",
|
|
140
141
|
# Map standard OpenAI model names to CUA-specific model names
|
|
141
142
|
"gpt-4-turbo": "computer-use-preview",
|
|
142
143
|
"gpt-4o": "computer-use-preview",
|
|
@@ -147,9 +148,17 @@ MODEL_MAPPINGS = {
|
|
|
147
148
|
"anthropic": {
|
|
148
149
|
# Default to newest model
|
|
149
150
|
"default": "claude-3-7-sonnet-20250219",
|
|
151
|
+
# New Claude 4 models
|
|
152
|
+
"Anthropic: Claude 4 Opus (20250514)": "claude-opus-4-20250514",
|
|
153
|
+
"Anthropic: Claude 4 Sonnet (20250514)": "claude-sonnet-4-20250514",
|
|
154
|
+
"claude-opus-4-20250514": "claude-opus-4-20250514",
|
|
155
|
+
"claude-sonnet-4-20250514": "claude-sonnet-4-20250514",
|
|
156
|
+
|
|
150
157
|
# Specific Claude models for CUA
|
|
151
|
-
"
|
|
158
|
+
"Anthropic: Claude 3.7 Sonnet (20250219)": "claude-3-5-sonnet-20240620",
|
|
159
|
+
"Anthropic: Claude 3.5 Sonnet (20240620)": "claude-3-7-sonnet-20250219",
|
|
152
160
|
"claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
|
|
161
|
+
"claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
|
|
153
162
|
# Map standard model names to CUA-specific model names
|
|
154
163
|
"claude-3-opus": "claude-3-7-sonnet-20250219",
|
|
155
164
|
"claude-3-sonnet": "claude-3-5-sonnet-20240620",
|
|
@@ -209,12 +218,12 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
|
|
|
209
218
|
if agent_loop == AgentLoop.OPENAI:
|
|
210
219
|
provider = LLMProvider.OPENAI
|
|
211
220
|
model_name_to_use = MODEL_MAPPINGS["openai"].get(
|
|
212
|
-
model_name
|
|
221
|
+
model_name, MODEL_MAPPINGS["openai"]["default"]
|
|
213
222
|
)
|
|
214
223
|
elif agent_loop == AgentLoop.ANTHROPIC:
|
|
215
224
|
provider = LLMProvider.ANTHROPIC
|
|
216
225
|
model_name_to_use = MODEL_MAPPINGS["anthropic"].get(
|
|
217
|
-
model_name
|
|
226
|
+
model_name, MODEL_MAPPINGS["anthropic"]["default"]
|
|
218
227
|
)
|
|
219
228
|
elif agent_loop == AgentLoop.OMNI:
|
|
220
229
|
# Determine provider and clean model name based on the full string from UI
|
|
@@ -234,33 +243,11 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
|
|
|
234
243
|
cleaned_model_name = model_name.split("OMNI: Ollama ", 1)[1]
|
|
235
244
|
elif model_name.startswith("OMNI: Claude "):
|
|
236
245
|
provider = LLMProvider.ANTHROPIC
|
|
237
|
-
# Extract the canonical model name based on the UI string
|
|
238
|
-
# e.g., "OMNI: Claude 3.7 Sonnet (20250219)" -> "3.7 Sonnet" and "20250219"
|
|
239
|
-
parts = model_name.split(" (")
|
|
240
|
-
model_key_part = parts[0].replace("OMNI: Claude ", "")
|
|
241
|
-
date_part = parts[1].replace(")", "") if len(parts) > 1 else ""
|
|
242
|
-
|
|
243
|
-
# Normalize the extracted key part for comparison
|
|
244
|
-
# "3.7 Sonnet" -> "37sonnet"
|
|
245
|
-
model_key_part_norm = model_key_part.lower().replace(".", "").replace(" ", "")
|
|
246
246
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
# "claude-3-7-sonnet-20250219" -> "claude37sonnet20250219"
|
|
252
|
-
key_anthropic_norm = key_anthropic.lower().replace("-", "")
|
|
253
|
-
|
|
254
|
-
# Check if the normalized canonical key starts with "claude" + normalized extracted part
|
|
255
|
-
# AND contains the date part.
|
|
256
|
-
if (
|
|
257
|
-
key_anthropic_norm.startswith("claude" + model_key_part_norm)
|
|
258
|
-
and date_part in key_anthropic_norm
|
|
259
|
-
):
|
|
260
|
-
cleaned_model_name = (
|
|
261
|
-
val_anthropic # Use the canonical name like "claude-3-7-sonnet-20250219"
|
|
262
|
-
)
|
|
263
|
-
break
|
|
247
|
+
model_name = model_name.replace("OMNI: ", "Anthropic: ")
|
|
248
|
+
cleaned_model_name = MODEL_MAPPINGS["anthropic"].get(
|
|
249
|
+
model_name, MODEL_MAPPINGS["anthropic"]["default"]
|
|
250
|
+
)
|
|
264
251
|
elif model_name.startswith("OMNI: OpenAI "):
|
|
265
252
|
provider = LLMProvider.OPENAI
|
|
266
253
|
# Extract the model part, e.g., "GPT-4o mini"
|
|
@@ -309,6 +296,8 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
|
|
|
309
296
|
model_name_to_use = MODEL_MAPPINGS["openai"]["default"]
|
|
310
297
|
agent_loop = AgentLoop.OPENAI
|
|
311
298
|
|
|
299
|
+
print(f"Mapping {model_name} and {loop_provider} to {provider}, {model_name_to_use}, {agent_loop}")
|
|
300
|
+
|
|
312
301
|
return provider, model_name_to_use, agent_loop
|
|
313
302
|
|
|
314
303
|
|
|
@@ -453,6 +442,9 @@ def create_gradio_ui(
|
|
|
453
442
|
# Always show models regardless of API key availability
|
|
454
443
|
openai_models = ["OpenAI: Computer-Use Preview"]
|
|
455
444
|
anthropic_models = [
|
|
445
|
+
"Anthropic: Claude 4 Opus (20250514)",
|
|
446
|
+
"Anthropic: Claude 4 Sonnet (20250514)",
|
|
447
|
+
|
|
456
448
|
"Anthropic: Claude 3.7 Sonnet (20250219)",
|
|
457
449
|
"Anthropic: Claude 3.5 Sonnet (20240620)",
|
|
458
450
|
]
|
|
@@ -460,6 +452,8 @@ def create_gradio_ui(
|
|
|
460
452
|
"OMNI: OpenAI GPT-4o",
|
|
461
453
|
"OMNI: OpenAI GPT-4o mini",
|
|
462
454
|
"OMNI: OpenAI GPT-4.5-preview",
|
|
455
|
+
"OMNI: Claude 4 Opus (20250514)",
|
|
456
|
+
"OMNI: Claude 4 Sonnet (20250514)",
|
|
463
457
|
"OMNI: Claude 3.7 Sonnet (20250219)",
|
|
464
458
|
"OMNI: Claude 3.5 Sonnet (20240620)"
|
|
465
459
|
]
|
|
@@ -729,20 +723,25 @@ if __name__ == "__main__":
|
|
|
729
723
|
with gr.Accordion("Computer Configuration", open=True):
|
|
730
724
|
# Computer configuration options
|
|
731
725
|
computer_os = gr.Radio(
|
|
732
|
-
choices=["macos", "linux"],
|
|
726
|
+
choices=["macos", "linux", "windows"],
|
|
733
727
|
label="Operating System",
|
|
734
728
|
value="macos",
|
|
735
729
|
info="Select the operating system for the computer",
|
|
736
730
|
)
|
|
737
731
|
|
|
738
|
-
|
|
732
|
+
is_windows = platform.system().lower() == "windows"
|
|
739
733
|
is_mac = platform.system().lower() == "darwin"
|
|
740
734
|
|
|
735
|
+
providers = ["cloud"]
|
|
736
|
+
if is_mac:
|
|
737
|
+
providers += ["lume"]
|
|
738
|
+
elif is_windows:
|
|
739
|
+
providers += ["winsandbox"]
|
|
740
|
+
|
|
741
741
|
computer_provider = gr.Radio(
|
|
742
|
-
choices=
|
|
742
|
+
choices=providers,
|
|
743
743
|
label="Provider",
|
|
744
744
|
value="lume" if is_mac else "cloud",
|
|
745
|
-
visible=is_mac,
|
|
746
745
|
info="Select the computer provider",
|
|
747
746
|
)
|
|
748
747
|
|
|
@@ -5,7 +5,7 @@ agent/core/base.py,sha256=7hD1rosM-JjyruwSplD4-5YO6BaO1a1bD7bjFYGGUrg,8315
|
|
|
5
5
|
agent/core/callbacks.py,sha256=uAoJo4rHpVf1d8rzEBFdtSud9jRndPLwDoC4U4uYZlw,6386
|
|
6
6
|
agent/core/experiment.py,sha256=Ywj6q3JZFDKicfPuQsDl0vSN55HS7-Cnk3u3EcUCKe8,8866
|
|
7
7
|
agent/core/factory.py,sha256=zzlCdibctqhf8Uta-SrvE-G7h59wAw-7SGhHiGvS9GY,4608
|
|
8
|
-
agent/core/messages.py,sha256=
|
|
8
|
+
agent/core/messages.py,sha256=2bMR_U6A9q1QgzAT5pGaC4FwLs5KNFC6OlW0ZoYSYOY,14335
|
|
9
9
|
agent/core/provider_config.py,sha256=jB3fLsEsf806HQZ8jtzfSq4bCYGYONBeuCOoog_Nv_Y,768
|
|
10
10
|
agent/core/telemetry.py,sha256=HElPd32k_w2SJ6t-Cc3j_2-AKdLbFwh2YlM8QViDgRw,4790
|
|
11
11
|
agent/core/tools.py,sha256=53aPme3O8U91n122Smu3TGbyGjQQe2zDimaZgKkFNi0,878
|
|
@@ -32,7 +32,7 @@ agent/providers/anthropic/tools/__init__.py,sha256=JyZwuVtPUnZwRSZBSCdQv9yxbLCsy
|
|
|
32
32
|
agent/providers/anthropic/tools/base.py,sha256=WnRDbqO25tQzLpS2RU2ZXTLF5wd5IqU7SiyRAglQat4,2752
|
|
33
33
|
agent/providers/anthropic/tools/bash.py,sha256=QODuFjWuHM4GgGTqK2HizSyYqGqQwX70AdwrFiGSp2Q,2218
|
|
34
34
|
agent/providers/anthropic/tools/collection.py,sha256=RBK_6hxfHExR-EOxadiLl0OznmFj07nyIUjFgaYZ6Eo,960
|
|
35
|
-
agent/providers/anthropic/tools/computer.py,sha256=
|
|
35
|
+
agent/providers/anthropic/tools/computer.py,sha256=GRmEOyZGQ6Sw7jNx39-WEWdYqQ0X0E5hW2nE2z-52a8,16979
|
|
36
36
|
agent/providers/anthropic/tools/edit.py,sha256=EGRP61MDA4Oue1D7Q-_vLpd6LdGbdBA1Z4HSZ66DbmI,13465
|
|
37
37
|
agent/providers/anthropic/tools/manager.py,sha256=yNvgTkfEqnOz5isDF0RxvmBMZB0uh2PipFEH-PUXpoY,2020
|
|
38
38
|
agent/providers/anthropic/tools/run.py,sha256=xhXdnBK1di9muaO44CEirL9hpGy3NmKbjfMpyeVmn8Y,1595
|
|
@@ -62,7 +62,7 @@ agent/providers/openai/loop.py,sha256=_MyjPu4rpHpTxS2nTSRLHrCbSDkZPK5WEG1APKGP-1
|
|
|
62
62
|
agent/providers/openai/response_handler.py,sha256=K8v_92uSr9R74Y5INY4naeEZZZm35CLIl4h74MBZhsw,7953
|
|
63
63
|
agent/providers/openai/tools/__init__.py,sha256=-KbHMWcd2OVTk5RYQ3ACBEMygwbH-VW6n_98p0lwM4A,344
|
|
64
64
|
agent/providers/openai/tools/base.py,sha256=Np_BC9Cm6TslK99etE9hVTtsBlcEaGhoNCK3NXdB_Lw,2474
|
|
65
|
-
agent/providers/openai/tools/computer.py,sha256=
|
|
65
|
+
agent/providers/openai/tools/computer.py,sha256=g5GzfVC3j0YlyLu1ixaSqHpxLQQ8Zcr_bbyFqm3HyfM,12497
|
|
66
66
|
agent/providers/openai/tools/manager.py,sha256=-wM641dLf8vcv6QF9x_ViGJeDl2YTuUV93j6u7GBI18,3903
|
|
67
67
|
agent/providers/openai/types.py,sha256=0mFUxeFy23fJhMwc6lAFVXKngg2fJIXkPS5oV284V1M,898
|
|
68
68
|
agent/providers/openai/utils.py,sha256=YeCZWIqOFSeugWoqAS0rhxOKAfL-9uN9nrYSBGBgPdc,3175
|
|
@@ -78,9 +78,10 @@ agent/providers/uitars/tools/manager.py,sha256=2dK9STtz6NuZG3i0nH7ZuHJpb7vKJ2mOV
|
|
|
78
78
|
agent/providers/uitars/utils.py,sha256=493STTEEJcVhVbQgR0e8rNTI1DjkxUx8IgIv3wkJ1SU,8878
|
|
79
79
|
agent/telemetry.py,sha256=pVGxbj0ewnvq4EGj28CydN4a1iOfvZR_XKL3vIOqhOM,390
|
|
80
80
|
agent/ui/__init__.py,sha256=ohhxJLBin6k1hl5sKcmBST8mgh23WXgAXz3pN4f470E,45
|
|
81
|
+
agent/ui/__main__.py,sha256=Ah2575SAf7hI8QCsCKx-W4A6QcFsnFPdqMArmJ4H9ic,299
|
|
81
82
|
agent/ui/gradio/__init__.py,sha256=ANKZhv1HqsLheWbLVBlyRQ7Q5qGeXuPi5jDs8vu-ZMo,579
|
|
82
|
-
agent/ui/gradio/app.py,sha256=
|
|
83
|
-
cua_agent-0.2.
|
|
84
|
-
cua_agent-0.2.
|
|
85
|
-
cua_agent-0.2.
|
|
86
|
-
cua_agent-0.2.
|
|
83
|
+
agent/ui/gradio/app.py,sha256=8zTmQavVZH_FWcxpaqBlvfX6Su3WQDMAlbiwUN4roMA,70313
|
|
84
|
+
cua_agent-0.2.11.dist-info/METADATA,sha256=p8rB3P7N4u6vuQET7wZsOMFNLBKamdjws2cA1xZNLmM,12689
|
|
85
|
+
cua_agent-0.2.11.dist-info/WHEEL,sha256=tSfRZzRHthuv7vxpI4aehrdN9scLjk-dCJkPLzkHxGg,90
|
|
86
|
+
cua_agent-0.2.11.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
|
|
87
|
+
cua_agent-0.2.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|