cua-agent 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

agent/core/base.py CHANGED
@@ -5,7 +5,6 @@ import asyncio
5
5
  from abc import ABC, abstractmethod
6
6
  from typing import Any, AsyncGenerator, Dict, List, Optional
7
7
 
8
- from agent.providers.omni.parser import ParseResult
9
8
  from computer import Computer
10
9
  from .messages import StandardMessageManager, ImageRetentionConfig
11
10
  from .types import AgentResponse
@@ -207,7 +206,7 @@ class BaseLoop(ABC):
207
206
  # EVENT HOOKS / CALLBACKS
208
207
  ###########################################
209
208
 
210
- async def handle_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None:
209
+ async def handle_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[dict] = None) -> None:
211
210
  """Process a screenshot through callback managers
212
211
 
213
212
  Args:
agent/core/callbacks.py CHANGED
@@ -6,8 +6,6 @@ from abc import ABC, abstractmethod
6
6
  from datetime import datetime
7
7
  from typing import Any, Dict, List, Optional, Protocol
8
8
 
9
- from agent.providers.omni.parser import ParseResult
10
-
11
9
  logger = logging.getLogger(__name__)
12
10
 
13
11
  class ContentCallback(Protocol):
@@ -117,7 +115,7 @@ class CallbackManager:
117
115
  for handler in self.handlers:
118
116
  await handler.on_error(error, **kwargs)
119
117
 
120
- async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None:
118
+ async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[dict] = None) -> None:
121
119
  """Called when a screenshot is taken.
122
120
 
123
121
  Args:
@@ -166,7 +164,7 @@ class CallbackHandler(ABC):
166
164
  pass
167
165
 
168
166
  @abstractmethod
169
- async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None:
167
+ async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[dict] = None) -> None:
170
168
  """Called when a screenshot is taken.
171
169
 
172
170
  Args:
agent/core/messages.py CHANGED
@@ -5,7 +5,6 @@ import json
5
5
  from typing import Any, Dict, List, Optional, Union, Tuple
6
6
  from dataclasses import dataclass
7
7
  import re
8
- from ..providers.omni.parser import ParseResult
9
8
 
10
9
  logger = logging.getLogger(__name__)
11
10
 
@@ -82,16 +81,27 @@ class StandardMessageManager:
82
81
  if not self.config.num_images_to_keep:
83
82
  return messages
84
83
 
85
- # Find user messages with images
84
+ # Find messages with images (both user messages and tool call outputs)
86
85
  image_messages = []
87
86
  for msg in messages:
87
+ has_image = False
88
+
89
+ # Check user messages with images
88
90
  if msg["role"] == "user" and isinstance(msg["content"], list):
89
91
  has_image = any(
90
92
  item.get("type") == "image_url" or item.get("type") == "image"
91
93
  for item in msg["content"]
92
94
  )
93
- if has_image:
94
- image_messages.append(msg)
95
+
96
+ # Check assistant messages with tool calls that have images
97
+ elif msg["role"] == "assistant" and isinstance(msg["content"], list):
98
+ for item in msg["content"]:
99
+ if item.get("type") == "tool_result" and "base64_image" in item:
100
+ has_image = True
101
+ break
102
+
103
+ if has_image:
104
+ image_messages.append(msg)
95
105
 
96
106
  # If we don't have more images than the limit, return all messages
97
107
  if len(image_messages) <= self.config.num_images_to_keep:
@@ -101,13 +111,35 @@ class StandardMessageManager:
101
111
  images_to_keep = image_messages[-self.config.num_images_to_keep :]
102
112
  images_to_remove = image_messages[: -self.config.num_images_to_keep]
103
113
 
104
- # Create a new message list without the older images
114
+ # Create a new message list, removing images from older messages
105
115
  result = []
106
116
  for msg in messages:
107
117
  if msg in images_to_remove:
108
- # Skip this message
109
- continue
110
- result.append(msg)
118
+ # Remove images from this message but keep the text content
119
+ if msg["role"] == "user" and isinstance(msg["content"], list):
120
+ # Keep only text content, remove images
121
+ new_content = [
122
+ item for item in msg["content"]
123
+ if item.get("type") not in ["image_url", "image"]
124
+ ]
125
+ if new_content: # Only add if there's still content
126
+ result.append({"role": msg["role"], "content": new_content})
127
+ elif msg["role"] == "assistant" and isinstance(msg["content"], list):
128
+ # Remove base64_image from tool_result items
129
+ new_content = []
130
+ for item in msg["content"]:
131
+ if item.get("type") == "tool_result" and "base64_image" in item:
132
+ # Create a copy without the base64_image
133
+ new_item = {k: v for k, v in item.items() if k != "base64_image"}
134
+ new_content.append(new_item)
135
+ else:
136
+ new_content.append(item)
137
+ result.append({"role": msg["role"], "content": new_content})
138
+ else:
139
+ # For other message types, keep as is
140
+ result.append(msg)
141
+ else:
142
+ result.append(msg)
111
143
 
112
144
  return result
113
145
 
@@ -205,26 +205,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
205
205
  self.logger.info(f" Coordinates: ({x}, {y})")
206
206
 
207
207
  try:
208
- # Take pre-action screenshot to get current dimensions
209
- pre_screenshot = await self.computer.interface.screenshot()
210
- pre_img = Image.open(io.BytesIO(pre_screenshot))
211
-
212
- # Scale image to match screen dimensions if needed
213
- if pre_img.size != (self.width, self.height):
214
- self.logger.info(
215
- f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
216
- )
217
- if not isinstance(self.width, int) or not isinstance(self.height, int):
218
- raise ToolError("Screen dimensions must be integers")
219
- size = (int(self.width), int(self.height))
220
- pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
221
- # Save the scaled image back to bytes
222
- buffer = io.BytesIO()
223
- pre_img.save(buffer, format="PNG")
224
- pre_screenshot = buffer.getvalue()
225
-
226
- self.logger.info(f" Current dimensions: {pre_img.width}x{pre_img.height}")
227
-
228
208
  # Perform the click action
229
209
  if action == "left_click":
230
210
  self.logger.info(f"Clicking at ({x}, {y})")
@@ -242,45 +222,14 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
242
222
  # Wait briefly for any UI changes
243
223
  await asyncio.sleep(0.5)
244
224
 
245
- # Take and save post-action screenshot
246
- post_screenshot = await self.computer.interface.screenshot()
247
- post_img = Image.open(io.BytesIO(post_screenshot))
248
-
249
- # Scale post-action image if needed
250
- if post_img.size != (self.width, self.height):
251
- self.logger.info(
252
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
253
- )
254
- post_img = post_img.resize(
255
- (self.width, self.height), Image.Resampling.LANCZOS
256
- )
257
- buffer = io.BytesIO()
258
- post_img.save(buffer, format="PNG")
259
- post_screenshot = buffer.getvalue()
260
-
261
225
  return ToolResult(
262
226
  output=f"Performed {action} at ({x}, {y})",
263
- base64_image=base64.b64encode(post_screenshot).decode(),
264
227
  )
265
228
  except Exception as e:
266
229
  self.logger.error(f"Error during {action} action: {str(e)}")
267
230
  raise ToolError(f"Failed to perform {action}: {str(e)}")
268
231
  else:
269
232
  try:
270
- # Take pre-action screenshot
271
- pre_screenshot = await self.computer.interface.screenshot()
272
- pre_img = Image.open(io.BytesIO(pre_screenshot))
273
-
274
- # Scale image if needed
275
- if pre_img.size != (self.width, self.height):
276
- self.logger.info(
277
- f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
278
- )
279
- if not isinstance(self.width, int) or not isinstance(self.height, int):
280
- raise ToolError("Screen dimensions must be integers")
281
- size = (int(self.width), int(self.height))
282
- pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
283
-
284
233
  # Perform the click action
285
234
  if action == "left_click":
286
235
  self.logger.info("Performing left click at current position")
@@ -295,25 +244,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
295
244
  # Wait briefly for any UI changes
296
245
  await asyncio.sleep(0.5)
297
246
 
298
- # Take post-action screenshot
299
- post_screenshot = await self.computer.interface.screenshot()
300
- post_img = Image.open(io.BytesIO(post_screenshot))
301
-
302
- # Scale post-action image if needed
303
- if post_img.size != (self.width, self.height):
304
- self.logger.info(
305
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
306
- )
307
- post_img = post_img.resize(
308
- (self.width, self.height), Image.Resampling.LANCZOS
309
- )
310
- buffer = io.BytesIO()
311
- post_img.save(buffer, format="PNG")
312
- post_screenshot = buffer.getvalue()
313
-
314
247
  return ToolResult(
315
248
  output=f"Performed {action} at current position",
316
- base64_image=base64.b64encode(post_screenshot).decode(),
317
249
  )
318
250
  except Exception as e:
319
251
  self.logger.error(f"Error during {action} action: {str(e)}")
@@ -328,20 +260,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
328
260
  raise ToolError(f"{text} must be a string")
329
261
 
330
262
  try:
331
- # Take pre-action screenshot
332
- pre_screenshot = await self.computer.interface.screenshot()
333
- pre_img = Image.open(io.BytesIO(pre_screenshot))
334
-
335
- # Scale image if needed
336
- if pre_img.size != (self.width, self.height):
337
- self.logger.info(
338
- f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
339
- )
340
- if not isinstance(self.width, int) or not isinstance(self.height, int):
341
- raise ToolError("Screen dimensions must be integers")
342
- size = (int(self.width), int(self.height))
343
- pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
344
-
345
263
  if action == "key":
346
264
  # Special handling for page up/down on macOS
347
265
  if text.lower() in ["pagedown", "page_down", "page down"]:
@@ -378,25 +296,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
378
296
  # Wait briefly for UI changes
379
297
  await asyncio.sleep(0.5)
380
298
 
381
- # Take post-action screenshot
382
- post_screenshot = await self.computer.interface.screenshot()
383
- post_img = Image.open(io.BytesIO(post_screenshot))
384
-
385
- # Scale post-action image if needed
386
- if post_img.size != (self.width, self.height):
387
- self.logger.info(
388
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
389
- )
390
- post_img = post_img.resize(
391
- (self.width, self.height), Image.Resampling.LANCZOS
392
- )
393
- buffer = io.BytesIO()
394
- post_img.save(buffer, format="PNG")
395
- post_screenshot = buffer.getvalue()
396
-
397
299
  return ToolResult(
398
300
  output=f"Pressed key: {output_text}",
399
- base64_image=base64.b64encode(post_screenshot).decode(),
400
301
  )
401
302
 
402
303
  elif action == "type":
@@ -406,66 +307,13 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
406
307
  # Wait briefly for UI changes
407
308
  await asyncio.sleep(0.5)
408
309
 
409
- # Take post-action screenshot
410
- post_screenshot = await self.computer.interface.screenshot()
411
- post_img = Image.open(io.BytesIO(post_screenshot))
412
-
413
- # Scale post-action image if needed
414
- if post_img.size != (self.width, self.height):
415
- self.logger.info(
416
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
417
- )
418
- post_img = post_img.resize(
419
- (self.width, self.height), Image.Resampling.LANCZOS
420
- )
421
- buffer = io.BytesIO()
422
- post_img.save(buffer, format="PNG")
423
- post_screenshot = buffer.getvalue()
424
-
425
310
  return ToolResult(
426
311
  output=f"Typed text: {text}",
427
- base64_image=base64.b64encode(post_screenshot).decode(),
428
312
  )
429
313
  except Exception as e:
430
314
  self.logger.error(f"Error during {action} action: {str(e)}")
431
315
  raise ToolError(f"Failed to perform {action}: {str(e)}")
432
316
 
433
- elif action in ("screenshot", "cursor_position"):
434
- if text is not None:
435
- raise ToolError(f"text is not accepted for {action}")
436
- if coordinate is not None:
437
- raise ToolError(f"coordinate is not accepted for {action}")
438
-
439
- try:
440
- if action == "screenshot":
441
- # Take screenshot
442
- screenshot = await self.computer.interface.screenshot()
443
- img = Image.open(io.BytesIO(screenshot))
444
-
445
- # Scale image if needed
446
- if img.size != (self.width, self.height):
447
- self.logger.info(
448
- f"Scaling image from {img.size} to {self.width}x{self.height}"
449
- )
450
- if not isinstance(self.width, int) or not isinstance(self.height, int):
451
- raise ToolError("Screen dimensions must be integers")
452
- size = (int(self.width), int(self.height))
453
- img = img.resize(size, Image.Resampling.LANCZOS)
454
- buffer = io.BytesIO()
455
- img.save(buffer, format="PNG")
456
- screenshot = buffer.getvalue()
457
-
458
- return ToolResult(base64_image=base64.b64encode(screenshot).decode())
459
-
460
- elif action == "cursor_position":
461
- pos = await self.computer.interface.get_cursor_position()
462
- x, y = pos # Unpack the tuple
463
- return ToolResult(output=f"X={int(x)},Y={int(y)}")
464
-
465
- except Exception as e:
466
- self.logger.error(f"Error during {action} action: {str(e)}")
467
- raise ToolError(f"Failed to perform {action}: {str(e)}")
468
-
469
317
  elif action == "scroll":
470
318
  # Implement scroll action
471
319
  direction = kwargs.get("direction", "down")
@@ -487,28 +335,20 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
487
335
  # Wait briefly for UI changes
488
336
  await asyncio.sleep(0.5)
489
337
 
490
- # Take post-action screenshot
491
- post_screenshot = await self.computer.interface.screenshot()
492
- post_img = Image.open(io.BytesIO(post_screenshot))
493
-
494
- # Scale post-action image if needed
495
- if post_img.size != (self.width, self.height):
496
- self.logger.info(
497
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
498
- )
499
- post_img = post_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
500
- buffer = io.BytesIO()
501
- post_img.save(buffer, format="PNG")
502
- post_screenshot = buffer.getvalue()
503
-
504
338
  return ToolResult(
505
339
  output=f"Scrolled {direction} by {amount} steps",
506
- base64_image=base64.b64encode(post_screenshot).decode(),
507
340
  )
508
341
  except Exception as e:
509
342
  self.logger.error(f"Error during scroll action: {str(e)}")
510
343
  raise ToolError(f"Failed to perform scroll: {str(e)}")
511
344
 
345
+ elif action == "screenshot":
346
+ # Take screenshot
347
+ return await self.screenshot()
348
+ elif action == "cursor_position":
349
+ pos = await self.computer.interface.get_cursor_position()
350
+ x, y = pos # Unpack the tuple
351
+ return ToolResult(output=f"X={int(x)},Y={int(y)}")
512
352
  raise ToolError(f"Invalid action: {action}")
513
353
 
514
354
  async def screenshot(self):
@@ -4,7 +4,6 @@ import logging
4
4
  import re
5
5
  from typing import Any, Dict, List, Optional, Tuple, cast
6
6
  from anthropic.types.beta import BetaMessage
7
- from ..omni.parser import ParseResult
8
7
  from ...core.types import AgentResponse
9
8
  from datetime import datetime
10
9
 
@@ -188,7 +187,7 @@ def from_anthropic_format(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]
188
187
  async def to_agent_response_format(
189
188
  response: BetaMessage,
190
189
  messages: List[Dict[str, Any]],
191
- parsed_screen: Optional[ParseResult] = None,
190
+ parsed_screen: Optional[dict] = None,
192
191
  parser: Optional[Any] = None,
193
192
  model: Optional[str] = None,
194
193
  ) -> AgentResponse:
@@ -61,9 +61,6 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
61
61
  computer: Computer # The CUA Computer instance
62
62
  logger = logging.getLogger(__name__)
63
63
 
64
- _screenshot_delay = 1.0 # macOS is generally faster than X11
65
- _scaling_enabled = True
66
-
67
64
  def __init__(self, computer: Computer):
68
65
  """Initialize the computer tool.
69
66
 
@@ -185,26 +182,23 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
185
182
  raise ToolError(f"Failed to execute {type}: {str(e)}")
186
183
 
187
184
  async def handle_click(self, button: str, x: int, y: int) -> ToolResult:
188
- """Handle different click actions."""
185
+ """Handle mouse clicks."""
189
186
  try:
190
- # Perform requested click action
187
+ # Perform the click based on button type
191
188
  if button == "left":
192
189
  await self.computer.interface.left_click(x, y)
193
190
  elif button == "right":
194
191
  await self.computer.interface.right_click(x, y)
195
192
  elif button == "double":
196
193
  await self.computer.interface.double_click(x, y)
194
+ else:
195
+ raise ToolError(f"Unsupported button type: {button}")
197
196
 
198
- # Wait for UI to update
199
- await asyncio.sleep(0.5)
200
-
201
- # Take screenshot after action
202
- screenshot = await self.computer.interface.screenshot()
203
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
197
+ # Wait briefly for UI to update
198
+ await asyncio.sleep(0.3)
204
199
 
205
200
  return ToolResult(
206
201
  output=f"Performed {button} click at ({x}, {y})",
207
- base64_image=base64_screenshot,
208
202
  )
209
203
  except Exception as e:
210
204
  self.logger.error(f"Error in handle_click: {str(e)}")
@@ -218,11 +212,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
218
212
 
219
213
  await asyncio.sleep(0.3)
220
214
 
221
- # Take screenshot after typing
222
- screenshot = await self.computer.interface.screenshot()
223
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
224
-
225
- return ToolResult(output=f"Typed: {text}", base64_image=base64_screenshot)
215
+ return ToolResult(output=f"Typed: {text}")
226
216
  except Exception as e:
227
217
  self.logger.error(f"Error in handle_typing: {str(e)}")
228
218
  raise ToolError(f"Failed to type '{text}': {str(e)}")
@@ -254,11 +244,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
254
244
  # Wait briefly
255
245
  await asyncio.sleep(0.3)
256
246
 
257
- # Take screenshot after action
258
- screenshot = await self.computer.interface.screenshot()
259
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
260
-
261
- return ToolResult(output=f"Pressed key: {key}", base64_image=base64_screenshot)
247
+ return ToolResult(output=f"Pressed key: {key}")
262
248
  except Exception as e:
263
249
  self.logger.error(f"Error in handle_key: {str(e)}")
264
250
  raise ToolError(f"Failed to press key '{key}': {str(e)}")
@@ -272,11 +258,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
272
258
  # Wait briefly
273
259
  await asyncio.sleep(0.2)
274
260
 
275
- # Take screenshot after action
276
- screenshot = await self.computer.interface.screenshot()
277
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
278
-
279
- return ToolResult(output=f"Moved cursor to ({x}, {y})", base64_image=base64_screenshot)
261
+ return ToolResult(output=f"Moved cursor to ({x}, {y})")
280
262
  except Exception as e:
281
263
  self.logger.error(f"Error in handle_mouse_move: {str(e)}")
282
264
  raise ToolError(f"Failed to move cursor to ({x}, {y}): {str(e)}")
@@ -296,14 +278,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
296
278
  # Wait for UI to update
297
279
  await asyncio.sleep(0.5)
298
280
 
299
- # Take screenshot after action
300
- screenshot = await self.computer.interface.screenshot()
301
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
302
-
303
- return ToolResult(
304
- output=f"Scrolled at ({x}, {y}) with delta ({scroll_x}, {scroll_y})",
305
- base64_image=base64_screenshot,
306
- )
281
+ return ToolResult(output=f"Scrolled at ({x}, {y}) by ({scroll_x}, {scroll_y})")
307
282
  except Exception as e:
308
283
  self.logger.error(f"Error in handle_scroll: {str(e)}")
309
284
  raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
@@ -331,13 +306,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
331
306
  # Wait for UI to update
332
307
  await asyncio.sleep(0.5)
333
308
 
334
- # Take screenshot after action
335
- screenshot = await self.computer.interface.screenshot()
336
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
337
-
338
309
  return ToolResult(
339
310
  output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
340
- base64_image=base64_screenshot,
341
311
  )
342
312
  except Exception as e:
343
313
  self.logger.error(f"Error in handle_drag: {str(e)}")
agent/ui/__main__.py ADDED
@@ -0,0 +1,15 @@
1
+ """
2
+ Main entry point for agent.ui module.
3
+
4
+ This allows running the agent UI with:
5
+ python -m agent.ui
6
+
7
+ Instead of:
8
+ python -m agent.ui.gradio.app
9
+ """
10
+
11
+ from .gradio.app import create_gradio_ui
12
+
13
+ if __name__ == "__main__":
14
+ app = create_gradio_ui()
15
+ app.launch(share=False, inbrowser=True)
agent/ui/gradio/app.py CHANGED
@@ -41,7 +41,6 @@ from typing import cast
41
41
  # Import from agent package
42
42
  from agent.core.types import AgentResponse
43
43
  from agent.core.callbacks import DefaultCallbackHandler
44
- from agent.providers.omni.parser import ParseResult
45
44
  from computer import Computer
46
45
 
47
46
  from agent import ComputerAgent, AgentLoop, LLM, LLMProvider
@@ -103,7 +102,7 @@ class GradioChatScreenshotHandler(DefaultCallbackHandler):
103
102
  self,
104
103
  screenshot_base64: str,
105
104
  action_type: str = "",
106
- parsed_screen: Optional[ParseResult] = None,
105
+ parsed_screen: Optional[dict] = None,
107
106
  ) -> None:
108
107
  """Add screenshot to chatbot when a screenshot is taken and update the annotated image.
109
108
 
@@ -138,6 +137,7 @@ MODEL_MAPPINGS = {
138
137
  "openai": {
139
138
  # Default to operator CUA model
140
139
  "default": "computer-use-preview",
140
+ "OpenAI: Computer-Use Preview": "computer-use-preview",
141
141
  # Map standard OpenAI model names to CUA-specific model names
142
142
  "gpt-4-turbo": "computer-use-preview",
143
143
  "gpt-4o": "computer-use-preview",
@@ -148,9 +148,17 @@ MODEL_MAPPINGS = {
148
148
  "anthropic": {
149
149
  # Default to newest model
150
150
  "default": "claude-3-7-sonnet-20250219",
151
+ # New Claude 4 models
152
+ "Anthropic: Claude 4 Opus (20250514)": "claude-opus-4-20250514",
153
+ "Anthropic: Claude 4 Sonnet (20250514)": "claude-sonnet-4-20250514",
154
+ "claude-opus-4-20250514": "claude-opus-4-20250514",
155
+ "claude-sonnet-4-20250514": "claude-sonnet-4-20250514",
156
+
151
157
  # Specific Claude models for CUA
152
- "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
158
+ "Anthropic: Claude 3.7 Sonnet (20250219)": "claude-3-5-sonnet-20240620",
159
+ "Anthropic: Claude 3.5 Sonnet (20240620)": "claude-3-7-sonnet-20250219",
153
160
  "claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
161
+ "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
154
162
  # Map standard model names to CUA-specific model names
155
163
  "claude-3-opus": "claude-3-7-sonnet-20250219",
156
164
  "claude-3-sonnet": "claude-3-5-sonnet-20240620",
@@ -210,12 +218,12 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
210
218
  if agent_loop == AgentLoop.OPENAI:
211
219
  provider = LLMProvider.OPENAI
212
220
  model_name_to_use = MODEL_MAPPINGS["openai"].get(
213
- model_name.lower(), MODEL_MAPPINGS["openai"]["default"]
221
+ model_name, MODEL_MAPPINGS["openai"]["default"]
214
222
  )
215
223
  elif agent_loop == AgentLoop.ANTHROPIC:
216
224
  provider = LLMProvider.ANTHROPIC
217
225
  model_name_to_use = MODEL_MAPPINGS["anthropic"].get(
218
- model_name.lower(), MODEL_MAPPINGS["anthropic"]["default"]
226
+ model_name, MODEL_MAPPINGS["anthropic"]["default"]
219
227
  )
220
228
  elif agent_loop == AgentLoop.OMNI:
221
229
  # Determine provider and clean model name based on the full string from UI
@@ -235,33 +243,11 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
235
243
  cleaned_model_name = model_name.split("OMNI: Ollama ", 1)[1]
236
244
  elif model_name.startswith("OMNI: Claude "):
237
245
  provider = LLMProvider.ANTHROPIC
238
- # Extract the canonical model name based on the UI string
239
- # e.g., "OMNI: Claude 3.7 Sonnet (20250219)" -> "3.7 Sonnet" and "20250219"
240
- parts = model_name.split(" (")
241
- model_key_part = parts[0].replace("OMNI: Claude ", "")
242
- date_part = parts[1].replace(")", "") if len(parts) > 1 else ""
243
-
244
- # Normalize the extracted key part for comparison
245
- # "3.7 Sonnet" -> "37sonnet"
246
- model_key_part_norm = model_key_part.lower().replace(".", "").replace(" ", "")
247
246
 
248
- cleaned_model_name = MODEL_MAPPINGS["omni"]["default"] # Default if not found
249
- # Find the canonical name in the main Anthropic map
250
- for key_anthropic, val_anthropic in MODEL_MAPPINGS["anthropic"].items():
251
- # Normalize the canonical key for comparison
252
- # "claude-3-7-sonnet-20250219" -> "claude37sonnet20250219"
253
- key_anthropic_norm = key_anthropic.lower().replace("-", "")
254
-
255
- # Check if the normalized canonical key starts with "claude" + normalized extracted part
256
- # AND contains the date part.
257
- if (
258
- key_anthropic_norm.startswith("claude" + model_key_part_norm)
259
- and date_part in key_anthropic_norm
260
- ):
261
- cleaned_model_name = (
262
- val_anthropic # Use the canonical name like "claude-3-7-sonnet-20250219"
263
- )
264
- break
247
+ model_name = model_name.replace("OMNI: ", "Anthropic: ")
248
+ cleaned_model_name = MODEL_MAPPINGS["anthropic"].get(
249
+ model_name, MODEL_MAPPINGS["anthropic"]["default"]
250
+ )
265
251
  elif model_name.startswith("OMNI: OpenAI "):
266
252
  provider = LLMProvider.OPENAI
267
253
  # Extract the model part, e.g., "GPT-4o mini"
@@ -310,6 +296,8 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
310
296
  model_name_to_use = MODEL_MAPPINGS["openai"]["default"]
311
297
  agent_loop = AgentLoop.OPENAI
312
298
 
299
+ print(f"Mapping {model_name} and {loop_provider} to {provider}, {model_name_to_use}, {agent_loop}")
300
+
313
301
  return provider, model_name_to_use, agent_loop
314
302
 
315
303
 
@@ -454,6 +442,9 @@ def create_gradio_ui(
454
442
  # Always show models regardless of API key availability
455
443
  openai_models = ["OpenAI: Computer-Use Preview"]
456
444
  anthropic_models = [
445
+ "Anthropic: Claude 4 Opus (20250514)",
446
+ "Anthropic: Claude 4 Sonnet (20250514)",
447
+
457
448
  "Anthropic: Claude 3.7 Sonnet (20250219)",
458
449
  "Anthropic: Claude 3.5 Sonnet (20240620)",
459
450
  ]
@@ -461,6 +452,8 @@ def create_gradio_ui(
461
452
  "OMNI: OpenAI GPT-4o",
462
453
  "OMNI: OpenAI GPT-4o mini",
463
454
  "OMNI: OpenAI GPT-4.5-preview",
455
+ "OMNI: Claude 4 Opus (20250514)",
456
+ "OMNI: Claude 4 Sonnet (20250514)",
464
457
  "OMNI: Claude 3.7 Sonnet (20250219)",
465
458
  "OMNI: Claude 3.5 Sonnet (20240620)"
466
459
  ]
@@ -730,20 +723,25 @@ if __name__ == "__main__":
730
723
  with gr.Accordion("Computer Configuration", open=True):
731
724
  # Computer configuration options
732
725
  computer_os = gr.Radio(
733
- choices=["macos", "linux"],
726
+ choices=["macos", "linux", "windows"],
734
727
  label="Operating System",
735
728
  value="macos",
736
729
  info="Select the operating system for the computer",
737
730
  )
738
731
 
739
- # Detect if current device is MacOS
732
+ is_windows = platform.system().lower() == "windows"
740
733
  is_mac = platform.system().lower() == "darwin"
741
734
 
735
+ providers = ["cloud"]
736
+ if is_mac:
737
+ providers += ["lume"]
738
+ elif is_windows:
739
+ providers += ["winsandbox"]
740
+
742
741
  computer_provider = gr.Radio(
743
- choices=["cloud", "lume"],
742
+ choices=providers,
744
743
  label="Provider",
745
744
  value="lume" if is_mac else "cloud",
746
- visible=is_mac,
747
745
  info="Select the computer provider",
748
746
  )
749
747
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.2.9
3
+ Version: 0.2.11
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.11
@@ -1,11 +1,11 @@
1
1
  agent/__init__.py,sha256=guFGtorDBF6R5hVep0Bvci3_sUJfBlcsq9ss5Kwrej8,1484
2
2
  agent/core/__init__.py,sha256=3x4XmLSj40-sjUMOtxOuM82RnOQl0I5AwURk5wW_9GE,514
3
3
  agent/core/agent.py,sha256=HUfBe7Uam3TObAmf6KH0GDKuNCNunNmmMcuxS7aZg0Q,8332
4
- agent/core/base.py,sha256=AiSjnBAcHhZIca4KWBP1vQRE3HyikAPkr4Ij9WDevZQ,8374
5
- agent/core/callbacks.py,sha256=FKAxyajJ-ZJ5SxNXoupNcrm0GYBgjOjJEsStqst0EAk,6453
4
+ agent/core/base.py,sha256=7hD1rosM-JjyruwSplD4-5YO6BaO1a1bD7bjFYGGUrg,8315
5
+ agent/core/callbacks.py,sha256=uAoJo4rHpVf1d8rzEBFdtSud9jRndPLwDoC4U4uYZlw,6386
6
6
  agent/core/experiment.py,sha256=Ywj6q3JZFDKicfPuQsDl0vSN55HS7-Cnk3u3EcUCKe8,8866
7
7
  agent/core/factory.py,sha256=zzlCdibctqhf8Uta-SrvE-G7h59wAw-7SGhHiGvS9GY,4608
8
- agent/core/messages.py,sha256=anhZ0G82ti6uNst7brIPBALxLET9dMP_VfEvvGtpYMA,12521
8
+ agent/core/messages.py,sha256=2bMR_U6A9q1QgzAT5pGaC4FwLs5KNFC6OlW0ZoYSYOY,14335
9
9
  agent/core/provider_config.py,sha256=jB3fLsEsf806HQZ8jtzfSq4bCYGYONBeuCOoog_Nv_Y,768
10
10
  agent/core/telemetry.py,sha256=HElPd32k_w2SJ6t-Cc3j_2-AKdLbFwh2YlM8QViDgRw,4790
11
11
  agent/core/tools.py,sha256=53aPme3O8U91n122Smu3TGbyGjQQe2zDimaZgKkFNi0,878
@@ -32,12 +32,12 @@ agent/providers/anthropic/tools/__init__.py,sha256=JyZwuVtPUnZwRSZBSCdQv9yxbLCsy
32
32
  agent/providers/anthropic/tools/base.py,sha256=WnRDbqO25tQzLpS2RU2ZXTLF5wd5IqU7SiyRAglQat4,2752
33
33
  agent/providers/anthropic/tools/bash.py,sha256=QODuFjWuHM4GgGTqK2HizSyYqGqQwX70AdwrFiGSp2Q,2218
34
34
  agent/providers/anthropic/tools/collection.py,sha256=RBK_6hxfHExR-EOxadiLl0OznmFj07nyIUjFgaYZ6Eo,960
35
- agent/providers/anthropic/tools/computer.py,sha256=3MWMGOy_xDXOg5B7CvQDVNts2WQ9NcAyfzsLEMxt5ME,25627
35
+ agent/providers/anthropic/tools/computer.py,sha256=GRmEOyZGQ6Sw7jNx39-WEWdYqQ0X0E5hW2nE2z-52a8,16979
36
36
  agent/providers/anthropic/tools/edit.py,sha256=EGRP61MDA4Oue1D7Q-_vLpd6LdGbdBA1Z4HSZ66DbmI,13465
37
37
  agent/providers/anthropic/tools/manager.py,sha256=yNvgTkfEqnOz5isDF0RxvmBMZB0uh2PipFEH-PUXpoY,2020
38
38
  agent/providers/anthropic/tools/run.py,sha256=xhXdnBK1di9muaO44CEirL9hpGy3NmKbjfMpyeVmn8Y,1595
39
39
  agent/providers/anthropic/types.py,sha256=SF00kOMC1ui8j9Ah56KaeiR2cL394qCHjFIsBpXxt5w,421
40
- agent/providers/anthropic/utils.py,sha256=qDp0bFGQhK1dG9U461iaeCiyoVUsksXmD43g9cedRW8,14367
40
+ agent/providers/anthropic/utils.py,sha256=6-lANH2-PjnYcZ_n8uGPbkbk9pqIUad5wh07zzslz3Q,14322
41
41
  agent/providers/omni/__init__.py,sha256=5ix67iJdtQNGuGJEjEOF65PwFWO7vdo1QlXD28bRbW4,179
42
42
  agent/providers/omni/api_handler.py,sha256=7CpD43lYAqTyNKWfrD8XcM9ekbajqKCTH9p0TWtEQyg,1163
43
43
  agent/providers/omni/clients/anthropic.py,sha256=nC_lj3UwrLqx9TIew58yxLqKwrH1_LwJD6EqVSEfp3g,3670
@@ -62,7 +62,7 @@ agent/providers/openai/loop.py,sha256=_MyjPu4rpHpTxS2nTSRLHrCbSDkZPK5WEG1APKGP-1
62
62
  agent/providers/openai/response_handler.py,sha256=K8v_92uSr9R74Y5INY4naeEZZZm35CLIl4h74MBZhsw,7953
63
63
  agent/providers/openai/tools/__init__.py,sha256=-KbHMWcd2OVTk5RYQ3ACBEMygwbH-VW6n_98p0lwM4A,344
64
64
  agent/providers/openai/tools/base.py,sha256=Np_BC9Cm6TslK99etE9hVTtsBlcEaGhoNCK3NXdB_Lw,2474
65
- agent/providers/openai/tools/computer.py,sha256=Jo243sNNy3_N1kO07tTMe2YWbzJGLUzOHOj5CGfwwM4,13924
65
+ agent/providers/openai/tools/computer.py,sha256=g5GzfVC3j0YlyLu1ixaSqHpxLQQ8Zcr_bbyFqm3HyfM,12497
66
66
  agent/providers/openai/tools/manager.py,sha256=-wM641dLf8vcv6QF9x_ViGJeDl2YTuUV93j6u7GBI18,3903
67
67
  agent/providers/openai/types.py,sha256=0mFUxeFy23fJhMwc6lAFVXKngg2fJIXkPS5oV284V1M,898
68
68
  agent/providers/openai/utils.py,sha256=YeCZWIqOFSeugWoqAS0rhxOKAfL-9uN9nrYSBGBgPdc,3175
@@ -78,9 +78,10 @@ agent/providers/uitars/tools/manager.py,sha256=2dK9STtz6NuZG3i0nH7ZuHJpb7vKJ2mOV
78
78
  agent/providers/uitars/utils.py,sha256=493STTEEJcVhVbQgR0e8rNTI1DjkxUx8IgIv3wkJ1SU,8878
79
79
  agent/telemetry.py,sha256=pVGxbj0ewnvq4EGj28CydN4a1iOfvZR_XKL3vIOqhOM,390
80
80
  agent/ui/__init__.py,sha256=ohhxJLBin6k1hl5sKcmBST8mgh23WXgAXz3pN4f470E,45
81
+ agent/ui/__main__.py,sha256=Ah2575SAf7hI8QCsCKx-W4A6QcFsnFPdqMArmJ4H9ic,299
81
82
  agent/ui/gradio/__init__.py,sha256=ANKZhv1HqsLheWbLVBlyRQ7Q5qGeXuPi5jDs8vu-ZMo,579
82
- agent/ui/gradio/app.py,sha256=StBehGfPJhE6ywnxU3CHDPkZrOm_2XMT1Npepf89G5c,70675
83
- cua_agent-0.2.9.dist-info/METADATA,sha256=YyGSdA5NmYLUnxwOwqcPdWLNRRN5wNXRaO4eaPTZxwM,12688
84
- cua_agent-0.2.9.dist-info/WHEEL,sha256=tSfRZzRHthuv7vxpI4aehrdN9scLjk-dCJkPLzkHxGg,90
85
- cua_agent-0.2.9.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
86
- cua_agent-0.2.9.dist-info/RECORD,,
83
+ agent/ui/gradio/app.py,sha256=8zTmQavVZH_FWcxpaqBlvfX6Su3WQDMAlbiwUN4roMA,70313
84
+ cua_agent-0.2.11.dist-info/METADATA,sha256=p8rB3P7N4u6vuQET7wZsOMFNLBKamdjws2cA1xZNLmM,12689
85
+ cua_agent-0.2.11.dist-info/WHEEL,sha256=tSfRZzRHthuv7vxpI4aehrdN9scLjk-dCJkPLzkHxGg,90
86
+ cua_agent-0.2.11.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
87
+ cua_agent-0.2.11.dist-info/RECORD,,