cua-agent 0.2.10__tar.gz → 0.2.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (86) hide show
  1. {cua_agent-0.2.10 → cua_agent-0.2.11}/PKG-INFO +1 -1
  2. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/messages.py +40 -7
  3. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/tools/computer.py +7 -167
  4. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/openai/tools/computer.py +10 -40
  5. cua_agent-0.2.11/agent/ui/__main__.py +15 -0
  6. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/ui/gradio/app.py +32 -33
  7. {cua_agent-0.2.10 → cua_agent-0.2.11}/pyproject.toml +3 -3
  8. {cua_agent-0.2.10 → cua_agent-0.2.11}/README.md +0 -0
  9. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/__init__.py +0 -0
  10. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/__init__.py +0 -0
  11. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/agent.py +0 -0
  12. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/base.py +0 -0
  13. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/callbacks.py +0 -0
  14. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/experiment.py +0 -0
  15. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/factory.py +0 -0
  16. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/provider_config.py +0 -0
  17. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/telemetry.py +0 -0
  18. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/tools/__init__.py +0 -0
  19. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/tools/base.py +0 -0
  20. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/tools/bash.py +0 -0
  21. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/tools/collection.py +0 -0
  22. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/tools/computer.py +0 -0
  23. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/tools/edit.py +0 -0
  24. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/tools/manager.py +0 -0
  25. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/tools.py +0 -0
  26. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/types.py +0 -0
  27. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/core/visualization.py +0 -0
  28. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/__init__.py +0 -0
  29. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/__init__.py +0 -0
  30. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/api/client.py +0 -0
  31. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/api/logging.py +0 -0
  32. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/api_handler.py +0 -0
  33. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/callbacks/__init__.py +0 -0
  34. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/callbacks/manager.py +0 -0
  35. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/loop.py +0 -0
  36. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/prompts.py +0 -0
  37. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/response_handler.py +0 -0
  38. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/tools/__init__.py +0 -0
  39. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/tools/base.py +0 -0
  40. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/tools/bash.py +0 -0
  41. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/tools/collection.py +0 -0
  42. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/tools/edit.py +0 -0
  43. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/tools/manager.py +0 -0
  44. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/tools/run.py +0 -0
  45. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/types.py +0 -0
  46. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/anthropic/utils.py +0 -0
  47. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/__init__.py +0 -0
  48. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/api_handler.py +0 -0
  49. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/clients/anthropic.py +0 -0
  50. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/clients/base.py +0 -0
  51. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/clients/oaicompat.py +0 -0
  52. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/clients/ollama.py +0 -0
  53. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/clients/openai.py +0 -0
  54. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/clients/utils.py +0 -0
  55. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/image_utils.py +0 -0
  56. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/loop.py +0 -0
  57. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/parser.py +0 -0
  58. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/prompts.py +0 -0
  59. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/tools/__init__.py +0 -0
  60. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/tools/base.py +0 -0
  61. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/tools/bash.py +0 -0
  62. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/tools/computer.py +0 -0
  63. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/tools/manager.py +0 -0
  64. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/omni/utils.py +0 -0
  65. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/openai/__init__.py +0 -0
  66. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/openai/api_handler.py +0 -0
  67. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/openai/loop.py +0 -0
  68. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/openai/response_handler.py +0 -0
  69. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/openai/tools/__init__.py +0 -0
  70. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/openai/tools/base.py +0 -0
  71. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/openai/tools/manager.py +0 -0
  72. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/openai/types.py +0 -0
  73. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/openai/utils.py +0 -0
  74. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/uitars/__init__.py +0 -0
  75. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/uitars/clients/base.py +0 -0
  76. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/uitars/clients/mlxvlm.py +0 -0
  77. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/uitars/clients/oaicompat.py +0 -0
  78. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/uitars/loop.py +0 -0
  79. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/uitars/prompts.py +0 -0
  80. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/uitars/tools/__init__.py +0 -0
  81. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/uitars/tools/computer.py +0 -0
  82. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/uitars/tools/manager.py +0 -0
  83. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/providers/uitars/utils.py +0 -0
  84. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/telemetry.py +0 -0
  85. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/ui/__init__.py +0 -0
  86. {cua_agent-0.2.10 → cua_agent-0.2.11}/agent/ui/gradio/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.2.10
3
+ Version: 0.2.11
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.11
@@ -81,16 +81,27 @@ class StandardMessageManager:
81
81
  if not self.config.num_images_to_keep:
82
82
  return messages
83
83
 
84
- # Find user messages with images
84
+ # Find messages with images (both user messages and tool call outputs)
85
85
  image_messages = []
86
86
  for msg in messages:
87
+ has_image = False
88
+
89
+ # Check user messages with images
87
90
  if msg["role"] == "user" and isinstance(msg["content"], list):
88
91
  has_image = any(
89
92
  item.get("type") == "image_url" or item.get("type") == "image"
90
93
  for item in msg["content"]
91
94
  )
92
- if has_image:
93
- image_messages.append(msg)
95
+
96
+ # Check assistant messages with tool calls that have images
97
+ elif msg["role"] == "assistant" and isinstance(msg["content"], list):
98
+ for item in msg["content"]:
99
+ if item.get("type") == "tool_result" and "base64_image" in item:
100
+ has_image = True
101
+ break
102
+
103
+ if has_image:
104
+ image_messages.append(msg)
94
105
 
95
106
  # If we don't have more images than the limit, return all messages
96
107
  if len(image_messages) <= self.config.num_images_to_keep:
@@ -100,13 +111,35 @@ class StandardMessageManager:
100
111
  images_to_keep = image_messages[-self.config.num_images_to_keep :]
101
112
  images_to_remove = image_messages[: -self.config.num_images_to_keep]
102
113
 
103
- # Create a new message list without the older images
114
+ # Create a new message list, removing images from older messages
104
115
  result = []
105
116
  for msg in messages:
106
117
  if msg in images_to_remove:
107
- # Skip this message
108
- continue
109
- result.append(msg)
118
+ # Remove images from this message but keep the text content
119
+ if msg["role"] == "user" and isinstance(msg["content"], list):
120
+ # Keep only text content, remove images
121
+ new_content = [
122
+ item for item in msg["content"]
123
+ if item.get("type") not in ["image_url", "image"]
124
+ ]
125
+ if new_content: # Only add if there's still content
126
+ result.append({"role": msg["role"], "content": new_content})
127
+ elif msg["role"] == "assistant" and isinstance(msg["content"], list):
128
+ # Remove base64_image from tool_result items
129
+ new_content = []
130
+ for item in msg["content"]:
131
+ if item.get("type") == "tool_result" and "base64_image" in item:
132
+ # Create a copy without the base64_image
133
+ new_item = {k: v for k, v in item.items() if k != "base64_image"}
134
+ new_content.append(new_item)
135
+ else:
136
+ new_content.append(item)
137
+ result.append({"role": msg["role"], "content": new_content})
138
+ else:
139
+ # For other message types, keep as is
140
+ result.append(msg)
141
+ else:
142
+ result.append(msg)
110
143
 
111
144
  return result
112
145
 
@@ -205,26 +205,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
205
205
  self.logger.info(f" Coordinates: ({x}, {y})")
206
206
 
207
207
  try:
208
- # Take pre-action screenshot to get current dimensions
209
- pre_screenshot = await self.computer.interface.screenshot()
210
- pre_img = Image.open(io.BytesIO(pre_screenshot))
211
-
212
- # Scale image to match screen dimensions if needed
213
- if pre_img.size != (self.width, self.height):
214
- self.logger.info(
215
- f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
216
- )
217
- if not isinstance(self.width, int) or not isinstance(self.height, int):
218
- raise ToolError("Screen dimensions must be integers")
219
- size = (int(self.width), int(self.height))
220
- pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
221
- # Save the scaled image back to bytes
222
- buffer = io.BytesIO()
223
- pre_img.save(buffer, format="PNG")
224
- pre_screenshot = buffer.getvalue()
225
-
226
- self.logger.info(f" Current dimensions: {pre_img.width}x{pre_img.height}")
227
-
228
208
  # Perform the click action
229
209
  if action == "left_click":
230
210
  self.logger.info(f"Clicking at ({x}, {y})")
@@ -242,45 +222,14 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
242
222
  # Wait briefly for any UI changes
243
223
  await asyncio.sleep(0.5)
244
224
 
245
- # Take and save post-action screenshot
246
- post_screenshot = await self.computer.interface.screenshot()
247
- post_img = Image.open(io.BytesIO(post_screenshot))
248
-
249
- # Scale post-action image if needed
250
- if post_img.size != (self.width, self.height):
251
- self.logger.info(
252
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
253
- )
254
- post_img = post_img.resize(
255
- (self.width, self.height), Image.Resampling.LANCZOS
256
- )
257
- buffer = io.BytesIO()
258
- post_img.save(buffer, format="PNG")
259
- post_screenshot = buffer.getvalue()
260
-
261
225
  return ToolResult(
262
226
  output=f"Performed {action} at ({x}, {y})",
263
- base64_image=base64.b64encode(post_screenshot).decode(),
264
227
  )
265
228
  except Exception as e:
266
229
  self.logger.error(f"Error during {action} action: {str(e)}")
267
230
  raise ToolError(f"Failed to perform {action}: {str(e)}")
268
231
  else:
269
232
  try:
270
- # Take pre-action screenshot
271
- pre_screenshot = await self.computer.interface.screenshot()
272
- pre_img = Image.open(io.BytesIO(pre_screenshot))
273
-
274
- # Scale image if needed
275
- if pre_img.size != (self.width, self.height):
276
- self.logger.info(
277
- f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
278
- )
279
- if not isinstance(self.width, int) or not isinstance(self.height, int):
280
- raise ToolError("Screen dimensions must be integers")
281
- size = (int(self.width), int(self.height))
282
- pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
283
-
284
233
  # Perform the click action
285
234
  if action == "left_click":
286
235
  self.logger.info("Performing left click at current position")
@@ -295,25 +244,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
295
244
  # Wait briefly for any UI changes
296
245
  await asyncio.sleep(0.5)
297
246
 
298
- # Take post-action screenshot
299
- post_screenshot = await self.computer.interface.screenshot()
300
- post_img = Image.open(io.BytesIO(post_screenshot))
301
-
302
- # Scale post-action image if needed
303
- if post_img.size != (self.width, self.height):
304
- self.logger.info(
305
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
306
- )
307
- post_img = post_img.resize(
308
- (self.width, self.height), Image.Resampling.LANCZOS
309
- )
310
- buffer = io.BytesIO()
311
- post_img.save(buffer, format="PNG")
312
- post_screenshot = buffer.getvalue()
313
-
314
247
  return ToolResult(
315
248
  output=f"Performed {action} at current position",
316
- base64_image=base64.b64encode(post_screenshot).decode(),
317
249
  )
318
250
  except Exception as e:
319
251
  self.logger.error(f"Error during {action} action: {str(e)}")
@@ -328,20 +260,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
328
260
  raise ToolError(f"{text} must be a string")
329
261
 
330
262
  try:
331
- # Take pre-action screenshot
332
- pre_screenshot = await self.computer.interface.screenshot()
333
- pre_img = Image.open(io.BytesIO(pre_screenshot))
334
-
335
- # Scale image if needed
336
- if pre_img.size != (self.width, self.height):
337
- self.logger.info(
338
- f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
339
- )
340
- if not isinstance(self.width, int) or not isinstance(self.height, int):
341
- raise ToolError("Screen dimensions must be integers")
342
- size = (int(self.width), int(self.height))
343
- pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
344
-
345
263
  if action == "key":
346
264
  # Special handling for page up/down on macOS
347
265
  if text.lower() in ["pagedown", "page_down", "page down"]:
@@ -378,25 +296,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
378
296
  # Wait briefly for UI changes
379
297
  await asyncio.sleep(0.5)
380
298
 
381
- # Take post-action screenshot
382
- post_screenshot = await self.computer.interface.screenshot()
383
- post_img = Image.open(io.BytesIO(post_screenshot))
384
-
385
- # Scale post-action image if needed
386
- if post_img.size != (self.width, self.height):
387
- self.logger.info(
388
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
389
- )
390
- post_img = post_img.resize(
391
- (self.width, self.height), Image.Resampling.LANCZOS
392
- )
393
- buffer = io.BytesIO()
394
- post_img.save(buffer, format="PNG")
395
- post_screenshot = buffer.getvalue()
396
-
397
299
  return ToolResult(
398
300
  output=f"Pressed key: {output_text}",
399
- base64_image=base64.b64encode(post_screenshot).decode(),
400
301
  )
401
302
 
402
303
  elif action == "type":
@@ -406,66 +307,13 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
406
307
  # Wait briefly for UI changes
407
308
  await asyncio.sleep(0.5)
408
309
 
409
- # Take post-action screenshot
410
- post_screenshot = await self.computer.interface.screenshot()
411
- post_img = Image.open(io.BytesIO(post_screenshot))
412
-
413
- # Scale post-action image if needed
414
- if post_img.size != (self.width, self.height):
415
- self.logger.info(
416
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
417
- )
418
- post_img = post_img.resize(
419
- (self.width, self.height), Image.Resampling.LANCZOS
420
- )
421
- buffer = io.BytesIO()
422
- post_img.save(buffer, format="PNG")
423
- post_screenshot = buffer.getvalue()
424
-
425
310
  return ToolResult(
426
311
  output=f"Typed text: {text}",
427
- base64_image=base64.b64encode(post_screenshot).decode(),
428
312
  )
429
313
  except Exception as e:
430
314
  self.logger.error(f"Error during {action} action: {str(e)}")
431
315
  raise ToolError(f"Failed to perform {action}: {str(e)}")
432
316
 
433
- elif action in ("screenshot", "cursor_position"):
434
- if text is not None:
435
- raise ToolError(f"text is not accepted for {action}")
436
- if coordinate is not None:
437
- raise ToolError(f"coordinate is not accepted for {action}")
438
-
439
- try:
440
- if action == "screenshot":
441
- # Take screenshot
442
- screenshot = await self.computer.interface.screenshot()
443
- img = Image.open(io.BytesIO(screenshot))
444
-
445
- # Scale image if needed
446
- if img.size != (self.width, self.height):
447
- self.logger.info(
448
- f"Scaling image from {img.size} to {self.width}x{self.height}"
449
- )
450
- if not isinstance(self.width, int) or not isinstance(self.height, int):
451
- raise ToolError("Screen dimensions must be integers")
452
- size = (int(self.width), int(self.height))
453
- img = img.resize(size, Image.Resampling.LANCZOS)
454
- buffer = io.BytesIO()
455
- img.save(buffer, format="PNG")
456
- screenshot = buffer.getvalue()
457
-
458
- return ToolResult(base64_image=base64.b64encode(screenshot).decode())
459
-
460
- elif action == "cursor_position":
461
- pos = await self.computer.interface.get_cursor_position()
462
- x, y = pos # Unpack the tuple
463
- return ToolResult(output=f"X={int(x)},Y={int(y)}")
464
-
465
- except Exception as e:
466
- self.logger.error(f"Error during {action} action: {str(e)}")
467
- raise ToolError(f"Failed to perform {action}: {str(e)}")
468
-
469
317
  elif action == "scroll":
470
318
  # Implement scroll action
471
319
  direction = kwargs.get("direction", "down")
@@ -487,28 +335,20 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
487
335
  # Wait briefly for UI changes
488
336
  await asyncio.sleep(0.5)
489
337
 
490
- # Take post-action screenshot
491
- post_screenshot = await self.computer.interface.screenshot()
492
- post_img = Image.open(io.BytesIO(post_screenshot))
493
-
494
- # Scale post-action image if needed
495
- if post_img.size != (self.width, self.height):
496
- self.logger.info(
497
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
498
- )
499
- post_img = post_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
500
- buffer = io.BytesIO()
501
- post_img.save(buffer, format="PNG")
502
- post_screenshot = buffer.getvalue()
503
-
504
338
  return ToolResult(
505
339
  output=f"Scrolled {direction} by {amount} steps",
506
- base64_image=base64.b64encode(post_screenshot).decode(),
507
340
  )
508
341
  except Exception as e:
509
342
  self.logger.error(f"Error during scroll action: {str(e)}")
510
343
  raise ToolError(f"Failed to perform scroll: {str(e)}")
511
344
 
345
+ elif action == "screenshot":
346
+ # Take screenshot
347
+ return await self.screenshot()
348
+ elif action == "cursor_position":
349
+ pos = await self.computer.interface.get_cursor_position()
350
+ x, y = pos # Unpack the tuple
351
+ return ToolResult(output=f"X={int(x)},Y={int(y)}")
512
352
  raise ToolError(f"Invalid action: {action}")
513
353
 
514
354
  async def screenshot(self):
@@ -61,9 +61,6 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
61
61
  computer: Computer # The CUA Computer instance
62
62
  logger = logging.getLogger(__name__)
63
63
 
64
- _screenshot_delay = 1.0 # macOS is generally faster than X11
65
- _scaling_enabled = True
66
-
67
64
  def __init__(self, computer: Computer):
68
65
  """Initialize the computer tool.
69
66
 
@@ -185,26 +182,23 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
185
182
  raise ToolError(f"Failed to execute {type}: {str(e)}")
186
183
 
187
184
  async def handle_click(self, button: str, x: int, y: int) -> ToolResult:
188
- """Handle different click actions."""
185
+ """Handle mouse clicks."""
189
186
  try:
190
- # Perform requested click action
187
+ # Perform the click based on button type
191
188
  if button == "left":
192
189
  await self.computer.interface.left_click(x, y)
193
190
  elif button == "right":
194
191
  await self.computer.interface.right_click(x, y)
195
192
  elif button == "double":
196
193
  await self.computer.interface.double_click(x, y)
194
+ else:
195
+ raise ToolError(f"Unsupported button type: {button}")
197
196
 
198
- # Wait for UI to update
199
- await asyncio.sleep(0.5)
200
-
201
- # Take screenshot after action
202
- screenshot = await self.computer.interface.screenshot()
203
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
197
+ # Wait briefly for UI to update
198
+ await asyncio.sleep(0.3)
204
199
 
205
200
  return ToolResult(
206
201
  output=f"Performed {button} click at ({x}, {y})",
207
- base64_image=base64_screenshot,
208
202
  )
209
203
  except Exception as e:
210
204
  self.logger.error(f"Error in handle_click: {str(e)}")
@@ -218,11 +212,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
218
212
 
219
213
  await asyncio.sleep(0.3)
220
214
 
221
- # Take screenshot after typing
222
- screenshot = await self.computer.interface.screenshot()
223
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
224
-
225
- return ToolResult(output=f"Typed: {text}", base64_image=base64_screenshot)
215
+ return ToolResult(output=f"Typed: {text}")
226
216
  except Exception as e:
227
217
  self.logger.error(f"Error in handle_typing: {str(e)}")
228
218
  raise ToolError(f"Failed to type '{text}': {str(e)}")
@@ -254,11 +244,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
254
244
  # Wait briefly
255
245
  await asyncio.sleep(0.3)
256
246
 
257
- # Take screenshot after action
258
- screenshot = await self.computer.interface.screenshot()
259
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
260
-
261
- return ToolResult(output=f"Pressed key: {key}", base64_image=base64_screenshot)
247
+ return ToolResult(output=f"Pressed key: {key}")
262
248
  except Exception as e:
263
249
  self.logger.error(f"Error in handle_key: {str(e)}")
264
250
  raise ToolError(f"Failed to press key '{key}': {str(e)}")
@@ -272,11 +258,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
272
258
  # Wait briefly
273
259
  await asyncio.sleep(0.2)
274
260
 
275
- # Take screenshot after action
276
- screenshot = await self.computer.interface.screenshot()
277
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
278
-
279
- return ToolResult(output=f"Moved cursor to ({x}, {y})", base64_image=base64_screenshot)
261
+ return ToolResult(output=f"Moved cursor to ({x}, {y})")
280
262
  except Exception as e:
281
263
  self.logger.error(f"Error in handle_mouse_move: {str(e)}")
282
264
  raise ToolError(f"Failed to move cursor to ({x}, {y}): {str(e)}")
@@ -296,14 +278,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
296
278
  # Wait for UI to update
297
279
  await asyncio.sleep(0.5)
298
280
 
299
- # Take screenshot after action
300
- screenshot = await self.computer.interface.screenshot()
301
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
302
-
303
- return ToolResult(
304
- output=f"Scrolled at ({x}, {y}) with delta ({scroll_x}, {scroll_y})",
305
- base64_image=base64_screenshot,
306
- )
281
+ return ToolResult(output=f"Scrolled at ({x}, {y}) by ({scroll_x}, {scroll_y})")
307
282
  except Exception as e:
308
283
  self.logger.error(f"Error in handle_scroll: {str(e)}")
309
284
  raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
@@ -331,13 +306,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
331
306
  # Wait for UI to update
332
307
  await asyncio.sleep(0.5)
333
308
 
334
- # Take screenshot after action
335
- screenshot = await self.computer.interface.screenshot()
336
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
337
-
338
309
  return ToolResult(
339
310
  output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
340
- base64_image=base64_screenshot,
341
311
  )
342
312
  except Exception as e:
343
313
  self.logger.error(f"Error in handle_drag: {str(e)}")
@@ -0,0 +1,15 @@
1
+ """
2
+ Main entry point for agent.ui module.
3
+
4
+ This allows running the agent UI with:
5
+ python -m agent.ui
6
+
7
+ Instead of:
8
+ python -m agent.ui.gradio.app
9
+ """
10
+
11
+ from .gradio.app import create_gradio_ui
12
+
13
+ if __name__ == "__main__":
14
+ app = create_gradio_ui()
15
+ app.launch(share=False, inbrowser=True)
@@ -137,6 +137,7 @@ MODEL_MAPPINGS = {
137
137
  "openai": {
138
138
  # Default to operator CUA model
139
139
  "default": "computer-use-preview",
140
+ "OpenAI: Computer-Use Preview": "computer-use-preview",
140
141
  # Map standard OpenAI model names to CUA-specific model names
141
142
  "gpt-4-turbo": "computer-use-preview",
142
143
  "gpt-4o": "computer-use-preview",
@@ -147,9 +148,17 @@ MODEL_MAPPINGS = {
147
148
  "anthropic": {
148
149
  # Default to newest model
149
150
  "default": "claude-3-7-sonnet-20250219",
151
+ # New Claude 4 models
152
+ "Anthropic: Claude 4 Opus (20250514)": "claude-opus-4-20250514",
153
+ "Anthropic: Claude 4 Sonnet (20250514)": "claude-sonnet-4-20250514",
154
+ "claude-opus-4-20250514": "claude-opus-4-20250514",
155
+ "claude-sonnet-4-20250514": "claude-sonnet-4-20250514",
156
+
150
157
  # Specific Claude models for CUA
151
- "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
158
+ "Anthropic: Claude 3.7 Sonnet (20250219)": "claude-3-5-sonnet-20240620",
159
+ "Anthropic: Claude 3.5 Sonnet (20240620)": "claude-3-7-sonnet-20250219",
152
160
  "claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
161
+ "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
153
162
  # Map standard model names to CUA-specific model names
154
163
  "claude-3-opus": "claude-3-7-sonnet-20250219",
155
164
  "claude-3-sonnet": "claude-3-5-sonnet-20240620",
@@ -209,12 +218,12 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
209
218
  if agent_loop == AgentLoop.OPENAI:
210
219
  provider = LLMProvider.OPENAI
211
220
  model_name_to_use = MODEL_MAPPINGS["openai"].get(
212
- model_name.lower(), MODEL_MAPPINGS["openai"]["default"]
221
+ model_name, MODEL_MAPPINGS["openai"]["default"]
213
222
  )
214
223
  elif agent_loop == AgentLoop.ANTHROPIC:
215
224
  provider = LLMProvider.ANTHROPIC
216
225
  model_name_to_use = MODEL_MAPPINGS["anthropic"].get(
217
- model_name.lower(), MODEL_MAPPINGS["anthropic"]["default"]
226
+ model_name, MODEL_MAPPINGS["anthropic"]["default"]
218
227
  )
219
228
  elif agent_loop == AgentLoop.OMNI:
220
229
  # Determine provider and clean model name based on the full string from UI
@@ -234,33 +243,11 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
234
243
  cleaned_model_name = model_name.split("OMNI: Ollama ", 1)[1]
235
244
  elif model_name.startswith("OMNI: Claude "):
236
245
  provider = LLMProvider.ANTHROPIC
237
- # Extract the canonical model name based on the UI string
238
- # e.g., "OMNI: Claude 3.7 Sonnet (20250219)" -> "3.7 Sonnet" and "20250219"
239
- parts = model_name.split(" (")
240
- model_key_part = parts[0].replace("OMNI: Claude ", "")
241
- date_part = parts[1].replace(")", "") if len(parts) > 1 else ""
242
-
243
- # Normalize the extracted key part for comparison
244
- # "3.7 Sonnet" -> "37sonnet"
245
- model_key_part_norm = model_key_part.lower().replace(".", "").replace(" ", "")
246
246
 
247
- cleaned_model_name = MODEL_MAPPINGS["omni"]["default"] # Default if not found
248
- # Find the canonical name in the main Anthropic map
249
- for key_anthropic, val_anthropic in MODEL_MAPPINGS["anthropic"].items():
250
- # Normalize the canonical key for comparison
251
- # "claude-3-7-sonnet-20250219" -> "claude37sonnet20250219"
252
- key_anthropic_norm = key_anthropic.lower().replace("-", "")
253
-
254
- # Check if the normalized canonical key starts with "claude" + normalized extracted part
255
- # AND contains the date part.
256
- if (
257
- key_anthropic_norm.startswith("claude" + model_key_part_norm)
258
- and date_part in key_anthropic_norm
259
- ):
260
- cleaned_model_name = (
261
- val_anthropic # Use the canonical name like "claude-3-7-sonnet-20250219"
262
- )
263
- break
247
+ model_name = model_name.replace("OMNI: ", "Anthropic: ")
248
+ cleaned_model_name = MODEL_MAPPINGS["anthropic"].get(
249
+ model_name, MODEL_MAPPINGS["anthropic"]["default"]
250
+ )
264
251
  elif model_name.startswith("OMNI: OpenAI "):
265
252
  provider = LLMProvider.OPENAI
266
253
  # Extract the model part, e.g., "GPT-4o mini"
@@ -309,6 +296,8 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
309
296
  model_name_to_use = MODEL_MAPPINGS["openai"]["default"]
310
297
  agent_loop = AgentLoop.OPENAI
311
298
 
299
+ print(f"Mapping {model_name} and {loop_provider} to {provider}, {model_name_to_use}, {agent_loop}")
300
+
312
301
  return provider, model_name_to_use, agent_loop
313
302
 
314
303
 
@@ -453,6 +442,9 @@ def create_gradio_ui(
453
442
  # Always show models regardless of API key availability
454
443
  openai_models = ["OpenAI: Computer-Use Preview"]
455
444
  anthropic_models = [
445
+ "Anthropic: Claude 4 Opus (20250514)",
446
+ "Anthropic: Claude 4 Sonnet (20250514)",
447
+
456
448
  "Anthropic: Claude 3.7 Sonnet (20250219)",
457
449
  "Anthropic: Claude 3.5 Sonnet (20240620)",
458
450
  ]
@@ -460,6 +452,8 @@ def create_gradio_ui(
460
452
  "OMNI: OpenAI GPT-4o",
461
453
  "OMNI: OpenAI GPT-4o mini",
462
454
  "OMNI: OpenAI GPT-4.5-preview",
455
+ "OMNI: Claude 4 Opus (20250514)",
456
+ "OMNI: Claude 4 Sonnet (20250514)",
463
457
  "OMNI: Claude 3.7 Sonnet (20250219)",
464
458
  "OMNI: Claude 3.5 Sonnet (20240620)"
465
459
  ]
@@ -729,20 +723,25 @@ if __name__ == "__main__":
729
723
  with gr.Accordion("Computer Configuration", open=True):
730
724
  # Computer configuration options
731
725
  computer_os = gr.Radio(
732
- choices=["macos", "linux"],
726
+ choices=["macos", "linux", "windows"],
733
727
  label="Operating System",
734
728
  value="macos",
735
729
  info="Select the operating system for the computer",
736
730
  )
737
731
 
738
- # Detect if current device is MacOS
732
+ is_windows = platform.system().lower() == "windows"
739
733
  is_mac = platform.system().lower() == "darwin"
740
734
 
735
+ providers = ["cloud"]
736
+ if is_mac:
737
+ providers += ["lume"]
738
+ elif is_windows:
739
+ providers += ["winsandbox"]
740
+
741
741
  computer_provider = gr.Radio(
742
- choices=["cloud", "lume"],
742
+ choices=providers,
743
743
  label="Provider",
744
744
  value="lume" if is_mac else "cloud",
745
- visible=is_mac,
746
745
  info="Select the computer provider",
747
746
  )
748
747
 
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "cua-agent"
9
- version = "0.2.10"
9
+ version = "0.2.11"
10
10
  description = "CUA (Computer Use) Agent for AI-driven computer interaction"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -109,7 +109,7 @@ target-version = [
109
109
 
110
110
  [tool.ruff]
111
111
  line-length = 100
112
- target-version = "0.2.10"
112
+ target-version = "0.2.11"
113
113
  select = [
114
114
  "E",
115
115
  "F",
@@ -123,7 +123,7 @@ docstring-code-format = true
123
123
 
124
124
  [tool.mypy]
125
125
  strict = true
126
- python_version = "0.2.10"
126
+ python_version = "0.2.11"
127
127
  ignore_missing_imports = true
128
128
  disallow_untyped_defs = true
129
129
  check_untyped_defs = true
File without changes
File without changes