cua-agent 0.2.10__tar.gz → 0.2.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (86) hide show
  1. {cua_agent-0.2.10 → cua_agent-0.2.12}/PKG-INFO +4 -5
  2. {cua_agent-0.2.10 → cua_agent-0.2.12}/README.md +1 -4
  3. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/__init__.py +1 -1
  4. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/agent.py +0 -2
  5. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/messages.py +40 -7
  6. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/telemetry.py +1 -1
  7. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/computer.py +7 -167
  8. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/loop.py +0 -2
  9. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/tools/computer.py +10 -40
  10. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/loop.py +0 -2
  11. cua_agent-0.2.12/agent/ui/__main__.py +15 -0
  12. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/ui/gradio/app.py +39 -33
  13. {cua_agent-0.2.10 → cua_agent-0.2.12}/pyproject.toml +7 -4
  14. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/__init__.py +0 -0
  15. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/base.py +0 -0
  16. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/callbacks.py +0 -0
  17. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/experiment.py +0 -0
  18. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/factory.py +0 -0
  19. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/provider_config.py +0 -0
  20. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools/__init__.py +0 -0
  21. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools/base.py +0 -0
  22. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools/bash.py +0 -0
  23. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools/collection.py +0 -0
  24. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools/computer.py +0 -0
  25. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools/edit.py +0 -0
  26. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools/manager.py +0 -0
  27. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools.py +0 -0
  28. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/types.py +0 -0
  29. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/visualization.py +0 -0
  30. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/__init__.py +0 -0
  31. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/__init__.py +0 -0
  32. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/api/client.py +0 -0
  33. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/api/logging.py +0 -0
  34. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/api_handler.py +0 -0
  35. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/callbacks/__init__.py +0 -0
  36. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/callbacks/manager.py +0 -0
  37. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/loop.py +0 -0
  38. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/prompts.py +0 -0
  39. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/response_handler.py +0 -0
  40. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/__init__.py +0 -0
  41. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/base.py +0 -0
  42. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/bash.py +0 -0
  43. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/collection.py +0 -0
  44. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/edit.py +0 -0
  45. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/manager.py +0 -0
  46. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/run.py +0 -0
  47. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/types.py +0 -0
  48. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/utils.py +0 -0
  49. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/__init__.py +0 -0
  50. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/api_handler.py +0 -0
  51. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/clients/anthropic.py +0 -0
  52. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/clients/base.py +0 -0
  53. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/clients/oaicompat.py +0 -0
  54. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/clients/ollama.py +0 -0
  55. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/clients/openai.py +0 -0
  56. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/clients/utils.py +0 -0
  57. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/image_utils.py +0 -0
  58. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/parser.py +0 -0
  59. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/prompts.py +0 -0
  60. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/tools/__init__.py +0 -0
  61. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/tools/base.py +0 -0
  62. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/tools/bash.py +0 -0
  63. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/tools/computer.py +0 -0
  64. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/tools/manager.py +0 -0
  65. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/utils.py +0 -0
  66. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/__init__.py +0 -0
  67. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/api_handler.py +0 -0
  68. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/loop.py +0 -0
  69. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/response_handler.py +0 -0
  70. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/tools/__init__.py +0 -0
  71. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/tools/base.py +0 -0
  72. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/tools/manager.py +0 -0
  73. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/types.py +0 -0
  74. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/utils.py +0 -0
  75. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/__init__.py +0 -0
  76. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/clients/base.py +0 -0
  77. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/clients/mlxvlm.py +0 -0
  78. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/clients/oaicompat.py +0 -0
  79. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/prompts.py +0 -0
  80. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/tools/__init__.py +0 -0
  81. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/tools/computer.py +0 -0
  82. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/tools/manager.py +0 -0
  83. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/utils.py +0 -0
  84. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/telemetry.py +0 -0
  85. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/ui/__init__.py +0 -0
  86. {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/ui/gradio/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.2.10
3
+ Version: 0.2.12
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.11
@@ -24,6 +24,7 @@ Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "openai"
24
24
  Provides-Extra: uitars
25
25
  Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "uitars"
26
26
  Provides-Extra: uitars-mlx
27
+ Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "uitars-mlx"
27
28
  Provides-Extra: ui
28
29
  Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "ui"
29
30
  Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "ui"
@@ -67,6 +68,7 @@ Requires-Dist: requests<3.0.0,>=2.31.0; extra == "all"
67
68
  Requires-Dist: ollama<0.5.0,>=0.4.7; extra == "all"
68
69
  Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "all"
69
70
  Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "all"
71
+ Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "all"
70
72
  Description-Content-Type: text/markdown
71
73
 
72
74
  <div align="center">
@@ -105,10 +107,7 @@ pip install "cua-agent[anthropic]" # Anthropic Cua Loop
105
107
  pip install "cua-agent[uitars]" # UI-Tars support
106
108
  pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
107
109
  pip install "cua-agent[ui]" # Gradio UI for the agent
108
-
109
- # For local UI-TARS with MLX support, you need to manually install mlx-vlm:
110
- pip install "cua-agent[uitars-mlx]"
111
- pip install git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id # PR: https://github.com/Blaizzy/mlx-vlm/pull/349
110
+ pip install "cua-agent[uitars-mlx]" # MLX UI-Tars support
112
111
  ```
113
112
 
114
113
  ## Run
@@ -34,10 +34,7 @@ pip install "cua-agent[anthropic]" # Anthropic Cua Loop
34
34
  pip install "cua-agent[uitars]" # UI-Tars support
35
35
  pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
36
36
  pip install "cua-agent[ui]" # Gradio UI for the agent
37
-
38
- # For local UI-TARS with MLX support, you need to manually install mlx-vlm:
39
- pip install "cua-agent[uitars-mlx]"
40
- pip install git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id # PR: https://github.com/Blaizzy/mlx-vlm/pull/349
37
+ pip install "cua-agent[uitars-mlx]" # MLX UI-Tars support
41
38
  ```
42
39
 
43
40
  ## Run
@@ -6,7 +6,7 @@ import logging
6
6
  __version__ = "0.1.0"
7
7
 
8
8
  # Initialize logging
9
- logger = logging.getLogger("cua.agent")
9
+ logger = logging.getLogger("agent")
10
10
 
11
11
  # Initialize telemetry when the package is imported
12
12
  try:
@@ -11,10 +11,8 @@ from .types import AgentResponse
11
11
  from .factory import LoopFactory
12
12
  from .provider_config import DEFAULT_MODELS, ENV_VARS
13
13
 
14
- logging.basicConfig(level=logging.INFO)
15
14
  logger = logging.getLogger(__name__)
16
15
 
17
-
18
16
  class ComputerAgent:
19
17
  """A computer agent that can perform automated tasks using natural language instructions."""
20
18
 
@@ -81,16 +81,27 @@ class StandardMessageManager:
81
81
  if not self.config.num_images_to_keep:
82
82
  return messages
83
83
 
84
- # Find user messages with images
84
+ # Find messages with images (both user messages and tool call outputs)
85
85
  image_messages = []
86
86
  for msg in messages:
87
+ has_image = False
88
+
89
+ # Check user messages with images
87
90
  if msg["role"] == "user" and isinstance(msg["content"], list):
88
91
  has_image = any(
89
92
  item.get("type") == "image_url" or item.get("type") == "image"
90
93
  for item in msg["content"]
91
94
  )
92
- if has_image:
93
- image_messages.append(msg)
95
+
96
+ # Check assistant messages with tool calls that have images
97
+ elif msg["role"] == "assistant" and isinstance(msg["content"], list):
98
+ for item in msg["content"]:
99
+ if item.get("type") == "tool_result" and "base64_image" in item:
100
+ has_image = True
101
+ break
102
+
103
+ if has_image:
104
+ image_messages.append(msg)
94
105
 
95
106
  # If we don't have more images than the limit, return all messages
96
107
  if len(image_messages) <= self.config.num_images_to_keep:
@@ -100,13 +111,35 @@ class StandardMessageManager:
100
111
  images_to_keep = image_messages[-self.config.num_images_to_keep :]
101
112
  images_to_remove = image_messages[: -self.config.num_images_to_keep]
102
113
 
103
- # Create a new message list without the older images
114
+ # Create a new message list, removing images from older messages
104
115
  result = []
105
116
  for msg in messages:
106
117
  if msg in images_to_remove:
107
- # Skip this message
108
- continue
109
- result.append(msg)
118
+ # Remove images from this message but keep the text content
119
+ if msg["role"] == "user" and isinstance(msg["content"], list):
120
+ # Keep only text content, remove images
121
+ new_content = [
122
+ item for item in msg["content"]
123
+ if item.get("type") not in ["image_url", "image"]
124
+ ]
125
+ if new_content: # Only add if there's still content
126
+ result.append({"role": msg["role"], "content": new_content})
127
+ elif msg["role"] == "assistant" and isinstance(msg["content"], list):
128
+ # Remove base64_image from tool_result items
129
+ new_content = []
130
+ for item in msg["content"]:
131
+ if item.get("type") == "tool_result" and "base64_image" in item:
132
+ # Create a copy without the base64_image
133
+ new_item = {k: v for k, v in item.items() if k != "base64_image"}
134
+ new_content.append(new_item)
135
+ else:
136
+ new_content.append(item)
137
+ result.append({"role": msg["role"], "content": new_content})
138
+ else:
139
+ # For other message types, keep as is
140
+ result.append(msg)
141
+ else:
142
+ result.append(msg)
110
143
 
111
144
  return result
112
145
 
@@ -34,7 +34,7 @@ flush = _default_flush
34
34
  is_telemetry_enabled = _default_is_telemetry_enabled
35
35
  is_telemetry_globally_disabled = _default_is_telemetry_globally_disabled
36
36
 
37
- logger = logging.getLogger("cua.agent.telemetry")
37
+ logger = logging.getLogger("agent.telemetry")
38
38
 
39
39
  try:
40
40
  # Import from core telemetry
@@ -205,26 +205,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
205
205
  self.logger.info(f" Coordinates: ({x}, {y})")
206
206
 
207
207
  try:
208
- # Take pre-action screenshot to get current dimensions
209
- pre_screenshot = await self.computer.interface.screenshot()
210
- pre_img = Image.open(io.BytesIO(pre_screenshot))
211
-
212
- # Scale image to match screen dimensions if needed
213
- if pre_img.size != (self.width, self.height):
214
- self.logger.info(
215
- f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
216
- )
217
- if not isinstance(self.width, int) or not isinstance(self.height, int):
218
- raise ToolError("Screen dimensions must be integers")
219
- size = (int(self.width), int(self.height))
220
- pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
221
- # Save the scaled image back to bytes
222
- buffer = io.BytesIO()
223
- pre_img.save(buffer, format="PNG")
224
- pre_screenshot = buffer.getvalue()
225
-
226
- self.logger.info(f" Current dimensions: {pre_img.width}x{pre_img.height}")
227
-
228
208
  # Perform the click action
229
209
  if action == "left_click":
230
210
  self.logger.info(f"Clicking at ({x}, {y})")
@@ -242,45 +222,14 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
242
222
  # Wait briefly for any UI changes
243
223
  await asyncio.sleep(0.5)
244
224
 
245
- # Take and save post-action screenshot
246
- post_screenshot = await self.computer.interface.screenshot()
247
- post_img = Image.open(io.BytesIO(post_screenshot))
248
-
249
- # Scale post-action image if needed
250
- if post_img.size != (self.width, self.height):
251
- self.logger.info(
252
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
253
- )
254
- post_img = post_img.resize(
255
- (self.width, self.height), Image.Resampling.LANCZOS
256
- )
257
- buffer = io.BytesIO()
258
- post_img.save(buffer, format="PNG")
259
- post_screenshot = buffer.getvalue()
260
-
261
225
  return ToolResult(
262
226
  output=f"Performed {action} at ({x}, {y})",
263
- base64_image=base64.b64encode(post_screenshot).decode(),
264
227
  )
265
228
  except Exception as e:
266
229
  self.logger.error(f"Error during {action} action: {str(e)}")
267
230
  raise ToolError(f"Failed to perform {action}: {str(e)}")
268
231
  else:
269
232
  try:
270
- # Take pre-action screenshot
271
- pre_screenshot = await self.computer.interface.screenshot()
272
- pre_img = Image.open(io.BytesIO(pre_screenshot))
273
-
274
- # Scale image if needed
275
- if pre_img.size != (self.width, self.height):
276
- self.logger.info(
277
- f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
278
- )
279
- if not isinstance(self.width, int) or not isinstance(self.height, int):
280
- raise ToolError("Screen dimensions must be integers")
281
- size = (int(self.width), int(self.height))
282
- pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
283
-
284
233
  # Perform the click action
285
234
  if action == "left_click":
286
235
  self.logger.info("Performing left click at current position")
@@ -295,25 +244,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
295
244
  # Wait briefly for any UI changes
296
245
  await asyncio.sleep(0.5)
297
246
 
298
- # Take post-action screenshot
299
- post_screenshot = await self.computer.interface.screenshot()
300
- post_img = Image.open(io.BytesIO(post_screenshot))
301
-
302
- # Scale post-action image if needed
303
- if post_img.size != (self.width, self.height):
304
- self.logger.info(
305
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
306
- )
307
- post_img = post_img.resize(
308
- (self.width, self.height), Image.Resampling.LANCZOS
309
- )
310
- buffer = io.BytesIO()
311
- post_img.save(buffer, format="PNG")
312
- post_screenshot = buffer.getvalue()
313
-
314
247
  return ToolResult(
315
248
  output=f"Performed {action} at current position",
316
- base64_image=base64.b64encode(post_screenshot).decode(),
317
249
  )
318
250
  except Exception as e:
319
251
  self.logger.error(f"Error during {action} action: {str(e)}")
@@ -328,20 +260,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
328
260
  raise ToolError(f"{text} must be a string")
329
261
 
330
262
  try:
331
- # Take pre-action screenshot
332
- pre_screenshot = await self.computer.interface.screenshot()
333
- pre_img = Image.open(io.BytesIO(pre_screenshot))
334
-
335
- # Scale image if needed
336
- if pre_img.size != (self.width, self.height):
337
- self.logger.info(
338
- f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
339
- )
340
- if not isinstance(self.width, int) or not isinstance(self.height, int):
341
- raise ToolError("Screen dimensions must be integers")
342
- size = (int(self.width), int(self.height))
343
- pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
344
-
345
263
  if action == "key":
346
264
  # Special handling for page up/down on macOS
347
265
  if text.lower() in ["pagedown", "page_down", "page down"]:
@@ -378,25 +296,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
378
296
  # Wait briefly for UI changes
379
297
  await asyncio.sleep(0.5)
380
298
 
381
- # Take post-action screenshot
382
- post_screenshot = await self.computer.interface.screenshot()
383
- post_img = Image.open(io.BytesIO(post_screenshot))
384
-
385
- # Scale post-action image if needed
386
- if post_img.size != (self.width, self.height):
387
- self.logger.info(
388
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
389
- )
390
- post_img = post_img.resize(
391
- (self.width, self.height), Image.Resampling.LANCZOS
392
- )
393
- buffer = io.BytesIO()
394
- post_img.save(buffer, format="PNG")
395
- post_screenshot = buffer.getvalue()
396
-
397
299
  return ToolResult(
398
300
  output=f"Pressed key: {output_text}",
399
- base64_image=base64.b64encode(post_screenshot).decode(),
400
301
  )
401
302
 
402
303
  elif action == "type":
@@ -406,66 +307,13 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
406
307
  # Wait briefly for UI changes
407
308
  await asyncio.sleep(0.5)
408
309
 
409
- # Take post-action screenshot
410
- post_screenshot = await self.computer.interface.screenshot()
411
- post_img = Image.open(io.BytesIO(post_screenshot))
412
-
413
- # Scale post-action image if needed
414
- if post_img.size != (self.width, self.height):
415
- self.logger.info(
416
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
417
- )
418
- post_img = post_img.resize(
419
- (self.width, self.height), Image.Resampling.LANCZOS
420
- )
421
- buffer = io.BytesIO()
422
- post_img.save(buffer, format="PNG")
423
- post_screenshot = buffer.getvalue()
424
-
425
310
  return ToolResult(
426
311
  output=f"Typed text: {text}",
427
- base64_image=base64.b64encode(post_screenshot).decode(),
428
312
  )
429
313
  except Exception as e:
430
314
  self.logger.error(f"Error during {action} action: {str(e)}")
431
315
  raise ToolError(f"Failed to perform {action}: {str(e)}")
432
316
 
433
- elif action in ("screenshot", "cursor_position"):
434
- if text is not None:
435
- raise ToolError(f"text is not accepted for {action}")
436
- if coordinate is not None:
437
- raise ToolError(f"coordinate is not accepted for {action}")
438
-
439
- try:
440
- if action == "screenshot":
441
- # Take screenshot
442
- screenshot = await self.computer.interface.screenshot()
443
- img = Image.open(io.BytesIO(screenshot))
444
-
445
- # Scale image if needed
446
- if img.size != (self.width, self.height):
447
- self.logger.info(
448
- f"Scaling image from {img.size} to {self.width}x{self.height}"
449
- )
450
- if not isinstance(self.width, int) or not isinstance(self.height, int):
451
- raise ToolError("Screen dimensions must be integers")
452
- size = (int(self.width), int(self.height))
453
- img = img.resize(size, Image.Resampling.LANCZOS)
454
- buffer = io.BytesIO()
455
- img.save(buffer, format="PNG")
456
- screenshot = buffer.getvalue()
457
-
458
- return ToolResult(base64_image=base64.b64encode(screenshot).decode())
459
-
460
- elif action == "cursor_position":
461
- pos = await self.computer.interface.get_cursor_position()
462
- x, y = pos # Unpack the tuple
463
- return ToolResult(output=f"X={int(x)},Y={int(y)}")
464
-
465
- except Exception as e:
466
- self.logger.error(f"Error during {action} action: {str(e)}")
467
- raise ToolError(f"Failed to perform {action}: {str(e)}")
468
-
469
317
  elif action == "scroll":
470
318
  # Implement scroll action
471
319
  direction = kwargs.get("direction", "down")
@@ -487,28 +335,20 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
487
335
  # Wait briefly for UI changes
488
336
  await asyncio.sleep(0.5)
489
337
 
490
- # Take post-action screenshot
491
- post_screenshot = await self.computer.interface.screenshot()
492
- post_img = Image.open(io.BytesIO(post_screenshot))
493
-
494
- # Scale post-action image if needed
495
- if post_img.size != (self.width, self.height):
496
- self.logger.info(
497
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
498
- )
499
- post_img = post_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
500
- buffer = io.BytesIO()
501
- post_img.save(buffer, format="PNG")
502
- post_screenshot = buffer.getvalue()
503
-
504
338
  return ToolResult(
505
339
  output=f"Scrolled {direction} by {amount} steps",
506
- base64_image=base64.b64encode(post_screenshot).decode(),
507
340
  )
508
341
  except Exception as e:
509
342
  self.logger.error(f"Error during scroll action: {str(e)}")
510
343
  raise ToolError(f"Failed to perform scroll: {str(e)}")
511
344
 
345
+ elif action == "screenshot":
346
+ # Take screenshot
347
+ return await self.screenshot()
348
+ elif action == "cursor_position":
349
+ pos = await self.computer.interface.get_cursor_position()
350
+ x, y = pos # Unpack the tuple
351
+ return ToolResult(output=f"X={int(x)},Y={int(y)}")
512
352
  raise ToolError(f"Invalid action: {action}")
513
353
 
514
354
  async def screenshot(self):
@@ -26,10 +26,8 @@ from .api_handler import OmniAPIHandler
26
26
  from .tools.manager import ToolManager
27
27
  from .tools import ToolResult
28
28
 
29
- logging.basicConfig(level=logging.INFO)
30
29
  logger = logging.getLogger(__name__)
31
30
 
32
-
33
31
  def extract_data(input_string: str, data_type: str) -> str:
34
32
  """Extract content from code blocks."""
35
33
  pattern = f"```{data_type}" + r"(.*?)(```|$)"
@@ -61,9 +61,6 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
61
61
  computer: Computer # The CUA Computer instance
62
62
  logger = logging.getLogger(__name__)
63
63
 
64
- _screenshot_delay = 1.0 # macOS is generally faster than X11
65
- _scaling_enabled = True
66
-
67
64
  def __init__(self, computer: Computer):
68
65
  """Initialize the computer tool.
69
66
 
@@ -185,26 +182,23 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
185
182
  raise ToolError(f"Failed to execute {type}: {str(e)}")
186
183
 
187
184
  async def handle_click(self, button: str, x: int, y: int) -> ToolResult:
188
- """Handle different click actions."""
185
+ """Handle mouse clicks."""
189
186
  try:
190
- # Perform requested click action
187
+ # Perform the click based on button type
191
188
  if button == "left":
192
189
  await self.computer.interface.left_click(x, y)
193
190
  elif button == "right":
194
191
  await self.computer.interface.right_click(x, y)
195
192
  elif button == "double":
196
193
  await self.computer.interface.double_click(x, y)
194
+ else:
195
+ raise ToolError(f"Unsupported button type: {button}")
197
196
 
198
- # Wait for UI to update
199
- await asyncio.sleep(0.5)
200
-
201
- # Take screenshot after action
202
- screenshot = await self.computer.interface.screenshot()
203
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
197
+ # Wait briefly for UI to update
198
+ await asyncio.sleep(0.3)
204
199
 
205
200
  return ToolResult(
206
201
  output=f"Performed {button} click at ({x}, {y})",
207
- base64_image=base64_screenshot,
208
202
  )
209
203
  except Exception as e:
210
204
  self.logger.error(f"Error in handle_click: {str(e)}")
@@ -218,11 +212,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
218
212
 
219
213
  await asyncio.sleep(0.3)
220
214
 
221
- # Take screenshot after typing
222
- screenshot = await self.computer.interface.screenshot()
223
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
224
-
225
- return ToolResult(output=f"Typed: {text}", base64_image=base64_screenshot)
215
+ return ToolResult(output=f"Typed: {text}")
226
216
  except Exception as e:
227
217
  self.logger.error(f"Error in handle_typing: {str(e)}")
228
218
  raise ToolError(f"Failed to type '{text}': {str(e)}")
@@ -254,11 +244,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
254
244
  # Wait briefly
255
245
  await asyncio.sleep(0.3)
256
246
 
257
- # Take screenshot after action
258
- screenshot = await self.computer.interface.screenshot()
259
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
260
-
261
- return ToolResult(output=f"Pressed key: {key}", base64_image=base64_screenshot)
247
+ return ToolResult(output=f"Pressed key: {key}")
262
248
  except Exception as e:
263
249
  self.logger.error(f"Error in handle_key: {str(e)}")
264
250
  raise ToolError(f"Failed to press key '{key}': {str(e)}")
@@ -272,11 +258,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
272
258
  # Wait briefly
273
259
  await asyncio.sleep(0.2)
274
260
 
275
- # Take screenshot after action
276
- screenshot = await self.computer.interface.screenshot()
277
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
278
-
279
- return ToolResult(output=f"Moved cursor to ({x}, {y})", base64_image=base64_screenshot)
261
+ return ToolResult(output=f"Moved cursor to ({x}, {y})")
280
262
  except Exception as e:
281
263
  self.logger.error(f"Error in handle_mouse_move: {str(e)}")
282
264
  raise ToolError(f"Failed to move cursor to ({x}, {y}): {str(e)}")
@@ -296,14 +278,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
296
278
  # Wait for UI to update
297
279
  await asyncio.sleep(0.5)
298
280
 
299
- # Take screenshot after action
300
- screenshot = await self.computer.interface.screenshot()
301
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
302
-
303
- return ToolResult(
304
- output=f"Scrolled at ({x}, {y}) with delta ({scroll_x}, {scroll_y})",
305
- base64_image=base64_screenshot,
306
- )
281
+ return ToolResult(output=f"Scrolled at ({x}, {y}) by ({scroll_x}, {scroll_y})")
307
282
  except Exception as e:
308
283
  self.logger.error(f"Error in handle_scroll: {str(e)}")
309
284
  raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
@@ -331,13 +306,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
331
306
  # Wait for UI to update
332
307
  await asyncio.sleep(0.5)
333
308
 
334
- # Take screenshot after action
335
- screenshot = await self.computer.interface.screenshot()
336
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
337
-
338
309
  return ToolResult(
339
310
  output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
340
- base64_image=base64_screenshot,
341
311
  )
342
312
  except Exception as e:
343
313
  self.logger.error(f"Error in handle_drag: {str(e)}")
@@ -25,10 +25,8 @@ from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
25
25
  from .clients.oaicompat import OAICompatClient
26
26
  from .clients.mlxvlm import MLXVLMUITarsClient
27
27
 
28
- logging.basicConfig(level=logging.INFO)
29
28
  logger = logging.getLogger(__name__)
30
29
 
31
-
32
30
  class UITARSLoop(BaseLoop):
33
31
  """UI-TARS-specific implementation of the agent loop.
34
32
 
@@ -0,0 +1,15 @@
1
+ """
2
+ Main entry point for agent.ui module.
3
+
4
+ This allows running the agent UI with:
5
+ python -m agent.ui
6
+
7
+ Instead of:
8
+ python -m agent.ui.gradio.app
9
+ """
10
+
11
+ from .gradio.app import create_gradio_ui
12
+
13
+ if __name__ == "__main__":
14
+ app = create_gradio_ui()
15
+ app.launch(share=False, inbrowser=True)
@@ -132,11 +132,19 @@ class GradioChatScreenshotHandler(DefaultCallbackHandler):
132
132
  # Detect if current device is MacOS
133
133
  is_mac = platform.system().lower() == "darwin"
134
134
 
135
+ # Detect if lume is available (host device is macOS)
136
+ is_lume_available = is_mac or (os.environ.get("PYLUME_HOST", "localhost") != "localhost")
137
+
138
+ print("PYLUME_HOST: ", os.environ.get("PYLUME_HOST", "localhost"))
139
+ print("is_mac: ", is_mac)
140
+ print("Lume available: ", is_lume_available)
141
+
135
142
  # Map model names to specific provider model names
136
143
  MODEL_MAPPINGS = {
137
144
  "openai": {
138
145
  # Default to operator CUA model
139
146
  "default": "computer-use-preview",
147
+ "OpenAI: Computer-Use Preview": "computer-use-preview",
140
148
  # Map standard OpenAI model names to CUA-specific model names
141
149
  "gpt-4-turbo": "computer-use-preview",
142
150
  "gpt-4o": "computer-use-preview",
@@ -147,9 +155,17 @@ MODEL_MAPPINGS = {
147
155
  "anthropic": {
148
156
  # Default to newest model
149
157
  "default": "claude-3-7-sonnet-20250219",
158
+ # New Claude 4 models
159
+ "Anthropic: Claude 4 Opus (20250514)": "claude-opus-4-20250514",
160
+ "Anthropic: Claude 4 Sonnet (20250514)": "claude-sonnet-4-20250514",
161
+ "claude-opus-4-20250514": "claude-opus-4-20250514",
162
+ "claude-sonnet-4-20250514": "claude-sonnet-4-20250514",
163
+
150
164
  # Specific Claude models for CUA
151
- "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
165
+ "Anthropic: Claude 3.7 Sonnet (20250219)": "claude-3-5-sonnet-20240620",
166
+ "Anthropic: Claude 3.5 Sonnet (20240620)": "claude-3-7-sonnet-20250219",
152
167
  "claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
168
+ "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
153
169
  # Map standard model names to CUA-specific model names
154
170
  "claude-3-opus": "claude-3-7-sonnet-20250219",
155
171
  "claude-3-sonnet": "claude-3-5-sonnet-20240620",
@@ -209,12 +225,12 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
209
225
  if agent_loop == AgentLoop.OPENAI:
210
226
  provider = LLMProvider.OPENAI
211
227
  model_name_to_use = MODEL_MAPPINGS["openai"].get(
212
- model_name.lower(), MODEL_MAPPINGS["openai"]["default"]
228
+ model_name, MODEL_MAPPINGS["openai"]["default"]
213
229
  )
214
230
  elif agent_loop == AgentLoop.ANTHROPIC:
215
231
  provider = LLMProvider.ANTHROPIC
216
232
  model_name_to_use = MODEL_MAPPINGS["anthropic"].get(
217
- model_name.lower(), MODEL_MAPPINGS["anthropic"]["default"]
233
+ model_name, MODEL_MAPPINGS["anthropic"]["default"]
218
234
  )
219
235
  elif agent_loop == AgentLoop.OMNI:
220
236
  # Determine provider and clean model name based on the full string from UI
@@ -234,33 +250,11 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
234
250
  cleaned_model_name = model_name.split("OMNI: Ollama ", 1)[1]
235
251
  elif model_name.startswith("OMNI: Claude "):
236
252
  provider = LLMProvider.ANTHROPIC
237
- # Extract the canonical model name based on the UI string
238
- # e.g., "OMNI: Claude 3.7 Sonnet (20250219)" -> "3.7 Sonnet" and "20250219"
239
- parts = model_name.split(" (")
240
- model_key_part = parts[0].replace("OMNI: Claude ", "")
241
- date_part = parts[1].replace(")", "") if len(parts) > 1 else ""
242
-
243
- # Normalize the extracted key part for comparison
244
- # "3.7 Sonnet" -> "37sonnet"
245
- model_key_part_norm = model_key_part.lower().replace(".", "").replace(" ", "")
246
253
 
247
- cleaned_model_name = MODEL_MAPPINGS["omni"]["default"] # Default if not found
248
- # Find the canonical name in the main Anthropic map
249
- for key_anthropic, val_anthropic in MODEL_MAPPINGS["anthropic"].items():
250
- # Normalize the canonical key for comparison
251
- # "claude-3-7-sonnet-20250219" -> "claude37sonnet20250219"
252
- key_anthropic_norm = key_anthropic.lower().replace("-", "")
253
-
254
- # Check if the normalized canonical key starts with "claude" + normalized extracted part
255
- # AND contains the date part.
256
- if (
257
- key_anthropic_norm.startswith("claude" + model_key_part_norm)
258
- and date_part in key_anthropic_norm
259
- ):
260
- cleaned_model_name = (
261
- val_anthropic # Use the canonical name like "claude-3-7-sonnet-20250219"
262
- )
263
- break
254
+ model_name = model_name.replace("OMNI: ", "Anthropic: ")
255
+ cleaned_model_name = MODEL_MAPPINGS["anthropic"].get(
256
+ model_name, MODEL_MAPPINGS["anthropic"]["default"]
257
+ )
264
258
  elif model_name.startswith("OMNI: OpenAI "):
265
259
  provider = LLMProvider.OPENAI
266
260
  # Extract the model part, e.g., "GPT-4o mini"
@@ -309,6 +303,8 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
309
303
  model_name_to_use = MODEL_MAPPINGS["openai"]["default"]
310
304
  agent_loop = AgentLoop.OPENAI
311
305
 
306
+ print(f"Mapping {model_name} and {loop_provider} to {provider}, {model_name_to_use}, {agent_loop}")
307
+
312
308
  return provider, model_name_to_use, agent_loop
313
309
 
314
310
 
@@ -453,6 +449,9 @@ def create_gradio_ui(
453
449
  # Always show models regardless of API key availability
454
450
  openai_models = ["OpenAI: Computer-Use Preview"]
455
451
  anthropic_models = [
452
+ "Anthropic: Claude 4 Opus (20250514)",
453
+ "Anthropic: Claude 4 Sonnet (20250514)",
454
+
456
455
  "Anthropic: Claude 3.7 Sonnet (20250219)",
457
456
  "Anthropic: Claude 3.5 Sonnet (20240620)",
458
457
  ]
@@ -460,6 +459,8 @@ def create_gradio_ui(
460
459
  "OMNI: OpenAI GPT-4o",
461
460
  "OMNI: OpenAI GPT-4o mini",
462
461
  "OMNI: OpenAI GPT-4.5-preview",
462
+ "OMNI: Claude 4 Opus (20250514)",
463
+ "OMNI: Claude 4 Sonnet (20250514)",
463
464
  "OMNI: Claude 3.7 Sonnet (20250219)",
464
465
  "OMNI: Claude 3.5 Sonnet (20240620)"
465
466
  ]
@@ -729,20 +730,25 @@ if __name__ == "__main__":
729
730
  with gr.Accordion("Computer Configuration", open=True):
730
731
  # Computer configuration options
731
732
  computer_os = gr.Radio(
732
- choices=["macos", "linux"],
733
+ choices=["macos", "linux", "windows"],
733
734
  label="Operating System",
734
735
  value="macos",
735
736
  info="Select the operating system for the computer",
736
737
  )
737
738
 
738
- # Detect if current device is MacOS
739
+ is_windows = platform.system().lower() == "windows"
739
740
  is_mac = platform.system().lower() == "darwin"
740
741
 
742
+ providers = ["cloud"]
743
+ if is_lume_available:
744
+ providers += ["lume"]
745
+ if is_windows:
746
+ providers += ["winsandbox"]
747
+
741
748
  computer_provider = gr.Radio(
742
- choices=["cloud", "lume"],
749
+ choices=providers,
743
750
  label="Provider",
744
751
  value="lume" if is_mac else "cloud",
745
- visible=is_mac,
746
752
  info="Select the computer provider",
747
753
  )
748
754
 
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "cua-agent"
9
- version = "0.2.10"
9
+ version = "0.2.12"
10
10
  description = "CUA (Computer Use) Agent for AI-driven computer interaction"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -39,7 +39,9 @@ openai = [
39
39
  uitars = [
40
40
  "httpx>=0.27.0,<0.29.0",
41
41
  ]
42
- uitars-mlx = []
42
+ uitars-mlx = [
43
+ "mlx-vlm>=0.1.27; sys_platform == 'darwin'",
44
+ ]
43
45
  ui = [
44
46
  "gradio>=5.23.3,<6.0.0",
45
47
  "python-dotenv>=1.0.1,<2.0.0",
@@ -86,6 +88,7 @@ all = [
86
88
  "ollama>=0.4.7,<0.5.0",
87
89
  "gradio>=5.23.3,<6.0.0",
88
90
  "python-dotenv>=1.0.1,<2.0.0",
91
+ "mlx-vlm>=0.1.27; sys_platform == 'darwin'",
89
92
  ]
90
93
 
91
94
  [tool.pdm]
@@ -109,7 +112,7 @@ target-version = [
109
112
 
110
113
  [tool.ruff]
111
114
  line-length = 100
112
- target-version = "0.2.10"
115
+ target-version = "0.2.12"
113
116
  select = [
114
117
  "E",
115
118
  "F",
@@ -123,7 +126,7 @@ docstring-code-format = true
123
126
 
124
127
  [tool.mypy]
125
128
  strict = true
126
- python_version = "0.2.10"
129
+ python_version = "0.2.12"
127
130
  ignore_missing_imports = true
128
131
  disallow_untyped_defs = true
129
132
  check_untyped_defs = true