cua-agent 0.2.10__tar.gz → 0.2.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- {cua_agent-0.2.10 → cua_agent-0.2.12}/PKG-INFO +4 -5
- {cua_agent-0.2.10 → cua_agent-0.2.12}/README.md +1 -4
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/__init__.py +1 -1
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/agent.py +0 -2
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/messages.py +40 -7
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/telemetry.py +1 -1
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/computer.py +7 -167
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/loop.py +0 -2
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/tools/computer.py +10 -40
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/loop.py +0 -2
- cua_agent-0.2.12/agent/ui/__main__.py +15 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/ui/gradio/app.py +39 -33
- {cua_agent-0.2.10 → cua_agent-0.2.12}/pyproject.toml +7 -4
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/__init__.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/base.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/callbacks.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/experiment.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/factory.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/provider_config.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools/__init__.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools/base.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools/bash.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools/collection.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools/computer.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools/edit.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools/manager.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/tools.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/types.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/visualization.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/__init__.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/__init__.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/api/client.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/api/logging.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/api_handler.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/callbacks/__init__.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/callbacks/manager.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/loop.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/prompts.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/response_handler.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/__init__.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/base.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/bash.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/collection.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/edit.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/manager.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/run.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/types.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/utils.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/__init__.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/api_handler.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/clients/anthropic.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/clients/base.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/clients/oaicompat.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/clients/ollama.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/clients/openai.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/clients/utils.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/image_utils.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/parser.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/prompts.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/tools/__init__.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/tools/base.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/tools/bash.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/tools/computer.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/tools/manager.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/utils.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/__init__.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/api_handler.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/loop.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/response_handler.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/tools/__init__.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/tools/base.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/tools/manager.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/types.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/utils.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/__init__.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/clients/base.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/clients/mlxvlm.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/clients/oaicompat.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/prompts.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/tools/__init__.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/tools/computer.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/tools/manager.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/utils.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/telemetry.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/ui/__init__.py +0 -0
- {cua_agent-0.2.10 → cua_agent-0.2.12}/agent/ui/gradio/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cua-agent
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.12
|
|
4
4
|
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
5
|
Author-Email: TryCua <gh@trycua.com>
|
|
6
6
|
Requires-Python: >=3.11
|
|
@@ -24,6 +24,7 @@ Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "openai"
|
|
|
24
24
|
Provides-Extra: uitars
|
|
25
25
|
Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "uitars"
|
|
26
26
|
Provides-Extra: uitars-mlx
|
|
27
|
+
Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "uitars-mlx"
|
|
27
28
|
Provides-Extra: ui
|
|
28
29
|
Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "ui"
|
|
29
30
|
Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "ui"
|
|
@@ -67,6 +68,7 @@ Requires-Dist: requests<3.0.0,>=2.31.0; extra == "all"
|
|
|
67
68
|
Requires-Dist: ollama<0.5.0,>=0.4.7; extra == "all"
|
|
68
69
|
Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "all"
|
|
69
70
|
Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "all"
|
|
71
|
+
Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "all"
|
|
70
72
|
Description-Content-Type: text/markdown
|
|
71
73
|
|
|
72
74
|
<div align="center">
|
|
@@ -105,10 +107,7 @@ pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
|
105
107
|
pip install "cua-agent[uitars]" # UI-Tars support
|
|
106
108
|
pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
|
|
107
109
|
pip install "cua-agent[ui]" # Gradio UI for the agent
|
|
108
|
-
|
|
109
|
-
# For local UI-TARS with MLX support, you need to manually install mlx-vlm:
|
|
110
|
-
pip install "cua-agent[uitars-mlx]"
|
|
111
|
-
pip install git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id # PR: https://github.com/Blaizzy/mlx-vlm/pull/349
|
|
110
|
+
pip install "cua-agent[uitars-mlx]" # MLX UI-Tars support
|
|
112
111
|
```
|
|
113
112
|
|
|
114
113
|
## Run
|
|
@@ -34,10 +34,7 @@ pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
|
34
34
|
pip install "cua-agent[uitars]" # UI-Tars support
|
|
35
35
|
pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
|
|
36
36
|
pip install "cua-agent[ui]" # Gradio UI for the agent
|
|
37
|
-
|
|
38
|
-
# For local UI-TARS with MLX support, you need to manually install mlx-vlm:
|
|
39
|
-
pip install "cua-agent[uitars-mlx]"
|
|
40
|
-
pip install git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id # PR: https://github.com/Blaizzy/mlx-vlm/pull/349
|
|
37
|
+
pip install "cua-agent[uitars-mlx]" # MLX UI-Tars support
|
|
41
38
|
```
|
|
42
39
|
|
|
43
40
|
## Run
|
|
@@ -11,10 +11,8 @@ from .types import AgentResponse
|
|
|
11
11
|
from .factory import LoopFactory
|
|
12
12
|
from .provider_config import DEFAULT_MODELS, ENV_VARS
|
|
13
13
|
|
|
14
|
-
logging.basicConfig(level=logging.INFO)
|
|
15
14
|
logger = logging.getLogger(__name__)
|
|
16
15
|
|
|
17
|
-
|
|
18
16
|
class ComputerAgent:
|
|
19
17
|
"""A computer agent that can perform automated tasks using natural language instructions."""
|
|
20
18
|
|
|
@@ -81,16 +81,27 @@ class StandardMessageManager:
|
|
|
81
81
|
if not self.config.num_images_to_keep:
|
|
82
82
|
return messages
|
|
83
83
|
|
|
84
|
-
# Find user messages
|
|
84
|
+
# Find messages with images (both user messages and tool call outputs)
|
|
85
85
|
image_messages = []
|
|
86
86
|
for msg in messages:
|
|
87
|
+
has_image = False
|
|
88
|
+
|
|
89
|
+
# Check user messages with images
|
|
87
90
|
if msg["role"] == "user" and isinstance(msg["content"], list):
|
|
88
91
|
has_image = any(
|
|
89
92
|
item.get("type") == "image_url" or item.get("type") == "image"
|
|
90
93
|
for item in msg["content"]
|
|
91
94
|
)
|
|
92
|
-
|
|
93
|
-
|
|
95
|
+
|
|
96
|
+
# Check assistant messages with tool calls that have images
|
|
97
|
+
elif msg["role"] == "assistant" and isinstance(msg["content"], list):
|
|
98
|
+
for item in msg["content"]:
|
|
99
|
+
if item.get("type") == "tool_result" and "base64_image" in item:
|
|
100
|
+
has_image = True
|
|
101
|
+
break
|
|
102
|
+
|
|
103
|
+
if has_image:
|
|
104
|
+
image_messages.append(msg)
|
|
94
105
|
|
|
95
106
|
# If we don't have more images than the limit, return all messages
|
|
96
107
|
if len(image_messages) <= self.config.num_images_to_keep:
|
|
@@ -100,13 +111,35 @@ class StandardMessageManager:
|
|
|
100
111
|
images_to_keep = image_messages[-self.config.num_images_to_keep :]
|
|
101
112
|
images_to_remove = image_messages[: -self.config.num_images_to_keep]
|
|
102
113
|
|
|
103
|
-
# Create a new message list
|
|
114
|
+
# Create a new message list, removing images from older messages
|
|
104
115
|
result = []
|
|
105
116
|
for msg in messages:
|
|
106
117
|
if msg in images_to_remove:
|
|
107
|
-
#
|
|
108
|
-
|
|
109
|
-
|
|
118
|
+
# Remove images from this message but keep the text content
|
|
119
|
+
if msg["role"] == "user" and isinstance(msg["content"], list):
|
|
120
|
+
# Keep only text content, remove images
|
|
121
|
+
new_content = [
|
|
122
|
+
item for item in msg["content"]
|
|
123
|
+
if item.get("type") not in ["image_url", "image"]
|
|
124
|
+
]
|
|
125
|
+
if new_content: # Only add if there's still content
|
|
126
|
+
result.append({"role": msg["role"], "content": new_content})
|
|
127
|
+
elif msg["role"] == "assistant" and isinstance(msg["content"], list):
|
|
128
|
+
# Remove base64_image from tool_result items
|
|
129
|
+
new_content = []
|
|
130
|
+
for item in msg["content"]:
|
|
131
|
+
if item.get("type") == "tool_result" and "base64_image" in item:
|
|
132
|
+
# Create a copy without the base64_image
|
|
133
|
+
new_item = {k: v for k, v in item.items() if k != "base64_image"}
|
|
134
|
+
new_content.append(new_item)
|
|
135
|
+
else:
|
|
136
|
+
new_content.append(item)
|
|
137
|
+
result.append({"role": msg["role"], "content": new_content})
|
|
138
|
+
else:
|
|
139
|
+
# For other message types, keep as is
|
|
140
|
+
result.append(msg)
|
|
141
|
+
else:
|
|
142
|
+
result.append(msg)
|
|
110
143
|
|
|
111
144
|
return result
|
|
112
145
|
|
|
@@ -34,7 +34,7 @@ flush = _default_flush
|
|
|
34
34
|
is_telemetry_enabled = _default_is_telemetry_enabled
|
|
35
35
|
is_telemetry_globally_disabled = _default_is_telemetry_globally_disabled
|
|
36
36
|
|
|
37
|
-
logger = logging.getLogger("
|
|
37
|
+
logger = logging.getLogger("agent.telemetry")
|
|
38
38
|
|
|
39
39
|
try:
|
|
40
40
|
# Import from core telemetry
|
|
@@ -205,26 +205,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
205
205
|
self.logger.info(f" Coordinates: ({x}, {y})")
|
|
206
206
|
|
|
207
207
|
try:
|
|
208
|
-
# Take pre-action screenshot to get current dimensions
|
|
209
|
-
pre_screenshot = await self.computer.interface.screenshot()
|
|
210
|
-
pre_img = Image.open(io.BytesIO(pre_screenshot))
|
|
211
|
-
|
|
212
|
-
# Scale image to match screen dimensions if needed
|
|
213
|
-
if pre_img.size != (self.width, self.height):
|
|
214
|
-
self.logger.info(
|
|
215
|
-
f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
|
|
216
|
-
)
|
|
217
|
-
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
218
|
-
raise ToolError("Screen dimensions must be integers")
|
|
219
|
-
size = (int(self.width), int(self.height))
|
|
220
|
-
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
|
|
221
|
-
# Save the scaled image back to bytes
|
|
222
|
-
buffer = io.BytesIO()
|
|
223
|
-
pre_img.save(buffer, format="PNG")
|
|
224
|
-
pre_screenshot = buffer.getvalue()
|
|
225
|
-
|
|
226
|
-
self.logger.info(f" Current dimensions: {pre_img.width}x{pre_img.height}")
|
|
227
|
-
|
|
228
208
|
# Perform the click action
|
|
229
209
|
if action == "left_click":
|
|
230
210
|
self.logger.info(f"Clicking at ({x}, {y})")
|
|
@@ -242,45 +222,14 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
242
222
|
# Wait briefly for any UI changes
|
|
243
223
|
await asyncio.sleep(0.5)
|
|
244
224
|
|
|
245
|
-
# Take and save post-action screenshot
|
|
246
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
247
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
248
|
-
|
|
249
|
-
# Scale post-action image if needed
|
|
250
|
-
if post_img.size != (self.width, self.height):
|
|
251
|
-
self.logger.info(
|
|
252
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
253
|
-
)
|
|
254
|
-
post_img = post_img.resize(
|
|
255
|
-
(self.width, self.height), Image.Resampling.LANCZOS
|
|
256
|
-
)
|
|
257
|
-
buffer = io.BytesIO()
|
|
258
|
-
post_img.save(buffer, format="PNG")
|
|
259
|
-
post_screenshot = buffer.getvalue()
|
|
260
|
-
|
|
261
225
|
return ToolResult(
|
|
262
226
|
output=f"Performed {action} at ({x}, {y})",
|
|
263
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
264
227
|
)
|
|
265
228
|
except Exception as e:
|
|
266
229
|
self.logger.error(f"Error during {action} action: {str(e)}")
|
|
267
230
|
raise ToolError(f"Failed to perform {action}: {str(e)}")
|
|
268
231
|
else:
|
|
269
232
|
try:
|
|
270
|
-
# Take pre-action screenshot
|
|
271
|
-
pre_screenshot = await self.computer.interface.screenshot()
|
|
272
|
-
pre_img = Image.open(io.BytesIO(pre_screenshot))
|
|
273
|
-
|
|
274
|
-
# Scale image if needed
|
|
275
|
-
if pre_img.size != (self.width, self.height):
|
|
276
|
-
self.logger.info(
|
|
277
|
-
f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
|
|
278
|
-
)
|
|
279
|
-
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
280
|
-
raise ToolError("Screen dimensions must be integers")
|
|
281
|
-
size = (int(self.width), int(self.height))
|
|
282
|
-
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
|
|
283
|
-
|
|
284
233
|
# Perform the click action
|
|
285
234
|
if action == "left_click":
|
|
286
235
|
self.logger.info("Performing left click at current position")
|
|
@@ -295,25 +244,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
295
244
|
# Wait briefly for any UI changes
|
|
296
245
|
await asyncio.sleep(0.5)
|
|
297
246
|
|
|
298
|
-
# Take post-action screenshot
|
|
299
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
300
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
301
|
-
|
|
302
|
-
# Scale post-action image if needed
|
|
303
|
-
if post_img.size != (self.width, self.height):
|
|
304
|
-
self.logger.info(
|
|
305
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
306
|
-
)
|
|
307
|
-
post_img = post_img.resize(
|
|
308
|
-
(self.width, self.height), Image.Resampling.LANCZOS
|
|
309
|
-
)
|
|
310
|
-
buffer = io.BytesIO()
|
|
311
|
-
post_img.save(buffer, format="PNG")
|
|
312
|
-
post_screenshot = buffer.getvalue()
|
|
313
|
-
|
|
314
247
|
return ToolResult(
|
|
315
248
|
output=f"Performed {action} at current position",
|
|
316
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
317
249
|
)
|
|
318
250
|
except Exception as e:
|
|
319
251
|
self.logger.error(f"Error during {action} action: {str(e)}")
|
|
@@ -328,20 +260,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
328
260
|
raise ToolError(f"{text} must be a string")
|
|
329
261
|
|
|
330
262
|
try:
|
|
331
|
-
# Take pre-action screenshot
|
|
332
|
-
pre_screenshot = await self.computer.interface.screenshot()
|
|
333
|
-
pre_img = Image.open(io.BytesIO(pre_screenshot))
|
|
334
|
-
|
|
335
|
-
# Scale image if needed
|
|
336
|
-
if pre_img.size != (self.width, self.height):
|
|
337
|
-
self.logger.info(
|
|
338
|
-
f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
|
|
339
|
-
)
|
|
340
|
-
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
341
|
-
raise ToolError("Screen dimensions must be integers")
|
|
342
|
-
size = (int(self.width), int(self.height))
|
|
343
|
-
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
|
|
344
|
-
|
|
345
263
|
if action == "key":
|
|
346
264
|
# Special handling for page up/down on macOS
|
|
347
265
|
if text.lower() in ["pagedown", "page_down", "page down"]:
|
|
@@ -378,25 +296,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
378
296
|
# Wait briefly for UI changes
|
|
379
297
|
await asyncio.sleep(0.5)
|
|
380
298
|
|
|
381
|
-
# Take post-action screenshot
|
|
382
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
383
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
384
|
-
|
|
385
|
-
# Scale post-action image if needed
|
|
386
|
-
if post_img.size != (self.width, self.height):
|
|
387
|
-
self.logger.info(
|
|
388
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
389
|
-
)
|
|
390
|
-
post_img = post_img.resize(
|
|
391
|
-
(self.width, self.height), Image.Resampling.LANCZOS
|
|
392
|
-
)
|
|
393
|
-
buffer = io.BytesIO()
|
|
394
|
-
post_img.save(buffer, format="PNG")
|
|
395
|
-
post_screenshot = buffer.getvalue()
|
|
396
|
-
|
|
397
299
|
return ToolResult(
|
|
398
300
|
output=f"Pressed key: {output_text}",
|
|
399
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
400
301
|
)
|
|
401
302
|
|
|
402
303
|
elif action == "type":
|
|
@@ -406,66 +307,13 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
406
307
|
# Wait briefly for UI changes
|
|
407
308
|
await asyncio.sleep(0.5)
|
|
408
309
|
|
|
409
|
-
# Take post-action screenshot
|
|
410
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
411
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
412
|
-
|
|
413
|
-
# Scale post-action image if needed
|
|
414
|
-
if post_img.size != (self.width, self.height):
|
|
415
|
-
self.logger.info(
|
|
416
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
417
|
-
)
|
|
418
|
-
post_img = post_img.resize(
|
|
419
|
-
(self.width, self.height), Image.Resampling.LANCZOS
|
|
420
|
-
)
|
|
421
|
-
buffer = io.BytesIO()
|
|
422
|
-
post_img.save(buffer, format="PNG")
|
|
423
|
-
post_screenshot = buffer.getvalue()
|
|
424
|
-
|
|
425
310
|
return ToolResult(
|
|
426
311
|
output=f"Typed text: {text}",
|
|
427
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
428
312
|
)
|
|
429
313
|
except Exception as e:
|
|
430
314
|
self.logger.error(f"Error during {action} action: {str(e)}")
|
|
431
315
|
raise ToolError(f"Failed to perform {action}: {str(e)}")
|
|
432
316
|
|
|
433
|
-
elif action in ("screenshot", "cursor_position"):
|
|
434
|
-
if text is not None:
|
|
435
|
-
raise ToolError(f"text is not accepted for {action}")
|
|
436
|
-
if coordinate is not None:
|
|
437
|
-
raise ToolError(f"coordinate is not accepted for {action}")
|
|
438
|
-
|
|
439
|
-
try:
|
|
440
|
-
if action == "screenshot":
|
|
441
|
-
# Take screenshot
|
|
442
|
-
screenshot = await self.computer.interface.screenshot()
|
|
443
|
-
img = Image.open(io.BytesIO(screenshot))
|
|
444
|
-
|
|
445
|
-
# Scale image if needed
|
|
446
|
-
if img.size != (self.width, self.height):
|
|
447
|
-
self.logger.info(
|
|
448
|
-
f"Scaling image from {img.size} to {self.width}x{self.height}"
|
|
449
|
-
)
|
|
450
|
-
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
451
|
-
raise ToolError("Screen dimensions must be integers")
|
|
452
|
-
size = (int(self.width), int(self.height))
|
|
453
|
-
img = img.resize(size, Image.Resampling.LANCZOS)
|
|
454
|
-
buffer = io.BytesIO()
|
|
455
|
-
img.save(buffer, format="PNG")
|
|
456
|
-
screenshot = buffer.getvalue()
|
|
457
|
-
|
|
458
|
-
return ToolResult(base64_image=base64.b64encode(screenshot).decode())
|
|
459
|
-
|
|
460
|
-
elif action == "cursor_position":
|
|
461
|
-
pos = await self.computer.interface.get_cursor_position()
|
|
462
|
-
x, y = pos # Unpack the tuple
|
|
463
|
-
return ToolResult(output=f"X={int(x)},Y={int(y)}")
|
|
464
|
-
|
|
465
|
-
except Exception as e:
|
|
466
|
-
self.logger.error(f"Error during {action} action: {str(e)}")
|
|
467
|
-
raise ToolError(f"Failed to perform {action}: {str(e)}")
|
|
468
|
-
|
|
469
317
|
elif action == "scroll":
|
|
470
318
|
# Implement scroll action
|
|
471
319
|
direction = kwargs.get("direction", "down")
|
|
@@ -487,28 +335,20 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
487
335
|
# Wait briefly for UI changes
|
|
488
336
|
await asyncio.sleep(0.5)
|
|
489
337
|
|
|
490
|
-
# Take post-action screenshot
|
|
491
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
492
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
493
|
-
|
|
494
|
-
# Scale post-action image if needed
|
|
495
|
-
if post_img.size != (self.width, self.height):
|
|
496
|
-
self.logger.info(
|
|
497
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
498
|
-
)
|
|
499
|
-
post_img = post_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
|
|
500
|
-
buffer = io.BytesIO()
|
|
501
|
-
post_img.save(buffer, format="PNG")
|
|
502
|
-
post_screenshot = buffer.getvalue()
|
|
503
|
-
|
|
504
338
|
return ToolResult(
|
|
505
339
|
output=f"Scrolled {direction} by {amount} steps",
|
|
506
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
507
340
|
)
|
|
508
341
|
except Exception as e:
|
|
509
342
|
self.logger.error(f"Error during scroll action: {str(e)}")
|
|
510
343
|
raise ToolError(f"Failed to perform scroll: {str(e)}")
|
|
511
344
|
|
|
345
|
+
elif action == "screenshot":
|
|
346
|
+
# Take screenshot
|
|
347
|
+
return await self.screenshot()
|
|
348
|
+
elif action == "cursor_position":
|
|
349
|
+
pos = await self.computer.interface.get_cursor_position()
|
|
350
|
+
x, y = pos # Unpack the tuple
|
|
351
|
+
return ToolResult(output=f"X={int(x)},Y={int(y)}")
|
|
512
352
|
raise ToolError(f"Invalid action: {action}")
|
|
513
353
|
|
|
514
354
|
async def screenshot(self):
|
|
@@ -26,10 +26,8 @@ from .api_handler import OmniAPIHandler
|
|
|
26
26
|
from .tools.manager import ToolManager
|
|
27
27
|
from .tools import ToolResult
|
|
28
28
|
|
|
29
|
-
logging.basicConfig(level=logging.INFO)
|
|
30
29
|
logger = logging.getLogger(__name__)
|
|
31
30
|
|
|
32
|
-
|
|
33
31
|
def extract_data(input_string: str, data_type: str) -> str:
|
|
34
32
|
"""Extract content from code blocks."""
|
|
35
33
|
pattern = f"```{data_type}" + r"(.*?)(```|$)"
|
|
@@ -61,9 +61,6 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
61
61
|
computer: Computer # The CUA Computer instance
|
|
62
62
|
logger = logging.getLogger(__name__)
|
|
63
63
|
|
|
64
|
-
_screenshot_delay = 1.0 # macOS is generally faster than X11
|
|
65
|
-
_scaling_enabled = True
|
|
66
|
-
|
|
67
64
|
def __init__(self, computer: Computer):
|
|
68
65
|
"""Initialize the computer tool.
|
|
69
66
|
|
|
@@ -185,26 +182,23 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
185
182
|
raise ToolError(f"Failed to execute {type}: {str(e)}")
|
|
186
183
|
|
|
187
184
|
async def handle_click(self, button: str, x: int, y: int) -> ToolResult:
|
|
188
|
-
"""Handle
|
|
185
|
+
"""Handle mouse clicks."""
|
|
189
186
|
try:
|
|
190
|
-
# Perform
|
|
187
|
+
# Perform the click based on button type
|
|
191
188
|
if button == "left":
|
|
192
189
|
await self.computer.interface.left_click(x, y)
|
|
193
190
|
elif button == "right":
|
|
194
191
|
await self.computer.interface.right_click(x, y)
|
|
195
192
|
elif button == "double":
|
|
196
193
|
await self.computer.interface.double_click(x, y)
|
|
194
|
+
else:
|
|
195
|
+
raise ToolError(f"Unsupported button type: {button}")
|
|
197
196
|
|
|
198
|
-
# Wait for UI to update
|
|
199
|
-
await asyncio.sleep(0.
|
|
200
|
-
|
|
201
|
-
# Take screenshot after action
|
|
202
|
-
screenshot = await self.computer.interface.screenshot()
|
|
203
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
197
|
+
# Wait briefly for UI to update
|
|
198
|
+
await asyncio.sleep(0.3)
|
|
204
199
|
|
|
205
200
|
return ToolResult(
|
|
206
201
|
output=f"Performed {button} click at ({x}, {y})",
|
|
207
|
-
base64_image=base64_screenshot,
|
|
208
202
|
)
|
|
209
203
|
except Exception as e:
|
|
210
204
|
self.logger.error(f"Error in handle_click: {str(e)}")
|
|
@@ -218,11 +212,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
218
212
|
|
|
219
213
|
await asyncio.sleep(0.3)
|
|
220
214
|
|
|
221
|
-
|
|
222
|
-
screenshot = await self.computer.interface.screenshot()
|
|
223
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
224
|
-
|
|
225
|
-
return ToolResult(output=f"Typed: {text}", base64_image=base64_screenshot)
|
|
215
|
+
return ToolResult(output=f"Typed: {text}")
|
|
226
216
|
except Exception as e:
|
|
227
217
|
self.logger.error(f"Error in handle_typing: {str(e)}")
|
|
228
218
|
raise ToolError(f"Failed to type '{text}': {str(e)}")
|
|
@@ -254,11 +244,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
254
244
|
# Wait briefly
|
|
255
245
|
await asyncio.sleep(0.3)
|
|
256
246
|
|
|
257
|
-
|
|
258
|
-
screenshot = await self.computer.interface.screenshot()
|
|
259
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
260
|
-
|
|
261
|
-
return ToolResult(output=f"Pressed key: {key}", base64_image=base64_screenshot)
|
|
247
|
+
return ToolResult(output=f"Pressed key: {key}")
|
|
262
248
|
except Exception as e:
|
|
263
249
|
self.logger.error(f"Error in handle_key: {str(e)}")
|
|
264
250
|
raise ToolError(f"Failed to press key '{key}': {str(e)}")
|
|
@@ -272,11 +258,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
272
258
|
# Wait briefly
|
|
273
259
|
await asyncio.sleep(0.2)
|
|
274
260
|
|
|
275
|
-
|
|
276
|
-
screenshot = await self.computer.interface.screenshot()
|
|
277
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
278
|
-
|
|
279
|
-
return ToolResult(output=f"Moved cursor to ({x}, {y})", base64_image=base64_screenshot)
|
|
261
|
+
return ToolResult(output=f"Moved cursor to ({x}, {y})")
|
|
280
262
|
except Exception as e:
|
|
281
263
|
self.logger.error(f"Error in handle_mouse_move: {str(e)}")
|
|
282
264
|
raise ToolError(f"Failed to move cursor to ({x}, {y}): {str(e)}")
|
|
@@ -296,14 +278,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
296
278
|
# Wait for UI to update
|
|
297
279
|
await asyncio.sleep(0.5)
|
|
298
280
|
|
|
299
|
-
|
|
300
|
-
screenshot = await self.computer.interface.screenshot()
|
|
301
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
302
|
-
|
|
303
|
-
return ToolResult(
|
|
304
|
-
output=f"Scrolled at ({x}, {y}) with delta ({scroll_x}, {scroll_y})",
|
|
305
|
-
base64_image=base64_screenshot,
|
|
306
|
-
)
|
|
281
|
+
return ToolResult(output=f"Scrolled at ({x}, {y}) by ({scroll_x}, {scroll_y})")
|
|
307
282
|
except Exception as e:
|
|
308
283
|
self.logger.error(f"Error in handle_scroll: {str(e)}")
|
|
309
284
|
raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
|
|
@@ -331,13 +306,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
331
306
|
# Wait for UI to update
|
|
332
307
|
await asyncio.sleep(0.5)
|
|
333
308
|
|
|
334
|
-
# Take screenshot after action
|
|
335
|
-
screenshot = await self.computer.interface.screenshot()
|
|
336
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
337
|
-
|
|
338
309
|
return ToolResult(
|
|
339
310
|
output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
|
|
340
|
-
base64_image=base64_screenshot,
|
|
341
311
|
)
|
|
342
312
|
except Exception as e:
|
|
343
313
|
self.logger.error(f"Error in handle_drag: {str(e)}")
|
|
@@ -25,10 +25,8 @@ from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
|
|
|
25
25
|
from .clients.oaicompat import OAICompatClient
|
|
26
26
|
from .clients.mlxvlm import MLXVLMUITarsClient
|
|
27
27
|
|
|
28
|
-
logging.basicConfig(level=logging.INFO)
|
|
29
28
|
logger = logging.getLogger(__name__)
|
|
30
29
|
|
|
31
|
-
|
|
32
30
|
class UITARSLoop(BaseLoop):
|
|
33
31
|
"""UI-TARS-specific implementation of the agent loop.
|
|
34
32
|
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main entry point for agent.ui module.
|
|
3
|
+
|
|
4
|
+
This allows running the agent UI with:
|
|
5
|
+
python -m agent.ui
|
|
6
|
+
|
|
7
|
+
Instead of:
|
|
8
|
+
python -m agent.ui.gradio.app
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .gradio.app import create_gradio_ui
|
|
12
|
+
|
|
13
|
+
if __name__ == "__main__":
|
|
14
|
+
app = create_gradio_ui()
|
|
15
|
+
app.launch(share=False, inbrowser=True)
|
|
@@ -132,11 +132,19 @@ class GradioChatScreenshotHandler(DefaultCallbackHandler):
|
|
|
132
132
|
# Detect if current device is MacOS
|
|
133
133
|
is_mac = platform.system().lower() == "darwin"
|
|
134
134
|
|
|
135
|
+
# Detect if lume is available (host device is macOS)
|
|
136
|
+
is_lume_available = is_mac or (os.environ.get("PYLUME_HOST", "localhost") != "localhost")
|
|
137
|
+
|
|
138
|
+
print("PYLUME_HOST: ", os.environ.get("PYLUME_HOST", "localhost"))
|
|
139
|
+
print("is_mac: ", is_mac)
|
|
140
|
+
print("Lume available: ", is_lume_available)
|
|
141
|
+
|
|
135
142
|
# Map model names to specific provider model names
|
|
136
143
|
MODEL_MAPPINGS = {
|
|
137
144
|
"openai": {
|
|
138
145
|
# Default to operator CUA model
|
|
139
146
|
"default": "computer-use-preview",
|
|
147
|
+
"OpenAI: Computer-Use Preview": "computer-use-preview",
|
|
140
148
|
# Map standard OpenAI model names to CUA-specific model names
|
|
141
149
|
"gpt-4-turbo": "computer-use-preview",
|
|
142
150
|
"gpt-4o": "computer-use-preview",
|
|
@@ -147,9 +155,17 @@ MODEL_MAPPINGS = {
|
|
|
147
155
|
"anthropic": {
|
|
148
156
|
# Default to newest model
|
|
149
157
|
"default": "claude-3-7-sonnet-20250219",
|
|
158
|
+
# New Claude 4 models
|
|
159
|
+
"Anthropic: Claude 4 Opus (20250514)": "claude-opus-4-20250514",
|
|
160
|
+
"Anthropic: Claude 4 Sonnet (20250514)": "claude-sonnet-4-20250514",
|
|
161
|
+
"claude-opus-4-20250514": "claude-opus-4-20250514",
|
|
162
|
+
"claude-sonnet-4-20250514": "claude-sonnet-4-20250514",
|
|
163
|
+
|
|
150
164
|
# Specific Claude models for CUA
|
|
151
|
-
"
|
|
165
|
+
"Anthropic: Claude 3.7 Sonnet (20250219)": "claude-3-5-sonnet-20240620",
|
|
166
|
+
"Anthropic: Claude 3.5 Sonnet (20240620)": "claude-3-7-sonnet-20250219",
|
|
152
167
|
"claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
|
|
168
|
+
"claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
|
|
153
169
|
# Map standard model names to CUA-specific model names
|
|
154
170
|
"claude-3-opus": "claude-3-7-sonnet-20250219",
|
|
155
171
|
"claude-3-sonnet": "claude-3-5-sonnet-20240620",
|
|
@@ -209,12 +225,12 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
|
|
|
209
225
|
if agent_loop == AgentLoop.OPENAI:
|
|
210
226
|
provider = LLMProvider.OPENAI
|
|
211
227
|
model_name_to_use = MODEL_MAPPINGS["openai"].get(
|
|
212
|
-
model_name
|
|
228
|
+
model_name, MODEL_MAPPINGS["openai"]["default"]
|
|
213
229
|
)
|
|
214
230
|
elif agent_loop == AgentLoop.ANTHROPIC:
|
|
215
231
|
provider = LLMProvider.ANTHROPIC
|
|
216
232
|
model_name_to_use = MODEL_MAPPINGS["anthropic"].get(
|
|
217
|
-
model_name
|
|
233
|
+
model_name, MODEL_MAPPINGS["anthropic"]["default"]
|
|
218
234
|
)
|
|
219
235
|
elif agent_loop == AgentLoop.OMNI:
|
|
220
236
|
# Determine provider and clean model name based on the full string from UI
|
|
@@ -234,33 +250,11 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
|
|
|
234
250
|
cleaned_model_name = model_name.split("OMNI: Ollama ", 1)[1]
|
|
235
251
|
elif model_name.startswith("OMNI: Claude "):
|
|
236
252
|
provider = LLMProvider.ANTHROPIC
|
|
237
|
-
# Extract the canonical model name based on the UI string
|
|
238
|
-
# e.g., "OMNI: Claude 3.7 Sonnet (20250219)" -> "3.7 Sonnet" and "20250219"
|
|
239
|
-
parts = model_name.split(" (")
|
|
240
|
-
model_key_part = parts[0].replace("OMNI: Claude ", "")
|
|
241
|
-
date_part = parts[1].replace(")", "") if len(parts) > 1 else ""
|
|
242
|
-
|
|
243
|
-
# Normalize the extracted key part for comparison
|
|
244
|
-
# "3.7 Sonnet" -> "37sonnet"
|
|
245
|
-
model_key_part_norm = model_key_part.lower().replace(".", "").replace(" ", "")
|
|
246
253
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
# "claude-3-7-sonnet-20250219" -> "claude37sonnet20250219"
|
|
252
|
-
key_anthropic_norm = key_anthropic.lower().replace("-", "")
|
|
253
|
-
|
|
254
|
-
# Check if the normalized canonical key starts with "claude" + normalized extracted part
|
|
255
|
-
# AND contains the date part.
|
|
256
|
-
if (
|
|
257
|
-
key_anthropic_norm.startswith("claude" + model_key_part_norm)
|
|
258
|
-
and date_part in key_anthropic_norm
|
|
259
|
-
):
|
|
260
|
-
cleaned_model_name = (
|
|
261
|
-
val_anthropic # Use the canonical name like "claude-3-7-sonnet-20250219"
|
|
262
|
-
)
|
|
263
|
-
break
|
|
254
|
+
model_name = model_name.replace("OMNI: ", "Anthropic: ")
|
|
255
|
+
cleaned_model_name = MODEL_MAPPINGS["anthropic"].get(
|
|
256
|
+
model_name, MODEL_MAPPINGS["anthropic"]["default"]
|
|
257
|
+
)
|
|
264
258
|
elif model_name.startswith("OMNI: OpenAI "):
|
|
265
259
|
provider = LLMProvider.OPENAI
|
|
266
260
|
# Extract the model part, e.g., "GPT-4o mini"
|
|
@@ -309,6 +303,8 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
|
|
|
309
303
|
model_name_to_use = MODEL_MAPPINGS["openai"]["default"]
|
|
310
304
|
agent_loop = AgentLoop.OPENAI
|
|
311
305
|
|
|
306
|
+
print(f"Mapping {model_name} and {loop_provider} to {provider}, {model_name_to_use}, {agent_loop}")
|
|
307
|
+
|
|
312
308
|
return provider, model_name_to_use, agent_loop
|
|
313
309
|
|
|
314
310
|
|
|
@@ -453,6 +449,9 @@ def create_gradio_ui(
|
|
|
453
449
|
# Always show models regardless of API key availability
|
|
454
450
|
openai_models = ["OpenAI: Computer-Use Preview"]
|
|
455
451
|
anthropic_models = [
|
|
452
|
+
"Anthropic: Claude 4 Opus (20250514)",
|
|
453
|
+
"Anthropic: Claude 4 Sonnet (20250514)",
|
|
454
|
+
|
|
456
455
|
"Anthropic: Claude 3.7 Sonnet (20250219)",
|
|
457
456
|
"Anthropic: Claude 3.5 Sonnet (20240620)",
|
|
458
457
|
]
|
|
@@ -460,6 +459,8 @@ def create_gradio_ui(
|
|
|
460
459
|
"OMNI: OpenAI GPT-4o",
|
|
461
460
|
"OMNI: OpenAI GPT-4o mini",
|
|
462
461
|
"OMNI: OpenAI GPT-4.5-preview",
|
|
462
|
+
"OMNI: Claude 4 Opus (20250514)",
|
|
463
|
+
"OMNI: Claude 4 Sonnet (20250514)",
|
|
463
464
|
"OMNI: Claude 3.7 Sonnet (20250219)",
|
|
464
465
|
"OMNI: Claude 3.5 Sonnet (20240620)"
|
|
465
466
|
]
|
|
@@ -729,20 +730,25 @@ if __name__ == "__main__":
|
|
|
729
730
|
with gr.Accordion("Computer Configuration", open=True):
|
|
730
731
|
# Computer configuration options
|
|
731
732
|
computer_os = gr.Radio(
|
|
732
|
-
choices=["macos", "linux"],
|
|
733
|
+
choices=["macos", "linux", "windows"],
|
|
733
734
|
label="Operating System",
|
|
734
735
|
value="macos",
|
|
735
736
|
info="Select the operating system for the computer",
|
|
736
737
|
)
|
|
737
738
|
|
|
738
|
-
|
|
739
|
+
is_windows = platform.system().lower() == "windows"
|
|
739
740
|
is_mac = platform.system().lower() == "darwin"
|
|
740
741
|
|
|
742
|
+
providers = ["cloud"]
|
|
743
|
+
if is_lume_available:
|
|
744
|
+
providers += ["lume"]
|
|
745
|
+
if is_windows:
|
|
746
|
+
providers += ["winsandbox"]
|
|
747
|
+
|
|
741
748
|
computer_provider = gr.Radio(
|
|
742
|
-
choices=
|
|
749
|
+
choices=providers,
|
|
743
750
|
label="Provider",
|
|
744
751
|
value="lume" if is_mac else "cloud",
|
|
745
|
-
visible=is_mac,
|
|
746
752
|
info="Select the computer provider",
|
|
747
753
|
)
|
|
748
754
|
|
|
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
|
|
|
6
6
|
|
|
7
7
|
[project]
|
|
8
8
|
name = "cua-agent"
|
|
9
|
-
version = "0.2.
|
|
9
|
+
version = "0.2.12"
|
|
10
10
|
description = "CUA (Computer Use) Agent for AI-driven computer interaction"
|
|
11
11
|
readme = "README.md"
|
|
12
12
|
authors = [
|
|
@@ -39,7 +39,9 @@ openai = [
|
|
|
39
39
|
uitars = [
|
|
40
40
|
"httpx>=0.27.0,<0.29.0",
|
|
41
41
|
]
|
|
42
|
-
uitars-mlx = [
|
|
42
|
+
uitars-mlx = [
|
|
43
|
+
"mlx-vlm>=0.1.27; sys_platform == 'darwin'",
|
|
44
|
+
]
|
|
43
45
|
ui = [
|
|
44
46
|
"gradio>=5.23.3,<6.0.0",
|
|
45
47
|
"python-dotenv>=1.0.1,<2.0.0",
|
|
@@ -86,6 +88,7 @@ all = [
|
|
|
86
88
|
"ollama>=0.4.7,<0.5.0",
|
|
87
89
|
"gradio>=5.23.3,<6.0.0",
|
|
88
90
|
"python-dotenv>=1.0.1,<2.0.0",
|
|
91
|
+
"mlx-vlm>=0.1.27; sys_platform == 'darwin'",
|
|
89
92
|
]
|
|
90
93
|
|
|
91
94
|
[tool.pdm]
|
|
@@ -109,7 +112,7 @@ target-version = [
|
|
|
109
112
|
|
|
110
113
|
[tool.ruff]
|
|
111
114
|
line-length = 100
|
|
112
|
-
target-version = "0.2.
|
|
115
|
+
target-version = "0.2.12"
|
|
113
116
|
select = [
|
|
114
117
|
"E",
|
|
115
118
|
"F",
|
|
@@ -123,7 +126,7 @@ docstring-code-format = true
|
|
|
123
126
|
|
|
124
127
|
[tool.mypy]
|
|
125
128
|
strict = true
|
|
126
|
-
python_version = "0.2.
|
|
129
|
+
python_version = "0.2.12"
|
|
127
130
|
ignore_missing_imports = true
|
|
128
131
|
disallow_untyped_defs = true
|
|
129
132
|
check_untyped_defs = true
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|