cua-agent 0.2.9__tar.gz → 0.2.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- {cua_agent-0.2.9 → cua_agent-0.2.11}/PKG-INFO +1 -1
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/base.py +1 -2
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/callbacks.py +2 -4
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/messages.py +40 -8
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/tools/computer.py +7 -167
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/utils.py +1 -2
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/openai/tools/computer.py +10 -40
- cua_agent-0.2.11/agent/ui/__main__.py +15 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/ui/gradio/app.py +33 -35
- {cua_agent-0.2.9 → cua_agent-0.2.11}/pyproject.toml +3 -3
- {cua_agent-0.2.9 → cua_agent-0.2.11}/README.md +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/__init__.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/__init__.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/agent.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/experiment.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/factory.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/provider_config.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/telemetry.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/tools/__init__.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/tools/base.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/tools/bash.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/tools/collection.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/tools/computer.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/tools/edit.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/tools/manager.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/tools.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/types.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/visualization.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/__init__.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/__init__.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/api/client.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/api/logging.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/api_handler.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/callbacks/__init__.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/callbacks/manager.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/loop.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/prompts.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/response_handler.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/tools/__init__.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/tools/base.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/tools/bash.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/tools/collection.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/tools/edit.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/tools/manager.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/tools/run.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/types.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/__init__.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/api_handler.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/clients/anthropic.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/clients/base.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/clients/oaicompat.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/clients/ollama.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/clients/openai.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/clients/utils.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/image_utils.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/loop.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/parser.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/prompts.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/tools/__init__.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/tools/base.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/tools/bash.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/tools/computer.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/tools/manager.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/omni/utils.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/openai/__init__.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/openai/api_handler.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/openai/loop.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/openai/response_handler.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/openai/tools/__init__.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/openai/tools/base.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/openai/tools/manager.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/openai/types.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/openai/utils.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/uitars/__init__.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/uitars/clients/base.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/uitars/clients/mlxvlm.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/uitars/clients/oaicompat.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/uitars/loop.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/uitars/prompts.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/uitars/tools/__init__.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/uitars/tools/computer.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/uitars/tools/manager.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/uitars/utils.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/telemetry.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/ui/__init__.py +0 -0
- {cua_agent-0.2.9 → cua_agent-0.2.11}/agent/ui/gradio/__init__.py +0 -0
|
@@ -5,7 +5,6 @@ import asyncio
|
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
6
|
from typing import Any, AsyncGenerator, Dict, List, Optional
|
|
7
7
|
|
|
8
|
-
from agent.providers.omni.parser import ParseResult
|
|
9
8
|
from computer import Computer
|
|
10
9
|
from .messages import StandardMessageManager, ImageRetentionConfig
|
|
11
10
|
from .types import AgentResponse
|
|
@@ -207,7 +206,7 @@ class BaseLoop(ABC):
|
|
|
207
206
|
# EVENT HOOKS / CALLBACKS
|
|
208
207
|
###########################################
|
|
209
208
|
|
|
210
|
-
async def handle_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[
|
|
209
|
+
async def handle_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[dict] = None) -> None:
|
|
211
210
|
"""Process a screenshot through callback managers
|
|
212
211
|
|
|
213
212
|
Args:
|
|
@@ -6,8 +6,6 @@ from abc import ABC, abstractmethod
|
|
|
6
6
|
from datetime import datetime
|
|
7
7
|
from typing import Any, Dict, List, Optional, Protocol
|
|
8
8
|
|
|
9
|
-
from agent.providers.omni.parser import ParseResult
|
|
10
|
-
|
|
11
9
|
logger = logging.getLogger(__name__)
|
|
12
10
|
|
|
13
11
|
class ContentCallback(Protocol):
|
|
@@ -117,7 +115,7 @@ class CallbackManager:
|
|
|
117
115
|
for handler in self.handlers:
|
|
118
116
|
await handler.on_error(error, **kwargs)
|
|
119
117
|
|
|
120
|
-
async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[
|
|
118
|
+
async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[dict] = None) -> None:
|
|
121
119
|
"""Called when a screenshot is taken.
|
|
122
120
|
|
|
123
121
|
Args:
|
|
@@ -166,7 +164,7 @@ class CallbackHandler(ABC):
|
|
|
166
164
|
pass
|
|
167
165
|
|
|
168
166
|
@abstractmethod
|
|
169
|
-
async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[
|
|
167
|
+
async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[dict] = None) -> None:
|
|
170
168
|
"""Called when a screenshot is taken.
|
|
171
169
|
|
|
172
170
|
Args:
|
|
@@ -5,7 +5,6 @@ import json
|
|
|
5
5
|
from typing import Any, Dict, List, Optional, Union, Tuple
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
import re
|
|
8
|
-
from ..providers.omni.parser import ParseResult
|
|
9
8
|
|
|
10
9
|
logger = logging.getLogger(__name__)
|
|
11
10
|
|
|
@@ -82,16 +81,27 @@ class StandardMessageManager:
|
|
|
82
81
|
if not self.config.num_images_to_keep:
|
|
83
82
|
return messages
|
|
84
83
|
|
|
85
|
-
# Find user messages
|
|
84
|
+
# Find messages with images (both user messages and tool call outputs)
|
|
86
85
|
image_messages = []
|
|
87
86
|
for msg in messages:
|
|
87
|
+
has_image = False
|
|
88
|
+
|
|
89
|
+
# Check user messages with images
|
|
88
90
|
if msg["role"] == "user" and isinstance(msg["content"], list):
|
|
89
91
|
has_image = any(
|
|
90
92
|
item.get("type") == "image_url" or item.get("type") == "image"
|
|
91
93
|
for item in msg["content"]
|
|
92
94
|
)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
+
|
|
96
|
+
# Check assistant messages with tool calls that have images
|
|
97
|
+
elif msg["role"] == "assistant" and isinstance(msg["content"], list):
|
|
98
|
+
for item in msg["content"]:
|
|
99
|
+
if item.get("type") == "tool_result" and "base64_image" in item:
|
|
100
|
+
has_image = True
|
|
101
|
+
break
|
|
102
|
+
|
|
103
|
+
if has_image:
|
|
104
|
+
image_messages.append(msg)
|
|
95
105
|
|
|
96
106
|
# If we don't have more images than the limit, return all messages
|
|
97
107
|
if len(image_messages) <= self.config.num_images_to_keep:
|
|
@@ -101,13 +111,35 @@ class StandardMessageManager:
|
|
|
101
111
|
images_to_keep = image_messages[-self.config.num_images_to_keep :]
|
|
102
112
|
images_to_remove = image_messages[: -self.config.num_images_to_keep]
|
|
103
113
|
|
|
104
|
-
# Create a new message list
|
|
114
|
+
# Create a new message list, removing images from older messages
|
|
105
115
|
result = []
|
|
106
116
|
for msg in messages:
|
|
107
117
|
if msg in images_to_remove:
|
|
108
|
-
#
|
|
109
|
-
|
|
110
|
-
|
|
118
|
+
# Remove images from this message but keep the text content
|
|
119
|
+
if msg["role"] == "user" and isinstance(msg["content"], list):
|
|
120
|
+
# Keep only text content, remove images
|
|
121
|
+
new_content = [
|
|
122
|
+
item for item in msg["content"]
|
|
123
|
+
if item.get("type") not in ["image_url", "image"]
|
|
124
|
+
]
|
|
125
|
+
if new_content: # Only add if there's still content
|
|
126
|
+
result.append({"role": msg["role"], "content": new_content})
|
|
127
|
+
elif msg["role"] == "assistant" and isinstance(msg["content"], list):
|
|
128
|
+
# Remove base64_image from tool_result items
|
|
129
|
+
new_content = []
|
|
130
|
+
for item in msg["content"]:
|
|
131
|
+
if item.get("type") == "tool_result" and "base64_image" in item:
|
|
132
|
+
# Create a copy without the base64_image
|
|
133
|
+
new_item = {k: v for k, v in item.items() if k != "base64_image"}
|
|
134
|
+
new_content.append(new_item)
|
|
135
|
+
else:
|
|
136
|
+
new_content.append(item)
|
|
137
|
+
result.append({"role": msg["role"], "content": new_content})
|
|
138
|
+
else:
|
|
139
|
+
# For other message types, keep as is
|
|
140
|
+
result.append(msg)
|
|
141
|
+
else:
|
|
142
|
+
result.append(msg)
|
|
111
143
|
|
|
112
144
|
return result
|
|
113
145
|
|
|
@@ -205,26 +205,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
205
205
|
self.logger.info(f" Coordinates: ({x}, {y})")
|
|
206
206
|
|
|
207
207
|
try:
|
|
208
|
-
# Take pre-action screenshot to get current dimensions
|
|
209
|
-
pre_screenshot = await self.computer.interface.screenshot()
|
|
210
|
-
pre_img = Image.open(io.BytesIO(pre_screenshot))
|
|
211
|
-
|
|
212
|
-
# Scale image to match screen dimensions if needed
|
|
213
|
-
if pre_img.size != (self.width, self.height):
|
|
214
|
-
self.logger.info(
|
|
215
|
-
f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
|
|
216
|
-
)
|
|
217
|
-
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
218
|
-
raise ToolError("Screen dimensions must be integers")
|
|
219
|
-
size = (int(self.width), int(self.height))
|
|
220
|
-
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
|
|
221
|
-
# Save the scaled image back to bytes
|
|
222
|
-
buffer = io.BytesIO()
|
|
223
|
-
pre_img.save(buffer, format="PNG")
|
|
224
|
-
pre_screenshot = buffer.getvalue()
|
|
225
|
-
|
|
226
|
-
self.logger.info(f" Current dimensions: {pre_img.width}x{pre_img.height}")
|
|
227
|
-
|
|
228
208
|
# Perform the click action
|
|
229
209
|
if action == "left_click":
|
|
230
210
|
self.logger.info(f"Clicking at ({x}, {y})")
|
|
@@ -242,45 +222,14 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
242
222
|
# Wait briefly for any UI changes
|
|
243
223
|
await asyncio.sleep(0.5)
|
|
244
224
|
|
|
245
|
-
# Take and save post-action screenshot
|
|
246
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
247
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
248
|
-
|
|
249
|
-
# Scale post-action image if needed
|
|
250
|
-
if post_img.size != (self.width, self.height):
|
|
251
|
-
self.logger.info(
|
|
252
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
253
|
-
)
|
|
254
|
-
post_img = post_img.resize(
|
|
255
|
-
(self.width, self.height), Image.Resampling.LANCZOS
|
|
256
|
-
)
|
|
257
|
-
buffer = io.BytesIO()
|
|
258
|
-
post_img.save(buffer, format="PNG")
|
|
259
|
-
post_screenshot = buffer.getvalue()
|
|
260
|
-
|
|
261
225
|
return ToolResult(
|
|
262
226
|
output=f"Performed {action} at ({x}, {y})",
|
|
263
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
264
227
|
)
|
|
265
228
|
except Exception as e:
|
|
266
229
|
self.logger.error(f"Error during {action} action: {str(e)}")
|
|
267
230
|
raise ToolError(f"Failed to perform {action}: {str(e)}")
|
|
268
231
|
else:
|
|
269
232
|
try:
|
|
270
|
-
# Take pre-action screenshot
|
|
271
|
-
pre_screenshot = await self.computer.interface.screenshot()
|
|
272
|
-
pre_img = Image.open(io.BytesIO(pre_screenshot))
|
|
273
|
-
|
|
274
|
-
# Scale image if needed
|
|
275
|
-
if pre_img.size != (self.width, self.height):
|
|
276
|
-
self.logger.info(
|
|
277
|
-
f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
|
|
278
|
-
)
|
|
279
|
-
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
280
|
-
raise ToolError("Screen dimensions must be integers")
|
|
281
|
-
size = (int(self.width), int(self.height))
|
|
282
|
-
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
|
|
283
|
-
|
|
284
233
|
# Perform the click action
|
|
285
234
|
if action == "left_click":
|
|
286
235
|
self.logger.info("Performing left click at current position")
|
|
@@ -295,25 +244,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
295
244
|
# Wait briefly for any UI changes
|
|
296
245
|
await asyncio.sleep(0.5)
|
|
297
246
|
|
|
298
|
-
# Take post-action screenshot
|
|
299
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
300
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
301
|
-
|
|
302
|
-
# Scale post-action image if needed
|
|
303
|
-
if post_img.size != (self.width, self.height):
|
|
304
|
-
self.logger.info(
|
|
305
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
306
|
-
)
|
|
307
|
-
post_img = post_img.resize(
|
|
308
|
-
(self.width, self.height), Image.Resampling.LANCZOS
|
|
309
|
-
)
|
|
310
|
-
buffer = io.BytesIO()
|
|
311
|
-
post_img.save(buffer, format="PNG")
|
|
312
|
-
post_screenshot = buffer.getvalue()
|
|
313
|
-
|
|
314
247
|
return ToolResult(
|
|
315
248
|
output=f"Performed {action} at current position",
|
|
316
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
317
249
|
)
|
|
318
250
|
except Exception as e:
|
|
319
251
|
self.logger.error(f"Error during {action} action: {str(e)}")
|
|
@@ -328,20 +260,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
328
260
|
raise ToolError(f"{text} must be a string")
|
|
329
261
|
|
|
330
262
|
try:
|
|
331
|
-
# Take pre-action screenshot
|
|
332
|
-
pre_screenshot = await self.computer.interface.screenshot()
|
|
333
|
-
pre_img = Image.open(io.BytesIO(pre_screenshot))
|
|
334
|
-
|
|
335
|
-
# Scale image if needed
|
|
336
|
-
if pre_img.size != (self.width, self.height):
|
|
337
|
-
self.logger.info(
|
|
338
|
-
f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
|
|
339
|
-
)
|
|
340
|
-
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
341
|
-
raise ToolError("Screen dimensions must be integers")
|
|
342
|
-
size = (int(self.width), int(self.height))
|
|
343
|
-
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
|
|
344
|
-
|
|
345
263
|
if action == "key":
|
|
346
264
|
# Special handling for page up/down on macOS
|
|
347
265
|
if text.lower() in ["pagedown", "page_down", "page down"]:
|
|
@@ -378,25 +296,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
378
296
|
# Wait briefly for UI changes
|
|
379
297
|
await asyncio.sleep(0.5)
|
|
380
298
|
|
|
381
|
-
# Take post-action screenshot
|
|
382
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
383
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
384
|
-
|
|
385
|
-
# Scale post-action image if needed
|
|
386
|
-
if post_img.size != (self.width, self.height):
|
|
387
|
-
self.logger.info(
|
|
388
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
389
|
-
)
|
|
390
|
-
post_img = post_img.resize(
|
|
391
|
-
(self.width, self.height), Image.Resampling.LANCZOS
|
|
392
|
-
)
|
|
393
|
-
buffer = io.BytesIO()
|
|
394
|
-
post_img.save(buffer, format="PNG")
|
|
395
|
-
post_screenshot = buffer.getvalue()
|
|
396
|
-
|
|
397
299
|
return ToolResult(
|
|
398
300
|
output=f"Pressed key: {output_text}",
|
|
399
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
400
301
|
)
|
|
401
302
|
|
|
402
303
|
elif action == "type":
|
|
@@ -406,66 +307,13 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
406
307
|
# Wait briefly for UI changes
|
|
407
308
|
await asyncio.sleep(0.5)
|
|
408
309
|
|
|
409
|
-
# Take post-action screenshot
|
|
410
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
411
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
412
|
-
|
|
413
|
-
# Scale post-action image if needed
|
|
414
|
-
if post_img.size != (self.width, self.height):
|
|
415
|
-
self.logger.info(
|
|
416
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
417
|
-
)
|
|
418
|
-
post_img = post_img.resize(
|
|
419
|
-
(self.width, self.height), Image.Resampling.LANCZOS
|
|
420
|
-
)
|
|
421
|
-
buffer = io.BytesIO()
|
|
422
|
-
post_img.save(buffer, format="PNG")
|
|
423
|
-
post_screenshot = buffer.getvalue()
|
|
424
|
-
|
|
425
310
|
return ToolResult(
|
|
426
311
|
output=f"Typed text: {text}",
|
|
427
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
428
312
|
)
|
|
429
313
|
except Exception as e:
|
|
430
314
|
self.logger.error(f"Error during {action} action: {str(e)}")
|
|
431
315
|
raise ToolError(f"Failed to perform {action}: {str(e)}")
|
|
432
316
|
|
|
433
|
-
elif action in ("screenshot", "cursor_position"):
|
|
434
|
-
if text is not None:
|
|
435
|
-
raise ToolError(f"text is not accepted for {action}")
|
|
436
|
-
if coordinate is not None:
|
|
437
|
-
raise ToolError(f"coordinate is not accepted for {action}")
|
|
438
|
-
|
|
439
|
-
try:
|
|
440
|
-
if action == "screenshot":
|
|
441
|
-
# Take screenshot
|
|
442
|
-
screenshot = await self.computer.interface.screenshot()
|
|
443
|
-
img = Image.open(io.BytesIO(screenshot))
|
|
444
|
-
|
|
445
|
-
# Scale image if needed
|
|
446
|
-
if img.size != (self.width, self.height):
|
|
447
|
-
self.logger.info(
|
|
448
|
-
f"Scaling image from {img.size} to {self.width}x{self.height}"
|
|
449
|
-
)
|
|
450
|
-
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
451
|
-
raise ToolError("Screen dimensions must be integers")
|
|
452
|
-
size = (int(self.width), int(self.height))
|
|
453
|
-
img = img.resize(size, Image.Resampling.LANCZOS)
|
|
454
|
-
buffer = io.BytesIO()
|
|
455
|
-
img.save(buffer, format="PNG")
|
|
456
|
-
screenshot = buffer.getvalue()
|
|
457
|
-
|
|
458
|
-
return ToolResult(base64_image=base64.b64encode(screenshot).decode())
|
|
459
|
-
|
|
460
|
-
elif action == "cursor_position":
|
|
461
|
-
pos = await self.computer.interface.get_cursor_position()
|
|
462
|
-
x, y = pos # Unpack the tuple
|
|
463
|
-
return ToolResult(output=f"X={int(x)},Y={int(y)}")
|
|
464
|
-
|
|
465
|
-
except Exception as e:
|
|
466
|
-
self.logger.error(f"Error during {action} action: {str(e)}")
|
|
467
|
-
raise ToolError(f"Failed to perform {action}: {str(e)}")
|
|
468
|
-
|
|
469
317
|
elif action == "scroll":
|
|
470
318
|
# Implement scroll action
|
|
471
319
|
direction = kwargs.get("direction", "down")
|
|
@@ -487,28 +335,20 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
487
335
|
# Wait briefly for UI changes
|
|
488
336
|
await asyncio.sleep(0.5)
|
|
489
337
|
|
|
490
|
-
# Take post-action screenshot
|
|
491
|
-
post_screenshot = await self.computer.interface.screenshot()
|
|
492
|
-
post_img = Image.open(io.BytesIO(post_screenshot))
|
|
493
|
-
|
|
494
|
-
# Scale post-action image if needed
|
|
495
|
-
if post_img.size != (self.width, self.height):
|
|
496
|
-
self.logger.info(
|
|
497
|
-
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
|
|
498
|
-
)
|
|
499
|
-
post_img = post_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
|
|
500
|
-
buffer = io.BytesIO()
|
|
501
|
-
post_img.save(buffer, format="PNG")
|
|
502
|
-
post_screenshot = buffer.getvalue()
|
|
503
|
-
|
|
504
338
|
return ToolResult(
|
|
505
339
|
output=f"Scrolled {direction} by {amount} steps",
|
|
506
|
-
base64_image=base64.b64encode(post_screenshot).decode(),
|
|
507
340
|
)
|
|
508
341
|
except Exception as e:
|
|
509
342
|
self.logger.error(f"Error during scroll action: {str(e)}")
|
|
510
343
|
raise ToolError(f"Failed to perform scroll: {str(e)}")
|
|
511
344
|
|
|
345
|
+
elif action == "screenshot":
|
|
346
|
+
# Take screenshot
|
|
347
|
+
return await self.screenshot()
|
|
348
|
+
elif action == "cursor_position":
|
|
349
|
+
pos = await self.computer.interface.get_cursor_position()
|
|
350
|
+
x, y = pos # Unpack the tuple
|
|
351
|
+
return ToolResult(output=f"X={int(x)},Y={int(y)}")
|
|
512
352
|
raise ToolError(f"Invalid action: {action}")
|
|
513
353
|
|
|
514
354
|
async def screenshot(self):
|
|
@@ -4,7 +4,6 @@ import logging
|
|
|
4
4
|
import re
|
|
5
5
|
from typing import Any, Dict, List, Optional, Tuple, cast
|
|
6
6
|
from anthropic.types.beta import BetaMessage
|
|
7
|
-
from ..omni.parser import ParseResult
|
|
8
7
|
from ...core.types import AgentResponse
|
|
9
8
|
from datetime import datetime
|
|
10
9
|
|
|
@@ -188,7 +187,7 @@ def from_anthropic_format(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]
|
|
|
188
187
|
async def to_agent_response_format(
|
|
189
188
|
response: BetaMessage,
|
|
190
189
|
messages: List[Dict[str, Any]],
|
|
191
|
-
parsed_screen: Optional[
|
|
190
|
+
parsed_screen: Optional[dict] = None,
|
|
192
191
|
parser: Optional[Any] = None,
|
|
193
192
|
model: Optional[str] = None,
|
|
194
193
|
) -> AgentResponse:
|
|
@@ -61,9 +61,6 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
61
61
|
computer: Computer # The CUA Computer instance
|
|
62
62
|
logger = logging.getLogger(__name__)
|
|
63
63
|
|
|
64
|
-
_screenshot_delay = 1.0 # macOS is generally faster than X11
|
|
65
|
-
_scaling_enabled = True
|
|
66
|
-
|
|
67
64
|
def __init__(self, computer: Computer):
|
|
68
65
|
"""Initialize the computer tool.
|
|
69
66
|
|
|
@@ -185,26 +182,23 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
185
182
|
raise ToolError(f"Failed to execute {type}: {str(e)}")
|
|
186
183
|
|
|
187
184
|
async def handle_click(self, button: str, x: int, y: int) -> ToolResult:
|
|
188
|
-
"""Handle
|
|
185
|
+
"""Handle mouse clicks."""
|
|
189
186
|
try:
|
|
190
|
-
# Perform
|
|
187
|
+
# Perform the click based on button type
|
|
191
188
|
if button == "left":
|
|
192
189
|
await self.computer.interface.left_click(x, y)
|
|
193
190
|
elif button == "right":
|
|
194
191
|
await self.computer.interface.right_click(x, y)
|
|
195
192
|
elif button == "double":
|
|
196
193
|
await self.computer.interface.double_click(x, y)
|
|
194
|
+
else:
|
|
195
|
+
raise ToolError(f"Unsupported button type: {button}")
|
|
197
196
|
|
|
198
|
-
# Wait for UI to update
|
|
199
|
-
await asyncio.sleep(0.
|
|
200
|
-
|
|
201
|
-
# Take screenshot after action
|
|
202
|
-
screenshot = await self.computer.interface.screenshot()
|
|
203
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
197
|
+
# Wait briefly for UI to update
|
|
198
|
+
await asyncio.sleep(0.3)
|
|
204
199
|
|
|
205
200
|
return ToolResult(
|
|
206
201
|
output=f"Performed {button} click at ({x}, {y})",
|
|
207
|
-
base64_image=base64_screenshot,
|
|
208
202
|
)
|
|
209
203
|
except Exception as e:
|
|
210
204
|
self.logger.error(f"Error in handle_click: {str(e)}")
|
|
@@ -218,11 +212,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
218
212
|
|
|
219
213
|
await asyncio.sleep(0.3)
|
|
220
214
|
|
|
221
|
-
|
|
222
|
-
screenshot = await self.computer.interface.screenshot()
|
|
223
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
224
|
-
|
|
225
|
-
return ToolResult(output=f"Typed: {text}", base64_image=base64_screenshot)
|
|
215
|
+
return ToolResult(output=f"Typed: {text}")
|
|
226
216
|
except Exception as e:
|
|
227
217
|
self.logger.error(f"Error in handle_typing: {str(e)}")
|
|
228
218
|
raise ToolError(f"Failed to type '{text}': {str(e)}")
|
|
@@ -254,11 +244,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
254
244
|
# Wait briefly
|
|
255
245
|
await asyncio.sleep(0.3)
|
|
256
246
|
|
|
257
|
-
|
|
258
|
-
screenshot = await self.computer.interface.screenshot()
|
|
259
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
260
|
-
|
|
261
|
-
return ToolResult(output=f"Pressed key: {key}", base64_image=base64_screenshot)
|
|
247
|
+
return ToolResult(output=f"Pressed key: {key}")
|
|
262
248
|
except Exception as e:
|
|
263
249
|
self.logger.error(f"Error in handle_key: {str(e)}")
|
|
264
250
|
raise ToolError(f"Failed to press key '{key}': {str(e)}")
|
|
@@ -272,11 +258,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
272
258
|
# Wait briefly
|
|
273
259
|
await asyncio.sleep(0.2)
|
|
274
260
|
|
|
275
|
-
|
|
276
|
-
screenshot = await self.computer.interface.screenshot()
|
|
277
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
278
|
-
|
|
279
|
-
return ToolResult(output=f"Moved cursor to ({x}, {y})", base64_image=base64_screenshot)
|
|
261
|
+
return ToolResult(output=f"Moved cursor to ({x}, {y})")
|
|
280
262
|
except Exception as e:
|
|
281
263
|
self.logger.error(f"Error in handle_mouse_move: {str(e)}")
|
|
282
264
|
raise ToolError(f"Failed to move cursor to ({x}, {y}): {str(e)}")
|
|
@@ -296,14 +278,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
296
278
|
# Wait for UI to update
|
|
297
279
|
await asyncio.sleep(0.5)
|
|
298
280
|
|
|
299
|
-
|
|
300
|
-
screenshot = await self.computer.interface.screenshot()
|
|
301
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
302
|
-
|
|
303
|
-
return ToolResult(
|
|
304
|
-
output=f"Scrolled at ({x}, {y}) with delta ({scroll_x}, {scroll_y})",
|
|
305
|
-
base64_image=base64_screenshot,
|
|
306
|
-
)
|
|
281
|
+
return ToolResult(output=f"Scrolled at ({x}, {y}) by ({scroll_x}, {scroll_y})")
|
|
307
282
|
except Exception as e:
|
|
308
283
|
self.logger.error(f"Error in handle_scroll: {str(e)}")
|
|
309
284
|
raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
|
|
@@ -331,13 +306,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
331
306
|
# Wait for UI to update
|
|
332
307
|
await asyncio.sleep(0.5)
|
|
333
308
|
|
|
334
|
-
# Take screenshot after action
|
|
335
|
-
screenshot = await self.computer.interface.screenshot()
|
|
336
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
337
|
-
|
|
338
309
|
return ToolResult(
|
|
339
310
|
output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
|
|
340
|
-
base64_image=base64_screenshot,
|
|
341
311
|
)
|
|
342
312
|
except Exception as e:
|
|
343
313
|
self.logger.error(f"Error in handle_drag: {str(e)}")
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main entry point for agent.ui module.
|
|
3
|
+
|
|
4
|
+
This allows running the agent UI with:
|
|
5
|
+
python -m agent.ui
|
|
6
|
+
|
|
7
|
+
Instead of:
|
|
8
|
+
python -m agent.ui.gradio.app
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .gradio.app import create_gradio_ui
|
|
12
|
+
|
|
13
|
+
if __name__ == "__main__":
|
|
14
|
+
app = create_gradio_ui()
|
|
15
|
+
app.launch(share=False, inbrowser=True)
|
|
@@ -41,7 +41,6 @@ from typing import cast
|
|
|
41
41
|
# Import from agent package
|
|
42
42
|
from agent.core.types import AgentResponse
|
|
43
43
|
from agent.core.callbacks import DefaultCallbackHandler
|
|
44
|
-
from agent.providers.omni.parser import ParseResult
|
|
45
44
|
from computer import Computer
|
|
46
45
|
|
|
47
46
|
from agent import ComputerAgent, AgentLoop, LLM, LLMProvider
|
|
@@ -103,7 +102,7 @@ class GradioChatScreenshotHandler(DefaultCallbackHandler):
|
|
|
103
102
|
self,
|
|
104
103
|
screenshot_base64: str,
|
|
105
104
|
action_type: str = "",
|
|
106
|
-
parsed_screen: Optional[
|
|
105
|
+
parsed_screen: Optional[dict] = None,
|
|
107
106
|
) -> None:
|
|
108
107
|
"""Add screenshot to chatbot when a screenshot is taken and update the annotated image.
|
|
109
108
|
|
|
@@ -138,6 +137,7 @@ MODEL_MAPPINGS = {
|
|
|
138
137
|
"openai": {
|
|
139
138
|
# Default to operator CUA model
|
|
140
139
|
"default": "computer-use-preview",
|
|
140
|
+
"OpenAI: Computer-Use Preview": "computer-use-preview",
|
|
141
141
|
# Map standard OpenAI model names to CUA-specific model names
|
|
142
142
|
"gpt-4-turbo": "computer-use-preview",
|
|
143
143
|
"gpt-4o": "computer-use-preview",
|
|
@@ -148,9 +148,17 @@ MODEL_MAPPINGS = {
|
|
|
148
148
|
"anthropic": {
|
|
149
149
|
# Default to newest model
|
|
150
150
|
"default": "claude-3-7-sonnet-20250219",
|
|
151
|
+
# New Claude 4 models
|
|
152
|
+
"Anthropic: Claude 4 Opus (20250514)": "claude-opus-4-20250514",
|
|
153
|
+
"Anthropic: Claude 4 Sonnet (20250514)": "claude-sonnet-4-20250514",
|
|
154
|
+
"claude-opus-4-20250514": "claude-opus-4-20250514",
|
|
155
|
+
"claude-sonnet-4-20250514": "claude-sonnet-4-20250514",
|
|
156
|
+
|
|
151
157
|
# Specific Claude models for CUA
|
|
152
|
-
"
|
|
158
|
+
"Anthropic: Claude 3.7 Sonnet (20250219)": "claude-3-5-sonnet-20240620",
|
|
159
|
+
"Anthropic: Claude 3.5 Sonnet (20240620)": "claude-3-7-sonnet-20250219",
|
|
153
160
|
"claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
|
|
161
|
+
"claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
|
|
154
162
|
# Map standard model names to CUA-specific model names
|
|
155
163
|
"claude-3-opus": "claude-3-7-sonnet-20250219",
|
|
156
164
|
"claude-3-sonnet": "claude-3-5-sonnet-20240620",
|
|
@@ -210,12 +218,12 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
|
|
|
210
218
|
if agent_loop == AgentLoop.OPENAI:
|
|
211
219
|
provider = LLMProvider.OPENAI
|
|
212
220
|
model_name_to_use = MODEL_MAPPINGS["openai"].get(
|
|
213
|
-
model_name
|
|
221
|
+
model_name, MODEL_MAPPINGS["openai"]["default"]
|
|
214
222
|
)
|
|
215
223
|
elif agent_loop == AgentLoop.ANTHROPIC:
|
|
216
224
|
provider = LLMProvider.ANTHROPIC
|
|
217
225
|
model_name_to_use = MODEL_MAPPINGS["anthropic"].get(
|
|
218
|
-
model_name
|
|
226
|
+
model_name, MODEL_MAPPINGS["anthropic"]["default"]
|
|
219
227
|
)
|
|
220
228
|
elif agent_loop == AgentLoop.OMNI:
|
|
221
229
|
# Determine provider and clean model name based on the full string from UI
|
|
@@ -235,33 +243,11 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
|
|
|
235
243
|
cleaned_model_name = model_name.split("OMNI: Ollama ", 1)[1]
|
|
236
244
|
elif model_name.startswith("OMNI: Claude "):
|
|
237
245
|
provider = LLMProvider.ANTHROPIC
|
|
238
|
-
# Extract the canonical model name based on the UI string
|
|
239
|
-
# e.g., "OMNI: Claude 3.7 Sonnet (20250219)" -> "3.7 Sonnet" and "20250219"
|
|
240
|
-
parts = model_name.split(" (")
|
|
241
|
-
model_key_part = parts[0].replace("OMNI: Claude ", "")
|
|
242
|
-
date_part = parts[1].replace(")", "") if len(parts) > 1 else ""
|
|
243
|
-
|
|
244
|
-
# Normalize the extracted key part for comparison
|
|
245
|
-
# "3.7 Sonnet" -> "37sonnet"
|
|
246
|
-
model_key_part_norm = model_key_part.lower().replace(".", "").replace(" ", "")
|
|
247
246
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
# "claude-3-7-sonnet-20250219" -> "claude37sonnet20250219"
|
|
253
|
-
key_anthropic_norm = key_anthropic.lower().replace("-", "")
|
|
254
|
-
|
|
255
|
-
# Check if the normalized canonical key starts with "claude" + normalized extracted part
|
|
256
|
-
# AND contains the date part.
|
|
257
|
-
if (
|
|
258
|
-
key_anthropic_norm.startswith("claude" + model_key_part_norm)
|
|
259
|
-
and date_part in key_anthropic_norm
|
|
260
|
-
):
|
|
261
|
-
cleaned_model_name = (
|
|
262
|
-
val_anthropic # Use the canonical name like "claude-3-7-sonnet-20250219"
|
|
263
|
-
)
|
|
264
|
-
break
|
|
247
|
+
model_name = model_name.replace("OMNI: ", "Anthropic: ")
|
|
248
|
+
cleaned_model_name = MODEL_MAPPINGS["anthropic"].get(
|
|
249
|
+
model_name, MODEL_MAPPINGS["anthropic"]["default"]
|
|
250
|
+
)
|
|
265
251
|
elif model_name.startswith("OMNI: OpenAI "):
|
|
266
252
|
provider = LLMProvider.OPENAI
|
|
267
253
|
# Extract the model part, e.g., "GPT-4o mini"
|
|
@@ -310,6 +296,8 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
|
|
|
310
296
|
model_name_to_use = MODEL_MAPPINGS["openai"]["default"]
|
|
311
297
|
agent_loop = AgentLoop.OPENAI
|
|
312
298
|
|
|
299
|
+
print(f"Mapping {model_name} and {loop_provider} to {provider}, {model_name_to_use}, {agent_loop}")
|
|
300
|
+
|
|
313
301
|
return provider, model_name_to_use, agent_loop
|
|
314
302
|
|
|
315
303
|
|
|
@@ -454,6 +442,9 @@ def create_gradio_ui(
|
|
|
454
442
|
# Always show models regardless of API key availability
|
|
455
443
|
openai_models = ["OpenAI: Computer-Use Preview"]
|
|
456
444
|
anthropic_models = [
|
|
445
|
+
"Anthropic: Claude 4 Opus (20250514)",
|
|
446
|
+
"Anthropic: Claude 4 Sonnet (20250514)",
|
|
447
|
+
|
|
457
448
|
"Anthropic: Claude 3.7 Sonnet (20250219)",
|
|
458
449
|
"Anthropic: Claude 3.5 Sonnet (20240620)",
|
|
459
450
|
]
|
|
@@ -461,6 +452,8 @@ def create_gradio_ui(
|
|
|
461
452
|
"OMNI: OpenAI GPT-4o",
|
|
462
453
|
"OMNI: OpenAI GPT-4o mini",
|
|
463
454
|
"OMNI: OpenAI GPT-4.5-preview",
|
|
455
|
+
"OMNI: Claude 4 Opus (20250514)",
|
|
456
|
+
"OMNI: Claude 4 Sonnet (20250514)",
|
|
464
457
|
"OMNI: Claude 3.7 Sonnet (20250219)",
|
|
465
458
|
"OMNI: Claude 3.5 Sonnet (20240620)"
|
|
466
459
|
]
|
|
@@ -730,20 +723,25 @@ if __name__ == "__main__":
|
|
|
730
723
|
with gr.Accordion("Computer Configuration", open=True):
|
|
731
724
|
# Computer configuration options
|
|
732
725
|
computer_os = gr.Radio(
|
|
733
|
-
choices=["macos", "linux"],
|
|
726
|
+
choices=["macos", "linux", "windows"],
|
|
734
727
|
label="Operating System",
|
|
735
728
|
value="macos",
|
|
736
729
|
info="Select the operating system for the computer",
|
|
737
730
|
)
|
|
738
731
|
|
|
739
|
-
|
|
732
|
+
is_windows = platform.system().lower() == "windows"
|
|
740
733
|
is_mac = platform.system().lower() == "darwin"
|
|
741
734
|
|
|
735
|
+
providers = ["cloud"]
|
|
736
|
+
if is_mac:
|
|
737
|
+
providers += ["lume"]
|
|
738
|
+
elif is_windows:
|
|
739
|
+
providers += ["winsandbox"]
|
|
740
|
+
|
|
742
741
|
computer_provider = gr.Radio(
|
|
743
|
-
choices=
|
|
742
|
+
choices=providers,
|
|
744
743
|
label="Provider",
|
|
745
744
|
value="lume" if is_mac else "cloud",
|
|
746
|
-
visible=is_mac,
|
|
747
745
|
info="Select the computer provider",
|
|
748
746
|
)
|
|
749
747
|
|
|
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
|
|
|
6
6
|
|
|
7
7
|
[project]
|
|
8
8
|
name = "cua-agent"
|
|
9
|
-
version = "0.2.
|
|
9
|
+
version = "0.2.11"
|
|
10
10
|
description = "CUA (Computer Use) Agent for AI-driven computer interaction"
|
|
11
11
|
readme = "README.md"
|
|
12
12
|
authors = [
|
|
@@ -109,7 +109,7 @@ target-version = [
|
|
|
109
109
|
|
|
110
110
|
[tool.ruff]
|
|
111
111
|
line-length = 100
|
|
112
|
-
target-version = "0.2.
|
|
112
|
+
target-version = "0.2.11"
|
|
113
113
|
select = [
|
|
114
114
|
"E",
|
|
115
115
|
"F",
|
|
@@ -123,7 +123,7 @@ docstring-code-format = true
|
|
|
123
123
|
|
|
124
124
|
[tool.mypy]
|
|
125
125
|
strict = true
|
|
126
|
-
python_version = "0.2.
|
|
126
|
+
python_version = "0.2.11"
|
|
127
127
|
ignore_missing_imports = true
|
|
128
128
|
disallow_untyped_defs = true
|
|
129
129
|
check_untyped_defs = true
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|