cua-agent 0.1.34__py3-none-any.whl → 0.1.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/core/factory.py +1 -0
- agent/core/provider_config.py +2 -0
- agent/core/types.py +1 -0
- agent/providers/uitars/clients/mlxvlm.py +263 -0
- agent/providers/uitars/loop.py +25 -12
- agent/providers/uitars/utils.py +1 -1
- agent/ui/gradio/app.py +120 -34
- {cua_agent-0.1.34.dist-info → cua_agent-0.1.37.dist-info}/METADATA +30 -3
- {cua_agent-0.1.34.dist-info → cua_agent-0.1.37.dist-info}/RECORD +11 -10
- {cua_agent-0.1.34.dist-info → cua_agent-0.1.37.dist-info}/WHEEL +0 -0
- {cua_agent-0.1.34.dist-info → cua_agent-0.1.37.dist-info}/entry_points.txt +0 -0
agent/core/factory.py
CHANGED
agent/core/provider_config.py
CHANGED
|
@@ -8,6 +8,7 @@ DEFAULT_MODELS = {
|
|
|
8
8
|
LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
9
9
|
LLMProvider.OLLAMA: "gemma3:4b-it-q4_K_M",
|
|
10
10
|
LLMProvider.OAICOMPAT: "Qwen2.5-VL-7B-Instruct",
|
|
11
|
+
LLMProvider.MLXVLM: "mlx-community/UI-TARS-1.5-7B-4bit",
|
|
11
12
|
}
|
|
12
13
|
|
|
13
14
|
# Map providers to their environment variable names
|
|
@@ -16,4 +17,5 @@ ENV_VARS = {
|
|
|
16
17
|
LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
17
18
|
LLMProvider.OLLAMA: "none",
|
|
18
19
|
LLMProvider.OAICOMPAT: "none", # OpenAI-compatible API typically doesn't require an API key
|
|
20
|
+
LLMProvider.MLXVLM: "none", # MLX VLM typically doesn't require an API key
|
|
19
21
|
}
|
agent/core/types.py
CHANGED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""MLX LVM client implementation."""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import logging
|
|
5
|
+
import base64
|
|
6
|
+
import tempfile
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import math
|
|
10
|
+
from typing import Dict, List, Optional, Any, cast, Tuple
|
|
11
|
+
from PIL import Image
|
|
12
|
+
|
|
13
|
+
from .base import BaseUITarsClient
|
|
14
|
+
import mlx.core as mx
|
|
15
|
+
from mlx_vlm import load, generate
|
|
16
|
+
from mlx_vlm.prompt_utils import apply_chat_template
|
|
17
|
+
from mlx_vlm.utils import load_config
|
|
18
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
# Constants for smart_resize
|
|
23
|
+
IMAGE_FACTOR = 28
|
|
24
|
+
MIN_PIXELS = 100 * 28 * 28
|
|
25
|
+
MAX_PIXELS = 16384 * 28 * 28
|
|
26
|
+
MAX_RATIO = 200
|
|
27
|
+
|
|
28
|
+
def round_by_factor(number: float, factor: int) -> int:
|
|
29
|
+
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
|
30
|
+
return round(number / factor) * factor
|
|
31
|
+
|
|
32
|
+
def ceil_by_factor(number: float, factor: int) -> int:
|
|
33
|
+
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
|
34
|
+
return math.ceil(number / factor) * factor
|
|
35
|
+
|
|
36
|
+
def floor_by_factor(number: float, factor: int) -> int:
|
|
37
|
+
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
|
38
|
+
return math.floor(number / factor) * factor
|
|
39
|
+
|
|
40
|
+
def smart_resize(
|
|
41
|
+
height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
|
|
42
|
+
) -> tuple[int, int]:
|
|
43
|
+
"""
|
|
44
|
+
Rescales the image so that the following conditions are met:
|
|
45
|
+
|
|
46
|
+
1. Both dimensions (height and width) are divisible by 'factor'.
|
|
47
|
+
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
|
48
|
+
3. The aspect ratio of the image is maintained as closely as possible.
|
|
49
|
+
"""
|
|
50
|
+
if max(height, width) / min(height, width) > MAX_RATIO:
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
|
|
53
|
+
)
|
|
54
|
+
h_bar = max(factor, round_by_factor(height, factor))
|
|
55
|
+
w_bar = max(factor, round_by_factor(width, factor))
|
|
56
|
+
if h_bar * w_bar > max_pixels:
|
|
57
|
+
beta = math.sqrt((height * width) / max_pixels)
|
|
58
|
+
h_bar = floor_by_factor(height / beta, factor)
|
|
59
|
+
w_bar = floor_by_factor(width / beta, factor)
|
|
60
|
+
elif h_bar * w_bar < min_pixels:
|
|
61
|
+
beta = math.sqrt(min_pixels / (height * width))
|
|
62
|
+
h_bar = ceil_by_factor(height * beta, factor)
|
|
63
|
+
w_bar = ceil_by_factor(width * beta, factor)
|
|
64
|
+
return h_bar, w_bar
|
|
65
|
+
|
|
66
|
+
class MLXVLMUITarsClient(BaseUITarsClient):
|
|
67
|
+
"""MLX LVM client implementation class."""
|
|
68
|
+
|
|
69
|
+
def __init__(
|
|
70
|
+
self,
|
|
71
|
+
model: str = "mlx-community/UI-TARS-1.5-7B-4bit"
|
|
72
|
+
):
|
|
73
|
+
"""Initialize MLX LVM client.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
model: Model name or path (defaults to mlx-community/UI-TARS-1.5-7B-4bit)
|
|
77
|
+
"""
|
|
78
|
+
# Load model and processor
|
|
79
|
+
model_obj, processor = load(
|
|
80
|
+
model,
|
|
81
|
+
processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS}
|
|
82
|
+
)
|
|
83
|
+
self.config = load_config(model)
|
|
84
|
+
self.model = model_obj
|
|
85
|
+
self.processor = processor
|
|
86
|
+
self.model_name = model
|
|
87
|
+
|
|
88
|
+
def _process_coordinates(self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]) -> str:
|
|
89
|
+
"""Process coordinates in box tokens based on image resizing using smart_resize approach.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
text: Text containing box tokens
|
|
93
|
+
original_size: Original image size (width, height)
|
|
94
|
+
model_size: Model processed image size (width, height)
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Text with processed coordinates
|
|
98
|
+
"""
|
|
99
|
+
# Find all box tokens
|
|
100
|
+
box_pattern = r"<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>"
|
|
101
|
+
|
|
102
|
+
def process_coords(match):
|
|
103
|
+
model_x, model_y = int(match.group(1)), int(match.group(2))
|
|
104
|
+
# Scale coordinates from model space to original image space
|
|
105
|
+
# Both original_size and model_size are in (width, height) format
|
|
106
|
+
new_x = int(model_x * original_size[0] / model_size[0]) # Width
|
|
107
|
+
new_y = int(model_y * original_size[1] / model_size[1]) # Height
|
|
108
|
+
return f"<|box_start|>({new_x},{new_y})<|box_end|>"
|
|
109
|
+
|
|
110
|
+
return re.sub(box_pattern, process_coords, text)
|
|
111
|
+
|
|
112
|
+
async def run_interleaved(
|
|
113
|
+
self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
|
|
114
|
+
) -> Dict[str, Any]:
|
|
115
|
+
"""Run interleaved chat completion.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
messages: List of message dicts
|
|
119
|
+
system: System prompt
|
|
120
|
+
max_tokens: Optional max tokens override
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Response dict
|
|
124
|
+
"""
|
|
125
|
+
# Ensure the system message is included
|
|
126
|
+
if not any(msg.get("role") == "system" for msg in messages):
|
|
127
|
+
messages = [{"role": "system", "content": system}] + messages
|
|
128
|
+
|
|
129
|
+
# Create a deep copy of messages to avoid modifying the original
|
|
130
|
+
processed_messages = messages.copy()
|
|
131
|
+
|
|
132
|
+
# Extract images and process messages
|
|
133
|
+
images = []
|
|
134
|
+
original_sizes = {} # Track original sizes of images for coordinate mapping
|
|
135
|
+
model_sizes = {} # Track model processed sizes
|
|
136
|
+
image_index = 0
|
|
137
|
+
|
|
138
|
+
for msg_idx, msg in enumerate(messages):
|
|
139
|
+
content = msg.get("content", [])
|
|
140
|
+
if not isinstance(content, list):
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
# Create a copy of the content list to modify
|
|
144
|
+
processed_content = []
|
|
145
|
+
|
|
146
|
+
for item_idx, item in enumerate(content):
|
|
147
|
+
if item.get("type") == "image_url":
|
|
148
|
+
image_url = item.get("image_url", {}).get("url", "")
|
|
149
|
+
pil_image = None
|
|
150
|
+
|
|
151
|
+
if image_url.startswith("data:image/"):
|
|
152
|
+
# Extract base64 data
|
|
153
|
+
base64_data = image_url.split(',')[1]
|
|
154
|
+
# Convert base64 to PIL Image
|
|
155
|
+
image_data = base64.b64decode(base64_data)
|
|
156
|
+
pil_image = Image.open(io.BytesIO(image_data))
|
|
157
|
+
else:
|
|
158
|
+
# Handle file path or URL
|
|
159
|
+
pil_image = Image.open(image_url)
|
|
160
|
+
|
|
161
|
+
# Store original image size for coordinate mapping
|
|
162
|
+
original_size = pil_image.size
|
|
163
|
+
original_sizes[image_index] = original_size
|
|
164
|
+
|
|
165
|
+
# Use smart_resize to determine model size
|
|
166
|
+
# Note: smart_resize expects (height, width) but PIL gives (width, height)
|
|
167
|
+
height, width = original_size[1], original_size[0]
|
|
168
|
+
new_height, new_width = smart_resize(height, width)
|
|
169
|
+
# Store model size in (width, height) format for consistent coordinate processing
|
|
170
|
+
model_sizes[image_index] = (new_width, new_height)
|
|
171
|
+
|
|
172
|
+
# Resize the image using the calculated dimensions from smart_resize
|
|
173
|
+
resized_image = pil_image.resize((new_width, new_height))
|
|
174
|
+
images.append(resized_image)
|
|
175
|
+
image_index += 1
|
|
176
|
+
|
|
177
|
+
# Copy items to processed content list
|
|
178
|
+
processed_content.append(item.copy())
|
|
179
|
+
|
|
180
|
+
# Update the processed message content
|
|
181
|
+
processed_messages[msg_idx] = msg.copy()
|
|
182
|
+
processed_messages[msg_idx]["content"] = processed_content
|
|
183
|
+
|
|
184
|
+
logger.info(f"resized {len(images)} from {original_sizes[0]} to {model_sizes[0]}")
|
|
185
|
+
|
|
186
|
+
# Process user text input with box coordinates after image processing
|
|
187
|
+
# Swap original_size and model_size arguments for inverse transformation
|
|
188
|
+
for msg_idx, msg in enumerate(processed_messages):
|
|
189
|
+
if msg.get("role") == "user" and isinstance(msg.get("content"), str):
|
|
190
|
+
if "<|box_start|>" in msg.get("content") and original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
|
|
191
|
+
orig_size = original_sizes[0]
|
|
192
|
+
model_size = model_sizes[0]
|
|
193
|
+
# Swap arguments to perform inverse transformation for user input
|
|
194
|
+
processed_messages[msg_idx]["content"] = self._process_coordinates(msg["content"], model_size, orig_size)
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
# Format prompt according to model requirements using the processor directly
|
|
198
|
+
prompt = self.processor.apply_chat_template(
|
|
199
|
+
processed_messages,
|
|
200
|
+
tokenize=False,
|
|
201
|
+
add_generation_prompt=True
|
|
202
|
+
)
|
|
203
|
+
tokenizer = cast(PreTrainedTokenizer, self.processor)
|
|
204
|
+
|
|
205
|
+
print("generating response...")
|
|
206
|
+
|
|
207
|
+
# Generate response
|
|
208
|
+
text_content, usage = generate(
|
|
209
|
+
self.model,
|
|
210
|
+
tokenizer,
|
|
211
|
+
str(prompt),
|
|
212
|
+
images,
|
|
213
|
+
verbose=False,
|
|
214
|
+
max_tokens=max_tokens
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
from pprint import pprint
|
|
218
|
+
print("DEBUG - AGENT GENERATION --------")
|
|
219
|
+
pprint(text_content)
|
|
220
|
+
print("DEBUG - AGENT GENERATION --------")
|
|
221
|
+
except Exception as e:
|
|
222
|
+
logger.error(f"Error generating response: {str(e)}")
|
|
223
|
+
return {
|
|
224
|
+
"choices": [
|
|
225
|
+
{
|
|
226
|
+
"message": {
|
|
227
|
+
"role": "assistant",
|
|
228
|
+
"content": f"Error generating response: {str(e)}"
|
|
229
|
+
},
|
|
230
|
+
"finish_reason": "error"
|
|
231
|
+
}
|
|
232
|
+
],
|
|
233
|
+
"model": self.model_name,
|
|
234
|
+
"error": str(e)
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
# Process coordinates in the response back to original image space
|
|
238
|
+
if original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
|
|
239
|
+
# Get original image size and model size (using the first image)
|
|
240
|
+
orig_size = original_sizes[0]
|
|
241
|
+
model_size = model_sizes[0]
|
|
242
|
+
|
|
243
|
+
# Check if output contains box tokens that need processing
|
|
244
|
+
if "<|box_start|>" in text_content:
|
|
245
|
+
# Process coordinates from model space back to original image space
|
|
246
|
+
text_content = self._process_coordinates(text_content, orig_size, model_size)
|
|
247
|
+
|
|
248
|
+
# Format response to match OpenAI format
|
|
249
|
+
response = {
|
|
250
|
+
"choices": [
|
|
251
|
+
{
|
|
252
|
+
"message": {
|
|
253
|
+
"role": "assistant",
|
|
254
|
+
"content": text_content
|
|
255
|
+
},
|
|
256
|
+
"finish_reason": "stop"
|
|
257
|
+
}
|
|
258
|
+
],
|
|
259
|
+
"model": self.model_name,
|
|
260
|
+
"usage": usage
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
return response
|
agent/providers/uitars/loop.py
CHANGED
|
@@ -23,6 +23,7 @@ from .tools.computer import ToolResult
|
|
|
23
23
|
from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
|
|
24
24
|
|
|
25
25
|
from .clients.oaicompat import OAICompatClient
|
|
26
|
+
from .clients.mlxvlm import MLXVLMUITarsClient
|
|
26
27
|
|
|
27
28
|
logging.basicConfig(level=logging.INFO)
|
|
28
29
|
logger = logging.getLogger(__name__)
|
|
@@ -44,6 +45,7 @@ class UITARSLoop(BaseLoop):
|
|
|
44
45
|
computer: Computer,
|
|
45
46
|
api_key: str,
|
|
46
47
|
model: str,
|
|
48
|
+
provider: Optional[LLMProvider] = None,
|
|
47
49
|
provider_base_url: Optional[str] = "http://localhost:8000/v1",
|
|
48
50
|
only_n_most_recent_images: Optional[int] = 2,
|
|
49
51
|
base_dir: Optional[str] = "trajectories",
|
|
@@ -64,9 +66,10 @@ class UITARSLoop(BaseLoop):
|
|
|
64
66
|
max_retries: Maximum number of retries for API calls
|
|
65
67
|
retry_delay: Delay between retries in seconds
|
|
66
68
|
save_trajectory: Whether to save trajectory data
|
|
69
|
+
provider: The LLM provider to use (defaults to OAICOMPAT if not specified)
|
|
67
70
|
"""
|
|
68
71
|
# Set provider before initializing base class
|
|
69
|
-
self.provider = LLMProvider.OAICOMPAT
|
|
72
|
+
self.provider = provider or LLMProvider.OAICOMPAT
|
|
70
73
|
self.provider_base_url = provider_base_url
|
|
71
74
|
|
|
72
75
|
# Initialize message manager with image retention config
|
|
@@ -113,7 +116,7 @@ class UITARSLoop(BaseLoop):
|
|
|
113
116
|
logger.error(f"Error initializing tool manager: {str(e)}")
|
|
114
117
|
logger.warning("Will attempt to initialize tools on first use.")
|
|
115
118
|
|
|
116
|
-
# Initialize client for the
|
|
119
|
+
# Initialize client for the selected provider
|
|
117
120
|
try:
|
|
118
121
|
await self.initialize_client()
|
|
119
122
|
except Exception as e:
|
|
@@ -128,18 +131,28 @@ class UITARSLoop(BaseLoop):
|
|
|
128
131
|
"""Initialize the appropriate client.
|
|
129
132
|
|
|
130
133
|
Implements abstract method from BaseLoop to set up the specific
|
|
131
|
-
provider client
|
|
134
|
+
provider client based on the configured provider.
|
|
132
135
|
"""
|
|
133
136
|
try:
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
137
|
+
if self.provider == LLMProvider.MLXVLM:
|
|
138
|
+
logger.info(f"Initializing MLX VLM client for UI-TARS with model {self.model}...")
|
|
139
|
+
|
|
140
|
+
self.client = MLXVLMUITarsClient(
|
|
141
|
+
model=self.model,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
logger.info(f"Initialized MLX VLM client with model {self.model}")
|
|
145
|
+
else:
|
|
146
|
+
# Default to OAICompat client for other providers
|
|
147
|
+
logger.info(f"Initializing OAICompat client for UI-TARS with model {self.model}...")
|
|
148
|
+
|
|
149
|
+
self.client = OAICompatClient(
|
|
150
|
+
api_key=self.api_key or "EMPTY", # Local endpoints typically don't require an API key
|
|
151
|
+
model=self.model,
|
|
152
|
+
provider_base_url=self.provider_base_url,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
logger.info(f"Initialized OAICompat client with model {self.model}")
|
|
143
156
|
except Exception as e:
|
|
144
157
|
logger.error(f"Error initializing client: {str(e)}")
|
|
145
158
|
self.client = None
|
agent/providers/uitars/utils.py
CHANGED
agent/ui/gradio/app.py
CHANGED
|
@@ -164,8 +164,10 @@ MODEL_MAPPINGS = {
|
|
|
164
164
|
"claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
|
|
165
165
|
},
|
|
166
166
|
"uitars": {
|
|
167
|
-
# UI-TARS models
|
|
168
|
-
"default": "
|
|
167
|
+
# UI-TARS models using MLXVLM provider
|
|
168
|
+
"default": "mlx-community/UI-TARS-1.5-7B-4bit",
|
|
169
|
+
"mlx-community/UI-TARS-1.5-7B-4bit": "mlx-community/UI-TARS-1.5-7B-4bit",
|
|
170
|
+
"mlx-community/UI-TARS-1.5-7B-6bit": "mlx-community/UI-TARS-1.5-7B-6bit"
|
|
169
171
|
},
|
|
170
172
|
"ollama": {
|
|
171
173
|
# For Ollama models, we keep the original name
|
|
@@ -288,8 +290,16 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
|
|
|
288
290
|
model_name_to_use = cleaned_model_name
|
|
289
291
|
# agent_loop remains AgentLoop.OMNI
|
|
290
292
|
elif agent_loop == AgentLoop.UITARS:
|
|
291
|
-
provider
|
|
292
|
-
|
|
293
|
+
# For UITARS, use MLXVLM provider for the MLX models, OAICOMPAT for custom
|
|
294
|
+
if model_name == "Custom model...":
|
|
295
|
+
provider = LLMProvider.OAICOMPAT
|
|
296
|
+
model_name_to_use = "tgi"
|
|
297
|
+
else:
|
|
298
|
+
provider = LLMProvider.MLXVLM
|
|
299
|
+
# Get the model name from the mappings or use as-is if not found
|
|
300
|
+
model_name_to_use = MODEL_MAPPINGS["uitars"].get(
|
|
301
|
+
model_name, model_name if model_name else MODEL_MAPPINGS["uitars"]["default"]
|
|
302
|
+
)
|
|
293
303
|
else:
|
|
294
304
|
# Default to OpenAI if unrecognized loop
|
|
295
305
|
provider = LLMProvider.OPENAI
|
|
@@ -412,25 +422,23 @@ def create_gradio_ui(
|
|
|
412
422
|
openai_api_key = os.environ.get("OPENAI_API_KEY", "")
|
|
413
423
|
anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
|
414
424
|
|
|
415
|
-
#
|
|
416
|
-
openai_models = []
|
|
417
|
-
anthropic_models = [
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
]
|
|
433
|
-
omni_models += ["OMNI: Claude 3.7 Sonnet (20250219)", "OMNI: Claude 3.5 Sonnet (20240620)"]
|
|
425
|
+
# Always show models regardless of API key availability
|
|
426
|
+
openai_models = ["OpenAI: Computer-Use Preview"]
|
|
427
|
+
anthropic_models = [
|
|
428
|
+
"Anthropic: Claude 3.7 Sonnet (20250219)",
|
|
429
|
+
"Anthropic: Claude 3.5 Sonnet (20240620)",
|
|
430
|
+
]
|
|
431
|
+
omni_models = [
|
|
432
|
+
"OMNI: OpenAI GPT-4o",
|
|
433
|
+
"OMNI: OpenAI GPT-4o mini",
|
|
434
|
+
"OMNI: OpenAI GPT-4.5-preview",
|
|
435
|
+
"OMNI: Claude 3.7 Sonnet (20250219)",
|
|
436
|
+
"OMNI: Claude 3.5 Sonnet (20240620)"
|
|
437
|
+
]
|
|
438
|
+
|
|
439
|
+
# Check if API keys are available
|
|
440
|
+
has_openai_key = bool(openai_api_key)
|
|
441
|
+
has_anthropic_key = bool(anthropic_api_key)
|
|
434
442
|
|
|
435
443
|
# Get Ollama models for OMNI
|
|
436
444
|
ollama_models = get_ollama_models()
|
|
@@ -442,7 +450,11 @@ def create_gradio_ui(
|
|
|
442
450
|
"OPENAI": openai_models,
|
|
443
451
|
"ANTHROPIC": anthropic_models,
|
|
444
452
|
"OMNI": omni_models + ["Custom model..."], # Add custom model option
|
|
445
|
-
"UITARS": [
|
|
453
|
+
"UITARS": [
|
|
454
|
+
"mlx-community/UI-TARS-1.5-7B-4bit",
|
|
455
|
+
"mlx-community/UI-TARS-1.5-7B-6bit",
|
|
456
|
+
"Custom model..."
|
|
457
|
+
], # UI-TARS options with MLX models
|
|
446
458
|
}
|
|
447
459
|
|
|
448
460
|
# --- Apply Saved Settings (override defaults if available) ---
|
|
@@ -568,16 +580,51 @@ if __name__ == "__main__":
|
|
|
568
580
|
elif "Custom model..." not in models:
|
|
569
581
|
models.append("Custom model...")
|
|
570
582
|
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
583
|
+
# Show both OpenAI and Anthropic key inputs for OMNI if keys aren't set
|
|
584
|
+
return [
|
|
585
|
+
gr.update(choices=models, value=models[0] if models else "Custom model...", interactive=True),
|
|
586
|
+
gr.update(visible=not has_openai_key),
|
|
587
|
+
gr.update(visible=not has_anthropic_key)
|
|
588
|
+
]
|
|
589
|
+
elif loop == "OPENAI":
|
|
590
|
+
# Show only OpenAI key input for OPENAI loop if key isn't set
|
|
591
|
+
if not models:
|
|
592
|
+
return [
|
|
593
|
+
gr.update(choices=["No models available"], value="No models available", interactive=True),
|
|
594
|
+
gr.update(visible=not has_openai_key),
|
|
595
|
+
gr.update(visible=False)
|
|
596
|
+
]
|
|
597
|
+
return [
|
|
598
|
+
gr.update(choices=models, value=models[0] if models else None, interactive=True),
|
|
599
|
+
gr.update(visible=not has_openai_key),
|
|
600
|
+
gr.update(visible=False)
|
|
601
|
+
]
|
|
602
|
+
elif loop == "ANTHROPIC":
|
|
603
|
+
# Show only Anthropic key input for ANTHROPIC loop if key isn't set
|
|
604
|
+
if not models:
|
|
605
|
+
return [
|
|
606
|
+
gr.update(choices=["No models available"], value="No models available", interactive=True),
|
|
607
|
+
gr.update(visible=False),
|
|
608
|
+
gr.update(visible=not has_anthropic_key)
|
|
609
|
+
]
|
|
610
|
+
return [
|
|
611
|
+
gr.update(choices=models, value=models[0] if models else None, interactive=True),
|
|
612
|
+
gr.update(visible=False),
|
|
613
|
+
gr.update(visible=not has_anthropic_key)
|
|
614
|
+
]
|
|
574
615
|
else:
|
|
575
|
-
# For other providers,
|
|
616
|
+
# For other providers (like UITARS), don't show API key inputs
|
|
576
617
|
if not models:
|
|
577
|
-
return
|
|
578
|
-
choices=["No models available"], value="No models available", interactive=True
|
|
579
|
-
|
|
580
|
-
|
|
618
|
+
return [
|
|
619
|
+
gr.update(choices=["No models available"], value="No models available", interactive=True),
|
|
620
|
+
gr.update(visible=False),
|
|
621
|
+
gr.update(visible=False)
|
|
622
|
+
]
|
|
623
|
+
return [
|
|
624
|
+
gr.update(choices=models, value=models[0] if models else None, interactive=True),
|
|
625
|
+
gr.update(visible=False),
|
|
626
|
+
gr.update(visible=False)
|
|
627
|
+
]
|
|
581
628
|
|
|
582
629
|
# Create the Gradio interface with advanced UI
|
|
583
630
|
with gr.Blocks(title="Computer-Use Agent") as demo:
|
|
@@ -646,6 +693,27 @@ if __name__ == "__main__":
|
|
|
646
693
|
interactive=True,
|
|
647
694
|
)
|
|
648
695
|
|
|
696
|
+
# Add API key inputs for OpenAI and Anthropic
|
|
697
|
+
with gr.Group(visible=not has_openai_key and (initial_loop == "OPENAI" or initial_loop == "OMNI")) as openai_key_group:
|
|
698
|
+
openai_api_key_input = gr.Textbox(
|
|
699
|
+
label="OpenAI API Key",
|
|
700
|
+
placeholder="Enter your OpenAI API key",
|
|
701
|
+
value="",
|
|
702
|
+
interactive=True,
|
|
703
|
+
type="password",
|
|
704
|
+
info="Required for OpenAI models"
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
with gr.Group(visible=not has_anthropic_key and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")) as anthropic_key_group:
|
|
708
|
+
anthropic_api_key_input = gr.Textbox(
|
|
709
|
+
label="Anthropic API Key",
|
|
710
|
+
placeholder="Enter your Anthropic API key",
|
|
711
|
+
value="",
|
|
712
|
+
interactive=True,
|
|
713
|
+
type="password",
|
|
714
|
+
info="Required for Anthropic models"
|
|
715
|
+
)
|
|
716
|
+
|
|
649
717
|
# Add custom model textbox (only visible when "Custom model..." is selected)
|
|
650
718
|
custom_model = gr.Textbox(
|
|
651
719
|
label="Custom Model Name",
|
|
@@ -724,6 +792,8 @@ if __name__ == "__main__":
|
|
|
724
792
|
recent_imgs,
|
|
725
793
|
custom_url_value=None,
|
|
726
794
|
custom_api_key=None,
|
|
795
|
+
openai_key_input=None,
|
|
796
|
+
anthropic_key_input=None,
|
|
727
797
|
):
|
|
728
798
|
if not history:
|
|
729
799
|
yield history
|
|
@@ -769,9 +839,15 @@ if __name__ == "__main__":
|
|
|
769
839
|
f"DEBUG - Using custom API key for model: {final_model_name_to_send}"
|
|
770
840
|
)
|
|
771
841
|
elif provider == LLMProvider.OPENAI:
|
|
772
|
-
|
|
842
|
+
# Use OpenAI key from input if provided, otherwise use environment variable
|
|
843
|
+
api_key = openai_key_input if openai_key_input else (openai_api_key or os.environ.get("OPENAI_API_KEY", ""))
|
|
844
|
+
if openai_key_input:
|
|
845
|
+
print(f"DEBUG - Using provided OpenAI API key from UI")
|
|
773
846
|
elif provider == LLMProvider.ANTHROPIC:
|
|
774
|
-
|
|
847
|
+
# Use Anthropic key from input if provided, otherwise use environment variable
|
|
848
|
+
api_key = anthropic_key_input if anthropic_key_input else (anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", ""))
|
|
849
|
+
if anthropic_key_input:
|
|
850
|
+
print(f"DEBUG - Using provided Anthropic API key from UI")
|
|
775
851
|
else:
|
|
776
852
|
# For Ollama or default OAICOMPAT (without custom key), no key needed/expected
|
|
777
853
|
api_key = ""
|
|
@@ -917,6 +993,8 @@ if __name__ == "__main__":
|
|
|
917
993
|
recent_images,
|
|
918
994
|
provider_base_url,
|
|
919
995
|
provider_api_key,
|
|
996
|
+
openai_api_key_input,
|
|
997
|
+
anthropic_api_key_input,
|
|
920
998
|
],
|
|
921
999
|
[chatbot_history],
|
|
922
1000
|
)
|
|
@@ -947,6 +1025,14 @@ if __name__ == "__main__":
|
|
|
947
1025
|
outputs=[custom_model, provider_base_url, provider_api_key],
|
|
948
1026
|
queue=False, # Process immediately without queueing
|
|
949
1027
|
)
|
|
1028
|
+
|
|
1029
|
+
# Connect agent_loop changes to model selection and API key visibility
|
|
1030
|
+
agent_loop.change(
|
|
1031
|
+
fn=update_model_choices,
|
|
1032
|
+
inputs=[agent_loop],
|
|
1033
|
+
outputs=[model_choice, openai_key_group, anthropic_key_group],
|
|
1034
|
+
queue=False, # Process immediately without queueing
|
|
1035
|
+
)
|
|
950
1036
|
|
|
951
1037
|
# Function to update the code display based on configuration and chat history
|
|
952
1038
|
def update_code_display(agent_loop, model_choice_val, custom_model_val, chat_history, provider_base_url, recent_images_val, save_trajectory_val):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cua-agent
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.37
|
|
4
4
|
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
5
|
Author-Email: TryCua <gh@trycua.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -23,6 +23,7 @@ Requires-Dist: openai<2.0.0,>=1.14.0; extra == "openai"
|
|
|
23
23
|
Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "openai"
|
|
24
24
|
Provides-Extra: uitars
|
|
25
25
|
Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "uitars"
|
|
26
|
+
Provides-Extra: uitars-mlx
|
|
26
27
|
Provides-Extra: ui
|
|
27
28
|
Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "ui"
|
|
28
29
|
Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "ui"
|
|
@@ -102,6 +103,7 @@ pip install "cua-agent[all]"
|
|
|
102
103
|
pip install "cua-agent[openai]" # OpenAI Cua Loop
|
|
103
104
|
pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
104
105
|
pip install "cua-agent[uitars]" # UI-Tars support
|
|
106
|
+
pip install "cua-agent[uitars-mlx]" # local UI-Tars support with MLXVLM
|
|
105
107
|
pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
|
|
106
108
|
pip install "cua-agent[ui]" # Gradio UI for the agent
|
|
107
109
|
```
|
|
@@ -206,7 +208,32 @@ The Gradio UI provides:
|
|
|
206
208
|
|
|
207
209
|
### Using UI-TARS
|
|
208
210
|
|
|
209
|
-
|
|
211
|
+
The UI-TARS models are available in two forms:
|
|
212
|
+
|
|
213
|
+
1. **MLX UI-TARS models** (Default): These models run locally using MLXVLM provider
|
|
214
|
+
- `mlx-community/UI-TARS-1.5-7B-4bit` (default) - 4-bit quantized version
|
|
215
|
+
- `mlx-community/UI-TARS-1.5-7B-6bit` - 6-bit quantized version for higher quality
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
agent = ComputerAgent(
|
|
219
|
+
computer=macos_computer,
|
|
220
|
+
loop=AgentLoop.UITARS,
|
|
221
|
+
model=LLM(provider=LLMProvider.MLXVLM, name="mlx-community/UI-TARS-1.5-7B-4bit")
|
|
222
|
+
)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
2. **OpenAI-compatible UI-TARS**: For using the original ByteDance model
|
|
226
|
+
- If you want to use the original ByteDance UI-TARS model via an OpenAI-compatible API, follow the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md)
|
|
227
|
+
- This will give you a provider URL like `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the code or Gradio UI:
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
agent = ComputerAgent(
|
|
231
|
+
computer=macos_computer,
|
|
232
|
+
loop=AgentLoop.UITARS,
|
|
233
|
+
model=LLM(provider=LLMProvider.OAICOMPAT, name="tgi",
|
|
234
|
+
provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
|
|
235
|
+
)
|
|
236
|
+
```
|
|
210
237
|
|
|
211
238
|
## Agent Loops
|
|
212
239
|
|
|
@@ -216,7 +243,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
|
|
|
216
243
|
|:-----------|:-----------------|:------------|:-------------|
|
|
217
244
|
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
|
|
218
245
|
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
|
|
219
|
-
| `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses
|
|
246
|
+
| `AgentLoop.UITARS` | • `mlx-community/UI-TARS-1.5-7B-4bit` (default)<br>• `mlx-community/UI-TARS-1.5-7B-6bit`<br>• `ByteDance-Seed/UI-TARS-1.5-7B` (via openAI-compatible endpoint) | Uses UI-TARS models with MLXVLM (default) or OAICOMPAT providers | Not Required |
|
|
220
247
|
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
|
221
248
|
|
|
222
249
|
## AgentResponse
|
|
@@ -4,9 +4,9 @@ agent/core/agent.py,sha256=HUfBe7Uam3TObAmf6KH0GDKuNCNunNmmMcuxS7aZg0Q,8332
|
|
|
4
4
|
agent/core/base.py,sha256=2sg8B2VqUKImRlkLTNj5lx-Oarlu7_GoMR6MbNzSY9Q,8078
|
|
5
5
|
agent/core/callbacks.py,sha256=FKAxyajJ-ZJ5SxNXoupNcrm0GYBgjOjJEsStqst0EAk,6453
|
|
6
6
|
agent/core/experiment.py,sha256=Ywj6q3JZFDKicfPuQsDl0vSN55HS7-Cnk3u3EcUCKe8,8866
|
|
7
|
-
agent/core/factory.py,sha256=
|
|
7
|
+
agent/core/factory.py,sha256=zzlCdibctqhf8Uta-SrvE-G7h59wAw-7SGhHiGvS9GY,4608
|
|
8
8
|
agent/core/messages.py,sha256=-OVMDqcxK5MUHPEkHliK29XFJYMRAc1keFvzrUyrOmM,16231
|
|
9
|
-
agent/core/provider_config.py,sha256=
|
|
9
|
+
agent/core/provider_config.py,sha256=jB3fLsEsf806HQZ8jtzfSq4bCYGYONBeuCOoog_Nv_Y,768
|
|
10
10
|
agent/core/telemetry.py,sha256=HElPd32k_w2SJ6t-Cc3j_2-AKdLbFwh2YlM8QViDgRw,4790
|
|
11
11
|
agent/core/tools.py,sha256=Jes2CFCFqC727WWHbO-sG7V03rBHnQe5X7Oi9ZkuScI,877
|
|
12
12
|
agent/core/tools/__init__.py,sha256=xZen-PqUp2dUaMEHJowXCQm33_5Sxhsx9PSoD0rq6tI,489
|
|
@@ -16,7 +16,7 @@ agent/core/tools/collection.py,sha256=NuwTn6dXSyznxWodfmFDQwUlxxaGb4oBPym4AEJABS
|
|
|
16
16
|
agent/core/tools/computer.py,sha256=lT_aW3huoYpcM8kffuokELupSz_WZG_qkaW1gITRC58,3892
|
|
17
17
|
agent/core/tools/edit.py,sha256=kv4jTKCM0VXrnoNErf7mT-xlr81-7T8v49_VA9y_L4Y,2005
|
|
18
18
|
agent/core/tools/manager.py,sha256=IRsCXjGc076nncQuyIjODoafnHTDhrf9sP5B4q5Pcdo,1742
|
|
19
|
-
agent/core/types.py,sha256=
|
|
19
|
+
agent/core/types.py,sha256=tkT-PqjgjL0oWVBRFkHAGWVwYx2Byp7PlUWSpvw_-h8,2442
|
|
20
20
|
agent/core/visualization.py,sha256=1DuFF5sSeSf5BRSevBMDxml9-ajl7BQLFm5KBUwMbI8,6573
|
|
21
21
|
agent/providers/__init__.py,sha256=b4tIBAaIB1V7p8V0BWipHVnMhfHH_OuVgP4OWGSHdD8,194
|
|
22
22
|
agent/providers/anthropic/__init__.py,sha256=Mj11IZnVshZ2iHkvg4Z5-jrQIaD1WvzDz2Zk_pMwqIA,149
|
|
@@ -68,18 +68,19 @@ agent/providers/openai/types.py,sha256=0mFUxeFy23fJhMwc6lAFVXKngg2fJIXkPS5oV284V
|
|
|
68
68
|
agent/providers/openai/utils.py,sha256=YeCZWIqOFSeugWoqAS0rhxOKAfL-9uN9nrYSBGBgPdc,3175
|
|
69
69
|
agent/providers/uitars/__init__.py,sha256=sq5OMVJP9E_sok9tIiKJreGkjmNWXPMObjPTClYv1es,38
|
|
70
70
|
agent/providers/uitars/clients/base.py,sha256=5w8Ajmq1JiPyUQJUAq1lSkfpA8_Ts80NQiDxPMTtQrI,948
|
|
71
|
+
agent/providers/uitars/clients/mlxvlm.py,sha256=lMnN6ecMmWHf_l7khJ2iJHHvT7PE4XagUjrWhB0zEhc,10893
|
|
71
72
|
agent/providers/uitars/clients/oaicompat.py,sha256=uYjwrGCVpFi8wj4kcaJ905ABiY6ksJZXaLlM61B2DUA,8907
|
|
72
|
-
agent/providers/uitars/loop.py,sha256=
|
|
73
|
+
agent/providers/uitars/loop.py,sha256=4-cgQteixPy03vp7xWezd6jWpuPkBmlLS3tizaOmd0U,23494
|
|
73
74
|
agent/providers/uitars/prompts.py,sha256=_pQNd438mFpZKZT0aMl6Bd0_GgQxuy9y08kQAMPi9UM,2536
|
|
74
75
|
agent/providers/uitars/tools/__init__.py,sha256=0hc3W6u5TvcXYztYKIyve_C2G3XMfwt_y7grmH0ZHC0,29
|
|
75
76
|
agent/providers/uitars/tools/computer.py,sha256=TeIg_aCtMroxWOBJEiYY_YI4krW_C3pYu51tgGsVUYU,11808
|
|
76
77
|
agent/providers/uitars/tools/manager.py,sha256=2dK9STtz6NuZG3i0nH7ZuHJpb7vKJ2mOVbxGsb0t8lQ,1945
|
|
77
|
-
agent/providers/uitars/utils.py,sha256=
|
|
78
|
+
agent/providers/uitars/utils.py,sha256=493STTEEJcVhVbQgR0e8rNTI1DjkxUx8IgIv3wkJ1SU,8878
|
|
78
79
|
agent/telemetry.py,sha256=pVGxbj0ewnvq4EGj28CydN4a1iOfvZR_XKL3vIOqhOM,390
|
|
79
80
|
agent/ui/__init__.py,sha256=ohhxJLBin6k1hl5sKcmBST8mgh23WXgAXz3pN4f470E,45
|
|
80
81
|
agent/ui/gradio/__init__.py,sha256=ANKZhv1HqsLheWbLVBlyRQ7Q5qGeXuPi5jDs8vu-ZMo,579
|
|
81
|
-
agent/ui/gradio/app.py,sha256=
|
|
82
|
-
cua_agent-0.1.
|
|
83
|
-
cua_agent-0.1.
|
|
84
|
-
cua_agent-0.1.
|
|
85
|
-
cua_agent-0.1.
|
|
82
|
+
agent/ui/gradio/app.py,sha256=uj6cT0sFgnaN_a7JMy-OMKyOVEiKhwl3b5bJ7RamUQY,50090
|
|
83
|
+
cua_agent-0.1.37.dist-info/METADATA,sha256=Zvtfyd23U2UJTko82x6z5jzaJEuySC2TdQddMkHny28,12514
|
|
84
|
+
cua_agent-0.1.37.dist-info/WHEEL,sha256=tSfRZzRHthuv7vxpI4aehrdN9scLjk-dCJkPLzkHxGg,90
|
|
85
|
+
cua_agent-0.1.37.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
|
|
86
|
+
cua_agent-0.1.37.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|