cua-agent 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +1 -1
- agent/core/agent.py +9 -4
- agent/core/factory.py +3 -5
- agent/core/provider_config.py +4 -2
- agent/core/types.py +41 -1
- agent/providers/omni/__init__.py +1 -1
- agent/providers/omni/clients/oaicompat.py +177 -0
- agent/providers/omni/loop.py +25 -1
- agent/providers/omni/tools/manager.py +1 -1
- agent/ui/__init__.py +1 -0
- agent/ui/gradio/__init__.py +21 -0
- agent/ui/gradio/app.py +872 -0
- {cua_agent-0.1.22.dist-info → cua_agent-0.1.24.dist-info}/METADATA +74 -2
- {cua_agent-0.1.22.dist-info → cua_agent-0.1.24.dist-info}/RECORD +16 -14
- agent/core/README.md +0 -101
- agent/providers/omni/types.py +0 -47
- {cua_agent-0.1.22.dist-info → cua_agent-0.1.24.dist-info}/WHEEL +0 -0
- {cua_agent-0.1.22.dist-info → cua_agent-0.1.24.dist-info}/entry_points.txt +0 -0
agent/ui/gradio/app.py
ADDED
|
@@ -0,0 +1,872 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Advanced Gradio UI for Computer-Use Agent
|
|
3
|
+
|
|
4
|
+
This is a Gradio interface for the Computer-Use Agent
|
|
5
|
+
with an advanced UI for model selection and configuration.
|
|
6
|
+
|
|
7
|
+
Supported Agent Loops and Models:
|
|
8
|
+
- AgentLoop.OPENAI: Uses OpenAI Operator CUA model
|
|
9
|
+
• computer_use_preview
|
|
10
|
+
|
|
11
|
+
- AgentLoop.ANTHROPIC: Uses Anthropic Computer-Use models
|
|
12
|
+
• claude-3-5-sonnet-20240620
|
|
13
|
+
• claude-3-7-sonnet-20250219
|
|
14
|
+
|
|
15
|
+
- AgentLoop.OMNI (experimental): Uses OmniParser for element pixel-detection
|
|
16
|
+
• claude-3-5-sonnet-20240620
|
|
17
|
+
• claude-3-7-sonnet-20250219
|
|
18
|
+
• gpt-4.5-preview
|
|
19
|
+
• gpt-4o
|
|
20
|
+
• gpt-4
|
|
21
|
+
|
|
22
|
+
Requirements:
|
|
23
|
+
- Mac with Apple Silicon (M1/M2/M3/M4)
|
|
24
|
+
- macOS 14 (Sonoma) or newer
|
|
25
|
+
- Python 3.10+
|
|
26
|
+
- Lume CLI installed (https://github.com/trycua/cua)
|
|
27
|
+
- OpenAI or Anthropic API key
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
import os
|
|
31
|
+
import asyncio
|
|
32
|
+
import logging
|
|
33
|
+
from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
|
|
34
|
+
import gradio as gr
|
|
35
|
+
|
|
36
|
+
# Import from agent package
|
|
37
|
+
from agent.core.types import AgentResponse
|
|
38
|
+
from computer import Computer
|
|
39
|
+
|
|
40
|
+
from agent import ComputerAgent, AgentLoop, LLM, LLMProvider
|
|
41
|
+
|
|
42
|
+
# Global variables
|
|
43
|
+
global_agent = None
|
|
44
|
+
global_computer = None
|
|
45
|
+
|
|
46
|
+
# Map model names to specific provider model names
|
|
47
|
+
MODEL_MAPPINGS = {
|
|
48
|
+
"openai": {
|
|
49
|
+
# Default to operator CUA model
|
|
50
|
+
"default": "computer_use_preview",
|
|
51
|
+
# Map standard OpenAI model names to CUA-specific model names
|
|
52
|
+
"gpt-4-turbo": "computer_use_preview",
|
|
53
|
+
"gpt-4o": "computer_use_preview",
|
|
54
|
+
"gpt-4": "computer_use_preview",
|
|
55
|
+
"gpt-4.5-preview": "computer_use_preview",
|
|
56
|
+
},
|
|
57
|
+
"anthropic": {
|
|
58
|
+
# Default to newest model
|
|
59
|
+
"default": "claude-3-7-sonnet-20250219",
|
|
60
|
+
# Specific Claude models for CUA
|
|
61
|
+
"claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
|
|
62
|
+
"claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
|
|
63
|
+
# Map standard model names to CUA-specific model names
|
|
64
|
+
"claude-3-opus": "claude-3-7-sonnet-20250219",
|
|
65
|
+
"claude-3-sonnet": "claude-3-5-sonnet-20240620",
|
|
66
|
+
"claude-3-5-sonnet": "claude-3-5-sonnet-20240620",
|
|
67
|
+
"claude-3-7-sonnet": "claude-3-7-sonnet-20250219",
|
|
68
|
+
},
|
|
69
|
+
"omni": {
|
|
70
|
+
# OMNI works with any of these models
|
|
71
|
+
"default": "gpt-4o",
|
|
72
|
+
"gpt-4o": "gpt-4o",
|
|
73
|
+
"gpt-4": "gpt-4",
|
|
74
|
+
"gpt-4.5-preview": "gpt-4.5-preview",
|
|
75
|
+
"claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
|
|
76
|
+
"claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
|
|
77
|
+
},
|
|
78
|
+
"ollama": {
|
|
79
|
+
# For Ollama models, we keep the original name
|
|
80
|
+
"default": "llama3", # A common default model
|
|
81
|
+
# Don't map other models - we'll use the original name
|
|
82
|
+
},
|
|
83
|
+
"oaicompat": {
|
|
84
|
+
# Default for OpenAI-compatible providers like VLLM
|
|
85
|
+
"default": "Qwen2.5-VL-7B-Instruct",
|
|
86
|
+
},
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
|
|
91
|
+
"""
|
|
92
|
+
Determine the provider and actual model name to use based on the input.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
model_name: The requested model name
|
|
96
|
+
loop_provider: The requested agent loop provider
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
tuple: (provider, model_name_to_use, agent_loop)
|
|
100
|
+
"""
|
|
101
|
+
# Get the agent loop
|
|
102
|
+
loop_provider_map = {
|
|
103
|
+
"OPENAI": AgentLoop.OPENAI,
|
|
104
|
+
"ANTHROPIC": AgentLoop.ANTHROPIC,
|
|
105
|
+
"OMNI": AgentLoop.OMNI,
|
|
106
|
+
"OMNI-OLLAMA": AgentLoop.OMNI, # Special case for Ollama models with OMNI parser
|
|
107
|
+
}
|
|
108
|
+
agent_loop = loop_provider_map.get(loop_provider, AgentLoop.OPENAI)
|
|
109
|
+
|
|
110
|
+
# Set up the provider and model based on the loop and model_name
|
|
111
|
+
if agent_loop == AgentLoop.OPENAI:
|
|
112
|
+
provider = LLMProvider.OPENAI
|
|
113
|
+
model_name_to_use = MODEL_MAPPINGS["openai"].get(
|
|
114
|
+
model_name.lower(), MODEL_MAPPINGS["openai"]["default"]
|
|
115
|
+
)
|
|
116
|
+
elif agent_loop == AgentLoop.ANTHROPIC:
|
|
117
|
+
provider = LLMProvider.ANTHROPIC
|
|
118
|
+
model_name_to_use = MODEL_MAPPINGS["anthropic"].get(
|
|
119
|
+
model_name.lower(), MODEL_MAPPINGS["anthropic"]["default"]
|
|
120
|
+
)
|
|
121
|
+
elif agent_loop == AgentLoop.OMNI:
|
|
122
|
+
# For OMNI, select provider based on model name or loop_provider
|
|
123
|
+
if loop_provider == "OMNI-OLLAMA":
|
|
124
|
+
provider = LLMProvider.OLLAMA
|
|
125
|
+
|
|
126
|
+
# For Ollama models from the UI dropdown, we use the model name as is
|
|
127
|
+
# No need to parse it - it's already the correct Ollama model name
|
|
128
|
+
model_name_to_use = model_name
|
|
129
|
+
elif "claude" in model_name.lower():
|
|
130
|
+
provider = LLMProvider.ANTHROPIC
|
|
131
|
+
model_name_to_use = MODEL_MAPPINGS["omni"].get(
|
|
132
|
+
model_name.lower(), MODEL_MAPPINGS["omni"]["default"]
|
|
133
|
+
)
|
|
134
|
+
elif "gpt" in model_name.lower():
|
|
135
|
+
provider = LLMProvider.OPENAI
|
|
136
|
+
model_name_to_use = MODEL_MAPPINGS["omni"].get(
|
|
137
|
+
model_name.lower(), MODEL_MAPPINGS["omni"]["default"]
|
|
138
|
+
)
|
|
139
|
+
else:
|
|
140
|
+
# Handle custom model names - use the OAICOMPAT provider
|
|
141
|
+
provider = LLMProvider.OAICOMPAT
|
|
142
|
+
# Use the model name as is without mapping, or use default if empty
|
|
143
|
+
model_name_to_use = (
|
|
144
|
+
model_name if model_name.strip() else MODEL_MAPPINGS["oaicompat"]["default"]
|
|
145
|
+
)
|
|
146
|
+
else:
|
|
147
|
+
# Default to OpenAI if unrecognized loop
|
|
148
|
+
provider = LLMProvider.OPENAI
|
|
149
|
+
model_name_to_use = MODEL_MAPPINGS["openai"]["default"]
|
|
150
|
+
agent_loop = AgentLoop.OPENAI
|
|
151
|
+
|
|
152
|
+
return provider, model_name_to_use, agent_loop
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def get_ollama_models() -> List[str]:
|
|
156
|
+
"""Get available models from Ollama if installed."""
|
|
157
|
+
try:
|
|
158
|
+
import subprocess
|
|
159
|
+
|
|
160
|
+
result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
|
|
161
|
+
if result.returncode == 0:
|
|
162
|
+
lines = result.stdout.strip().split("\n")
|
|
163
|
+
if len(lines) < 2: # No models or just header
|
|
164
|
+
return []
|
|
165
|
+
|
|
166
|
+
models = []
|
|
167
|
+
# Skip header line
|
|
168
|
+
for line in lines[1:]:
|
|
169
|
+
parts = line.split()
|
|
170
|
+
if parts:
|
|
171
|
+
model_name = parts[0]
|
|
172
|
+
models.append(f"OMNI: Ollama {model_name}")
|
|
173
|
+
return models
|
|
174
|
+
return []
|
|
175
|
+
except Exception as e:
|
|
176
|
+
logging.error(f"Error getting Ollama models: {e}")
|
|
177
|
+
return []
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def extract_synthesized_text(result: Union[AgentResponse, Dict[str, Any]]) -> str:
|
|
181
|
+
"""Extract synthesized text from the agent result."""
|
|
182
|
+
synthesized_text = ""
|
|
183
|
+
|
|
184
|
+
if "output" in result and result["output"]:
|
|
185
|
+
for output in result["output"]:
|
|
186
|
+
if output.get("type") == "reasoning":
|
|
187
|
+
content = output.get("content", "")
|
|
188
|
+
if content:
|
|
189
|
+
synthesized_text += f"{content}\n"
|
|
190
|
+
|
|
191
|
+
elif output.get("type") == "message":
|
|
192
|
+
# Handle message type outputs - can contain rich content
|
|
193
|
+
content = output.get("content", [])
|
|
194
|
+
|
|
195
|
+
# Content is usually an array of content blocks
|
|
196
|
+
if isinstance(content, list):
|
|
197
|
+
for block in content:
|
|
198
|
+
if isinstance(block, dict) and block.get("type") == "output_text":
|
|
199
|
+
text_value = block.get("text", "")
|
|
200
|
+
if text_value:
|
|
201
|
+
synthesized_text += f"{text_value}\n"
|
|
202
|
+
|
|
203
|
+
elif output.get("type") == "computer_call":
|
|
204
|
+
action = output.get("action", {})
|
|
205
|
+
action_type = action.get("type", "")
|
|
206
|
+
|
|
207
|
+
# Create a descriptive text about the action
|
|
208
|
+
if action_type == "click":
|
|
209
|
+
button = action.get("button", "")
|
|
210
|
+
x = action.get("x", "")
|
|
211
|
+
y = action.get("y", "")
|
|
212
|
+
synthesized_text += f"Clicked {button} at position ({x}, {y}).\n"
|
|
213
|
+
elif action_type == "type":
|
|
214
|
+
text = action.get("text", "")
|
|
215
|
+
synthesized_text += f"Typed: {text}.\n"
|
|
216
|
+
elif action_type == "keypress":
|
|
217
|
+
# Extract key correctly from either keys array or key field
|
|
218
|
+
if isinstance(action.get("keys"), list):
|
|
219
|
+
key = ", ".join(action.get("keys"))
|
|
220
|
+
else:
|
|
221
|
+
key = action.get("key", "")
|
|
222
|
+
|
|
223
|
+
synthesized_text += f"Pressed key: {key}\n"
|
|
224
|
+
else:
|
|
225
|
+
synthesized_text += f"Performed {action_type} action.\n"
|
|
226
|
+
|
|
227
|
+
return synthesized_text.strip()
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def create_computer_instance(verbosity: int = logging.INFO) -> Computer:
|
|
231
|
+
"""Create or get the global Computer instance."""
|
|
232
|
+
global global_computer
|
|
233
|
+
|
|
234
|
+
if global_computer is None:
|
|
235
|
+
global_computer = Computer(verbosity=verbosity)
|
|
236
|
+
|
|
237
|
+
return global_computer
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def create_agent(
|
|
241
|
+
provider: LLMProvider,
|
|
242
|
+
agent_loop: AgentLoop,
|
|
243
|
+
model_name: str,
|
|
244
|
+
api_key: Optional[str] = None,
|
|
245
|
+
save_trajectory: bool = True,
|
|
246
|
+
only_n_most_recent_images: int = 3,
|
|
247
|
+
verbosity: int = logging.INFO,
|
|
248
|
+
use_ollama: bool = False,
|
|
249
|
+
use_oaicompat: bool = False,
|
|
250
|
+
) -> ComputerAgent:
|
|
251
|
+
"""Create or update the global agent with the specified parameters."""
|
|
252
|
+
global global_agent
|
|
253
|
+
|
|
254
|
+
# Create the computer if not already done
|
|
255
|
+
computer = create_computer_instance(verbosity=verbosity)
|
|
256
|
+
|
|
257
|
+
# Extra configuration to pass to the agent
|
|
258
|
+
extra_config = {}
|
|
259
|
+
|
|
260
|
+
# For Ollama models, we'll pass use_ollama and the model_name directly
|
|
261
|
+
if use_ollama:
|
|
262
|
+
extra_config["use_ollama"] = True
|
|
263
|
+
extra_config["ollama_model"] = model_name
|
|
264
|
+
print(f"DEBUG - Using Ollama with model: {model_name}")
|
|
265
|
+
|
|
266
|
+
# Get API key from environment if not provided
|
|
267
|
+
if api_key is None:
|
|
268
|
+
if provider == LLMProvider.OPENAI:
|
|
269
|
+
api_key = os.environ.get("OPENAI_API_KEY", "")
|
|
270
|
+
elif provider == LLMProvider.ANTHROPIC:
|
|
271
|
+
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
|
272
|
+
|
|
273
|
+
# Create LLM model object with appropriate parameters
|
|
274
|
+
provider_base_url = "http://localhost:8000/v1" if use_oaicompat else None
|
|
275
|
+
|
|
276
|
+
if use_oaicompat:
|
|
277
|
+
# Special handling for OAICOMPAT - use OPENAI provider with custom base URL
|
|
278
|
+
print(f"DEBUG - Creating OAICOMPAT agent with model: {model_name}")
|
|
279
|
+
llm = LLM(
|
|
280
|
+
provider=provider, # Already set to OPENAI
|
|
281
|
+
name=model_name,
|
|
282
|
+
provider_base_url=provider_base_url,
|
|
283
|
+
)
|
|
284
|
+
# Note: Don't pass use_oaicompat to the agent, as it doesn't accept this parameter
|
|
285
|
+
elif provider == LLMProvider.OAICOMPAT:
|
|
286
|
+
# This path is unlikely to be taken with our current approach
|
|
287
|
+
llm = LLM(provider=provider, name=model_name, provider_base_url=provider_base_url)
|
|
288
|
+
else:
|
|
289
|
+
# For other providers, just use standard parameters
|
|
290
|
+
llm = LLM(provider=provider, name=model_name)
|
|
291
|
+
|
|
292
|
+
# Create or update the agent
|
|
293
|
+
if global_agent is None:
|
|
294
|
+
global_agent = ComputerAgent(
|
|
295
|
+
computer=computer,
|
|
296
|
+
loop=agent_loop,
|
|
297
|
+
model=llm,
|
|
298
|
+
api_key=api_key,
|
|
299
|
+
save_trajectory=save_trajectory,
|
|
300
|
+
only_n_most_recent_images=only_n_most_recent_images,
|
|
301
|
+
verbosity=verbosity,
|
|
302
|
+
**extra_config,
|
|
303
|
+
)
|
|
304
|
+
else:
|
|
305
|
+
# Update the existing agent's parameters
|
|
306
|
+
global_agent._loop = None # Force recreation of the loop
|
|
307
|
+
global_agent.provider = provider
|
|
308
|
+
global_agent.loop = agent_loop
|
|
309
|
+
global_agent.model = llm
|
|
310
|
+
global_agent.api_key = api_key
|
|
311
|
+
|
|
312
|
+
# Explicitly update these settings to ensure they take effect
|
|
313
|
+
global_agent.save_trajectory = save_trajectory
|
|
314
|
+
global_agent.only_n_most_recent_images = only_n_most_recent_images
|
|
315
|
+
|
|
316
|
+
# Update Ollama settings if applicable
|
|
317
|
+
if use_ollama:
|
|
318
|
+
global_agent.use_ollama = True
|
|
319
|
+
global_agent.ollama_model = model_name
|
|
320
|
+
else:
|
|
321
|
+
global_agent.use_ollama = False
|
|
322
|
+
global_agent.ollama_model = None
|
|
323
|
+
|
|
324
|
+
# Log the updated settings
|
|
325
|
+
logging.info(
|
|
326
|
+
f"Updated agent settings: save_trajectory={save_trajectory}, recent_images={only_n_most_recent_images}"
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
return global_agent
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> str:
|
|
333
|
+
"""Process agent results for the Gradio UI."""
|
|
334
|
+
# Extract text content
|
|
335
|
+
text_obj = result.get("text", {})
|
|
336
|
+
|
|
337
|
+
# For OpenAI's Computer-Use Agent, text field is an object with format property
|
|
338
|
+
if (
|
|
339
|
+
text_obj
|
|
340
|
+
and isinstance(text_obj, dict)
|
|
341
|
+
and "format" in text_obj
|
|
342
|
+
and not text_obj.get("value", "")
|
|
343
|
+
):
|
|
344
|
+
content = extract_synthesized_text(result)
|
|
345
|
+
else:
|
|
346
|
+
# For other types of results, try to get text directly
|
|
347
|
+
if isinstance(text_obj, dict):
|
|
348
|
+
if "value" in text_obj:
|
|
349
|
+
content = text_obj["value"]
|
|
350
|
+
elif "text" in text_obj:
|
|
351
|
+
content = text_obj["text"]
|
|
352
|
+
elif "content" in text_obj:
|
|
353
|
+
content = text_obj["content"]
|
|
354
|
+
else:
|
|
355
|
+
content = ""
|
|
356
|
+
else:
|
|
357
|
+
content = str(text_obj) if text_obj else ""
|
|
358
|
+
|
|
359
|
+
# If still no content but we have outputs, create a summary
|
|
360
|
+
if not content and "output" in result and result["output"]:
|
|
361
|
+
output = result["output"]
|
|
362
|
+
for out in output:
|
|
363
|
+
if out.get("type") == "reasoning":
|
|
364
|
+
content = out.get("content", "")
|
|
365
|
+
if content:
|
|
366
|
+
break
|
|
367
|
+
elif out.get("type") == "computer_call":
|
|
368
|
+
action = out.get("action", {})
|
|
369
|
+
action_type = action.get("type", "")
|
|
370
|
+
if action_type:
|
|
371
|
+
content = f"Performing action: {action_type}"
|
|
372
|
+
break
|
|
373
|
+
|
|
374
|
+
# Clean up the text - ensure content is a string
|
|
375
|
+
if not isinstance(content, str):
|
|
376
|
+
content = str(content) if content else ""
|
|
377
|
+
|
|
378
|
+
return content
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def respond(
|
|
382
|
+
message: str,
|
|
383
|
+
history: List[Tuple[str, str]],
|
|
384
|
+
model_choice, # Accept Gradio Dropdown component
|
|
385
|
+
agent_loop, # Accept Gradio Dropdown component
|
|
386
|
+
save_trajectory, # Accept Gradio Checkbox component
|
|
387
|
+
recent_images, # Accept Gradio Slider component
|
|
388
|
+
openai_api_key: Optional[str] = None,
|
|
389
|
+
anthropic_api_key: Optional[str] = None,
|
|
390
|
+
) -> str:
|
|
391
|
+
"""Process a message with the Computer-Use Agent and return the response."""
|
|
392
|
+
import asyncio
|
|
393
|
+
|
|
394
|
+
# Get actual values from Gradio components
|
|
395
|
+
model_choice_value = model_choice.value if hasattr(model_choice, "value") else model_choice
|
|
396
|
+
agent_loop_value = agent_loop.value if hasattr(agent_loop, "value") else agent_loop
|
|
397
|
+
save_trajectory_value = (
|
|
398
|
+
save_trajectory.value if hasattr(save_trajectory, "value") else save_trajectory
|
|
399
|
+
)
|
|
400
|
+
recent_images_value = int(
|
|
401
|
+
recent_images.value if hasattr(recent_images, "value") else recent_images
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
# Debug logging
|
|
405
|
+
print(f"DEBUG - Model choice object: {type(model_choice)}")
|
|
406
|
+
print(f"DEBUG - Model choice value: {model_choice_value}")
|
|
407
|
+
print(f"DEBUG - Agent loop value: {agent_loop_value}")
|
|
408
|
+
|
|
409
|
+
# Create a new event loop for this function call
|
|
410
|
+
loop = asyncio.new_event_loop()
|
|
411
|
+
asyncio.set_event_loop(loop)
|
|
412
|
+
|
|
413
|
+
async def _async_respond():
|
|
414
|
+
# Extract the loop type and model from the selection
|
|
415
|
+
loop_provider = "OPENAI"
|
|
416
|
+
if isinstance(model_choice_value, str):
|
|
417
|
+
# This is the case for a custom text input from textbox
|
|
418
|
+
if agent_loop_value == "OMNI":
|
|
419
|
+
loop_provider = "OMNI"
|
|
420
|
+
# Use the custom model name as is
|
|
421
|
+
model_id = model_choice_value
|
|
422
|
+
print(f"DEBUG - Using custom model: {model_id}")
|
|
423
|
+
else:
|
|
424
|
+
# Handle regular dropdown value as string
|
|
425
|
+
if model_choice_value.startswith("OpenAI:"):
|
|
426
|
+
loop_provider = "OPENAI"
|
|
427
|
+
model_id = model_choice_value.replace("OpenAI: ", "").lower()
|
|
428
|
+
elif model_choice_value.startswith("Anthropic:"):
|
|
429
|
+
loop_provider = "ANTHROPIC"
|
|
430
|
+
model_id = model_choice_value.replace("Anthropic: ", "").lower()
|
|
431
|
+
elif model_choice_value.startswith("OMNI:"):
|
|
432
|
+
loop_provider = "OMNI"
|
|
433
|
+
if "GPT" in model_choice_value:
|
|
434
|
+
model_id = model_choice_value.replace("OMNI: OpenAI ", "").lower()
|
|
435
|
+
elif "Claude" in model_choice_value:
|
|
436
|
+
model_id = model_choice_value.replace("OMNI: ", "").lower()
|
|
437
|
+
elif "Ollama" in model_choice_value:
|
|
438
|
+
loop_provider = "OMNI-OLLAMA"
|
|
439
|
+
# Extract everything after "OMNI: Ollama " which is the full model name (e.g., phi3:latest)
|
|
440
|
+
model_id = model_choice_value.replace("OMNI: Ollama ", "")
|
|
441
|
+
print(f"DEBUG - Ollama model ID: {model_id}")
|
|
442
|
+
else:
|
|
443
|
+
model_id = "default"
|
|
444
|
+
else:
|
|
445
|
+
# Default case
|
|
446
|
+
loop_provider = agent_loop_value
|
|
447
|
+
model_id = "default"
|
|
448
|
+
else:
|
|
449
|
+
# Model choice is not a string (shouldn't happen, but handle anyway)
|
|
450
|
+
loop_provider = agent_loop_value
|
|
451
|
+
model_id = "default"
|
|
452
|
+
|
|
453
|
+
print(f"DEBUG - Using loop provider: {loop_provider}, model_id: {model_id}")
|
|
454
|
+
|
|
455
|
+
# Use the mapping function to get provider, model name and agent loop
|
|
456
|
+
provider, model_name, agent_loop_type = get_provider_and_model(model_id, loop_provider)
|
|
457
|
+
print(
|
|
458
|
+
f"DEBUG - After mapping: provider={provider}, model_name={model_name}, agent_loop={agent_loop_type}"
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
# Special handling for OAICOMPAT to bypass provider-specific errors
|
|
462
|
+
# Creates the agent with OPENAI provider but using custom model name and provider base URL
|
|
463
|
+
is_oaicompat = str(provider) == "oaicompat"
|
|
464
|
+
if is_oaicompat:
|
|
465
|
+
provider = LLMProvider.OPENAI
|
|
466
|
+
|
|
467
|
+
# Get API key based on provider
|
|
468
|
+
if provider == LLMProvider.OPENAI:
|
|
469
|
+
api_key = openai_api_key or os.environ.get("OPENAI_API_KEY", "")
|
|
470
|
+
elif provider == LLMProvider.ANTHROPIC:
|
|
471
|
+
api_key = anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
|
|
472
|
+
else:
|
|
473
|
+
api_key = ""
|
|
474
|
+
|
|
475
|
+
# Check for settings changes if agent already exists
|
|
476
|
+
settings_changed = False
|
|
477
|
+
settings_message = ""
|
|
478
|
+
if global_agent is not None:
|
|
479
|
+
# Safely check if save_trajectory setting changed
|
|
480
|
+
current_save_traj = getattr(global_agent, "save_trajectory", None)
|
|
481
|
+
if current_save_traj is not None and current_save_traj != save_trajectory_value:
|
|
482
|
+
settings_changed = True
|
|
483
|
+
settings_message += f"Save trajectory set to: {save_trajectory_value}. "
|
|
484
|
+
|
|
485
|
+
# Safely check if recent_images setting changed
|
|
486
|
+
current_recent_images = getattr(global_agent, "only_n_most_recent_images", None)
|
|
487
|
+
if current_recent_images is not None and current_recent_images != recent_images_value:
|
|
488
|
+
settings_changed = True
|
|
489
|
+
settings_message += f"Recent images set to: {recent_images_value}. "
|
|
490
|
+
|
|
491
|
+
# Create or update the agent
|
|
492
|
+
try:
|
|
493
|
+
create_agent(
|
|
494
|
+
provider=provider,
|
|
495
|
+
agent_loop=agent_loop_type,
|
|
496
|
+
model_name=model_name,
|
|
497
|
+
api_key=api_key,
|
|
498
|
+
save_trajectory=save_trajectory_value,
|
|
499
|
+
only_n_most_recent_images=recent_images_value,
|
|
500
|
+
use_ollama=loop_provider == "OMNI-OLLAMA",
|
|
501
|
+
use_oaicompat=is_oaicompat,
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
if global_agent is None:
|
|
505
|
+
return "Failed to create agent. Check API keys and configuration."
|
|
506
|
+
except Exception as e:
|
|
507
|
+
return f"Error creating agent: {str(e)}"
|
|
508
|
+
|
|
509
|
+
# Notify about settings changes if needed
|
|
510
|
+
if settings_changed:
|
|
511
|
+
return f"Settings updated: {settings_message}"
|
|
512
|
+
|
|
513
|
+
# Collect all responses
|
|
514
|
+
response_text = []
|
|
515
|
+
|
|
516
|
+
# Run the agent
|
|
517
|
+
try:
|
|
518
|
+
async for result in global_agent.run(message):
|
|
519
|
+
# Process result
|
|
520
|
+
content = process_agent_result(result)
|
|
521
|
+
|
|
522
|
+
# Skip empty content
|
|
523
|
+
if not content:
|
|
524
|
+
continue
|
|
525
|
+
|
|
526
|
+
# Add content to response list
|
|
527
|
+
response_text.append(content)
|
|
528
|
+
|
|
529
|
+
# Return the full response as a single string
|
|
530
|
+
return "\n".join(response_text) if response_text else "Task completed."
|
|
531
|
+
|
|
532
|
+
except Exception as e:
|
|
533
|
+
import traceback
|
|
534
|
+
|
|
535
|
+
traceback.print_exc()
|
|
536
|
+
return f"Error: {str(e)}"
|
|
537
|
+
|
|
538
|
+
# Run the async function and get the result
|
|
539
|
+
try:
|
|
540
|
+
result = loop.run_until_complete(_async_respond())
|
|
541
|
+
loop.close()
|
|
542
|
+
return result
|
|
543
|
+
except Exception as e:
|
|
544
|
+
loop.close()
|
|
545
|
+
import traceback
|
|
546
|
+
|
|
547
|
+
traceback.print_exc()
|
|
548
|
+
return f"Error executing async operation: {str(e)}"
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def create_gradio_ui(
|
|
552
|
+
provider_name: str = "openai",
|
|
553
|
+
model_name: str = "gpt-4o",
|
|
554
|
+
) -> gr.Blocks:
|
|
555
|
+
"""Create a Gradio UI for the Computer-Use Agent.
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
provider_name: The provider to use (e.g., "openai", "anthropic")
|
|
559
|
+
model_name: The model to use (e.g., "gpt-4o", "claude-3-7-sonnet")
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
A Gradio Blocks application
|
|
563
|
+
"""
|
|
564
|
+
# Check for API keys
|
|
565
|
+
openai_api_key = os.environ.get("OPENAI_API_KEY", "")
|
|
566
|
+
anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
|
567
|
+
|
|
568
|
+
# Prepare model choices based on available API keys
|
|
569
|
+
openai_models = []
|
|
570
|
+
anthropic_models = []
|
|
571
|
+
omni_models = []
|
|
572
|
+
|
|
573
|
+
if openai_api_key:
|
|
574
|
+
openai_models = ["OpenAI: Computer-Use Preview"]
|
|
575
|
+
omni_models += [
|
|
576
|
+
"OMNI: OpenAI GPT-4o",
|
|
577
|
+
"OMNI: OpenAI GPT-4.5-preview",
|
|
578
|
+
]
|
|
579
|
+
|
|
580
|
+
if anthropic_api_key:
|
|
581
|
+
anthropic_models = [
|
|
582
|
+
"Anthropic: Claude 3.7 Sonnet (20250219)",
|
|
583
|
+
"Anthropic: Claude 3.5 Sonnet (20240620)",
|
|
584
|
+
]
|
|
585
|
+
omni_models += ["OMNI: Claude 3.7 Sonnet (20250219)", "OMNI: Claude 3.5 Sonnet (20240620)"]
|
|
586
|
+
|
|
587
|
+
# Get Ollama models for OMNI
|
|
588
|
+
ollama_models = get_ollama_models()
|
|
589
|
+
if ollama_models:
|
|
590
|
+
omni_models += ollama_models
|
|
591
|
+
|
|
592
|
+
# Format model choices
|
|
593
|
+
provider_to_models = {
|
|
594
|
+
"OPENAI": openai_models,
|
|
595
|
+
"ANTHROPIC": anthropic_models,
|
|
596
|
+
"OMNI": omni_models + ["Custom model..."], # Add custom model option
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
# Get initial agent loop and model based on provided parameters
|
|
600
|
+
if provider_name.lower() == "openai":
|
|
601
|
+
initial_loop = "OPENAI"
|
|
602
|
+
initial_model = "OpenAI: Computer-Use Preview" if openai_models else "No models available"
|
|
603
|
+
elif provider_name.lower() == "anthropic":
|
|
604
|
+
initial_loop = "ANTHROPIC"
|
|
605
|
+
initial_model = anthropic_models[0] if anthropic_models else "No models available"
|
|
606
|
+
else:
|
|
607
|
+
initial_loop = "OMNI"
|
|
608
|
+
if model_name == "gpt-4o" and "OMNI: OpenAI GPT-4o" in omni_models:
|
|
609
|
+
initial_model = "OMNI: OpenAI GPT-4o"
|
|
610
|
+
elif "claude" in model_name.lower() and omni_models:
|
|
611
|
+
initial_model = next((m for m in omni_models if "Claude" in m), omni_models[0])
|
|
612
|
+
else:
|
|
613
|
+
initial_model = omni_models[0] if omni_models else "No models available"
|
|
614
|
+
|
|
615
|
+
# Example prompts
|
|
616
|
+
example_messages = [
|
|
617
|
+
"Create a Python virtual environment, install pandas and matplotlib, then plot stock data",
|
|
618
|
+
"Open a PDF in Preview, add annotations, and save it as a compressed version",
|
|
619
|
+
"Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
|
|
620
|
+
"Configure SSH keys and set up a connection to a remote server",
|
|
621
|
+
]
|
|
622
|
+
|
|
623
|
+
# Function to update model choices based on agent loop selection
|
|
624
|
+
def update_model_choices(loop):
|
|
625
|
+
models = provider_to_models.get(loop, [])
|
|
626
|
+
if loop == "OMNI":
|
|
627
|
+
# For OMNI, include the custom model option
|
|
628
|
+
if not models:
|
|
629
|
+
models = ["Custom model..."]
|
|
630
|
+
elif "Custom model..." not in models:
|
|
631
|
+
models.append("Custom model...")
|
|
632
|
+
|
|
633
|
+
return gr.update(
|
|
634
|
+
choices=models, value=models[0] if models else "Custom model...", interactive=True
|
|
635
|
+
)
|
|
636
|
+
else:
|
|
637
|
+
# For other providers, use standard dropdown without custom option
|
|
638
|
+
if not models:
|
|
639
|
+
return gr.update(
|
|
640
|
+
choices=["No models available"], value="No models available", interactive=True
|
|
641
|
+
)
|
|
642
|
+
return gr.update(choices=models, value=models[0] if models else None, interactive=True)
|
|
643
|
+
|
|
644
|
+
# Create the Gradio interface with advanced UI
|
|
645
|
+
with gr.Blocks(title="Computer-Use Agent") as demo:
|
|
646
|
+
with gr.Row():
|
|
647
|
+
# Left column for settings
|
|
648
|
+
with gr.Column(scale=1):
|
|
649
|
+
# Logo with theme-aware styling
|
|
650
|
+
gr.HTML(
|
|
651
|
+
"""
|
|
652
|
+
<style>
|
|
653
|
+
.light-logo, .dark-logo {
|
|
654
|
+
display: block;
|
|
655
|
+
margin: 0 auto;
|
|
656
|
+
width: 80px;
|
|
657
|
+
}
|
|
658
|
+
/* Hide dark logo in light mode */
|
|
659
|
+
.dark-logo {
|
|
660
|
+
display: none;
|
|
661
|
+
}
|
|
662
|
+
/* In dark mode, hide light logo and show dark logo */
|
|
663
|
+
.dark .light-logo {
|
|
664
|
+
display: none;
|
|
665
|
+
}
|
|
666
|
+
.dark .dark-logo {
|
|
667
|
+
display: block;
|
|
668
|
+
}
|
|
669
|
+
</style>
|
|
670
|
+
<div style="display: flex; justify-content: center; margin-bottom: 0.5em">
|
|
671
|
+
<img class="light-logo" alt="CUA Logo"
|
|
672
|
+
src="https://github.com/trycua/cua/blob/main/img/logo_black.png?raw=true" />
|
|
673
|
+
<img class="dark-logo" alt="CUA Logo"
|
|
674
|
+
src="https://github.com/trycua/cua/blob/main/img/logo_white.png?raw=true" />
|
|
675
|
+
</div>
|
|
676
|
+
"""
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
# Add installation prerequisites as a collapsible section
|
|
680
|
+
with gr.Accordion("Prerequisites & Installation", open=False):
|
|
681
|
+
gr.Markdown(
|
|
682
|
+
"""
|
|
683
|
+
## Prerequisites
|
|
684
|
+
|
|
685
|
+
Before using the Computer-Use Agent, you need to set up the Lume daemon and pull the macOS VM image.
|
|
686
|
+
|
|
687
|
+
### 1. Install Lume daemon
|
|
688
|
+
|
|
689
|
+
While a lume binary is included with Computer, we recommend installing the standalone version with brew, and starting the lume daemon service:
|
|
690
|
+
|
|
691
|
+
```bash
|
|
692
|
+
sudo /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
|
|
693
|
+
```
|
|
694
|
+
|
|
695
|
+
### 2. Start the Lume daemon service
|
|
696
|
+
|
|
697
|
+
In a separate terminal:
|
|
698
|
+
|
|
699
|
+
```bash
|
|
700
|
+
lume serve
|
|
701
|
+
```
|
|
702
|
+
|
|
703
|
+
### 3. Pull the pre-built macOS image
|
|
704
|
+
|
|
705
|
+
```bash
|
|
706
|
+
lume pull macos-sequoia-cua:latest --no-cache
|
|
707
|
+
```
|
|
708
|
+
|
|
709
|
+
Initial download requires 80GB storage, but reduces to ~30GB after first run due to macOS's sparse file system.
|
|
710
|
+
|
|
711
|
+
VMs are stored in `~/.lume`, and locally cached images are stored in `~/.lume/cache`.
|
|
712
|
+
|
|
713
|
+
### 4. Test the sandbox
|
|
714
|
+
|
|
715
|
+
```bash
|
|
716
|
+
lume run macos-sequoia-cua:latest
|
|
717
|
+
```
|
|
718
|
+
|
|
719
|
+
For more detailed instructions, visit the [CUA GitHub repository](https://github.com/trycua/cua).
|
|
720
|
+
"""
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
# Configuration options
|
|
724
|
+
agent_loop = gr.Dropdown(
|
|
725
|
+
choices=["OPENAI", "ANTHROPIC", "OMNI"],
|
|
726
|
+
label="Agent Loop",
|
|
727
|
+
value=initial_loop,
|
|
728
|
+
info="Select the agent loop provider",
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
# Create model selection dropdown with custom value support for OMNI
|
|
732
|
+
model_choice = gr.Dropdown(
|
|
733
|
+
choices=provider_to_models.get(initial_loop, ["No models available"]),
|
|
734
|
+
label="LLM Provider and Model",
|
|
735
|
+
value=initial_model,
|
|
736
|
+
info="Select model or choose 'Custom model...' to enter a custom name",
|
|
737
|
+
interactive=True,
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
# Add custom model textbox (only visible when "Custom model..." is selected)
|
|
741
|
+
custom_model = gr.Textbox(
|
|
742
|
+
label="Custom Model Name",
|
|
743
|
+
placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct)",
|
|
744
|
+
value="Qwen2.5-VL-7B-Instruct", # Default value
|
|
745
|
+
visible=False, # Initially hidden
|
|
746
|
+
interactive=True,
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
save_trajectory = gr.Checkbox(
|
|
750
|
+
label="Save Trajectory",
|
|
751
|
+
value=True,
|
|
752
|
+
info="Save the agent's trajectory for debugging",
|
|
753
|
+
interactive=True,
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
recent_images = gr.Slider(
|
|
757
|
+
label="Recent Images",
|
|
758
|
+
minimum=1,
|
|
759
|
+
maximum=10,
|
|
760
|
+
value=3,
|
|
761
|
+
step=1,
|
|
762
|
+
info="Number of recent images to keep in context",
|
|
763
|
+
interactive=True,
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
# Right column for chat interface
|
|
767
|
+
with gr.Column(scale=2):
|
|
768
|
+
# Add instruction text before the chat interface
|
|
769
|
+
gr.Markdown(
|
|
770
|
+
"Ask me to perform tasks in a virtual macOS environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
|
|
771
|
+
)
|
|
772
|
+
|
|
773
|
+
chatbot = gr.Chatbot()
|
|
774
|
+
msg = gr.Textbox(
|
|
775
|
+
placeholder="Ask me to perform tasks in a virtual macOS environment"
|
|
776
|
+
)
|
|
777
|
+
clear = gr.Button("Clear")
|
|
778
|
+
|
|
779
|
+
# Add examples
|
|
780
|
+
example_group = gr.Examples(examples=example_messages, inputs=msg)
|
|
781
|
+
|
|
782
|
+
# Function to handle chat submission
|
|
783
|
+
def chat_submit(message, history):
|
|
784
|
+
# Add user message to history
|
|
785
|
+
history = history + [(message, None)]
|
|
786
|
+
return "", history
|
|
787
|
+
|
|
788
|
+
# Function to process agent response after user input
|
|
789
|
+
def process_response(
|
|
790
|
+
history,
|
|
791
|
+
model_choice_value,
|
|
792
|
+
custom_model_value,
|
|
793
|
+
agent_loop_choice,
|
|
794
|
+
save_traj,
|
|
795
|
+
recent_imgs,
|
|
796
|
+
):
|
|
797
|
+
if not history:
|
|
798
|
+
return history
|
|
799
|
+
|
|
800
|
+
# Get the last user message
|
|
801
|
+
last_user_message = history[-1][0]
|
|
802
|
+
|
|
803
|
+
# Use custom model value if "Custom model..." is selected
|
|
804
|
+
model_to_use = (
|
|
805
|
+
custom_model_value
|
|
806
|
+
if model_choice_value == "Custom model..."
|
|
807
|
+
else model_choice_value
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
# Process with agent
|
|
811
|
+
response = respond(
|
|
812
|
+
last_user_message,
|
|
813
|
+
history[:-1], # History without the last message
|
|
814
|
+
model_to_use,
|
|
815
|
+
agent_loop_choice,
|
|
816
|
+
save_traj,
|
|
817
|
+
recent_imgs,
|
|
818
|
+
openai_api_key,
|
|
819
|
+
anthropic_api_key,
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
# Update the last assistant message
|
|
823
|
+
history[-1] = (last_user_message, response)
|
|
824
|
+
return history
|
|
825
|
+
|
|
826
|
+
# Connect the components
|
|
827
|
+
msg.submit(chat_submit, [msg, chatbot], [msg, chatbot]).then(
|
|
828
|
+
process_response,
|
|
829
|
+
[
|
|
830
|
+
chatbot,
|
|
831
|
+
model_choice,
|
|
832
|
+
custom_model,
|
|
833
|
+
agent_loop,
|
|
834
|
+
save_trajectory,
|
|
835
|
+
recent_images,
|
|
836
|
+
],
|
|
837
|
+
[chatbot],
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
# Clear button functionality
|
|
841
|
+
clear.click(lambda: None, None, chatbot, queue=False)
|
|
842
|
+
|
|
843
|
+
# Connect agent_loop changes to model selection
|
|
844
|
+
agent_loop.change(
|
|
845
|
+
fn=update_model_choices,
|
|
846
|
+
inputs=[agent_loop],
|
|
847
|
+
outputs=[model_choice],
|
|
848
|
+
queue=False, # Process immediately without queueing
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
# Show/hide custom model textbox based on dropdown selection
|
|
852
|
+
def update_custom_model_visibility(model_value):
|
|
853
|
+
return gr.update(visible=model_value == "Custom model...")
|
|
854
|
+
|
|
855
|
+
model_choice.change(
|
|
856
|
+
fn=update_custom_model_visibility,
|
|
857
|
+
inputs=[model_choice],
|
|
858
|
+
outputs=[custom_model],
|
|
859
|
+
queue=False, # Process immediately without queueing
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
return demo
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
def test_cua():
|
|
866
|
+
"""Standalone function to launch the Gradio app."""
|
|
867
|
+
demo = create_gradio_ui()
|
|
868
|
+
demo.launch(share=False) # Don't create a public link
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
if __name__ == "__main__":
|
|
872
|
+
test_cua()
|