cua-agent 0.1.21__py3-none-any.whl → 0.1.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

agent/ui/gradio/app.py ADDED
@@ -0,0 +1,872 @@
1
+ """
2
+ Advanced Gradio UI for Computer-Use Agent
3
+
4
+ This is a Gradio interface for the Computer-Use Agent
5
+ with an advanced UI for model selection and configuration.
6
+
7
+ Supported Agent Loops and Models:
8
+ - AgentLoop.OPENAI: Uses OpenAI Operator CUA model
9
+ • computer_use_preview
10
+
11
+ - AgentLoop.ANTHROPIC: Uses Anthropic Computer-Use models
12
+ • claude-3-5-sonnet-20240620
13
+ • claude-3-7-sonnet-20250219
14
+
15
+ - AgentLoop.OMNI (experimental): Uses OmniParser for element pixel-detection
16
+ • claude-3-5-sonnet-20240620
17
+ • claude-3-7-sonnet-20250219
18
+ • gpt-4.5-preview
19
+ • gpt-4o
20
+ • gpt-4
21
+
22
+ Requirements:
23
+ - Mac with Apple Silicon (M1/M2/M3/M4)
24
+ - macOS 14 (Sonoma) or newer
25
+ - Python 3.10+
26
+ - Lume CLI installed (https://github.com/trycua/cua)
27
+ - OpenAI or Anthropic API key
28
+ """
29
+
30
+ import os
31
+ import asyncio
32
+ import logging
33
+ from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
34
+ import gradio as gr
35
+
36
+ # Import from agent package
37
+ from agent.core.types import AgentResponse
38
+ from computer import Computer
39
+
40
+ from agent import ComputerAgent, AgentLoop, LLM, LLMProvider
41
+
42
+ # Global variables
43
+ global_agent = None
44
+ global_computer = None
45
+
46
+ # Map model names to specific provider model names
47
+ MODEL_MAPPINGS = {
48
+ "openai": {
49
+ # Default to operator CUA model
50
+ "default": "computer_use_preview",
51
+ # Map standard OpenAI model names to CUA-specific model names
52
+ "gpt-4-turbo": "computer_use_preview",
53
+ "gpt-4o": "computer_use_preview",
54
+ "gpt-4": "computer_use_preview",
55
+ "gpt-4.5-preview": "computer_use_preview",
56
+ },
57
+ "anthropic": {
58
+ # Default to newest model
59
+ "default": "claude-3-7-sonnet-20250219",
60
+ # Specific Claude models for CUA
61
+ "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
62
+ "claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
63
+ # Map standard model names to CUA-specific model names
64
+ "claude-3-opus": "claude-3-7-sonnet-20250219",
65
+ "claude-3-sonnet": "claude-3-5-sonnet-20240620",
66
+ "claude-3-5-sonnet": "claude-3-5-sonnet-20240620",
67
+ "claude-3-7-sonnet": "claude-3-7-sonnet-20250219",
68
+ },
69
+ "omni": {
70
+ # OMNI works with any of these models
71
+ "default": "gpt-4o",
72
+ "gpt-4o": "gpt-4o",
73
+ "gpt-4": "gpt-4",
74
+ "gpt-4.5-preview": "gpt-4.5-preview",
75
+ "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
76
+ "claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
77
+ },
78
+ "ollama": {
79
+ # For Ollama models, we keep the original name
80
+ "default": "llama3", # A common default model
81
+ # Don't map other models - we'll use the original name
82
+ },
83
+ "oaicompat": {
84
+ # Default for OpenAI-compatible providers like VLLM
85
+ "default": "Qwen2.5-VL-7B-Instruct",
86
+ },
87
+ }
88
+
89
+
90
+ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
91
+ """
92
+ Determine the provider and actual model name to use based on the input.
93
+
94
+ Args:
95
+ model_name: The requested model name
96
+ loop_provider: The requested agent loop provider
97
+
98
+ Returns:
99
+ tuple: (provider, model_name_to_use, agent_loop)
100
+ """
101
+ # Get the agent loop
102
+ loop_provider_map = {
103
+ "OPENAI": AgentLoop.OPENAI,
104
+ "ANTHROPIC": AgentLoop.ANTHROPIC,
105
+ "OMNI": AgentLoop.OMNI,
106
+ "OMNI-OLLAMA": AgentLoop.OMNI, # Special case for Ollama models with OMNI parser
107
+ }
108
+ agent_loop = loop_provider_map.get(loop_provider, AgentLoop.OPENAI)
109
+
110
+ # Set up the provider and model based on the loop and model_name
111
+ if agent_loop == AgentLoop.OPENAI:
112
+ provider = LLMProvider.OPENAI
113
+ model_name_to_use = MODEL_MAPPINGS["openai"].get(
114
+ model_name.lower(), MODEL_MAPPINGS["openai"]["default"]
115
+ )
116
+ elif agent_loop == AgentLoop.ANTHROPIC:
117
+ provider = LLMProvider.ANTHROPIC
118
+ model_name_to_use = MODEL_MAPPINGS["anthropic"].get(
119
+ model_name.lower(), MODEL_MAPPINGS["anthropic"]["default"]
120
+ )
121
+ elif agent_loop == AgentLoop.OMNI:
122
+ # For OMNI, select provider based on model name or loop_provider
123
+ if loop_provider == "OMNI-OLLAMA":
124
+ provider = LLMProvider.OLLAMA
125
+
126
+ # For Ollama models from the UI dropdown, we use the model name as is
127
+ # No need to parse it - it's already the correct Ollama model name
128
+ model_name_to_use = model_name
129
+ elif "claude" in model_name.lower():
130
+ provider = LLMProvider.ANTHROPIC
131
+ model_name_to_use = MODEL_MAPPINGS["omni"].get(
132
+ model_name.lower(), MODEL_MAPPINGS["omni"]["default"]
133
+ )
134
+ elif "gpt" in model_name.lower():
135
+ provider = LLMProvider.OPENAI
136
+ model_name_to_use = MODEL_MAPPINGS["omni"].get(
137
+ model_name.lower(), MODEL_MAPPINGS["omni"]["default"]
138
+ )
139
+ else:
140
+ # Handle custom model names - use the OAICOMPAT provider
141
+ provider = LLMProvider.OAICOMPAT
142
+ # Use the model name as is without mapping, or use default if empty
143
+ model_name_to_use = (
144
+ model_name if model_name.strip() else MODEL_MAPPINGS["oaicompat"]["default"]
145
+ )
146
+ else:
147
+ # Default to OpenAI if unrecognized loop
148
+ provider = LLMProvider.OPENAI
149
+ model_name_to_use = MODEL_MAPPINGS["openai"]["default"]
150
+ agent_loop = AgentLoop.OPENAI
151
+
152
+ return provider, model_name_to_use, agent_loop
153
+
154
+
155
+ def get_ollama_models() -> List[str]:
156
+ """Get available models from Ollama if installed."""
157
+ try:
158
+ import subprocess
159
+
160
+ result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
161
+ if result.returncode == 0:
162
+ lines = result.stdout.strip().split("\n")
163
+ if len(lines) < 2: # No models or just header
164
+ return []
165
+
166
+ models = []
167
+ # Skip header line
168
+ for line in lines[1:]:
169
+ parts = line.split()
170
+ if parts:
171
+ model_name = parts[0]
172
+ models.append(f"OMNI: Ollama {model_name}")
173
+ return models
174
+ return []
175
+ except Exception as e:
176
+ logging.error(f"Error getting Ollama models: {e}")
177
+ return []
178
+
179
+
180
+ def extract_synthesized_text(result: Union[AgentResponse, Dict[str, Any]]) -> str:
181
+ """Extract synthesized text from the agent result."""
182
+ synthesized_text = ""
183
+
184
+ if "output" in result and result["output"]:
185
+ for output in result["output"]:
186
+ if output.get("type") == "reasoning":
187
+ content = output.get("content", "")
188
+ if content:
189
+ synthesized_text += f"{content}\n"
190
+
191
+ elif output.get("type") == "message":
192
+ # Handle message type outputs - can contain rich content
193
+ content = output.get("content", [])
194
+
195
+ # Content is usually an array of content blocks
196
+ if isinstance(content, list):
197
+ for block in content:
198
+ if isinstance(block, dict) and block.get("type") == "output_text":
199
+ text_value = block.get("text", "")
200
+ if text_value:
201
+ synthesized_text += f"{text_value}\n"
202
+
203
+ elif output.get("type") == "computer_call":
204
+ action = output.get("action", {})
205
+ action_type = action.get("type", "")
206
+
207
+ # Create a descriptive text about the action
208
+ if action_type == "click":
209
+ button = action.get("button", "")
210
+ x = action.get("x", "")
211
+ y = action.get("y", "")
212
+ synthesized_text += f"Clicked {button} at position ({x}, {y}).\n"
213
+ elif action_type == "type":
214
+ text = action.get("text", "")
215
+ synthesized_text += f"Typed: {text}.\n"
216
+ elif action_type == "keypress":
217
+ # Extract key correctly from either keys array or key field
218
+ if isinstance(action.get("keys"), list):
219
+ key = ", ".join(action.get("keys"))
220
+ else:
221
+ key = action.get("key", "")
222
+
223
+ synthesized_text += f"Pressed key: {key}\n"
224
+ else:
225
+ synthesized_text += f"Performed {action_type} action.\n"
226
+
227
+ return synthesized_text.strip()
228
+
229
+
230
+ def create_computer_instance(verbosity: int = logging.INFO) -> Computer:
231
+ """Create or get the global Computer instance."""
232
+ global global_computer
233
+
234
+ if global_computer is None:
235
+ global_computer = Computer(verbosity=verbosity)
236
+
237
+ return global_computer
238
+
239
+
240
+ def create_agent(
241
+ provider: LLMProvider,
242
+ agent_loop: AgentLoop,
243
+ model_name: str,
244
+ api_key: Optional[str] = None,
245
+ save_trajectory: bool = True,
246
+ only_n_most_recent_images: int = 3,
247
+ verbosity: int = logging.INFO,
248
+ use_ollama: bool = False,
249
+ use_oaicompat: bool = False,
250
+ ) -> ComputerAgent:
251
+ """Create or update the global agent with the specified parameters."""
252
+ global global_agent
253
+
254
+ # Create the computer if not already done
255
+ computer = create_computer_instance(verbosity=verbosity)
256
+
257
+ # Extra configuration to pass to the agent
258
+ extra_config = {}
259
+
260
+ # For Ollama models, we'll pass use_ollama and the model_name directly
261
+ if use_ollama:
262
+ extra_config["use_ollama"] = True
263
+ extra_config["ollama_model"] = model_name
264
+ print(f"DEBUG - Using Ollama with model: {model_name}")
265
+
266
+ # Get API key from environment if not provided
267
+ if api_key is None:
268
+ if provider == LLMProvider.OPENAI:
269
+ api_key = os.environ.get("OPENAI_API_KEY", "")
270
+ elif provider == LLMProvider.ANTHROPIC:
271
+ api_key = os.environ.get("ANTHROPIC_API_KEY", "")
272
+
273
+ # Create LLM model object with appropriate parameters
274
+ provider_base_url = "http://localhost:8000/v1" if use_oaicompat else None
275
+
276
+ if use_oaicompat:
277
+ # Special handling for OAICOMPAT - use OPENAI provider with custom base URL
278
+ print(f"DEBUG - Creating OAICOMPAT agent with model: {model_name}")
279
+ llm = LLM(
280
+ provider=provider, # Already set to OPENAI
281
+ name=model_name,
282
+ provider_base_url=provider_base_url,
283
+ )
284
+ # Note: Don't pass use_oaicompat to the agent, as it doesn't accept this parameter
285
+ elif provider == LLMProvider.OAICOMPAT:
286
+ # This path is unlikely to be taken with our current approach
287
+ llm = LLM(provider=provider, name=model_name, provider_base_url=provider_base_url)
288
+ else:
289
+ # For other providers, just use standard parameters
290
+ llm = LLM(provider=provider, name=model_name)
291
+
292
+ # Create or update the agent
293
+ if global_agent is None:
294
+ global_agent = ComputerAgent(
295
+ computer=computer,
296
+ loop=agent_loop,
297
+ model=llm,
298
+ api_key=api_key,
299
+ save_trajectory=save_trajectory,
300
+ only_n_most_recent_images=only_n_most_recent_images,
301
+ verbosity=verbosity,
302
+ **extra_config,
303
+ )
304
+ else:
305
+ # Update the existing agent's parameters
306
+ global_agent._loop = None # Force recreation of the loop
307
+ global_agent.provider = provider
308
+ global_agent.loop = agent_loop
309
+ global_agent.model = llm
310
+ global_agent.api_key = api_key
311
+
312
+ # Explicitly update these settings to ensure they take effect
313
+ global_agent.save_trajectory = save_trajectory
314
+ global_agent.only_n_most_recent_images = only_n_most_recent_images
315
+
316
+ # Update Ollama settings if applicable
317
+ if use_ollama:
318
+ global_agent.use_ollama = True
319
+ global_agent.ollama_model = model_name
320
+ else:
321
+ global_agent.use_ollama = False
322
+ global_agent.ollama_model = None
323
+
324
+ # Log the updated settings
325
+ logging.info(
326
+ f"Updated agent settings: save_trajectory={save_trajectory}, recent_images={only_n_most_recent_images}"
327
+ )
328
+
329
+ return global_agent
330
+
331
+
332
+ def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> str:
333
+ """Process agent results for the Gradio UI."""
334
+ # Extract text content
335
+ text_obj = result.get("text", {})
336
+
337
+ # For OpenAI's Computer-Use Agent, text field is an object with format property
338
+ if (
339
+ text_obj
340
+ and isinstance(text_obj, dict)
341
+ and "format" in text_obj
342
+ and not text_obj.get("value", "")
343
+ ):
344
+ content = extract_synthesized_text(result)
345
+ else:
346
+ # For other types of results, try to get text directly
347
+ if isinstance(text_obj, dict):
348
+ if "value" in text_obj:
349
+ content = text_obj["value"]
350
+ elif "text" in text_obj:
351
+ content = text_obj["text"]
352
+ elif "content" in text_obj:
353
+ content = text_obj["content"]
354
+ else:
355
+ content = ""
356
+ else:
357
+ content = str(text_obj) if text_obj else ""
358
+
359
+ # If still no content but we have outputs, create a summary
360
+ if not content and "output" in result and result["output"]:
361
+ output = result["output"]
362
+ for out in output:
363
+ if out.get("type") == "reasoning":
364
+ content = out.get("content", "")
365
+ if content:
366
+ break
367
+ elif out.get("type") == "computer_call":
368
+ action = out.get("action", {})
369
+ action_type = action.get("type", "")
370
+ if action_type:
371
+ content = f"Performing action: {action_type}"
372
+ break
373
+
374
+ # Clean up the text - ensure content is a string
375
+ if not isinstance(content, str):
376
+ content = str(content) if content else ""
377
+
378
+ return content
379
+
380
+
381
+ def respond(
382
+ message: str,
383
+ history: List[Tuple[str, str]],
384
+ model_choice, # Accept Gradio Dropdown component
385
+ agent_loop, # Accept Gradio Dropdown component
386
+ save_trajectory, # Accept Gradio Checkbox component
387
+ recent_images, # Accept Gradio Slider component
388
+ openai_api_key: Optional[str] = None,
389
+ anthropic_api_key: Optional[str] = None,
390
+ ) -> str:
391
+ """Process a message with the Computer-Use Agent and return the response."""
392
+ import asyncio
393
+
394
+ # Get actual values from Gradio components
395
+ model_choice_value = model_choice.value if hasattr(model_choice, "value") else model_choice
396
+ agent_loop_value = agent_loop.value if hasattr(agent_loop, "value") else agent_loop
397
+ save_trajectory_value = (
398
+ save_trajectory.value if hasattr(save_trajectory, "value") else save_trajectory
399
+ )
400
+ recent_images_value = int(
401
+ recent_images.value if hasattr(recent_images, "value") else recent_images
402
+ )
403
+
404
+ # Debug logging
405
+ print(f"DEBUG - Model choice object: {type(model_choice)}")
406
+ print(f"DEBUG - Model choice value: {model_choice_value}")
407
+ print(f"DEBUG - Agent loop value: {agent_loop_value}")
408
+
409
+ # Create a new event loop for this function call
410
+ loop = asyncio.new_event_loop()
411
+ asyncio.set_event_loop(loop)
412
+
413
+ async def _async_respond():
414
+ # Extract the loop type and model from the selection
415
+ loop_provider = "OPENAI"
416
+ if isinstance(model_choice_value, str):
417
+ # This is the case for a custom text input from textbox
418
+ if agent_loop_value == "OMNI":
419
+ loop_provider = "OMNI"
420
+ # Use the custom model name as is
421
+ model_id = model_choice_value
422
+ print(f"DEBUG - Using custom model: {model_id}")
423
+ else:
424
+ # Handle regular dropdown value as string
425
+ if model_choice_value.startswith("OpenAI:"):
426
+ loop_provider = "OPENAI"
427
+ model_id = model_choice_value.replace("OpenAI: ", "").lower()
428
+ elif model_choice_value.startswith("Anthropic:"):
429
+ loop_provider = "ANTHROPIC"
430
+ model_id = model_choice_value.replace("Anthropic: ", "").lower()
431
+ elif model_choice_value.startswith("OMNI:"):
432
+ loop_provider = "OMNI"
433
+ if "GPT" in model_choice_value:
434
+ model_id = model_choice_value.replace("OMNI: OpenAI ", "").lower()
435
+ elif "Claude" in model_choice_value:
436
+ model_id = model_choice_value.replace("OMNI: ", "").lower()
437
+ elif "Ollama" in model_choice_value:
438
+ loop_provider = "OMNI-OLLAMA"
439
+ # Extract everything after "OMNI: Ollama " which is the full model name (e.g., phi3:latest)
440
+ model_id = model_choice_value.replace("OMNI: Ollama ", "")
441
+ print(f"DEBUG - Ollama model ID: {model_id}")
442
+ else:
443
+ model_id = "default"
444
+ else:
445
+ # Default case
446
+ loop_provider = agent_loop_value
447
+ model_id = "default"
448
+ else:
449
+ # Model choice is not a string (shouldn't happen, but handle anyway)
450
+ loop_provider = agent_loop_value
451
+ model_id = "default"
452
+
453
+ print(f"DEBUG - Using loop provider: {loop_provider}, model_id: {model_id}")
454
+
455
+ # Use the mapping function to get provider, model name and agent loop
456
+ provider, model_name, agent_loop_type = get_provider_and_model(model_id, loop_provider)
457
+ print(
458
+ f"DEBUG - After mapping: provider={provider}, model_name={model_name}, agent_loop={agent_loop_type}"
459
+ )
460
+
461
+ # Special handling for OAICOMPAT to bypass provider-specific errors
462
+ # Creates the agent with OPENAI provider but using custom model name and provider base URL
463
+ is_oaicompat = str(provider) == "oaicompat"
464
+ if is_oaicompat:
465
+ provider = LLMProvider.OPENAI
466
+
467
+ # Get API key based on provider
468
+ if provider == LLMProvider.OPENAI:
469
+ api_key = openai_api_key or os.environ.get("OPENAI_API_KEY", "")
470
+ elif provider == LLMProvider.ANTHROPIC:
471
+ api_key = anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
472
+ else:
473
+ api_key = ""
474
+
475
+ # Check for settings changes if agent already exists
476
+ settings_changed = False
477
+ settings_message = ""
478
+ if global_agent is not None:
479
+ # Safely check if save_trajectory setting changed
480
+ current_save_traj = getattr(global_agent, "save_trajectory", None)
481
+ if current_save_traj is not None and current_save_traj != save_trajectory_value:
482
+ settings_changed = True
483
+ settings_message += f"Save trajectory set to: {save_trajectory_value}. "
484
+
485
+ # Safely check if recent_images setting changed
486
+ current_recent_images = getattr(global_agent, "only_n_most_recent_images", None)
487
+ if current_recent_images is not None and current_recent_images != recent_images_value:
488
+ settings_changed = True
489
+ settings_message += f"Recent images set to: {recent_images_value}. "
490
+
491
+ # Create or update the agent
492
+ try:
493
+ create_agent(
494
+ provider=provider,
495
+ agent_loop=agent_loop_type,
496
+ model_name=model_name,
497
+ api_key=api_key,
498
+ save_trajectory=save_trajectory_value,
499
+ only_n_most_recent_images=recent_images_value,
500
+ use_ollama=loop_provider == "OMNI-OLLAMA",
501
+ use_oaicompat=is_oaicompat,
502
+ )
503
+
504
+ if global_agent is None:
505
+ return "Failed to create agent. Check API keys and configuration."
506
+ except Exception as e:
507
+ return f"Error creating agent: {str(e)}"
508
+
509
+ # Notify about settings changes if needed
510
+ if settings_changed:
511
+ return f"Settings updated: {settings_message}"
512
+
513
+ # Collect all responses
514
+ response_text = []
515
+
516
+ # Run the agent
517
+ try:
518
+ async for result in global_agent.run(message):
519
+ # Process result
520
+ content = process_agent_result(result)
521
+
522
+ # Skip empty content
523
+ if not content:
524
+ continue
525
+
526
+ # Add content to response list
527
+ response_text.append(content)
528
+
529
+ # Return the full response as a single string
530
+ return "\n".join(response_text) if response_text else "Task completed."
531
+
532
+ except Exception as e:
533
+ import traceback
534
+
535
+ traceback.print_exc()
536
+ return f"Error: {str(e)}"
537
+
538
+ # Run the async function and get the result
539
+ try:
540
+ result = loop.run_until_complete(_async_respond())
541
+ loop.close()
542
+ return result
543
+ except Exception as e:
544
+ loop.close()
545
+ import traceback
546
+
547
+ traceback.print_exc()
548
+ return f"Error executing async operation: {str(e)}"
549
+
550
+
551
+ def create_gradio_ui(
552
+ provider_name: str = "openai",
553
+ model_name: str = "gpt-4o",
554
+ ) -> gr.Blocks:
555
+ """Create a Gradio UI for the Computer-Use Agent.
556
+
557
+ Args:
558
+ provider_name: The provider to use (e.g., "openai", "anthropic")
559
+ model_name: The model to use (e.g., "gpt-4o", "claude-3-7-sonnet")
560
+
561
+ Returns:
562
+ A Gradio Blocks application
563
+ """
564
+ # Check for API keys
565
+ openai_api_key = os.environ.get("OPENAI_API_KEY", "")
566
+ anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
567
+
568
+ # Prepare model choices based on available API keys
569
+ openai_models = []
570
+ anthropic_models = []
571
+ omni_models = []
572
+
573
+ if openai_api_key:
574
+ openai_models = ["OpenAI: Computer-Use Preview"]
575
+ omni_models += [
576
+ "OMNI: OpenAI GPT-4o",
577
+ "OMNI: OpenAI GPT-4.5-preview",
578
+ ]
579
+
580
+ if anthropic_api_key:
581
+ anthropic_models = [
582
+ "Anthropic: Claude 3.7 Sonnet (20250219)",
583
+ "Anthropic: Claude 3.5 Sonnet (20240620)",
584
+ ]
585
+ omni_models += ["OMNI: Claude 3.7 Sonnet (20250219)", "OMNI: Claude 3.5 Sonnet (20240620)"]
586
+
587
+ # Get Ollama models for OMNI
588
+ ollama_models = get_ollama_models()
589
+ if ollama_models:
590
+ omni_models += ollama_models
591
+
592
+ # Format model choices
593
+ provider_to_models = {
594
+ "OPENAI": openai_models,
595
+ "ANTHROPIC": anthropic_models,
596
+ "OMNI": omni_models + ["Custom model..."], # Add custom model option
597
+ }
598
+
599
+ # Get initial agent loop and model based on provided parameters
600
+ if provider_name.lower() == "openai":
601
+ initial_loop = "OPENAI"
602
+ initial_model = "OpenAI: Computer-Use Preview" if openai_models else "No models available"
603
+ elif provider_name.lower() == "anthropic":
604
+ initial_loop = "ANTHROPIC"
605
+ initial_model = anthropic_models[0] if anthropic_models else "No models available"
606
+ else:
607
+ initial_loop = "OMNI"
608
+ if model_name == "gpt-4o" and "OMNI: OpenAI GPT-4o" in omni_models:
609
+ initial_model = "OMNI: OpenAI GPT-4o"
610
+ elif "claude" in model_name.lower() and omni_models:
611
+ initial_model = next((m for m in omni_models if "Claude" in m), omni_models[0])
612
+ else:
613
+ initial_model = omni_models[0] if omni_models else "No models available"
614
+
615
+ # Example prompts
616
+ example_messages = [
617
+ "Create a Python virtual environment, install pandas and matplotlib, then plot stock data",
618
+ "Open a PDF in Preview, add annotations, and save it as a compressed version",
619
+ "Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
620
+ "Configure SSH keys and set up a connection to a remote server",
621
+ ]
622
+
623
+ # Function to update model choices based on agent loop selection
624
+ def update_model_choices(loop):
625
+ models = provider_to_models.get(loop, [])
626
+ if loop == "OMNI":
627
+ # For OMNI, include the custom model option
628
+ if not models:
629
+ models = ["Custom model..."]
630
+ elif "Custom model..." not in models:
631
+ models.append("Custom model...")
632
+
633
+ return gr.update(
634
+ choices=models, value=models[0] if models else "Custom model...", interactive=True
635
+ )
636
+ else:
637
+ # For other providers, use standard dropdown without custom option
638
+ if not models:
639
+ return gr.update(
640
+ choices=["No models available"], value="No models available", interactive=True
641
+ )
642
+ return gr.update(choices=models, value=models[0] if models else None, interactive=True)
643
+
644
+ # Create the Gradio interface with advanced UI
645
+ with gr.Blocks(title="Computer-Use Agent") as demo:
646
+ with gr.Row():
647
+ # Left column for settings
648
+ with gr.Column(scale=1):
649
+ # Logo with theme-aware styling
650
+ gr.HTML(
651
+ """
652
+ <style>
653
+ .light-logo, .dark-logo {
654
+ display: block;
655
+ margin: 0 auto;
656
+ width: 80px;
657
+ }
658
+ /* Hide dark logo in light mode */
659
+ .dark-logo {
660
+ display: none;
661
+ }
662
+ /* In dark mode, hide light logo and show dark logo */
663
+ .dark .light-logo {
664
+ display: none;
665
+ }
666
+ .dark .dark-logo {
667
+ display: block;
668
+ }
669
+ </style>
670
+ <div style="display: flex; justify-content: center; margin-bottom: 0.5em">
671
+ <img class="light-logo" alt="CUA Logo"
672
+ src="https://github.com/trycua/cua/blob/main/img/logo_black.png?raw=true" />
673
+ <img class="dark-logo" alt="CUA Logo"
674
+ src="https://github.com/trycua/cua/blob/main/img/logo_white.png?raw=true" />
675
+ </div>
676
+ """
677
+ )
678
+
679
+ # Add installation prerequisites as a collapsible section
680
+ with gr.Accordion("Prerequisites & Installation", open=False):
681
+ gr.Markdown(
682
+ """
683
+ ## Prerequisites
684
+
685
+ Before using the Computer-Use Agent, you need to set up the Lume daemon and pull the macOS VM image.
686
+
687
+ ### 1. Install Lume daemon
688
+
689
+ While a lume binary is included with Computer, we recommend installing the standalone version with brew, and starting the lume daemon service:
690
+
691
+ ```bash
692
+ sudo /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
693
+ ```
694
+
695
+ ### 2. Start the Lume daemon service
696
+
697
+ In a separate terminal:
698
+
699
+ ```bash
700
+ lume serve
701
+ ```
702
+
703
+ ### 3. Pull the pre-built macOS image
704
+
705
+ ```bash
706
+ lume pull macos-sequoia-cua:latest --no-cache
707
+ ```
708
+
709
+ Initial download requires 80GB storage, but reduces to ~30GB after first run due to macOS's sparse file system.
710
+
711
+ VMs are stored in `~/.lume`, and locally cached images are stored in `~/.lume/cache`.
712
+
713
+ ### 4. Test the sandbox
714
+
715
+ ```bash
716
+ lume run macos-sequoia-cua:latest
717
+ ```
718
+
719
+ For more detailed instructions, visit the [CUA GitHub repository](https://github.com/trycua/cua).
720
+ """
721
+ )
722
+
723
+ # Configuration options
724
+ agent_loop = gr.Dropdown(
725
+ choices=["OPENAI", "ANTHROPIC", "OMNI"],
726
+ label="Agent Loop",
727
+ value=initial_loop,
728
+ info="Select the agent loop provider",
729
+ )
730
+
731
+ # Create model selection dropdown with custom value support for OMNI
732
+ model_choice = gr.Dropdown(
733
+ choices=provider_to_models.get(initial_loop, ["No models available"]),
734
+ label="LLM Provider and Model",
735
+ value=initial_model,
736
+ info="Select model or choose 'Custom model...' to enter a custom name",
737
+ interactive=True,
738
+ )
739
+
740
+ # Add custom model textbox (only visible when "Custom model..." is selected)
741
+ custom_model = gr.Textbox(
742
+ label="Custom Model Name",
743
+ placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct)",
744
+ value="Qwen2.5-VL-7B-Instruct", # Default value
745
+ visible=False, # Initially hidden
746
+ interactive=True,
747
+ )
748
+
749
+ save_trajectory = gr.Checkbox(
750
+ label="Save Trajectory",
751
+ value=True,
752
+ info="Save the agent's trajectory for debugging",
753
+ interactive=True,
754
+ )
755
+
756
+ recent_images = gr.Slider(
757
+ label="Recent Images",
758
+ minimum=1,
759
+ maximum=10,
760
+ value=3,
761
+ step=1,
762
+ info="Number of recent images to keep in context",
763
+ interactive=True,
764
+ )
765
+
766
+ # Right column for chat interface
767
+ with gr.Column(scale=2):
768
+ # Add instruction text before the chat interface
769
+ gr.Markdown(
770
+ "Ask me to perform tasks in a virtual macOS environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
771
+ )
772
+
773
+ chatbot = gr.Chatbot()
774
+ msg = gr.Textbox(
775
+ placeholder="Ask me to perform tasks in a virtual macOS environment"
776
+ )
777
+ clear = gr.Button("Clear")
778
+
779
+ # Add examples
780
+ example_group = gr.Examples(examples=example_messages, inputs=msg)
781
+
782
+ # Function to handle chat submission
783
+ def chat_submit(message, history):
784
+ # Add user message to history
785
+ history = history + [(message, None)]
786
+ return "", history
787
+
788
+ # Function to process agent response after user input
789
+ def process_response(
790
+ history,
791
+ model_choice_value,
792
+ custom_model_value,
793
+ agent_loop_choice,
794
+ save_traj,
795
+ recent_imgs,
796
+ ):
797
+ if not history:
798
+ return history
799
+
800
+ # Get the last user message
801
+ last_user_message = history[-1][0]
802
+
803
+ # Use custom model value if "Custom model..." is selected
804
+ model_to_use = (
805
+ custom_model_value
806
+ if model_choice_value == "Custom model..."
807
+ else model_choice_value
808
+ )
809
+
810
+ # Process with agent
811
+ response = respond(
812
+ last_user_message,
813
+ history[:-1], # History without the last message
814
+ model_to_use,
815
+ agent_loop_choice,
816
+ save_traj,
817
+ recent_imgs,
818
+ openai_api_key,
819
+ anthropic_api_key,
820
+ )
821
+
822
+ # Update the last assistant message
823
+ history[-1] = (last_user_message, response)
824
+ return history
825
+
826
+ # Connect the components
827
+ msg.submit(chat_submit, [msg, chatbot], [msg, chatbot]).then(
828
+ process_response,
829
+ [
830
+ chatbot,
831
+ model_choice,
832
+ custom_model,
833
+ agent_loop,
834
+ save_trajectory,
835
+ recent_images,
836
+ ],
837
+ [chatbot],
838
+ )
839
+
840
+ # Clear button functionality
841
+ clear.click(lambda: None, None, chatbot, queue=False)
842
+
843
+ # Connect agent_loop changes to model selection
844
+ agent_loop.change(
845
+ fn=update_model_choices,
846
+ inputs=[agent_loop],
847
+ outputs=[model_choice],
848
+ queue=False, # Process immediately without queueing
849
+ )
850
+
851
+ # Show/hide custom model textbox based on dropdown selection
852
+ def update_custom_model_visibility(model_value):
853
+ return gr.update(visible=model_value == "Custom model...")
854
+
855
+ model_choice.change(
856
+ fn=update_custom_model_visibility,
857
+ inputs=[model_choice],
858
+ outputs=[custom_model],
859
+ queue=False, # Process immediately without queueing
860
+ )
861
+
862
+ return demo
863
+
864
+
865
+ def test_cua():
866
+ """Standalone function to launch the Gradio app."""
867
+ demo = create_gradio_ui()
868
+ demo.launch(share=False) # Don't create a public link
869
+
870
+
871
+ if __name__ == "__main__":
872
+ test_cua()