cua-agent 0.1.24__py3-none-any.whl → 0.1.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

agent/ui/gradio/app.py CHANGED
@@ -30,11 +30,16 @@ Requirements:
30
30
  import os
31
31
  import asyncio
32
32
  import logging
33
+ import json
34
+ from pathlib import Path
33
35
  from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
34
36
  import gradio as gr
37
+ from gradio.components.chatbot import MetadataDict
35
38
 
36
39
  # Import from agent package
37
40
  from agent.core.types import AgentResponse
41
+ from agent.core.callbacks import DefaultCallbackHandler
42
+ from agent.providers.omni.parser import ParseResult
38
43
  from computer import Computer
39
44
 
40
45
  from agent import ComputerAgent, AgentLoop, LLM, LLMProvider
@@ -42,6 +47,86 @@ from agent import ComputerAgent, AgentLoop, LLM, LLMProvider
42
47
  # Global variables
43
48
  global_agent = None
44
49
  global_computer = None
50
+ SETTINGS_FILE = Path(".gradio_settings.json")
51
+
52
+ # We'll use asyncio.run() instead of a persistent event loop
53
+
54
+
55
+ # --- Settings Load/Save Functions ---
56
+ def load_settings() -> Dict[str, Any]:
57
+ """Loads settings from the JSON file."""
58
+ if SETTINGS_FILE.exists():
59
+ try:
60
+ with open(SETTINGS_FILE, "r") as f:
61
+ settings = json.load(f)
62
+ # Basic validation (can be expanded)
63
+ if isinstance(settings, dict):
64
+ print(f"DEBUG - Loaded settings from {SETTINGS_FILE}")
65
+ return settings
66
+ except (json.JSONDecodeError, IOError) as e:
67
+ print(f"Warning: Could not load settings from {SETTINGS_FILE}: {e}")
68
+ return {}
69
+
70
+
71
+ def save_settings(settings: Dict[str, Any]):
72
+ """Saves settings to the JSON file."""
73
+ # Ensure sensitive keys are not saved
74
+ settings.pop("provider_api_key", None)
75
+ try:
76
+ with open(SETTINGS_FILE, "w") as f:
77
+ json.dump(settings, f, indent=4)
78
+ print(f"DEBUG - Saved settings to {SETTINGS_FILE}")
79
+ except IOError as e:
80
+ print(f"Warning: Could not save settings to {SETTINGS_FILE}: {e}")
81
+
82
+
83
+ # --- End Settings Load/Save ---
84
+
85
+
86
+ # Custom Screenshot Handler for Gradio chat
87
+ class GradioChatScreenshotHandler(DefaultCallbackHandler):
88
+ """Custom handler that adds screenshots to the Gradio chatbot and updates annotated image."""
89
+
90
+ def __init__(self, chatbot_history: List[gr.ChatMessage]):
91
+ """Initialize with reference to chat history and annotated image component.
92
+
93
+ Args:
94
+ chatbot_history: Reference to the Gradio chatbot history list
95
+ annotated_image: Reference to the annotated image component
96
+ """
97
+ self.chatbot_history = chatbot_history
98
+ print("GradioChatScreenshotHandler initialized")
99
+
100
+ async def on_screenshot(
101
+ self,
102
+ screenshot_base64: str,
103
+ action_type: str = "",
104
+ parsed_screen: Optional[ParseResult] = None,
105
+ ) -> None:
106
+ """Add screenshot to chatbot when a screenshot is taken and update the annotated image.
107
+
108
+ Args:
109
+ screenshot_base64: Base64 encoded screenshot
110
+ action_type: Type of action that triggered the screenshot
111
+
112
+ Returns:
113
+ Original screenshot (does not modify it)
114
+ """
115
+ # Create a markdown image element for the screenshot
116
+ image_markdown = (
117
+ f"![Screenshot after {action_type}](data:image/png;base64,{screenshot_base64})"
118
+ )
119
+
120
+ # Simply append the screenshot as a new message
121
+ if self.chatbot_history is not None:
122
+ self.chatbot_history.append(
123
+ gr.ChatMessage(
124
+ role="assistant",
125
+ content=image_markdown,
126
+ metadata={"title": f"🖥️ Screenshot - {action_type}", "status": "done"},
127
+ )
128
+ )
129
+
45
130
 
46
131
  # Map model names to specific provider model names
47
132
  MODEL_MAPPINGS = {
@@ -53,6 +138,7 @@ MODEL_MAPPINGS = {
53
138
  "gpt-4o": "computer_use_preview",
54
139
  "gpt-4": "computer_use_preview",
55
140
  "gpt-4.5-preview": "computer_use_preview",
141
+ "gpt-4o-mini": "gpt-4o-mini",
56
142
  },
57
143
  "anthropic": {
58
144
  # Default to newest model
@@ -70,6 +156,7 @@ MODEL_MAPPINGS = {
70
156
  # OMNI works with any of these models
71
157
  "default": "gpt-4o",
72
158
  "gpt-4o": "gpt-4o",
159
+ "gpt-4o-mini": "gpt-4o-mini",
73
160
  "gpt-4": "gpt-4",
74
161
  "gpt-4.5-preview": "gpt-4.5-preview",
75
162
  "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
@@ -119,30 +206,82 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
119
206
  model_name.lower(), MODEL_MAPPINGS["anthropic"]["default"]
120
207
  )
121
208
  elif agent_loop == AgentLoop.OMNI:
122
- # For OMNI, select provider based on model name or loop_provider
123
- if loop_provider == "OMNI-OLLAMA":
209
+ # Determine provider and clean model name based on the full string from UI
210
+ cleaned_model_name = model_name # Default to using the name as-is (for custom)
211
+
212
+ if model_name == "Custom model...":
213
+ # Actual model name comes from custom_model_value via model_to_use.
214
+ # Assume OAICOMPAT for custom models unless overridden by URL/key later?
215
+ # get_provider_and_model determines the *initial* provider/model.
216
+ # The custom URL/key in process_response ultimately dictates the OAICOMPAT setup.
217
+ provider = LLMProvider.OAICOMPAT
218
+ # We set cleaned_model_name below outside the checks based on model_to_use
219
+ cleaned_model_name = "" # Placeholder, will be set by custom value later
220
+ elif model_name.startswith("OMNI: Ollama "):
124
221
  provider = LLMProvider.OLLAMA
125
-
126
- # For Ollama models from the UI dropdown, we use the model name as is
127
- # No need to parse it - it's already the correct Ollama model name
128
- model_name_to_use = model_name
129
- elif "claude" in model_name.lower():
222
+ # Extract the part after "OMNI: Ollama "
223
+ cleaned_model_name = model_name.split("OMNI: Ollama ", 1)[1]
224
+ elif model_name.startswith("OMNI: Claude "):
130
225
  provider = LLMProvider.ANTHROPIC
131
- model_name_to_use = MODEL_MAPPINGS["omni"].get(
132
- model_name.lower(), MODEL_MAPPINGS["omni"]["default"]
133
- )
134
- elif "gpt" in model_name.lower():
226
+ # Extract the canonical model name based on the UI string
227
+ # e.g., "OMNI: Claude 3.7 Sonnet (20250219)" -> "3.7 Sonnet" and "20250219"
228
+ parts = model_name.split(" (")
229
+ model_key_part = parts[0].replace("OMNI: Claude ", "")
230
+ date_part = parts[1].replace(")", "") if len(parts) > 1 else ""
231
+
232
+ # Normalize the extracted key part for comparison
233
+ # "3.7 Sonnet" -> "37sonnet"
234
+ model_key_part_norm = model_key_part.lower().replace(".", "").replace(" ", "")
235
+
236
+ cleaned_model_name = MODEL_MAPPINGS["omni"]["default"] # Default if not found
237
+ # Find the canonical name in the main Anthropic map
238
+ for key_anthropic, val_anthropic in MODEL_MAPPINGS["anthropic"].items():
239
+ # Normalize the canonical key for comparison
240
+ # "claude-3-7-sonnet-20250219" -> "claude37sonnet20250219"
241
+ key_anthropic_norm = key_anthropic.lower().replace("-", "")
242
+
243
+ # Check if the normalized canonical key starts with "claude" + normalized extracted part
244
+ # AND contains the date part.
245
+ if (
246
+ key_anthropic_norm.startswith("claude" + model_key_part_norm)
247
+ and date_part in key_anthropic_norm
248
+ ):
249
+ cleaned_model_name = (
250
+ val_anthropic # Use the canonical name like "claude-3-7-sonnet-20250219"
251
+ )
252
+ break
253
+ elif model_name.startswith("OMNI: OpenAI "):
135
254
  provider = LLMProvider.OPENAI
136
- model_name_to_use = MODEL_MAPPINGS["omni"].get(
137
- model_name.lower(), MODEL_MAPPINGS["omni"]["default"]
138
- )
139
- else:
140
- # Handle custom model names - use the OAICOMPAT provider
255
+ # Extract the model part, e.g., "GPT-4o mini"
256
+ model_key_part = model_name.replace("OMNI: OpenAI ", "")
257
+ # Normalize the extracted part: "gpt4omini"
258
+ model_key_part_norm = model_key_part.lower().replace("-", "").replace(" ", "")
259
+
260
+ cleaned_model_name = MODEL_MAPPINGS["omni"]["default"] # Default if not found
261
+ # Find the canonical name in the main OMNI map for OpenAI models
262
+ for key_omni, val_omni in MODEL_MAPPINGS["omni"].items():
263
+ # Normalize the omni map key: "gpt-4o-mini" -> "gpt4omini"
264
+ key_omni_norm = key_omni.lower().replace("-", "").replace(" ", "")
265
+ # Check if the normalized omni key matches the normalized extracted part
266
+ if key_omni_norm == model_key_part_norm:
267
+ cleaned_model_name = (
268
+ val_omni # Use the value from the OMNI map (e.g., gpt-4o-mini)
269
+ )
270
+ break
271
+ # Note: No fallback needed here as we explicitly check against omni keys
272
+
273
+ else: # Handles unexpected formats or the raw custom name if "Custom model..." selected
274
+ # Should only happen if user selected "Custom model..."
275
+ # Or if a model name format isn't caught above
141
276
  provider = LLMProvider.OAICOMPAT
142
- # Use the model name as is without mapping, or use default if empty
143
- model_name_to_use = (
144
- model_name if model_name.strip() else MODEL_MAPPINGS["oaicompat"]["default"]
277
+ cleaned_model_name = (
278
+ model_name.strip() if model_name.strip() else MODEL_MAPPINGS["oaicompat"]["default"]
145
279
  )
280
+
281
+ # Assign the determined model name
282
+ model_name_to_use = cleaned_model_name
283
+ # agent_loop remains AgentLoop.OMNI
284
+
146
285
  else:
147
286
  # Default to OpenAI if unrecognized loop
148
287
  provider = LLMProvider.OPENAI
@@ -177,17 +316,20 @@ def get_ollama_models() -> List[str]:
177
316
  return []
178
317
 
179
318
 
180
- def extract_synthesized_text(result: Union[AgentResponse, Dict[str, Any]]) -> str:
319
+ def extract_synthesized_text(
320
+ result: Union[AgentResponse, Dict[str, Any]],
321
+ ) -> Tuple[str, MetadataDict]:
181
322
  """Extract synthesized text from the agent result."""
182
323
  synthesized_text = ""
324
+ metadata = MetadataDict()
183
325
 
184
326
  if "output" in result and result["output"]:
185
327
  for output in result["output"]:
186
328
  if output.get("type") == "reasoning":
329
+ metadata["title"] = "🧠 Reasoning"
187
330
  content = output.get("content", "")
188
331
  if content:
189
332
  synthesized_text += f"{content}\n"
190
-
191
333
  elif output.get("type") == "message":
192
334
  # Handle message type outputs - can contain rich content
193
335
  content = output.get("content", [])
@@ -224,7 +366,10 @@ def extract_synthesized_text(result: Union[AgentResponse, Dict[str, Any]]) -> st
224
366
  else:
225
367
  synthesized_text += f"Performed {action_type} action.\n"
226
368
 
227
- return synthesized_text.strip()
369
+ metadata["status"] = "done"
370
+ metadata["title"] = f"🛠️ {synthesized_text.strip().splitlines()[-1]}"
371
+
372
+ return synthesized_text.strip(), metadata
228
373
 
229
374
 
230
375
  def create_computer_instance(verbosity: int = logging.INFO) -> Computer:
@@ -245,8 +390,8 @@ def create_agent(
245
390
  save_trajectory: bool = True,
246
391
  only_n_most_recent_images: int = 3,
247
392
  verbosity: int = logging.INFO,
248
- use_ollama: bool = False,
249
393
  use_oaicompat: bool = False,
394
+ provider_base_url: Optional[str] = None,
250
395
  ) -> ComputerAgent:
251
396
  """Create or update the global agent with the specified parameters."""
252
397
  global global_agent
@@ -254,15 +399,6 @@ def create_agent(
254
399
  # Create the computer if not already done
255
400
  computer = create_computer_instance(verbosity=verbosity)
256
401
 
257
- # Extra configuration to pass to the agent
258
- extra_config = {}
259
-
260
- # For Ollama models, we'll pass use_ollama and the model_name directly
261
- if use_ollama:
262
- extra_config["use_ollama"] = True
263
- extra_config["ollama_model"] = model_name
264
- print(f"DEBUG - Using Ollama with model: {model_name}")
265
-
266
402
  # Get API key from environment if not provided
267
403
  if api_key is None:
268
404
  if provider == LLMProvider.OPENAI:
@@ -270,69 +406,52 @@ def create_agent(
270
406
  elif provider == LLMProvider.ANTHROPIC:
271
407
  api_key = os.environ.get("ANTHROPIC_API_KEY", "")
272
408
 
273
- # Create LLM model object with appropriate parameters
274
- provider_base_url = "http://localhost:8000/v1" if use_oaicompat else None
409
+ # Use provided provider_base_url if available, otherwise use default
410
+ default_base_url = "http://localhost:1234/v1" if use_oaicompat else None
411
+ custom_base_url = provider_base_url or default_base_url
275
412
 
276
413
  if use_oaicompat:
277
- # Special handling for OAICOMPAT - use OPENAI provider with custom base URL
278
- print(f"DEBUG - Creating OAICOMPAT agent with model: {model_name}")
414
+ # Special handling for OAICOMPAT - use OAICOMPAT provider with custom base URL
415
+ print(f"DEBUG - Creating OAICOMPAT agent with model: {model_name}, URL: {custom_base_url}")
279
416
  llm = LLM(
280
- provider=provider, # Already set to OPENAI
417
+ provider=LLMProvider.OAICOMPAT, # Set to OAICOMPAT instead of using original provider
281
418
  name=model_name,
282
- provider_base_url=provider_base_url,
419
+ provider_base_url=custom_base_url,
283
420
  )
421
+ print(f"DEBUG - LLM provider is now: {llm.provider}, base URL: {llm.provider_base_url}")
284
422
  # Note: Don't pass use_oaicompat to the agent, as it doesn't accept this parameter
285
423
  elif provider == LLMProvider.OAICOMPAT:
286
424
  # This path is unlikely to be taken with our current approach
287
- llm = LLM(provider=provider, name=model_name, provider_base_url=provider_base_url)
425
+ llm = LLM(provider=provider, name=model_name, provider_base_url=custom_base_url)
288
426
  else:
289
427
  # For other providers, just use standard parameters
290
428
  llm = LLM(provider=provider, name=model_name)
291
429
 
292
430
  # Create or update the agent
293
- if global_agent is None:
294
- global_agent = ComputerAgent(
295
- computer=computer,
296
- loop=agent_loop,
297
- model=llm,
298
- api_key=api_key,
299
- save_trajectory=save_trajectory,
300
- only_n_most_recent_images=only_n_most_recent_images,
301
- verbosity=verbosity,
302
- **extra_config,
303
- )
304
- else:
305
- # Update the existing agent's parameters
306
- global_agent._loop = None # Force recreation of the loop
307
- global_agent.provider = provider
308
- global_agent.loop = agent_loop
309
- global_agent.model = llm
310
- global_agent.api_key = api_key
311
-
312
- # Explicitly update these settings to ensure they take effect
313
- global_agent.save_trajectory = save_trajectory
314
- global_agent.only_n_most_recent_images = only_n_most_recent_images
315
-
316
- # Update Ollama settings if applicable
317
- if use_ollama:
318
- global_agent.use_ollama = True
319
- global_agent.ollama_model = model_name
320
- else:
321
- global_agent.use_ollama = False
322
- global_agent.ollama_model = None
323
-
324
- # Log the updated settings
325
- logging.info(
326
- f"Updated agent settings: save_trajectory={save_trajectory}, recent_images={only_n_most_recent_images}"
327
- )
431
+ global_agent = ComputerAgent(
432
+ computer=computer,
433
+ loop=agent_loop,
434
+ model=llm,
435
+ api_key=api_key,
436
+ save_trajectory=save_trajectory,
437
+ only_n_most_recent_images=only_n_most_recent_images,
438
+ verbosity=verbosity,
439
+ )
328
440
 
329
441
  return global_agent
330
442
 
331
443
 
332
- def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> str:
444
+ def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> Tuple[str, MetadataDict]:
333
445
  """Process agent results for the Gradio UI."""
334
446
  # Extract text content
335
447
  text_obj = result.get("text", {})
448
+ metadata = result.get("metadata", {})
449
+
450
+ # Create a properly typed MetadataDict
451
+ metadata_dict = MetadataDict()
452
+ metadata_dict["title"] = metadata.get("title", "")
453
+ metadata_dict["status"] = "done"
454
+ metadata = metadata_dict
336
455
 
337
456
  # For OpenAI's Computer-Use Agent, text field is an object with format property
338
457
  if (
@@ -341,8 +460,11 @@ def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> str:
341
460
  and "format" in text_obj
342
461
  and not text_obj.get("value", "")
343
462
  ):
344
- content = extract_synthesized_text(result)
463
+ content, metadata = extract_synthesized_text(result)
345
464
  else:
465
+ if not text_obj:
466
+ text_obj = result
467
+
346
468
  # For other types of results, try to get text directly
347
469
  if isinstance(text_obj, dict):
348
470
  if "value" in text_obj:
@@ -375,177 +497,7 @@ def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> str:
375
497
  if not isinstance(content, str):
376
498
  content = str(content) if content else ""
377
499
 
378
- return content
379
-
380
-
381
- def respond(
382
- message: str,
383
- history: List[Tuple[str, str]],
384
- model_choice, # Accept Gradio Dropdown component
385
- agent_loop, # Accept Gradio Dropdown component
386
- save_trajectory, # Accept Gradio Checkbox component
387
- recent_images, # Accept Gradio Slider component
388
- openai_api_key: Optional[str] = None,
389
- anthropic_api_key: Optional[str] = None,
390
- ) -> str:
391
- """Process a message with the Computer-Use Agent and return the response."""
392
- import asyncio
393
-
394
- # Get actual values from Gradio components
395
- model_choice_value = model_choice.value if hasattr(model_choice, "value") else model_choice
396
- agent_loop_value = agent_loop.value if hasattr(agent_loop, "value") else agent_loop
397
- save_trajectory_value = (
398
- save_trajectory.value if hasattr(save_trajectory, "value") else save_trajectory
399
- )
400
- recent_images_value = int(
401
- recent_images.value if hasattr(recent_images, "value") else recent_images
402
- )
403
-
404
- # Debug logging
405
- print(f"DEBUG - Model choice object: {type(model_choice)}")
406
- print(f"DEBUG - Model choice value: {model_choice_value}")
407
- print(f"DEBUG - Agent loop value: {agent_loop_value}")
408
-
409
- # Create a new event loop for this function call
410
- loop = asyncio.new_event_loop()
411
- asyncio.set_event_loop(loop)
412
-
413
- async def _async_respond():
414
- # Extract the loop type and model from the selection
415
- loop_provider = "OPENAI"
416
- if isinstance(model_choice_value, str):
417
- # This is the case for a custom text input from textbox
418
- if agent_loop_value == "OMNI":
419
- loop_provider = "OMNI"
420
- # Use the custom model name as is
421
- model_id = model_choice_value
422
- print(f"DEBUG - Using custom model: {model_id}")
423
- else:
424
- # Handle regular dropdown value as string
425
- if model_choice_value.startswith("OpenAI:"):
426
- loop_provider = "OPENAI"
427
- model_id = model_choice_value.replace("OpenAI: ", "").lower()
428
- elif model_choice_value.startswith("Anthropic:"):
429
- loop_provider = "ANTHROPIC"
430
- model_id = model_choice_value.replace("Anthropic: ", "").lower()
431
- elif model_choice_value.startswith("OMNI:"):
432
- loop_provider = "OMNI"
433
- if "GPT" in model_choice_value:
434
- model_id = model_choice_value.replace("OMNI: OpenAI ", "").lower()
435
- elif "Claude" in model_choice_value:
436
- model_id = model_choice_value.replace("OMNI: ", "").lower()
437
- elif "Ollama" in model_choice_value:
438
- loop_provider = "OMNI-OLLAMA"
439
- # Extract everything after "OMNI: Ollama " which is the full model name (e.g., phi3:latest)
440
- model_id = model_choice_value.replace("OMNI: Ollama ", "")
441
- print(f"DEBUG - Ollama model ID: {model_id}")
442
- else:
443
- model_id = "default"
444
- else:
445
- # Default case
446
- loop_provider = agent_loop_value
447
- model_id = "default"
448
- else:
449
- # Model choice is not a string (shouldn't happen, but handle anyway)
450
- loop_provider = agent_loop_value
451
- model_id = "default"
452
-
453
- print(f"DEBUG - Using loop provider: {loop_provider}, model_id: {model_id}")
454
-
455
- # Use the mapping function to get provider, model name and agent loop
456
- provider, model_name, agent_loop_type = get_provider_and_model(model_id, loop_provider)
457
- print(
458
- f"DEBUG - After mapping: provider={provider}, model_name={model_name}, agent_loop={agent_loop_type}"
459
- )
460
-
461
- # Special handling for OAICOMPAT to bypass provider-specific errors
462
- # Creates the agent with OPENAI provider but using custom model name and provider base URL
463
- is_oaicompat = str(provider) == "oaicompat"
464
- if is_oaicompat:
465
- provider = LLMProvider.OPENAI
466
-
467
- # Get API key based on provider
468
- if provider == LLMProvider.OPENAI:
469
- api_key = openai_api_key or os.environ.get("OPENAI_API_KEY", "")
470
- elif provider == LLMProvider.ANTHROPIC:
471
- api_key = anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
472
- else:
473
- api_key = ""
474
-
475
- # Check for settings changes if agent already exists
476
- settings_changed = False
477
- settings_message = ""
478
- if global_agent is not None:
479
- # Safely check if save_trajectory setting changed
480
- current_save_traj = getattr(global_agent, "save_trajectory", None)
481
- if current_save_traj is not None and current_save_traj != save_trajectory_value:
482
- settings_changed = True
483
- settings_message += f"Save trajectory set to: {save_trajectory_value}. "
484
-
485
- # Safely check if recent_images setting changed
486
- current_recent_images = getattr(global_agent, "only_n_most_recent_images", None)
487
- if current_recent_images is not None and current_recent_images != recent_images_value:
488
- settings_changed = True
489
- settings_message += f"Recent images set to: {recent_images_value}. "
490
-
491
- # Create or update the agent
492
- try:
493
- create_agent(
494
- provider=provider,
495
- agent_loop=agent_loop_type,
496
- model_name=model_name,
497
- api_key=api_key,
498
- save_trajectory=save_trajectory_value,
499
- only_n_most_recent_images=recent_images_value,
500
- use_ollama=loop_provider == "OMNI-OLLAMA",
501
- use_oaicompat=is_oaicompat,
502
- )
503
-
504
- if global_agent is None:
505
- return "Failed to create agent. Check API keys and configuration."
506
- except Exception as e:
507
- return f"Error creating agent: {str(e)}"
508
-
509
- # Notify about settings changes if needed
510
- if settings_changed:
511
- return f"Settings updated: {settings_message}"
512
-
513
- # Collect all responses
514
- response_text = []
515
-
516
- # Run the agent
517
- try:
518
- async for result in global_agent.run(message):
519
- # Process result
520
- content = process_agent_result(result)
521
-
522
- # Skip empty content
523
- if not content:
524
- continue
525
-
526
- # Add content to response list
527
- response_text.append(content)
528
-
529
- # Return the full response as a single string
530
- return "\n".join(response_text) if response_text else "Task completed."
531
-
532
- except Exception as e:
533
- import traceback
534
-
535
- traceback.print_exc()
536
- return f"Error: {str(e)}"
537
-
538
- # Run the async function and get the result
539
- try:
540
- result = loop.run_until_complete(_async_respond())
541
- loop.close()
542
- return result
543
- except Exception as e:
544
- loop.close()
545
- import traceback
546
-
547
- traceback.print_exc()
548
- return f"Error executing async operation: {str(e)}"
500
+ return content, metadata
549
501
 
550
502
 
551
503
  def create_gradio_ui(
@@ -561,6 +513,10 @@ def create_gradio_ui(
561
513
  Returns:
562
514
  A Gradio Blocks application
563
515
  """
516
+ # --- Load Settings ---
517
+ saved_settings = load_settings()
518
+ # --- End Load Settings ---
519
+
564
520
  # Check for API keys
565
521
  openai_api_key = os.environ.get("OPENAI_API_KEY", "")
566
522
  anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
@@ -574,6 +530,7 @@ def create_gradio_ui(
574
530
  openai_models = ["OpenAI: Computer-Use Preview"]
575
531
  omni_models += [
576
532
  "OMNI: OpenAI GPT-4o",
533
+ "OMNI: OpenAI GPT-4o mini",
577
534
  "OMNI: OpenAI GPT-4.5-preview",
578
535
  ]
579
536
 
@@ -596,21 +553,33 @@ def create_gradio_ui(
596
553
  "OMNI": omni_models + ["Custom model..."], # Add custom model option
597
554
  }
598
555
 
599
- # Get initial agent loop and model based on provided parameters
600
- if provider_name.lower() == "openai":
601
- initial_loop = "OPENAI"
602
- initial_model = "OpenAI: Computer-Use Preview" if openai_models else "No models available"
603
- elif provider_name.lower() == "anthropic":
604
- initial_loop = "ANTHROPIC"
605
- initial_model = anthropic_models[0] if anthropic_models else "No models available"
556
+ # --- Apply Saved Settings (override defaults if available) ---
557
+ initial_loop = saved_settings.get("agent_loop", "OMNI")
558
+ # Ensure the saved model is actually available in the choices for the loaded loop
559
+ available_models_for_loop = provider_to_models.get(initial_loop, [])
560
+ saved_model_choice = saved_settings.get("model_choice")
561
+ if saved_model_choice and saved_model_choice in available_models_for_loop:
562
+ initial_model = saved_model_choice
606
563
  else:
607
- initial_loop = "OMNI"
608
- if model_name == "gpt-4o" and "OMNI: OpenAI GPT-4o" in omni_models:
609
- initial_model = "OMNI: OpenAI GPT-4o"
610
- elif "claude" in model_name.lower() and omni_models:
611
- initial_model = next((m for m in omni_models if "Claude" in m), omni_models[0])
612
- else:
564
+ # If saved model isn't valid for the loop, reset to default for that loop
565
+ if initial_loop == "OPENAI":
566
+ initial_model = (
567
+ "OpenAI: Computer-Use Preview" if openai_models else "No models available"
568
+ )
569
+ elif initial_loop == "ANTHROPIC":
570
+ initial_model = anthropic_models[0] if anthropic_models else "No models available"
571
+ else: # OMNI
613
572
  initial_model = omni_models[0] if omni_models else "No models available"
573
+ if "Custom model..." in available_models_for_loop:
574
+ initial_model = (
575
+ "Custom model..." # Default to custom if available and no other default fits
576
+ )
577
+
578
+ initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
579
+ initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
580
+ initial_save_trajectory = saved_settings.get("save_trajectory", True)
581
+ initial_recent_images = saved_settings.get("recent_images", 3)
582
+ # --- End Apply Saved Settings ---
614
583
 
615
584
  # Example prompts
616
585
  example_messages = [
@@ -703,7 +672,7 @@ def create_gradio_ui(
703
672
  ### 3. Pull the pre-built macOS image
704
673
 
705
674
  ```bash
706
- lume pull macos-sequoia-cua:latest --no-cache
675
+ lume pull macos-sequoia-cua:latest
707
676
  ```
708
677
 
709
678
  Initial download requires 80GB storage, but reduces to ~30GB after first run due to macOS's sparse file system.
@@ -720,48 +689,68 @@ def create_gradio_ui(
720
689
  """
721
690
  )
722
691
 
723
- # Configuration options
724
- agent_loop = gr.Dropdown(
725
- choices=["OPENAI", "ANTHROPIC", "OMNI"],
726
- label="Agent Loop",
727
- value=initial_loop,
728
- info="Select the agent loop provider",
729
- )
692
+ with gr.Accordion("Configuration", open=True):
693
+ # Configuration options
694
+ agent_loop = gr.Dropdown(
695
+ choices=["OPENAI", "ANTHROPIC", "OMNI"],
696
+ label="Agent Loop",
697
+ value=initial_loop,
698
+ info="Select the agent loop provider",
699
+ )
730
700
 
731
- # Create model selection dropdown with custom value support for OMNI
732
- model_choice = gr.Dropdown(
733
- choices=provider_to_models.get(initial_loop, ["No models available"]),
734
- label="LLM Provider and Model",
735
- value=initial_model,
736
- info="Select model or choose 'Custom model...' to enter a custom name",
737
- interactive=True,
738
- )
701
+ # Create model selection dropdown with custom value support for OMNI
702
+ model_choice = gr.Dropdown(
703
+ choices=provider_to_models.get(initial_loop, ["No models available"]),
704
+ label="LLM Provider and Model",
705
+ value=initial_model,
706
+ info="Select model or choose 'Custom model...' to enter a custom name",
707
+ interactive=True,
708
+ )
739
709
 
740
- # Add custom model textbox (only visible when "Custom model..." is selected)
741
- custom_model = gr.Textbox(
742
- label="Custom Model Name",
743
- placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct)",
744
- value="Qwen2.5-VL-7B-Instruct", # Default value
745
- visible=False, # Initially hidden
746
- interactive=True,
747
- )
710
+ # Add custom model textbox (only visible when "Custom model..." is selected)
711
+ custom_model = gr.Textbox(
712
+ label="Custom Model Name",
713
+ placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct)",
714
+ value=initial_custom_model,
715
+ visible=(initial_model == "Custom model..."),
716
+ interactive=True,
717
+ )
748
718
 
749
- save_trajectory = gr.Checkbox(
750
- label="Save Trajectory",
751
- value=True,
752
- info="Save the agent's trajectory for debugging",
753
- interactive=True,
754
- )
719
+ # Add custom provider base URL textbox (only visible when "Custom model..." is selected)
720
+ provider_base_url = gr.Textbox(
721
+ label="Provider Base URL",
722
+ placeholder="Enter provider base URL (e.g., http://localhost:1234/v1)",
723
+ value=initial_provider_base_url,
724
+ visible=(initial_model == "Custom model..."),
725
+ interactive=True,
726
+ )
755
727
 
756
- recent_images = gr.Slider(
757
- label="Recent Images",
758
- minimum=1,
759
- maximum=10,
760
- value=3,
761
- step=1,
762
- info="Number of recent images to keep in context",
763
- interactive=True,
764
- )
728
+ # Add custom API key textbox (only visible when "Custom model..." is selected)
729
+ provider_api_key = gr.Textbox(
730
+ label="Provider API Key",
731
+ placeholder="Enter provider API key (if required)",
732
+ value="",
733
+ visible=(initial_model == "Custom model..."),
734
+ interactive=True,
735
+ type="password",
736
+ )
737
+
738
+ save_trajectory = gr.Checkbox(
739
+ label="Save Trajectory",
740
+ value=initial_save_trajectory,
741
+ info="Save the agent's trajectory for debugging",
742
+ interactive=True,
743
+ )
744
+
745
+ recent_images = gr.Slider(
746
+ label="Recent Images",
747
+ minimum=1,
748
+ maximum=10,
749
+ value=initial_recent_images,
750
+ step=1,
751
+ info="Number of recent images to keep in context",
752
+ interactive=True,
753
+ )
765
754
 
766
755
  # Right column for chat interface
767
756
  with gr.Column(scale=2):
@@ -770,7 +759,7 @@ def create_gradio_ui(
770
759
  "Ask me to perform tasks in a virtual macOS environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
771
760
  )
772
761
 
773
- chatbot = gr.Chatbot()
762
+ chatbot_history = gr.Chatbot(type="messages")
774
763
  msg = gr.Textbox(
775
764
  placeholder="Ask me to perform tasks in a virtual macOS environment"
776
765
  )
@@ -782,63 +771,169 @@ def create_gradio_ui(
782
771
  # Function to handle chat submission
783
772
  def chat_submit(message, history):
784
773
  # Add user message to history
785
- history = history + [(message, None)]
774
+ history.append(gr.ChatMessage(role="user", content=message))
786
775
  return "", history
787
776
 
788
777
  # Function to process agent response after user input
789
- def process_response(
778
+ async def process_response(
790
779
  history,
791
780
  model_choice_value,
792
781
  custom_model_value,
793
782
  agent_loop_choice,
794
783
  save_traj,
795
784
  recent_imgs,
785
+ custom_url_value=None,
786
+ custom_api_key=None,
796
787
  ):
797
788
  if not history:
798
- return history
789
+ yield history
790
+ return
799
791
 
800
792
  # Get the last user message
801
- last_user_message = history[-1][0]
793
+ last_user_message = history[-1]["content"]
802
794
 
803
- # Use custom model value if "Custom model..." is selected
804
- model_to_use = (
795
+ # Determine the model name string to analyze: custom or from dropdown
796
+ model_string_to_analyze = (
805
797
  custom_model_value
806
798
  if model_choice_value == "Custom model..."
807
- else model_choice_value
799
+ else model_choice_value # Use the full UI string initially
808
800
  )
809
801
 
810
- # Process with agent
811
- response = respond(
812
- last_user_message,
813
- history[:-1], # History without the last message
814
- model_to_use,
815
- agent_loop_choice,
816
- save_traj,
817
- recent_imgs,
818
- openai_api_key,
819
- anthropic_api_key,
820
- )
821
-
822
- # Update the last assistant message
823
- history[-1] = (last_user_message, response)
824
- return history
802
+ # Determine if this is a custom model selection
803
+ is_custom_model_selected = model_choice_value == "Custom model..."
804
+
805
+ try:
806
+ # Get the provider, *cleaned* model name, and agent loop type
807
+ provider, cleaned_model_name_from_func, agent_loop_type = (
808
+ get_provider_and_model(model_string_to_analyze, agent_loop_choice)
809
+ )
810
+
811
+ # Determine the final model name to send to the agent
812
+ # If custom selected, use the custom text box value, otherwise use the cleaned name
813
+ final_model_name_to_send = (
814
+ custom_model_value
815
+ if is_custom_model_selected
816
+ else cleaned_model_name_from_func
817
+ )
818
+
819
+ # Determine if OAICOMPAT should be used (only if custom model explicitly selected)
820
+ is_oaicompat = is_custom_model_selected
821
+
822
+ # Get API key based on provider determined by get_provider_and_model
823
+ if is_oaicompat and custom_api_key:
824
+ # Use custom API key if provided for custom model
825
+ api_key = custom_api_key
826
+ print(
827
+ f"DEBUG - Using custom API key for model: {final_model_name_to_send}"
828
+ )
829
+ elif provider == LLMProvider.OPENAI:
830
+ api_key = openai_api_key or os.environ.get("OPENAI_API_KEY", "")
831
+ elif provider == LLMProvider.ANTHROPIC:
832
+ api_key = anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
833
+ else:
834
+ # For Ollama or default OAICOMPAT (without custom key), no key needed/expected
835
+ api_key = ""
836
+
837
+ # --- Save Settings Before Running Agent ---
838
+ current_settings = {
839
+ "agent_loop": agent_loop_choice,
840
+ "model_choice": model_choice_value,
841
+ "custom_model": custom_model_value,
842
+ "provider_base_url": custom_url_value,
843
+ "save_trajectory": save_traj,
844
+ "recent_images": recent_imgs,
845
+ }
846
+ save_settings(current_settings)
847
+ # --- End Save Settings ---
848
+
849
+ # Create or update the agent
850
+ create_agent(
851
+ # Provider determined by get_provider_and_model unless custom model selected
852
+ provider=LLMProvider.OAICOMPAT if is_oaicompat else provider,
853
+ agent_loop=agent_loop_type,
854
+ # Pass the FINAL determined model name (cleaned or custom)
855
+ model_name=final_model_name_to_send,
856
+ api_key=api_key,
857
+ save_trajectory=save_traj,
858
+ only_n_most_recent_images=recent_imgs,
859
+ use_oaicompat=is_oaicompat, # Set flag if custom model was selected
860
+ # Pass custom URL only if custom model was selected
861
+ provider_base_url=custom_url_value if is_oaicompat else None,
862
+ verbosity=logging.DEBUG, # Added verbosity here
863
+ )
864
+
865
+ if global_agent is None:
866
+ # Add initial empty assistant message
867
+ history.append(
868
+ gr.ChatMessage(
869
+ role="assistant",
870
+ content="Failed to create agent. Check API keys and configuration.",
871
+ )
872
+ )
873
+ yield history
874
+ return
875
+
876
+ # Add the screenshot handler to the agent's loop if available
877
+ if global_agent and hasattr(global_agent, "_loop"):
878
+ print("DEBUG - Adding screenshot handler to agent loop")
879
+
880
+ # Create the screenshot handler with references to UI components
881
+ screenshot_handler = GradioChatScreenshotHandler(history)
882
+
883
+ # Add the handler to the callback manager if it exists AND is not None
884
+ if (
885
+ hasattr(global_agent._loop, "callback_manager")
886
+ and global_agent._loop.callback_manager is not None
887
+ ):
888
+ global_agent._loop.callback_manager.add_handler(screenshot_handler)
889
+ print(
890
+ f"DEBUG - Screenshot handler added to callback manager with history: {id(history)}"
891
+ )
892
+ else:
893
+ # Optional: Log a warning if the callback manager is missing/None for a specific loop
894
+ print(
895
+ f"WARNING - Callback manager not found or is None for loop type: {type(global_agent._loop)}. Screenshot handler not added."
896
+ )
897
+
898
+ # Stream responses from the agent
899
+ async for result in global_agent.run(last_user_message):
900
+ # Process result
901
+ content, metadata = process_agent_result(result)
902
+
903
+ # Skip empty content
904
+ if content or metadata.get("title"):
905
+ history.append(
906
+ gr.ChatMessage(
907
+ role="assistant", content=content, metadata=metadata
908
+ )
909
+ )
910
+ yield history
911
+ except Exception as e:
912
+ import traceback
913
+
914
+ traceback.print_exc()
915
+ # Update with error message
916
+ history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
917
+ yield history
825
918
 
826
919
  # Connect the components
827
- msg.submit(chat_submit, [msg, chatbot], [msg, chatbot]).then(
920
+ msg.submit(chat_submit, [msg, chatbot_history], [msg, chatbot_history]).then(
828
921
  process_response,
829
922
  [
830
- chatbot,
923
+ chatbot_history,
831
924
  model_choice,
832
925
  custom_model,
833
926
  agent_loop,
834
927
  save_trajectory,
835
928
  recent_images,
929
+ provider_base_url,
930
+ provider_api_key,
836
931
  ],
837
- [chatbot],
932
+ [chatbot_history],
838
933
  )
839
934
 
840
935
  # Clear button functionality
841
- clear.click(lambda: None, None, chatbot, queue=False)
936
+ clear.click(lambda: None, None, chatbot_history, queue=False)
842
937
 
843
938
  # Connect agent_loop changes to model selection
844
939
  agent_loop.change(
@@ -848,14 +943,19 @@ def create_gradio_ui(
848
943
  queue=False, # Process immediately without queueing
849
944
  )
850
945
 
851
- # Show/hide custom model textbox based on dropdown selection
946
+ # Show/hide custom model, provider base URL, and API key textboxes based on dropdown selection
852
947
  def update_custom_model_visibility(model_value):
853
- return gr.update(visible=model_value == "Custom model...")
948
+ is_custom = model_value == "Custom model..."
949
+ return (
950
+ gr.update(visible=is_custom),
951
+ gr.update(visible=is_custom),
952
+ gr.update(visible=is_custom),
953
+ )
854
954
 
855
955
  model_choice.change(
856
956
  fn=update_custom_model_visibility,
857
957
  inputs=[model_choice],
858
- outputs=[custom_model],
958
+ outputs=[custom_model, provider_base_url, provider_api_key],
859
959
  queue=False, # Process immediately without queueing
860
960
  )
861
961