cua-agent 0.1.25__py3-none-any.whl → 0.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

agent/ui/gradio/app.py CHANGED
@@ -30,11 +30,16 @@ Requirements:
30
30
  import os
31
31
  import asyncio
32
32
  import logging
33
+ import json
34
+ from pathlib import Path
33
35
  from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
34
36
  import gradio as gr
37
+ from gradio.components.chatbot import MetadataDict
35
38
 
36
39
  # Import from agent package
37
40
  from agent.core.types import AgentResponse
41
+ from agent.core.callbacks import DefaultCallbackHandler
42
+ from agent.providers.omni.parser import ParseResult
38
43
  from computer import Computer
39
44
 
40
45
  from agent import ComputerAgent, AgentLoop, LLM, LLMProvider
@@ -42,6 +47,86 @@ from agent import ComputerAgent, AgentLoop, LLM, LLMProvider
42
47
  # Global variables
43
48
  global_agent = None
44
49
  global_computer = None
50
+ SETTINGS_FILE = Path(".gradio_settings.json")
51
+
52
+ # We'll use asyncio.run() instead of a persistent event loop
53
+
54
+
55
+ # --- Settings Load/Save Functions ---
56
+ def load_settings() -> Dict[str, Any]:
57
+ """Loads settings from the JSON file."""
58
+ if SETTINGS_FILE.exists():
59
+ try:
60
+ with open(SETTINGS_FILE, "r") as f:
61
+ settings = json.load(f)
62
+ # Basic validation (can be expanded)
63
+ if isinstance(settings, dict):
64
+ print(f"DEBUG - Loaded settings from {SETTINGS_FILE}")
65
+ return settings
66
+ except (json.JSONDecodeError, IOError) as e:
67
+ print(f"Warning: Could not load settings from {SETTINGS_FILE}: {e}")
68
+ return {}
69
+
70
+
71
+ def save_settings(settings: Dict[str, Any]):
72
+ """Saves settings to the JSON file."""
73
+ # Ensure sensitive keys are not saved
74
+ settings.pop("provider_api_key", None)
75
+ try:
76
+ with open(SETTINGS_FILE, "w") as f:
77
+ json.dump(settings, f, indent=4)
78
+ print(f"DEBUG - Saved settings to {SETTINGS_FILE}")
79
+ except IOError as e:
80
+ print(f"Warning: Could not save settings to {SETTINGS_FILE}: {e}")
81
+
82
+
83
+ # --- End Settings Load/Save ---
84
+
85
+
86
+ # Custom Screenshot Handler for Gradio chat
87
+ class GradioChatScreenshotHandler(DefaultCallbackHandler):
88
+ """Custom handler that adds screenshots to the Gradio chatbot and updates annotated image."""
89
+
90
+ def __init__(self, chatbot_history: List[gr.ChatMessage]):
91
+ """Initialize with reference to chat history and annotated image component.
92
+
93
+ Args:
94
+ chatbot_history: Reference to the Gradio chatbot history list
95
+ annotated_image: Reference to the annotated image component
96
+ """
97
+ self.chatbot_history = chatbot_history
98
+ print("GradioChatScreenshotHandler initialized")
99
+
100
+ async def on_screenshot(
101
+ self,
102
+ screenshot_base64: str,
103
+ action_type: str = "",
104
+ parsed_screen: Optional[ParseResult] = None,
105
+ ) -> None:
106
+ """Add screenshot to chatbot when a screenshot is taken and update the annotated image.
107
+
108
+ Args:
109
+ screenshot_base64: Base64 encoded screenshot
110
+ action_type: Type of action that triggered the screenshot
111
+
112
+ Returns:
113
+ Original screenshot (does not modify it)
114
+ """
115
+ # Create a markdown image element for the screenshot
116
+ image_markdown = (
117
+ f"![Screenshot after {action_type}](data:image/png;base64,{screenshot_base64})"
118
+ )
119
+
120
+ # Simply append the screenshot as a new message
121
+ if self.chatbot_history is not None:
122
+ self.chatbot_history.append(
123
+ gr.ChatMessage(
124
+ role="assistant",
125
+ content=image_markdown,
126
+ metadata={"title": f"🖥️ Screenshot - {action_type}", "status": "done"},
127
+ )
128
+ )
129
+
45
130
 
46
131
  # Map model names to specific provider model names
47
132
  MODEL_MAPPINGS = {
@@ -53,6 +138,7 @@ MODEL_MAPPINGS = {
53
138
  "gpt-4o": "computer_use_preview",
54
139
  "gpt-4": "computer_use_preview",
55
140
  "gpt-4.5-preview": "computer_use_preview",
141
+ "gpt-4o-mini": "gpt-4o-mini",
56
142
  },
57
143
  "anthropic": {
58
144
  # Default to newest model
@@ -70,6 +156,7 @@ MODEL_MAPPINGS = {
70
156
  # OMNI works with any of these models
71
157
  "default": "gpt-4o",
72
158
  "gpt-4o": "gpt-4o",
159
+ "gpt-4o-mini": "gpt-4o-mini",
73
160
  "gpt-4": "gpt-4",
74
161
  "gpt-4.5-preview": "gpt-4.5-preview",
75
162
  "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
@@ -119,30 +206,82 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
119
206
  model_name.lower(), MODEL_MAPPINGS["anthropic"]["default"]
120
207
  )
121
208
  elif agent_loop == AgentLoop.OMNI:
122
- # For OMNI, select provider based on model name or loop_provider
123
- if loop_provider == "OMNI-OLLAMA":
209
+ # Determine provider and clean model name based on the full string from UI
210
+ cleaned_model_name = model_name # Default to using the name as-is (for custom)
211
+
212
+ if model_name == "Custom model...":
213
+ # Actual model name comes from custom_model_value via model_to_use.
214
+ # Assume OAICOMPAT for custom models unless overridden by URL/key later?
215
+ # get_provider_and_model determines the *initial* provider/model.
216
+ # The custom URL/key in process_response ultimately dictates the OAICOMPAT setup.
217
+ provider = LLMProvider.OAICOMPAT
218
+ # We set cleaned_model_name below outside the checks based on model_to_use
219
+ cleaned_model_name = "" # Placeholder, will be set by custom value later
220
+ elif model_name.startswith("OMNI: Ollama "):
124
221
  provider = LLMProvider.OLLAMA
125
-
126
- # For Ollama models from the UI dropdown, we use the model name as is
127
- # No need to parse it - it's already the correct Ollama model name
128
- model_name_to_use = model_name
129
- elif "claude" in model_name.lower():
222
+ # Extract the part after "OMNI: Ollama "
223
+ cleaned_model_name = model_name.split("OMNI: Ollama ", 1)[1]
224
+ elif model_name.startswith("OMNI: Claude "):
130
225
  provider = LLMProvider.ANTHROPIC
131
- model_name_to_use = MODEL_MAPPINGS["omni"].get(
132
- model_name.lower(), MODEL_MAPPINGS["omni"]["default"]
133
- )
134
- elif "gpt" in model_name.lower():
226
+ # Extract the canonical model name based on the UI string
227
+ # e.g., "OMNI: Claude 3.7 Sonnet (20250219)" -> "3.7 Sonnet" and "20250219"
228
+ parts = model_name.split(" (")
229
+ model_key_part = parts[0].replace("OMNI: Claude ", "")
230
+ date_part = parts[1].replace(")", "") if len(parts) > 1 else ""
231
+
232
+ # Normalize the extracted key part for comparison
233
+ # "3.7 Sonnet" -> "37sonnet"
234
+ model_key_part_norm = model_key_part.lower().replace(".", "").replace(" ", "")
235
+
236
+ cleaned_model_name = MODEL_MAPPINGS["omni"]["default"] # Default if not found
237
+ # Find the canonical name in the main Anthropic map
238
+ for key_anthropic, val_anthropic in MODEL_MAPPINGS["anthropic"].items():
239
+ # Normalize the canonical key for comparison
240
+ # "claude-3-7-sonnet-20250219" -> "claude37sonnet20250219"
241
+ key_anthropic_norm = key_anthropic.lower().replace("-", "")
242
+
243
+ # Check if the normalized canonical key starts with "claude" + normalized extracted part
244
+ # AND contains the date part.
245
+ if (
246
+ key_anthropic_norm.startswith("claude" + model_key_part_norm)
247
+ and date_part in key_anthropic_norm
248
+ ):
249
+ cleaned_model_name = (
250
+ val_anthropic # Use the canonical name like "claude-3-7-sonnet-20250219"
251
+ )
252
+ break
253
+ elif model_name.startswith("OMNI: OpenAI "):
135
254
  provider = LLMProvider.OPENAI
136
- model_name_to_use = MODEL_MAPPINGS["omni"].get(
137
- model_name.lower(), MODEL_MAPPINGS["omni"]["default"]
138
- )
139
- else:
140
- # Handle custom model names - use the OAICOMPAT provider
255
+ # Extract the model part, e.g., "GPT-4o mini"
256
+ model_key_part = model_name.replace("OMNI: OpenAI ", "")
257
+ # Normalize the extracted part: "gpt4omini"
258
+ model_key_part_norm = model_key_part.lower().replace("-", "").replace(" ", "")
259
+
260
+ cleaned_model_name = MODEL_MAPPINGS["omni"]["default"] # Default if not found
261
+ # Find the canonical name in the main OMNI map for OpenAI models
262
+ for key_omni, val_omni in MODEL_MAPPINGS["omni"].items():
263
+ # Normalize the omni map key: "gpt-4o-mini" -> "gpt4omini"
264
+ key_omni_norm = key_omni.lower().replace("-", "").replace(" ", "")
265
+ # Check if the normalized omni key matches the normalized extracted part
266
+ if key_omni_norm == model_key_part_norm:
267
+ cleaned_model_name = (
268
+ val_omni # Use the value from the OMNI map (e.g., gpt-4o-mini)
269
+ )
270
+ break
271
+ # Note: No fallback needed here as we explicitly check against omni keys
272
+
273
+ else: # Handles unexpected formats or the raw custom name if "Custom model..." selected
274
+ # Should only happen if user selected "Custom model..."
275
+ # Or if a model name format isn't caught above
141
276
  provider = LLMProvider.OAICOMPAT
142
- # Use the model name as is without mapping, or use default if empty
143
- model_name_to_use = (
144
- model_name if model_name.strip() else MODEL_MAPPINGS["oaicompat"]["default"]
277
+ cleaned_model_name = (
278
+ model_name.strip() if model_name.strip() else MODEL_MAPPINGS["oaicompat"]["default"]
145
279
  )
280
+
281
+ # Assign the determined model name
282
+ model_name_to_use = cleaned_model_name
283
+ # agent_loop remains AgentLoop.OMNI
284
+
146
285
  else:
147
286
  # Default to OpenAI if unrecognized loop
148
287
  provider = LLMProvider.OPENAI
@@ -177,17 +316,20 @@ def get_ollama_models() -> List[str]:
177
316
  return []
178
317
 
179
318
 
180
- def extract_synthesized_text(result: Union[AgentResponse, Dict[str, Any]]) -> str:
319
+ def extract_synthesized_text(
320
+ result: Union[AgentResponse, Dict[str, Any]],
321
+ ) -> Tuple[str, MetadataDict]:
181
322
  """Extract synthesized text from the agent result."""
182
323
  synthesized_text = ""
324
+ metadata = MetadataDict()
183
325
 
184
326
  if "output" in result and result["output"]:
185
327
  for output in result["output"]:
186
328
  if output.get("type") == "reasoning":
329
+ metadata["title"] = "🧠 Reasoning"
187
330
  content = output.get("content", "")
188
331
  if content:
189
332
  synthesized_text += f"{content}\n"
190
-
191
333
  elif output.get("type") == "message":
192
334
  # Handle message type outputs - can contain rich content
193
335
  content = output.get("content", [])
@@ -224,7 +366,10 @@ def extract_synthesized_text(result: Union[AgentResponse, Dict[str, Any]]) -> st
224
366
  else:
225
367
  synthesized_text += f"Performed {action_type} action.\n"
226
368
 
227
- return synthesized_text.strip()
369
+ metadata["status"] = "done"
370
+ metadata["title"] = f"🛠️ {synthesized_text.strip().splitlines()[-1]}"
371
+
372
+ return synthesized_text.strip(), metadata
228
373
 
229
374
 
230
375
  def create_computer_instance(verbosity: int = logging.INFO) -> Computer:
@@ -245,8 +390,8 @@ def create_agent(
245
390
  save_trajectory: bool = True,
246
391
  only_n_most_recent_images: int = 3,
247
392
  verbosity: int = logging.INFO,
248
- use_ollama: bool = False,
249
393
  use_oaicompat: bool = False,
394
+ provider_base_url: Optional[str] = None,
250
395
  ) -> ComputerAgent:
251
396
  """Create or update the global agent with the specified parameters."""
252
397
  global global_agent
@@ -254,15 +399,6 @@ def create_agent(
254
399
  # Create the computer if not already done
255
400
  computer = create_computer_instance(verbosity=verbosity)
256
401
 
257
- # Extra configuration to pass to the agent
258
- extra_config = {}
259
-
260
- # For Ollama models, we'll pass use_ollama and the model_name directly
261
- if use_ollama:
262
- extra_config["use_ollama"] = True
263
- extra_config["ollama_model"] = model_name
264
- print(f"DEBUG - Using Ollama with model: {model_name}")
265
-
266
402
  # Get API key from environment if not provided
267
403
  if api_key is None:
268
404
  if provider == LLMProvider.OPENAI:
@@ -270,72 +406,52 @@ def create_agent(
270
406
  elif provider == LLMProvider.ANTHROPIC:
271
407
  api_key = os.environ.get("ANTHROPIC_API_KEY", "")
272
408
 
273
- # Create LLM model object with appropriate parameters
274
- provider_base_url = "http://localhost:1234/v1" if use_oaicompat else None
409
+ # Use provided provider_base_url if available, otherwise use default
410
+ default_base_url = "http://localhost:1234/v1" if use_oaicompat else None
411
+ custom_base_url = provider_base_url or default_base_url
275
412
 
276
413
  if use_oaicompat:
277
414
  # Special handling for OAICOMPAT - use OAICOMPAT provider with custom base URL
278
- print(
279
- f"DEBUG - Creating OAICOMPAT agent with model: {model_name}, URL: {provider_base_url}"
280
- )
415
+ print(f"DEBUG - Creating OAICOMPAT agent with model: {model_name}, URL: {custom_base_url}")
281
416
  llm = LLM(
282
417
  provider=LLMProvider.OAICOMPAT, # Set to OAICOMPAT instead of using original provider
283
418
  name=model_name,
284
- provider_base_url=provider_base_url,
419
+ provider_base_url=custom_base_url,
285
420
  )
286
421
  print(f"DEBUG - LLM provider is now: {llm.provider}, base URL: {llm.provider_base_url}")
287
422
  # Note: Don't pass use_oaicompat to the agent, as it doesn't accept this parameter
288
423
  elif provider == LLMProvider.OAICOMPAT:
289
424
  # This path is unlikely to be taken with our current approach
290
- llm = LLM(provider=provider, name=model_name, provider_base_url=provider_base_url)
425
+ llm = LLM(provider=provider, name=model_name, provider_base_url=custom_base_url)
291
426
  else:
292
427
  # For other providers, just use standard parameters
293
428
  llm = LLM(provider=provider, name=model_name)
294
429
 
295
430
  # Create or update the agent
296
- if global_agent is None:
297
- global_agent = ComputerAgent(
298
- computer=computer,
299
- loop=agent_loop,
300
- model=llm,
301
- api_key=api_key,
302
- save_trajectory=save_trajectory,
303
- only_n_most_recent_images=only_n_most_recent_images,
304
- verbosity=verbosity,
305
- **extra_config,
306
- )
307
- else:
308
- # Update the existing agent's parameters
309
- global_agent._loop = None # Force recreation of the loop
310
- global_agent.provider = provider
311
- global_agent.loop = agent_loop
312
- global_agent.model = llm
313
- global_agent.api_key = api_key
314
-
315
- # Explicitly update these settings to ensure they take effect
316
- global_agent.save_trajectory = save_trajectory
317
- global_agent.only_n_most_recent_images = only_n_most_recent_images
318
-
319
- # Update Ollama settings if applicable
320
- if use_ollama:
321
- global_agent.use_ollama = True
322
- global_agent.ollama_model = model_name
323
- else:
324
- global_agent.use_ollama = False
325
- global_agent.ollama_model = None
326
-
327
- # Log the updated settings
328
- logging.info(
329
- f"Updated agent settings: save_trajectory={save_trajectory}, recent_images={only_n_most_recent_images}"
330
- )
431
+ global_agent = ComputerAgent(
432
+ computer=computer,
433
+ loop=agent_loop,
434
+ model=llm,
435
+ api_key=api_key,
436
+ save_trajectory=save_trajectory,
437
+ only_n_most_recent_images=only_n_most_recent_images,
438
+ verbosity=verbosity,
439
+ )
331
440
 
332
441
  return global_agent
333
442
 
334
443
 
335
- def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> str:
444
+ def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> Tuple[str, MetadataDict]:
336
445
  """Process agent results for the Gradio UI."""
337
446
  # Extract text content
338
447
  text_obj = result.get("text", {})
448
+ metadata = result.get("metadata", {})
449
+
450
+ # Create a properly typed MetadataDict
451
+ metadata_dict = MetadataDict()
452
+ metadata_dict["title"] = metadata.get("title", "")
453
+ metadata_dict["status"] = "done"
454
+ metadata = metadata_dict
339
455
 
340
456
  # For OpenAI's Computer-Use Agent, text field is an object with format property
341
457
  if (
@@ -344,8 +460,11 @@ def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> str:
344
460
  and "format" in text_obj
345
461
  and not text_obj.get("value", "")
346
462
  ):
347
- content = extract_synthesized_text(result)
463
+ content, metadata = extract_synthesized_text(result)
348
464
  else:
465
+ if not text_obj:
466
+ text_obj = result
467
+
349
468
  # For other types of results, try to get text directly
350
469
  if isinstance(text_obj, dict):
351
470
  if "value" in text_obj:
@@ -378,179 +497,7 @@ def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> str:
378
497
  if not isinstance(content, str):
379
498
  content = str(content) if content else ""
380
499
 
381
- return content
382
-
383
-
384
- def respond(
385
- message: str,
386
- history: List[Tuple[str, str]],
387
- model_choice, # Accept Gradio Dropdown component
388
- agent_loop, # Accept Gradio Dropdown component
389
- save_trajectory, # Accept Gradio Checkbox component
390
- recent_images, # Accept Gradio Slider component
391
- openai_api_key: Optional[str] = None,
392
- anthropic_api_key: Optional[str] = None,
393
- ) -> str:
394
- """Process a message with the Computer-Use Agent and return the response."""
395
- import asyncio
396
-
397
- # Get actual values from Gradio components
398
- model_choice_value = model_choice.value if hasattr(model_choice, "value") else model_choice
399
- agent_loop_value = agent_loop.value if hasattr(agent_loop, "value") else agent_loop
400
- save_trajectory_value = (
401
- save_trajectory.value if hasattr(save_trajectory, "value") else save_trajectory
402
- )
403
- recent_images_value = int(
404
- recent_images.value if hasattr(recent_images, "value") else recent_images
405
- )
406
-
407
- # Debug logging
408
- print(f"DEBUG - Model choice object: {type(model_choice)}")
409
- print(f"DEBUG - Model choice value: {model_choice_value}")
410
- print(f"DEBUG - Agent loop value: {agent_loop_value}")
411
-
412
- # Create a new event loop for this function call
413
- loop = asyncio.new_event_loop()
414
- asyncio.set_event_loop(loop)
415
-
416
- async def _async_respond():
417
- # Extract the loop type and model from the selection
418
- loop_provider = "OPENAI"
419
- if isinstance(model_choice_value, str):
420
- # This is the case for a custom text input from textbox
421
- if agent_loop_value == "OMNI":
422
- loop_provider = "OMNI"
423
- # Use the custom model name as is
424
- model_id = model_choice_value
425
- print(f"DEBUG - Using custom model: {model_id}")
426
- else:
427
- # Handle regular dropdown value as string
428
- if model_choice_value.startswith("OpenAI:"):
429
- loop_provider = "OPENAI"
430
- model_id = model_choice_value.replace("OpenAI: ", "").lower()
431
- elif model_choice_value.startswith("Anthropic:"):
432
- loop_provider = "ANTHROPIC"
433
- model_id = model_choice_value.replace("Anthropic: ", "").lower()
434
- elif model_choice_value.startswith("OMNI:"):
435
- loop_provider = "OMNI"
436
- if "GPT" in model_choice_value:
437
- model_id = model_choice_value.replace("OMNI: OpenAI ", "").lower()
438
- elif "Claude" in model_choice_value:
439
- model_id = model_choice_value.replace("OMNI: ", "").lower()
440
- elif "Ollama" in model_choice_value:
441
- loop_provider = "OMNI-OLLAMA"
442
- # Extract everything after "OMNI: Ollama " which is the full model name (e.g., phi3:latest)
443
- model_id = model_choice_value.replace("OMNI: Ollama ", "")
444
- print(f"DEBUG - Ollama model ID: {model_id}")
445
- else:
446
- model_id = "default"
447
- else:
448
- # Default case
449
- loop_provider = agent_loop_value
450
- model_id = "default"
451
- else:
452
- # Model choice is not a string (shouldn't happen, but handle anyway)
453
- loop_provider = agent_loop_value
454
- model_id = "default"
455
-
456
- print(f"DEBUG - Using loop provider: {loop_provider}, model_id: {model_id}")
457
-
458
- # Use the mapping function to get provider, model name and agent loop
459
- provider, model_name, agent_loop_type = get_provider_and_model(model_id, loop_provider)
460
- print(
461
- f"DEBUG - After mapping: provider={provider}, model_name={model_name}, agent_loop={agent_loop_type}"
462
- )
463
-
464
- # Special handling for OAICOMPAT to bypass provider-specific errors
465
- # Creates the agent with OPENAI provider but using custom model name and provider base URL
466
- is_oaicompat = str(provider) == "oaicompat"
467
-
468
- # Don't override the provider for OAICOMPAT - instead pass it through
469
- # if is_oaicompat:
470
- # provider = LLMProvider.OPENAI
471
-
472
- # Get API key based on provider
473
- if provider == LLMProvider.OPENAI:
474
- api_key = openai_api_key or os.environ.get("OPENAI_API_KEY", "")
475
- elif provider == LLMProvider.ANTHROPIC:
476
- api_key = anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
477
- else:
478
- api_key = ""
479
-
480
- # Check for settings changes if agent already exists
481
- settings_changed = False
482
- settings_message = ""
483
- if global_agent is not None:
484
- # Safely check if save_trajectory setting changed
485
- current_save_traj = getattr(global_agent, "save_trajectory", None)
486
- if current_save_traj is not None and current_save_traj != save_trajectory_value:
487
- settings_changed = True
488
- settings_message += f"Save trajectory set to: {save_trajectory_value}. "
489
-
490
- # Safely check if recent_images setting changed
491
- current_recent_images = getattr(global_agent, "only_n_most_recent_images", None)
492
- if current_recent_images is not None and current_recent_images != recent_images_value:
493
- settings_changed = True
494
- settings_message += f"Recent images set to: {recent_images_value}. "
495
-
496
- # Create or update the agent
497
- try:
498
- create_agent(
499
- provider=provider,
500
- agent_loop=agent_loop_type,
501
- model_name=model_name,
502
- api_key=api_key,
503
- save_trajectory=save_trajectory_value,
504
- only_n_most_recent_images=recent_images_value,
505
- use_ollama=loop_provider == "OMNI-OLLAMA",
506
- use_oaicompat=is_oaicompat,
507
- )
508
-
509
- if global_agent is None:
510
- return "Failed to create agent. Check API keys and configuration."
511
- except Exception as e:
512
- return f"Error creating agent: {str(e)}"
513
-
514
- # Notify about settings changes if needed
515
- if settings_changed:
516
- return f"Settings updated: {settings_message}"
517
-
518
- # Collect all responses
519
- response_text = []
520
-
521
- # Run the agent
522
- try:
523
- async for result in global_agent.run(message):
524
- # Process result
525
- content = process_agent_result(result)
526
-
527
- # Skip empty content
528
- if not content:
529
- continue
530
-
531
- # Add content to response list
532
- response_text.append(content)
533
-
534
- # Return the full response as a single string
535
- return "\n".join(response_text) if response_text else "Task completed."
536
-
537
- except Exception as e:
538
- import traceback
539
-
540
- traceback.print_exc()
541
- return f"Error: {str(e)}"
542
-
543
- # Run the async function and get the result
544
- try:
545
- result = loop.run_until_complete(_async_respond())
546
- loop.close()
547
- return result
548
- except Exception as e:
549
- loop.close()
550
- import traceback
551
-
552
- traceback.print_exc()
553
- return f"Error executing async operation: {str(e)}"
500
+ return content, metadata
554
501
 
555
502
 
556
503
  def create_gradio_ui(
@@ -566,6 +513,10 @@ def create_gradio_ui(
566
513
  Returns:
567
514
  A Gradio Blocks application
568
515
  """
516
+ # --- Load Settings ---
517
+ saved_settings = load_settings()
518
+ # --- End Load Settings ---
519
+
569
520
  # Check for API keys
570
521
  openai_api_key = os.environ.get("OPENAI_API_KEY", "")
571
522
  anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
@@ -579,6 +530,7 @@ def create_gradio_ui(
579
530
  openai_models = ["OpenAI: Computer-Use Preview"]
580
531
  omni_models += [
581
532
  "OMNI: OpenAI GPT-4o",
533
+ "OMNI: OpenAI GPT-4o mini",
582
534
  "OMNI: OpenAI GPT-4.5-preview",
583
535
  ]
584
536
 
@@ -601,21 +553,33 @@ def create_gradio_ui(
601
553
  "OMNI": omni_models + ["Custom model..."], # Add custom model option
602
554
  }
603
555
 
604
- # Get initial agent loop and model based on provided parameters
605
- if provider_name.lower() == "openai":
606
- initial_loop = "OPENAI"
607
- initial_model = "OpenAI: Computer-Use Preview" if openai_models else "No models available"
608
- elif provider_name.lower() == "anthropic":
609
- initial_loop = "ANTHROPIC"
610
- initial_model = anthropic_models[0] if anthropic_models else "No models available"
556
+ # --- Apply Saved Settings (override defaults if available) ---
557
+ initial_loop = saved_settings.get("agent_loop", "OMNI")
558
+ # Ensure the saved model is actually available in the choices for the loaded loop
559
+ available_models_for_loop = provider_to_models.get(initial_loop, [])
560
+ saved_model_choice = saved_settings.get("model_choice")
561
+ if saved_model_choice and saved_model_choice in available_models_for_loop:
562
+ initial_model = saved_model_choice
611
563
  else:
612
- initial_loop = "OMNI"
613
- if model_name == "gpt-4o" and "OMNI: OpenAI GPT-4o" in omni_models:
614
- initial_model = "OMNI: OpenAI GPT-4o"
615
- elif "claude" in model_name.lower() and omni_models:
616
- initial_model = next((m for m in omni_models if "Claude" in m), omni_models[0])
617
- else:
564
+ # If saved model isn't valid for the loop, reset to default for that loop
565
+ if initial_loop == "OPENAI":
566
+ initial_model = (
567
+ "OpenAI: Computer-Use Preview" if openai_models else "No models available"
568
+ )
569
+ elif initial_loop == "ANTHROPIC":
570
+ initial_model = anthropic_models[0] if anthropic_models else "No models available"
571
+ else: # OMNI
618
572
  initial_model = omni_models[0] if omni_models else "No models available"
573
+ if "Custom model..." in available_models_for_loop:
574
+ initial_model = (
575
+ "Custom model..." # Default to custom if available and no other default fits
576
+ )
577
+
578
+ initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
579
+ initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
580
+ initial_save_trajectory = saved_settings.get("save_trajectory", True)
581
+ initial_recent_images = saved_settings.get("recent_images", 3)
582
+ # --- End Apply Saved Settings ---
619
583
 
620
584
  # Example prompts
621
585
  example_messages = [
@@ -708,7 +672,7 @@ def create_gradio_ui(
708
672
  ### 3. Pull the pre-built macOS image
709
673
 
710
674
  ```bash
711
- lume pull macos-sequoia-cua:latest --no-cache
675
+ lume pull macos-sequoia-cua:latest
712
676
  ```
713
677
 
714
678
  Initial download requires 80GB storage, but reduces to ~30GB after first run due to macOS's sparse file system.
@@ -725,48 +689,68 @@ def create_gradio_ui(
725
689
  """
726
690
  )
727
691
 
728
- # Configuration options
729
- agent_loop = gr.Dropdown(
730
- choices=["OPENAI", "ANTHROPIC", "OMNI"],
731
- label="Agent Loop",
732
- value=initial_loop,
733
- info="Select the agent loop provider",
734
- )
692
+ with gr.Accordion("Configuration", open=True):
693
+ # Configuration options
694
+ agent_loop = gr.Dropdown(
695
+ choices=["OPENAI", "ANTHROPIC", "OMNI"],
696
+ label="Agent Loop",
697
+ value=initial_loop,
698
+ info="Select the agent loop provider",
699
+ )
735
700
 
736
- # Create model selection dropdown with custom value support for OMNI
737
- model_choice = gr.Dropdown(
738
- choices=provider_to_models.get(initial_loop, ["No models available"]),
739
- label="LLM Provider and Model",
740
- value=initial_model,
741
- info="Select model or choose 'Custom model...' to enter a custom name",
742
- interactive=True,
743
- )
701
+ # Create model selection dropdown with custom value support for OMNI
702
+ model_choice = gr.Dropdown(
703
+ choices=provider_to_models.get(initial_loop, ["No models available"]),
704
+ label="LLM Provider and Model",
705
+ value=initial_model,
706
+ info="Select model or choose 'Custom model...' to enter a custom name",
707
+ interactive=True,
708
+ )
744
709
 
745
- # Add custom model textbox (only visible when "Custom model..." is selected)
746
- custom_model = gr.Textbox(
747
- label="Custom Model Name",
748
- placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct)",
749
- value="Qwen2.5-VL-7B-Instruct", # Default value
750
- visible=False, # Initially hidden
751
- interactive=True,
752
- )
710
+ # Add custom model textbox (only visible when "Custom model..." is selected)
711
+ custom_model = gr.Textbox(
712
+ label="Custom Model Name",
713
+ placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct)",
714
+ value=initial_custom_model,
715
+ visible=(initial_model == "Custom model..."),
716
+ interactive=True,
717
+ )
753
718
 
754
- save_trajectory = gr.Checkbox(
755
- label="Save Trajectory",
756
- value=True,
757
- info="Save the agent's trajectory for debugging",
758
- interactive=True,
759
- )
719
+ # Add custom provider base URL textbox (only visible when "Custom model..." is selected)
720
+ provider_base_url = gr.Textbox(
721
+ label="Provider Base URL",
722
+ placeholder="Enter provider base URL (e.g., http://localhost:1234/v1)",
723
+ value=initial_provider_base_url,
724
+ visible=(initial_model == "Custom model..."),
725
+ interactive=True,
726
+ )
760
727
 
761
- recent_images = gr.Slider(
762
- label="Recent Images",
763
- minimum=1,
764
- maximum=10,
765
- value=3,
766
- step=1,
767
- info="Number of recent images to keep in context",
768
- interactive=True,
769
- )
728
+ # Add custom API key textbox (only visible when "Custom model..." is selected)
729
+ provider_api_key = gr.Textbox(
730
+ label="Provider API Key",
731
+ placeholder="Enter provider API key (if required)",
732
+ value="",
733
+ visible=(initial_model == "Custom model..."),
734
+ interactive=True,
735
+ type="password",
736
+ )
737
+
738
+ save_trajectory = gr.Checkbox(
739
+ label="Save Trajectory",
740
+ value=initial_save_trajectory,
741
+ info="Save the agent's trajectory for debugging",
742
+ interactive=True,
743
+ )
744
+
745
+ recent_images = gr.Slider(
746
+ label="Recent Images",
747
+ minimum=1,
748
+ maximum=10,
749
+ value=initial_recent_images,
750
+ step=1,
751
+ info="Number of recent images to keep in context",
752
+ interactive=True,
753
+ )
770
754
 
771
755
  # Right column for chat interface
772
756
  with gr.Column(scale=2):
@@ -775,7 +759,7 @@ def create_gradio_ui(
775
759
  "Ask me to perform tasks in a virtual macOS environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
776
760
  )
777
761
 
778
- chatbot = gr.Chatbot()
762
+ chatbot_history = gr.Chatbot(type="messages")
779
763
  msg = gr.Textbox(
780
764
  placeholder="Ask me to perform tasks in a virtual macOS environment"
781
765
  )
@@ -787,63 +771,169 @@ def create_gradio_ui(
787
771
  # Function to handle chat submission
788
772
  def chat_submit(message, history):
789
773
  # Add user message to history
790
- history = history + [(message, None)]
774
+ history.append(gr.ChatMessage(role="user", content=message))
791
775
  return "", history
792
776
 
793
777
  # Function to process agent response after user input
794
- def process_response(
778
+ async def process_response(
795
779
  history,
796
780
  model_choice_value,
797
781
  custom_model_value,
798
782
  agent_loop_choice,
799
783
  save_traj,
800
784
  recent_imgs,
785
+ custom_url_value=None,
786
+ custom_api_key=None,
801
787
  ):
802
788
  if not history:
803
- return history
789
+ yield history
790
+ return
804
791
 
805
792
  # Get the last user message
806
- last_user_message = history[-1][0]
793
+ last_user_message = history[-1]["content"]
807
794
 
808
- # Use custom model value if "Custom model..." is selected
809
- model_to_use = (
795
+ # Determine the model name string to analyze: custom or from dropdown
796
+ model_string_to_analyze = (
810
797
  custom_model_value
811
798
  if model_choice_value == "Custom model..."
812
- else model_choice_value
799
+ else model_choice_value # Use the full UI string initially
813
800
  )
814
801
 
815
- # Process with agent
816
- response = respond(
817
- last_user_message,
818
- history[:-1], # History without the last message
819
- model_to_use,
820
- agent_loop_choice,
821
- save_traj,
822
- recent_imgs,
823
- openai_api_key,
824
- anthropic_api_key,
825
- )
826
-
827
- # Update the last assistant message
828
- history[-1] = (last_user_message, response)
829
- return history
802
+ # Determine if this is a custom model selection
803
+ is_custom_model_selected = model_choice_value == "Custom model..."
804
+
805
+ try:
806
+ # Get the provider, *cleaned* model name, and agent loop type
807
+ provider, cleaned_model_name_from_func, agent_loop_type = (
808
+ get_provider_and_model(model_string_to_analyze, agent_loop_choice)
809
+ )
810
+
811
+ # Determine the final model name to send to the agent
812
+ # If custom selected, use the custom text box value, otherwise use the cleaned name
813
+ final_model_name_to_send = (
814
+ custom_model_value
815
+ if is_custom_model_selected
816
+ else cleaned_model_name_from_func
817
+ )
818
+
819
+ # Determine if OAICOMPAT should be used (only if custom model explicitly selected)
820
+ is_oaicompat = is_custom_model_selected
821
+
822
+ # Get API key based on provider determined by get_provider_and_model
823
+ if is_oaicompat and custom_api_key:
824
+ # Use custom API key if provided for custom model
825
+ api_key = custom_api_key
826
+ print(
827
+ f"DEBUG - Using custom API key for model: {final_model_name_to_send}"
828
+ )
829
+ elif provider == LLMProvider.OPENAI:
830
+ api_key = openai_api_key or os.environ.get("OPENAI_API_KEY", "")
831
+ elif provider == LLMProvider.ANTHROPIC:
832
+ api_key = anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
833
+ else:
834
+ # For Ollama or default OAICOMPAT (without custom key), no key needed/expected
835
+ api_key = ""
836
+
837
+ # --- Save Settings Before Running Agent ---
838
+ current_settings = {
839
+ "agent_loop": agent_loop_choice,
840
+ "model_choice": model_choice_value,
841
+ "custom_model": custom_model_value,
842
+ "provider_base_url": custom_url_value,
843
+ "save_trajectory": save_traj,
844
+ "recent_images": recent_imgs,
845
+ }
846
+ save_settings(current_settings)
847
+ # --- End Save Settings ---
848
+
849
+ # Create or update the agent
850
+ create_agent(
851
+ # Provider determined by get_provider_and_model unless custom model selected
852
+ provider=LLMProvider.OAICOMPAT if is_oaicompat else provider,
853
+ agent_loop=agent_loop_type,
854
+ # Pass the FINAL determined model name (cleaned or custom)
855
+ model_name=final_model_name_to_send,
856
+ api_key=api_key,
857
+ save_trajectory=save_traj,
858
+ only_n_most_recent_images=recent_imgs,
859
+ use_oaicompat=is_oaicompat, # Set flag if custom model was selected
860
+ # Pass custom URL only if custom model was selected
861
+ provider_base_url=custom_url_value if is_oaicompat else None,
862
+ verbosity=logging.DEBUG, # Added verbosity here
863
+ )
864
+
865
+ if global_agent is None:
866
+ # Add initial empty assistant message
867
+ history.append(
868
+ gr.ChatMessage(
869
+ role="assistant",
870
+ content="Failed to create agent. Check API keys and configuration.",
871
+ )
872
+ )
873
+ yield history
874
+ return
875
+
876
+ # Add the screenshot handler to the agent's loop if available
877
+ if global_agent and hasattr(global_agent, "_loop"):
878
+ print("DEBUG - Adding screenshot handler to agent loop")
879
+
880
+ # Create the screenshot handler with references to UI components
881
+ screenshot_handler = GradioChatScreenshotHandler(history)
882
+
883
+ # Add the handler to the callback manager if it exists AND is not None
884
+ if (
885
+ hasattr(global_agent._loop, "callback_manager")
886
+ and global_agent._loop.callback_manager is not None
887
+ ):
888
+ global_agent._loop.callback_manager.add_handler(screenshot_handler)
889
+ print(
890
+ f"DEBUG - Screenshot handler added to callback manager with history: {id(history)}"
891
+ )
892
+ else:
893
+ # Optional: Log a warning if the callback manager is missing/None for a specific loop
894
+ print(
895
+ f"WARNING - Callback manager not found or is None for loop type: {type(global_agent._loop)}. Screenshot handler not added."
896
+ )
897
+
898
+ # Stream responses from the agent
899
+ async for result in global_agent.run(last_user_message):
900
+ # Process result
901
+ content, metadata = process_agent_result(result)
902
+
903
+ # Skip empty content
904
+ if content or metadata.get("title"):
905
+ history.append(
906
+ gr.ChatMessage(
907
+ role="assistant", content=content, metadata=metadata
908
+ )
909
+ )
910
+ yield history
911
+ except Exception as e:
912
+ import traceback
913
+
914
+ traceback.print_exc()
915
+ # Update with error message
916
+ history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
917
+ yield history
830
918
 
831
919
  # Connect the components
832
- msg.submit(chat_submit, [msg, chatbot], [msg, chatbot]).then(
920
+ msg.submit(chat_submit, [msg, chatbot_history], [msg, chatbot_history]).then(
833
921
  process_response,
834
922
  [
835
- chatbot,
923
+ chatbot_history,
836
924
  model_choice,
837
925
  custom_model,
838
926
  agent_loop,
839
927
  save_trajectory,
840
928
  recent_images,
929
+ provider_base_url,
930
+ provider_api_key,
841
931
  ],
842
- [chatbot],
932
+ [chatbot_history],
843
933
  )
844
934
 
845
935
  # Clear button functionality
846
- clear.click(lambda: None, None, chatbot, queue=False)
936
+ clear.click(lambda: None, None, chatbot_history, queue=False)
847
937
 
848
938
  # Connect agent_loop changes to model selection
849
939
  agent_loop.change(
@@ -853,14 +943,19 @@ def create_gradio_ui(
853
943
  queue=False, # Process immediately without queueing
854
944
  )
855
945
 
856
- # Show/hide custom model textbox based on dropdown selection
946
+ # Show/hide custom model, provider base URL, and API key textboxes based on dropdown selection
857
947
  def update_custom_model_visibility(model_value):
858
- return gr.update(visible=model_value == "Custom model...")
948
+ is_custom = model_value == "Custom model..."
949
+ return (
950
+ gr.update(visible=is_custom),
951
+ gr.update(visible=is_custom),
952
+ gr.update(visible=is_custom),
953
+ )
859
954
 
860
955
  model_choice.change(
861
956
  fn=update_custom_model_visibility,
862
957
  inputs=[model_choice],
863
- outputs=[custom_model],
958
+ outputs=[custom_model, provider_base_url, provider_api_key],
864
959
  queue=False, # Process immediately without queueing
865
960
  )
866
961