PyPI - cua-agent - Versions diffs - 0.1.24__py3-none-any.whl → 0.1.26__py3-none-any.whl - Mend

cua-agent 0.1.24py3-none-any.whl → 0.1.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (11) hide show

agent/core/base.py +20 -0
agent/core/callbacks.py +57 -2
agent/providers/anthropic/callbacks/manager.py +20 -10
agent/providers/omni/clients/oaicompat.py +11 -3
agent/providers/omni/loop.py +24 -4
agent/providers/openai/loop.py +13 -4
agent/ui/gradio/app.py +429 -329
{cua_agent-0.1.24.dist-info → cua_agent-0.1.26.dist-info}/METADATA +37 -23
{cua_agent-0.1.24.dist-info → cua_agent-0.1.26.dist-info}/RECORD +11 -11
{cua_agent-0.1.24.dist-info → cua_agent-0.1.26.dist-info}/WHEEL +1 -1
{cua_agent-0.1.24.dist-info → cua_agent-0.1.26.dist-info}/entry_points.txt +0 -0

agent/ui/gradio/app.py CHANGED Viewed

@@ -30,11 +30,16 @@ Requirements:
 import os
 import asyncio
 import logging
+import json
+from pathlib import Path
 from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
 import gradio as gr
+from gradio.components.chatbot import MetadataDict
 # Import from agent package
 from agent.core.types import AgentResponse
+from agent.core.callbacks import DefaultCallbackHandler
+from agent.providers.omni.parser import ParseResult
 from computer import Computer
 from agent import ComputerAgent, AgentLoop, LLM, LLMProvider
@@ -42,6 +47,86 @@ from agent import ComputerAgent, AgentLoop, LLM, LLMProvider
 # Global variables
 global_agent = None
 global_computer = None
+SETTINGS_FILE = Path(".gradio_settings.json")
+# We'll use asyncio.run() instead of a persistent event loop
+# --- Settings Load/Save Functions ---
+def load_settings() -> Dict[str, Any]:
+    """Loads settings from the JSON file."""
+    if SETTINGS_FILE.exists():
+        try:
+            with open(SETTINGS_FILE, "r") as f:
+                settings = json.load(f)
+                # Basic validation (can be expanded)
+                if isinstance(settings, dict):
+                    print(f"DEBUG - Loaded settings from {SETTINGS_FILE}")
+                    return settings
+        except (json.JSONDecodeError, IOError) as e:
+            print(f"Warning: Could not load settings from {SETTINGS_FILE}: {e}")
+    return {}
+def save_settings(settings: Dict[str, Any]):
+    """Saves settings to the JSON file."""
+    # Ensure sensitive keys are not saved
+    settings.pop("provider_api_key", None)
+    try:
+        with open(SETTINGS_FILE, "w") as f:
+            json.dump(settings, f, indent=4)
+        print(f"DEBUG - Saved settings to {SETTINGS_FILE}")
+    except IOError as e:
+        print(f"Warning: Could not save settings to {SETTINGS_FILE}: {e}")
+# --- End Settings Load/Save ---
+# Custom Screenshot Handler for Gradio chat
+class GradioChatScreenshotHandler(DefaultCallbackHandler):
+    """Custom handler that adds screenshots to the Gradio chatbot and updates annotated image."""
+    def __init__(self, chatbot_history: List[gr.ChatMessage]):
+        """Initialize with reference to chat history and annotated image component.
+        Args:
+            chatbot_history: Reference to the Gradio chatbot history list
+            annotated_image: Reference to the annotated image component
+        """
+        self.chatbot_history = chatbot_history
+        print("GradioChatScreenshotHandler initialized")
+    async def on_screenshot(
+        self,
+        screenshot_base64: str,
+        action_type: str = "",
+        parsed_screen: Optional[ParseResult] = None,
+    ) -> None:
+        """Add screenshot to chatbot when a screenshot is taken and update the annotated image.
+        Args:
+            screenshot_base64: Base64 encoded screenshot
+            action_type: Type of action that triggered the screenshot
+        Returns:
+            Original screenshot (does not modify it)
+        """
+        # Create a markdown image element for the screenshot
+        image_markdown = (
+            f"![Screenshot after {action_type}](data:image/png;base64,{screenshot_base64})"
+        )
+        # Simply append the screenshot as a new message
+        if self.chatbot_history is not None:
+            self.chatbot_history.append(
+                gr.ChatMessage(
+                    role="assistant",
+                    content=image_markdown,
+                    metadata={"title": f"🖥️ Screenshot - {action_type}", "status": "done"},
+                )
+            )
 # Map model names to specific provider model names
 MODEL_MAPPINGS = {
@@ -53,6 +138,7 @@ MODEL_MAPPINGS = {
         "gpt-4o": "computer_use_preview",
         "gpt-4": "computer_use_preview",
         "gpt-4.5-preview": "computer_use_preview",
+        "gpt-4o-mini": "gpt-4o-mini",
     },
     "anthropic": {
         # Default to newest model
@@ -70,6 +156,7 @@ MODEL_MAPPINGS = {
         # OMNI works with any of these models
         "default": "gpt-4o",
         "gpt-4o": "gpt-4o",
+        "gpt-4o-mini": "gpt-4o-mini",
         "gpt-4": "gpt-4",
         "gpt-4.5-preview": "gpt-4.5-preview",
         "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
@@ -119,30 +206,82 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
             model_name.lower(), MODEL_MAPPINGS["anthropic"]["default"]
         )
     elif agent_loop == AgentLoop.OMNI:
-        # For OMNI, select provider based on model name or loop_provider
-        if loop_provider == "OMNI-OLLAMA":
+        # Determine provider and clean model name based on the full string from UI
+        cleaned_model_name = model_name  # Default to using the name as-is (for custom)
+        if model_name == "Custom model...":
+            # Actual model name comes from custom_model_value via model_to_use.
+            # Assume OAICOMPAT for custom models unless overridden by URL/key later?
+            # get_provider_and_model determines the *initial* provider/model.
+            # The custom URL/key in process_response ultimately dictates the OAICOMPAT setup.
+            provider = LLMProvider.OAICOMPAT
+            # We set cleaned_model_name below outside the checks based on model_to_use
+            cleaned_model_name = ""  # Placeholder, will be set by custom value later
+        elif model_name.startswith("OMNI: Ollama "):
             provider = LLMProvider.OLLAMA
-            # For Ollama models from the UI dropdown, we use the model name as is
-            # No need to parse it - it's already the correct Ollama model name
-            model_name_to_use = model_name
-        elif "claude" in model_name.lower():
+            # Extract the part after "OMNI: Ollama "
+            cleaned_model_name = model_name.split("OMNI: Ollama ", 1)[1]
+        elif model_name.startswith("OMNI: Claude "):
             provider = LLMProvider.ANTHROPIC
-            model_name_to_use = MODEL_MAPPINGS["omni"].get(
-                model_name.lower(), MODEL_MAPPINGS["omni"]["default"]
-            )
-        elif "gpt" in model_name.lower():
+            # Extract the canonical model name based on the UI string
+            # e.g., "OMNI: Claude 3.7 Sonnet (20250219)" -> "3.7 Sonnet" and "20250219"
+            parts = model_name.split(" (")
+            model_key_part = parts[0].replace("OMNI: Claude ", "")
+            date_part = parts[1].replace(")", "") if len(parts) > 1 else ""
+            # Normalize the extracted key part for comparison
+            # "3.7 Sonnet" -> "37sonnet"
+            model_key_part_norm = model_key_part.lower().replace(".", "").replace(" ", "")
+            cleaned_model_name = MODEL_MAPPINGS["omni"]["default"]  # Default if not found
+            # Find the canonical name in the main Anthropic map
+            for key_anthropic, val_anthropic in MODEL_MAPPINGS["anthropic"].items():
+                # Normalize the canonical key for comparison
+                # "claude-3-7-sonnet-20250219" -> "claude37sonnet20250219"
+                key_anthropic_norm = key_anthropic.lower().replace("-", "")
+                # Check if the normalized canonical key starts with "claude" + normalized extracted part
+                # AND contains the date part.
+                if (
+                    key_anthropic_norm.startswith("claude" + model_key_part_norm)
+                    and date_part in key_anthropic_norm
+                ):
+                    cleaned_model_name = (
+                        val_anthropic  # Use the canonical name like "claude-3-7-sonnet-20250219"
+                    )
+                    break
+        elif model_name.startswith("OMNI: OpenAI "):
             provider = LLMProvider.OPENAI
-            model_name_to_use = MODEL_MAPPINGS["omni"].get(
-                model_name.lower(), MODEL_MAPPINGS["omni"]["default"]
-            )
-        else:
-            # Handle custom model names - use the OAICOMPAT provider
+            # Extract the model part, e.g., "GPT-4o mini"
+            model_key_part = model_name.replace("OMNI: OpenAI ", "")
+            # Normalize the extracted part: "gpt4omini"
+            model_key_part_norm = model_key_part.lower().replace("-", "").replace(" ", "")
+            cleaned_model_name = MODEL_MAPPINGS["omni"]["default"]  # Default if not found
+            # Find the canonical name in the main OMNI map for OpenAI models
+            for key_omni, val_omni in MODEL_MAPPINGS["omni"].items():
+                # Normalize the omni map key: "gpt-4o-mini" -> "gpt4omini"
+                key_omni_norm = key_omni.lower().replace("-", "").replace(" ", "")
+                # Check if the normalized omni key matches the normalized extracted part
+                if key_omni_norm == model_key_part_norm:
+                    cleaned_model_name = (
+                        val_omni  # Use the value from the OMNI map (e.g., gpt-4o-mini)
+                    )
+                    break
+            # Note: No fallback needed here as we explicitly check against omni keys
+        else:  # Handles unexpected formats or the raw custom name if "Custom model..." selected
+            # Should only happen if user selected "Custom model..."
+            # Or if a model name format isn't caught above
             provider = LLMProvider.OAICOMPAT
-            # Use the model name as is without mapping, or use default if empty
-            model_name_to_use = (
-                model_name if model_name.strip() else MODEL_MAPPINGS["oaicompat"]["default"]
+            cleaned_model_name = (
+                model_name.strip() if model_name.strip() else MODEL_MAPPINGS["oaicompat"]["default"]
             )
+        # Assign the determined model name
+        model_name_to_use = cleaned_model_name
+        # agent_loop remains AgentLoop.OMNI
     else:
         # Default to OpenAI if unrecognized loop
         provider = LLMProvider.OPENAI
@@ -177,17 +316,20 @@ def get_ollama_models() -> List[str]:
         return []
-def extract_synthesized_text(result: Union[AgentResponse, Dict[str, Any]]) -> str:
+def extract_synthesized_text(
+    result: Union[AgentResponse, Dict[str, Any]],
+) -> Tuple[str, MetadataDict]:
     """Extract synthesized text from the agent result."""
     synthesized_text = ""
+    metadata = MetadataDict()
     if "output" in result and result["output"]:
         for output in result["output"]:
             if output.get("type") == "reasoning":
+                metadata["title"] = "🧠 Reasoning"
                 content = output.get("content", "")
                 if content:
                     synthesized_text += f"{content}\n"
             elif output.get("type") == "message":
                 # Handle message type outputs - can contain rich content
                 content = output.get("content", [])
@@ -224,7 +366,10 @@ def extract_synthesized_text(result: Union[AgentResponse, Dict[str, Any]]) -> st
                 else:
                     synthesized_text += f"Performed {action_type} action.\n"
-    return synthesized_text.strip()
+                metadata["status"] = "done"
+                metadata["title"] = f"🛠️ {synthesized_text.strip().splitlines()[-1]}"
+    return synthesized_text.strip(), metadata
 def create_computer_instance(verbosity: int = logging.INFO) -> Computer:
@@ -245,8 +390,8 @@ def create_agent(
     save_trajectory: bool = True,
     only_n_most_recent_images: int = 3,
     verbosity: int = logging.INFO,
-    use_ollama: bool = False,
     use_oaicompat: bool = False,
+    provider_base_url: Optional[str] = None,
 ) -> ComputerAgent:
     """Create or update the global agent with the specified parameters."""
     global global_agent
@@ -254,15 +399,6 @@ def create_agent(
     # Create the computer if not already done
     computer = create_computer_instance(verbosity=verbosity)
-    # Extra configuration to pass to the agent
-    extra_config = {}
-    # For Ollama models, we'll pass use_ollama and the model_name directly
-    if use_ollama:
-        extra_config["use_ollama"] = True
-        extra_config["ollama_model"] = model_name
-        print(f"DEBUG - Using Ollama with model: {model_name}")
     # Get API key from environment if not provided
     if api_key is None:
         if provider == LLMProvider.OPENAI:
@@ -270,69 +406,52 @@ def create_agent(
         elif provider == LLMProvider.ANTHROPIC:
             api_key = os.environ.get("ANTHROPIC_API_KEY", "")
-    # Create LLM model object with appropriate parameters
-    provider_base_url = "http://localhost:8000/v1" if use_oaicompat else None
+    # Use provided provider_base_url if available, otherwise use default
+    default_base_url = "http://localhost:1234/v1" if use_oaicompat else None
+    custom_base_url = provider_base_url or default_base_url
     if use_oaicompat:
-        # Special handling for OAICOMPAT - use OPENAI provider with custom base URL
-        print(f"DEBUG - Creating OAICOMPAT agent with model: {model_name}")
+        # Special handling for OAICOMPAT - use OAICOMPAT provider with custom base URL
+        print(f"DEBUG - Creating OAICOMPAT agent with model: {model_name}, URL: {custom_base_url}")
         llm = LLM(
-            provider=provider,  # Already set to OPENAI
+            provider=LLMProvider.OAICOMPAT,  # Set to OAICOMPAT instead of using original provider
             name=model_name,
-            provider_base_url=provider_base_url,
+            provider_base_url=custom_base_url,
         )
+        print(f"DEBUG - LLM provider is now: {llm.provider}, base URL: {llm.provider_base_url}")
         # Note: Don't pass use_oaicompat to the agent, as it doesn't accept this parameter
     elif provider == LLMProvider.OAICOMPAT:
         # This path is unlikely to be taken with our current approach
-        llm = LLM(provider=provider, name=model_name, provider_base_url=provider_base_url)
+        llm = LLM(provider=provider, name=model_name, provider_base_url=custom_base_url)
     else:
         # For other providers, just use standard parameters
         llm = LLM(provider=provider, name=model_name)
     # Create or update the agent
-    if global_agent is None:
-        global_agent = ComputerAgent(
-            computer=computer,
-            loop=agent_loop,
-            model=llm,
-            api_key=api_key,
-            save_trajectory=save_trajectory,
-            only_n_most_recent_images=only_n_most_recent_images,
-            verbosity=verbosity,
-            **extra_config,
-        )
-    else:
-        # Update the existing agent's parameters
-        global_agent._loop = None  # Force recreation of the loop
-        global_agent.provider = provider
-        global_agent.loop = agent_loop
-        global_agent.model = llm
-        global_agent.api_key = api_key
-        # Explicitly update these settings to ensure they take effect
-        global_agent.save_trajectory = save_trajectory
-        global_agent.only_n_most_recent_images = only_n_most_recent_images
-        # Update Ollama settings if applicable
-        if use_ollama:
-            global_agent.use_ollama = True
-            global_agent.ollama_model = model_name
-        else:
-            global_agent.use_ollama = False
-            global_agent.ollama_model = None
-        # Log the updated settings
-        logging.info(
-            f"Updated agent settings: save_trajectory={save_trajectory}, recent_images={only_n_most_recent_images}"
-        )
+    global_agent = ComputerAgent(
+        computer=computer,
+        loop=agent_loop,
+        model=llm,
+        api_key=api_key,
+        save_trajectory=save_trajectory,
+        only_n_most_recent_images=only_n_most_recent_images,
+        verbosity=verbosity,
+    )
     return global_agent
-def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> str:
+def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> Tuple[str, MetadataDict]:
     """Process agent results for the Gradio UI."""
     # Extract text content
     text_obj = result.get("text", {})
+    metadata = result.get("metadata", {})
+    # Create a properly typed MetadataDict
+    metadata_dict = MetadataDict()
+    metadata_dict["title"] = metadata.get("title", "")
+    metadata_dict["status"] = "done"
+    metadata = metadata_dict
     # For OpenAI's Computer-Use Agent, text field is an object with format property
     if (
@@ -341,8 +460,11 @@ def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> str:
         and "format" in text_obj
         and not text_obj.get("value", "")
     ):
-        content = extract_synthesized_text(result)
+        content, metadata = extract_synthesized_text(result)
     else:
+        if not text_obj:
+            text_obj = result
         # For other types of results, try to get text directly
         if isinstance(text_obj, dict):
             if "value" in text_obj:
@@ -375,177 +497,7 @@ def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> str:
     if not isinstance(content, str):
         content = str(content) if content else ""
-    return content
-def respond(
-    message: str,
-    history: List[Tuple[str, str]],
-    model_choice,  # Accept Gradio Dropdown component
-    agent_loop,  # Accept Gradio Dropdown component
-    save_trajectory,  # Accept Gradio Checkbox component
-    recent_images,  # Accept Gradio Slider component
-    openai_api_key: Optional[str] = None,
-    anthropic_api_key: Optional[str] = None,
-) -> str:
-    """Process a message with the Computer-Use Agent and return the response."""
-    import asyncio
-    # Get actual values from Gradio components
-    model_choice_value = model_choice.value if hasattr(model_choice, "value") else model_choice
-    agent_loop_value = agent_loop.value if hasattr(agent_loop, "value") else agent_loop
-    save_trajectory_value = (
-        save_trajectory.value if hasattr(save_trajectory, "value") else save_trajectory
-    )
-    recent_images_value = int(
-        recent_images.value if hasattr(recent_images, "value") else recent_images
-    )
-    # Debug logging
-    print(f"DEBUG - Model choice object: {type(model_choice)}")
-    print(f"DEBUG - Model choice value: {model_choice_value}")
-    print(f"DEBUG - Agent loop value: {agent_loop_value}")
-    # Create a new event loop for this function call
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    async def _async_respond():
-        # Extract the loop type and model from the selection
-        loop_provider = "OPENAI"
-        if isinstance(model_choice_value, str):
-            # This is the case for a custom text input from textbox
-            if agent_loop_value == "OMNI":
-                loop_provider = "OMNI"
-                # Use the custom model name as is
-                model_id = model_choice_value
-                print(f"DEBUG - Using custom model: {model_id}")
-            else:
-                # Handle regular dropdown value as string
-                if model_choice_value.startswith("OpenAI:"):
-                    loop_provider = "OPENAI"
-                    model_id = model_choice_value.replace("OpenAI: ", "").lower()
-                elif model_choice_value.startswith("Anthropic:"):
-                    loop_provider = "ANTHROPIC"
-                    model_id = model_choice_value.replace("Anthropic: ", "").lower()
-                elif model_choice_value.startswith("OMNI:"):
-                    loop_provider = "OMNI"
-                    if "GPT" in model_choice_value:
-                        model_id = model_choice_value.replace("OMNI: OpenAI ", "").lower()
-                    elif "Claude" in model_choice_value:
-                        model_id = model_choice_value.replace("OMNI: ", "").lower()
-                    elif "Ollama" in model_choice_value:
-                        loop_provider = "OMNI-OLLAMA"
-                        # Extract everything after "OMNI: Ollama " which is the full model name (e.g., phi3:latest)
-                        model_id = model_choice_value.replace("OMNI: Ollama ", "")
-                        print(f"DEBUG - Ollama model ID: {model_id}")
-                    else:
-                        model_id = "default"
-                else:
-                    # Default case
-                    loop_provider = agent_loop_value
-                    model_id = "default"
-        else:
-            # Model choice is not a string (shouldn't happen, but handle anyway)
-            loop_provider = agent_loop_value
-            model_id = "default"
-        print(f"DEBUG - Using loop provider: {loop_provider}, model_id: {model_id}")
-        # Use the mapping function to get provider, model name and agent loop
-        provider, model_name, agent_loop_type = get_provider_and_model(model_id, loop_provider)
-        print(
-            f"DEBUG - After mapping: provider={provider}, model_name={model_name}, agent_loop={agent_loop_type}"
-        )
-        # Special handling for OAICOMPAT to bypass provider-specific errors
-        # Creates the agent with OPENAI provider but using custom model name and provider base URL
-        is_oaicompat = str(provider) == "oaicompat"
-        if is_oaicompat:
-            provider = LLMProvider.OPENAI
-        # Get API key based on provider
-        if provider == LLMProvider.OPENAI:
-            api_key = openai_api_key or os.environ.get("OPENAI_API_KEY", "")
-        elif provider == LLMProvider.ANTHROPIC:
-            api_key = anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
-        else:
-            api_key = ""
-        # Check for settings changes if agent already exists
-        settings_changed = False
-        settings_message = ""
-        if global_agent is not None:
-            # Safely check if save_trajectory setting changed
-            current_save_traj = getattr(global_agent, "save_trajectory", None)
-            if current_save_traj is not None and current_save_traj != save_trajectory_value:
-                settings_changed = True
-                settings_message += f"Save trajectory set to: {save_trajectory_value}. "
-            # Safely check if recent_images setting changed
-            current_recent_images = getattr(global_agent, "only_n_most_recent_images", None)
-            if current_recent_images is not None and current_recent_images != recent_images_value:
-                settings_changed = True
-                settings_message += f"Recent images set to: {recent_images_value}. "
-        # Create or update the agent
-        try:
-            create_agent(
-                provider=provider,
-                agent_loop=agent_loop_type,
-                model_name=model_name,
-                api_key=api_key,
-                save_trajectory=save_trajectory_value,
-                only_n_most_recent_images=recent_images_value,
-                use_ollama=loop_provider == "OMNI-OLLAMA",
-                use_oaicompat=is_oaicompat,
-            )
-            if global_agent is None:
-                return "Failed to create agent. Check API keys and configuration."
-        except Exception as e:
-            return f"Error creating agent: {str(e)}"
-        # Notify about settings changes if needed
-        if settings_changed:
-            return f"Settings updated: {settings_message}"
-        # Collect all responses
-        response_text = []
-        # Run the agent
-        try:
-            async for result in global_agent.run(message):
-                # Process result
-                content = process_agent_result(result)
-                # Skip empty content
-                if not content:
-                    continue
-                # Add content to response list
-                response_text.append(content)
-            # Return the full response as a single string
-            return "\n".join(response_text) if response_text else "Task completed."
-        except Exception as e:
-            import traceback
-            traceback.print_exc()
-            return f"Error: {str(e)}"
-    # Run the async function and get the result
-    try:
-        result = loop.run_until_complete(_async_respond())
-        loop.close()
-        return result
-    except Exception as e:
-        loop.close()
-        import traceback
-        traceback.print_exc()
-        return f"Error executing async operation: {str(e)}"
+    return content, metadata
 def create_gradio_ui(
@@ -561,6 +513,10 @@ def create_gradio_ui(
     Returns:
         A Gradio Blocks application
     """
+    # --- Load Settings ---
+    saved_settings = load_settings()
+    # --- End Load Settings ---
     # Check for API keys
     openai_api_key = os.environ.get("OPENAI_API_KEY", "")
     anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
@@ -574,6 +530,7 @@ def create_gradio_ui(
         openai_models = ["OpenAI: Computer-Use Preview"]
         omni_models += [
             "OMNI: OpenAI GPT-4o",
+            "OMNI: OpenAI GPT-4o mini",
             "OMNI: OpenAI GPT-4.5-preview",
         ]
@@ -596,21 +553,33 @@ def create_gradio_ui(
         "OMNI": omni_models + ["Custom model..."],  # Add custom model option
     }
-    # Get initial agent loop and model based on provided parameters
-    if provider_name.lower() == "openai":
-        initial_loop = "OPENAI"
-        initial_model = "OpenAI: Computer-Use Preview" if openai_models else "No models available"
-    elif provider_name.lower() == "anthropic":
-        initial_loop = "ANTHROPIC"
-        initial_model = anthropic_models[0] if anthropic_models else "No models available"
+    # --- Apply Saved Settings (override defaults if available) ---
+    initial_loop = saved_settings.get("agent_loop", "OMNI")
+    # Ensure the saved model is actually available in the choices for the loaded loop
+    available_models_for_loop = provider_to_models.get(initial_loop, [])
+    saved_model_choice = saved_settings.get("model_choice")
+    if saved_model_choice and saved_model_choice in available_models_for_loop:
+        initial_model = saved_model_choice
     else:
-        initial_loop = "OMNI"
-        if model_name == "gpt-4o" and "OMNI: OpenAI GPT-4o" in omni_models:
-            initial_model = "OMNI: OpenAI GPT-4o"
-        elif "claude" in model_name.lower() and omni_models:
-            initial_model = next((m for m in omni_models if "Claude" in m), omni_models[0])
-        else:
+        # If saved model isn't valid for the loop, reset to default for that loop
+        if initial_loop == "OPENAI":
+            initial_model = (
+                "OpenAI: Computer-Use Preview" if openai_models else "No models available"
+            )
+        elif initial_loop == "ANTHROPIC":
+            initial_model = anthropic_models[0] if anthropic_models else "No models available"
+        else:  # OMNI
             initial_model = omni_models[0] if omni_models else "No models available"
+            if "Custom model..." in available_models_for_loop:
+                initial_model = (
+                    "Custom model..."  # Default to custom if available and no other default fits
+                )
+    initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
+    initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
+    initial_save_trajectory = saved_settings.get("save_trajectory", True)
+    initial_recent_images = saved_settings.get("recent_images", 3)
+    # --- End Apply Saved Settings ---
     # Example prompts
     example_messages = [
@@ -703,7 +672,7 @@ def create_gradio_ui(
                     ### 3. Pull the pre-built macOS image
                     ```bash
-                    lume pull macos-sequoia-cua:latest --no-cache
+                    lume pull macos-sequoia-cua:latest
                     ```
                     Initial download requires 80GB storage, but reduces to ~30GB after first run due to macOS's sparse file system.
@@ -720,48 +689,68 @@ def create_gradio_ui(
                     """
                     )
-                # Configuration options
-                agent_loop = gr.Dropdown(
-                    choices=["OPENAI", "ANTHROPIC", "OMNI"],
-                    label="Agent Loop",
-                    value=initial_loop,
-                    info="Select the agent loop provider",
-                )
+                with gr.Accordion("Configuration", open=True):
+                    # Configuration options
+                    agent_loop = gr.Dropdown(
+                        choices=["OPENAI", "ANTHROPIC", "OMNI"],
+                        label="Agent Loop",
+                        value=initial_loop,
+                        info="Select the agent loop provider",
+                    )
-                # Create model selection dropdown with custom value support for OMNI
-                model_choice = gr.Dropdown(
-                    choices=provider_to_models.get(initial_loop, ["No models available"]),
-                    label="LLM Provider and Model",
-                    value=initial_model,
-                    info="Select model or choose 'Custom model...' to enter a custom name",
-                    interactive=True,
-                )
+                    # Create model selection dropdown with custom value support for OMNI
+                    model_choice = gr.Dropdown(
+                        choices=provider_to_models.get(initial_loop, ["No models available"]),
+                        label="LLM Provider and Model",
+                        value=initial_model,
+                        info="Select model or choose 'Custom model...' to enter a custom name",
+                        interactive=True,
+                    )
-                # Add custom model textbox (only visible when "Custom model..." is selected)
-                custom_model = gr.Textbox(
-                    label="Custom Model Name",
-                    placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct)",
-                    value="Qwen2.5-VL-7B-Instruct",  # Default value
-                    visible=False,  # Initially hidden
-                    interactive=True,
-                )
+                    # Add custom model textbox (only visible when "Custom model..." is selected)
+                    custom_model = gr.Textbox(
+                        label="Custom Model Name",
+                        placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct)",
+                        value=initial_custom_model,
+                        visible=(initial_model == "Custom model..."),
+                        interactive=True,
+                    )
-                save_trajectory = gr.Checkbox(
-                    label="Save Trajectory",
-                    value=True,
-                    info="Save the agent's trajectory for debugging",
-                    interactive=True,
-                )
+                    # Add custom provider base URL textbox (only visible when "Custom model..." is selected)
+                    provider_base_url = gr.Textbox(
+                        label="Provider Base URL",
+                        placeholder="Enter provider base URL (e.g., http://localhost:1234/v1)",
+                        value=initial_provider_base_url,
+                        visible=(initial_model == "Custom model..."),
+                        interactive=True,
+                    )
-                recent_images = gr.Slider(
-                    label="Recent Images",
-                    minimum=1,
-                    maximum=10,
-                    value=3,
-                    step=1,
-                    info="Number of recent images to keep in context",
-                    interactive=True,
-                )
+                    # Add custom API key textbox (only visible when "Custom model..." is selected)
+                    provider_api_key = gr.Textbox(
+                        label="Provider API Key",
+                        placeholder="Enter provider API key (if required)",
+                        value="",
+                        visible=(initial_model == "Custom model..."),
+                        interactive=True,
+                        type="password",
+                    )
+                    save_trajectory = gr.Checkbox(
+                        label="Save Trajectory",
+                        value=initial_save_trajectory,
+                        info="Save the agent's trajectory for debugging",
+                        interactive=True,
+                    )
+                    recent_images = gr.Slider(
+                        label="Recent Images",
+                        minimum=1,
+                        maximum=10,
+                        value=initial_recent_images,
+                        step=1,
+                        info="Number of recent images to keep in context",
+                        interactive=True,
+                    )
             # Right column for chat interface
             with gr.Column(scale=2):
@@ -770,7 +759,7 @@ def create_gradio_ui(
                     "Ask me to perform tasks in a virtual macOS environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
                 )
-                chatbot = gr.Chatbot()
+                chatbot_history = gr.Chatbot(type="messages")
                 msg = gr.Textbox(
                     placeholder="Ask me to perform tasks in a virtual macOS environment"
                 )
@@ -782,63 +771,169 @@ def create_gradio_ui(
                 # Function to handle chat submission
                 def chat_submit(message, history):
                     # Add user message to history
-                    history = history + [(message, None)]
+                    history.append(gr.ChatMessage(role="user", content=message))
                     return "", history
                 # Function to process agent response after user input
-                def process_response(
+                async def process_response(
                     history,
                     model_choice_value,
                     custom_model_value,
                     agent_loop_choice,
                     save_traj,
                     recent_imgs,
+                    custom_url_value=None,
+                    custom_api_key=None,
                 ):
                     if not history:
-                        return history
+                        yield history
+                        return
                     # Get the last user message
-                    last_user_message = history[-1][0]
+                    last_user_message = history[-1]["content"]
-                    # Use custom model value if "Custom model..." is selected
-                    model_to_use = (
+                    # Determine the model name string to analyze: custom or from dropdown
+                    model_string_to_analyze = (
                         custom_model_value
                         if model_choice_value == "Custom model..."
-                        else model_choice_value
+                        else model_choice_value  # Use the full UI string initially
                     )
-                    # Process with agent
-                    response = respond(
-                        last_user_message,
-                        history[:-1],  # History without the last message
-                        model_to_use,
-                        agent_loop_choice,
-                        save_traj,
-                        recent_imgs,
-                        openai_api_key,
-                        anthropic_api_key,
-                    )
-                    # Update the last assistant message
-                    history[-1] = (last_user_message, response)
-                    return history
+                    # Determine if this is a custom model selection
+                    is_custom_model_selected = model_choice_value == "Custom model..."
+                    try:
+                        # Get the provider, *cleaned* model name, and agent loop type
+                        provider, cleaned_model_name_from_func, agent_loop_type = (
+                            get_provider_and_model(model_string_to_analyze, agent_loop_choice)
+                        )
+                        # Determine the final model name to send to the agent
+                        # If custom selected, use the custom text box value, otherwise use the cleaned name
+                        final_model_name_to_send = (
+                            custom_model_value
+                            if is_custom_model_selected
+                            else cleaned_model_name_from_func
+                        )
+                        # Determine if OAICOMPAT should be used (only if custom model explicitly selected)
+                        is_oaicompat = is_custom_model_selected
+                        # Get API key based on provider determined by get_provider_and_model
+                        if is_oaicompat and custom_api_key:
+                            # Use custom API key if provided for custom model
+                            api_key = custom_api_key
+                            print(
+                                f"DEBUG - Using custom API key for model: {final_model_name_to_send}"
+                            )
+                        elif provider == LLMProvider.OPENAI:
+                            api_key = openai_api_key or os.environ.get("OPENAI_API_KEY", "")
+                        elif provider == LLMProvider.ANTHROPIC:
+                            api_key = anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
+                        else:
+                            # For Ollama or default OAICOMPAT (without custom key), no key needed/expected
+                            api_key = ""
+                        # --- Save Settings Before Running Agent ---
+                        current_settings = {
+                            "agent_loop": agent_loop_choice,
+                            "model_choice": model_choice_value,
+                            "custom_model": custom_model_value,
+                            "provider_base_url": custom_url_value,
+                            "save_trajectory": save_traj,
+                            "recent_images": recent_imgs,
+                        }
+                        save_settings(current_settings)
+                        # --- End Save Settings ---
+                        # Create or update the agent
+                        create_agent(
+                            # Provider determined by get_provider_and_model unless custom model selected
+                            provider=LLMProvider.OAICOMPAT if is_oaicompat else provider,
+                            agent_loop=agent_loop_type,
+                            # Pass the FINAL determined model name (cleaned or custom)
+                            model_name=final_model_name_to_send,
+                            api_key=api_key,
+                            save_trajectory=save_traj,
+                            only_n_most_recent_images=recent_imgs,
+                            use_oaicompat=is_oaicompat,  # Set flag if custom model was selected
+                            # Pass custom URL only if custom model was selected
+                            provider_base_url=custom_url_value if is_oaicompat else None,
+                            verbosity=logging.DEBUG,  # Added verbosity here
+                        )
+                        if global_agent is None:
+                            # Add initial empty assistant message
+                            history.append(
+                                gr.ChatMessage(
+                                    role="assistant",
+                                    content="Failed to create agent. Check API keys and configuration.",
+                                )
+                            )
+                            yield history
+                            return
+                            # Add the screenshot handler to the agent's loop if available
+                        if global_agent and hasattr(global_agent, "_loop"):
+                            print("DEBUG - Adding screenshot handler to agent loop")
+                            # Create the screenshot handler with references to UI components
+                            screenshot_handler = GradioChatScreenshotHandler(history)
+                            # Add the handler to the callback manager if it exists AND is not None
+                            if (
+                                hasattr(global_agent._loop, "callback_manager")
+                                and global_agent._loop.callback_manager is not None
+                            ):
+                                global_agent._loop.callback_manager.add_handler(screenshot_handler)
+                                print(
+                                    f"DEBUG - Screenshot handler added to callback manager with history: {id(history)}"
+                                )
+                            else:
+                                # Optional: Log a warning if the callback manager is missing/None for a specific loop
+                                print(
+                                    f"WARNING - Callback manager not found or is None for loop type: {type(global_agent._loop)}. Screenshot handler not added."
+                                )
+                        # Stream responses from the agent
+                        async for result in global_agent.run(last_user_message):
+                            # Process result
+                            content, metadata = process_agent_result(result)
+                            # Skip empty content
+                            if content or metadata.get("title"):
+                                history.append(
+                                    gr.ChatMessage(
+                                        role="assistant", content=content, metadata=metadata
+                                    )
+                                )
+                            yield history
+                    except Exception as e:
+                        import traceback
+                        traceback.print_exc()
+                        # Update with error message
+                        history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
+                        yield history
                 # Connect the components
-                msg.submit(chat_submit, [msg, chatbot], [msg, chatbot]).then(
+                msg.submit(chat_submit, [msg, chatbot_history], [msg, chatbot_history]).then(
                     process_response,
                     [
-                        chatbot,
+                        chatbot_history,
                         model_choice,
                         custom_model,
                         agent_loop,
                         save_trajectory,
                         recent_images,
+                        provider_base_url,
+                        provider_api_key,
                     ],
-                    [chatbot],
+                    [chatbot_history],
                 )
                 # Clear button functionality
-                clear.click(lambda: None, None, chatbot, queue=False)
+                clear.click(lambda: None, None, chatbot_history, queue=False)
                 # Connect agent_loop changes to model selection
                 agent_loop.change(
@@ -848,14 +943,19 @@ def create_gradio_ui(
                     queue=False,  # Process immediately without queueing
                 )
-                # Show/hide custom model textbox based on dropdown selection
+                # Show/hide custom model, provider base URL, and API key textboxes based on dropdown selection
                 def update_custom_model_visibility(model_value):
-                    return gr.update(visible=model_value == "Custom model...")
+                    is_custom = model_value == "Custom model..."
+                    return (
+                        gr.update(visible=is_custom),
+                        gr.update(visible=is_custom),
+                        gr.update(visible=is_custom),
+                    )
                 model_choice.change(
                     fn=update_custom_model_visibility,
                     inputs=[model_choice],
-                    outputs=[custom_model],
+                    outputs=[custom_model, provider_base_url, provider_api_key],
                     queue=False,  # Process immediately without queueing
                 )

cua-agent 0.1.24__py3-none-any.whl → 0.1.26__py3-none-any.whl

Potentially problematic release.

cua-agent 0.1.24py3-none-any.whl → 0.1.26py3-none-any.whl