PyPI - cua-agent - Versions diffs - 0.2.9__tar.gz → 0.2.11__tar.gz - Mend

cua-agent 0.2.9tar.gz → 0.2.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (86) hide show

{cua_agent-0.2.9 → cua_agent-0.2.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cua-agent
-Version: 0.2.9
+Version: 0.2.11
 Summary: CUA (Computer Use) Agent for AI-driven computer interaction
 Author-Email: TryCua <gh@trycua.com>
 Requires-Python: >=3.11

{cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/base.py RENAMED Viewed

@@ -5,7 +5,6 @@ import asyncio
 from abc import ABC, abstractmethod
 from typing import Any, AsyncGenerator, Dict, List, Optional
-from agent.providers.omni.parser import ParseResult
 from computer import Computer
 from .messages import StandardMessageManager, ImageRetentionConfig
 from .types import AgentResponse
@@ -207,7 +206,7 @@ class BaseLoop(ABC):
     # EVENT HOOKS / CALLBACKS
     ###########################################
-    async def handle_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None:
+    async def handle_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[dict] = None) -> None:
         """Process a screenshot through callback managers
         Args:

{cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/callbacks.py RENAMED Viewed

@@ -6,8 +6,6 @@ from abc import ABC, abstractmethod
 from datetime import datetime
 from typing import Any, Dict, List, Optional, Protocol
-from agent.providers.omni.parser import ParseResult
 logger = logging.getLogger(__name__)
 class ContentCallback(Protocol):
@@ -117,7 +115,7 @@ class CallbackManager:
         for handler in self.handlers:
             await handler.on_error(error, **kwargs)
-    async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None:
+    async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[dict] = None) -> None:
         """Called when a screenshot is taken.
         Args:
@@ -166,7 +164,7 @@ class CallbackHandler(ABC):
         pass
     @abstractmethod
-    async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None:
+    async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[dict] = None) -> None:
         """Called when a screenshot is taken.
         Args:

{cua_agent-0.2.9 → cua_agent-0.2.11}/agent/core/messages.py RENAMED Viewed

@@ -5,7 +5,6 @@ import json
 from typing import Any, Dict, List, Optional, Union, Tuple
 from dataclasses import dataclass
 import re
-from ..providers.omni.parser import ParseResult
 logger = logging.getLogger(__name__)
@@ -82,16 +81,27 @@ class StandardMessageManager:
         if not self.config.num_images_to_keep:
             return messages
-        # Find user messages with images
+        # Find messages with images (both user messages and tool call outputs)
         image_messages = []
         for msg in messages:
+            has_image = False
+            # Check user messages with images
             if msg["role"] == "user" and isinstance(msg["content"], list):
                 has_image = any(
                     item.get("type") == "image_url" or item.get("type") == "image"
                     for item in msg["content"]
                 )
-                if has_image:
-                    image_messages.append(msg)
+            # Check assistant messages with tool calls that have images
+            elif msg["role"] == "assistant" and isinstance(msg["content"], list):
+                for item in msg["content"]:
+                    if item.get("type") == "tool_result" and "base64_image" in item:
+                        has_image = True
+                        break
+            if has_image:
+                image_messages.append(msg)
         # If we don't have more images than the limit, return all messages
         if len(image_messages) <= self.config.num_images_to_keep:
@@ -101,13 +111,35 @@ class StandardMessageManager:
         images_to_keep = image_messages[-self.config.num_images_to_keep :]
         images_to_remove = image_messages[: -self.config.num_images_to_keep]
-        # Create a new message list without the older images
+        # Create a new message list, removing images from older messages
         result = []
         for msg in messages:
             if msg in images_to_remove:
-                # Skip this message
-                continue
-            result.append(msg)
+                # Remove images from this message but keep the text content
+                if msg["role"] == "user" and isinstance(msg["content"], list):
+                    # Keep only text content, remove images
+                    new_content = [
+                        item for item in msg["content"]
+                        if item.get("type") not in ["image_url", "image"]
+                    ]
+                    if new_content:  # Only add if there's still content
+                        result.append({"role": msg["role"], "content": new_content})
+                elif msg["role"] == "assistant" and isinstance(msg["content"], list):
+                    # Remove base64_image from tool_result items
+                    new_content = []
+                    for item in msg["content"]:
+                        if item.get("type") == "tool_result" and "base64_image" in item:
+                            # Create a copy without the base64_image
+                            new_item = {k: v for k, v in item.items() if k != "base64_image"}
+                            new_content.append(new_item)
+                        else:
+                            new_content.append(item)
+                    result.append({"role": msg["role"], "content": new_content})
+                else:
+                    # For other message types, keep as is
+                    result.append(msg)
+            else:
+                result.append(msg)
         return result

{cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/tools/computer.py RENAMED Viewed

@@ -205,26 +205,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                 self.logger.info(f"  Coordinates: ({x}, {y})")
                 try:
-                    # Take pre-action screenshot to get current dimensions
-                    pre_screenshot = await self.computer.interface.screenshot()
-                    pre_img = Image.open(io.BytesIO(pre_screenshot))
-                    # Scale image to match screen dimensions if needed
-                    if pre_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
-                        )
-                        if not isinstance(self.width, int) or not isinstance(self.height, int):
-                            raise ToolError("Screen dimensions must be integers")
-                        size = (int(self.width), int(self.height))
-                        pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
-                        # Save the scaled image back to bytes
-                        buffer = io.BytesIO()
-                        pre_img.save(buffer, format="PNG")
-                        pre_screenshot = buffer.getvalue()
-                    self.logger.info(f"  Current dimensions: {pre_img.width}x{pre_img.height}")
                     # Perform the click action
                     if action == "left_click":
                         self.logger.info(f"Clicking at ({x}, {y})")
@@ -242,45 +222,14 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                     # Wait briefly for any UI changes
                     await asyncio.sleep(0.5)
-                    # Take and save post-action screenshot
-                    post_screenshot = await self.computer.interface.screenshot()
-                    post_img = Image.open(io.BytesIO(post_screenshot))
-                    # Scale post-action image if needed
-                    if post_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                        )
-                        post_img = post_img.resize(
-                            (self.width, self.height), Image.Resampling.LANCZOS
-                        )
-                        buffer = io.BytesIO()
-                        post_img.save(buffer, format="PNG")
-                        post_screenshot = buffer.getvalue()
                     return ToolResult(
                         output=f"Performed {action} at ({x}, {y})",
-                        base64_image=base64.b64encode(post_screenshot).decode(),
                     )
                 except Exception as e:
                     self.logger.error(f"Error during {action} action: {str(e)}")
                     raise ToolError(f"Failed to perform {action}: {str(e)}")
             else:
                 try:
-                    # Take pre-action screenshot
-                    pre_screenshot = await self.computer.interface.screenshot()
-                    pre_img = Image.open(io.BytesIO(pre_screenshot))
-                    # Scale image if needed
-                    if pre_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
-                        )
-                        if not isinstance(self.width, int) or not isinstance(self.height, int):
-                            raise ToolError("Screen dimensions must be integers")
-                        size = (int(self.width), int(self.height))
-                        pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
                     # Perform the click action
                     if action == "left_click":
                         self.logger.info("Performing left click at current position")
@@ -295,25 +244,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                     # Wait briefly for any UI changes
                     await asyncio.sleep(0.5)
-                    # Take post-action screenshot
-                    post_screenshot = await self.computer.interface.screenshot()
-                    post_img = Image.open(io.BytesIO(post_screenshot))
-                    # Scale post-action image if needed
-                    if post_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                        )
-                        post_img = post_img.resize(
-                            (self.width, self.height), Image.Resampling.LANCZOS
-                        )
-                        buffer = io.BytesIO()
-                        post_img.save(buffer, format="PNG")
-                        post_screenshot = buffer.getvalue()
                     return ToolResult(
                         output=f"Performed {action} at current position",
-                        base64_image=base64.b64encode(post_screenshot).decode(),
                     )
                 except Exception as e:
                     self.logger.error(f"Error during {action} action: {str(e)}")
@@ -328,20 +260,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                 raise ToolError(f"{text} must be a string")
             try:
-                # Take pre-action screenshot
-                pre_screenshot = await self.computer.interface.screenshot()
-                pre_img = Image.open(io.BytesIO(pre_screenshot))
-                # Scale image if needed
-                if pre_img.size != (self.width, self.height):
-                    self.logger.info(
-                        f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
-                    )
-                    if not isinstance(self.width, int) or not isinstance(self.height, int):
-                        raise ToolError("Screen dimensions must be integers")
-                    size = (int(self.width), int(self.height))
-                    pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
                 if action == "key":
                     # Special handling for page up/down on macOS
                     if text.lower() in ["pagedown", "page_down", "page down"]:
@@ -378,25 +296,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                     # Wait briefly for UI changes
                     await asyncio.sleep(0.5)
-                    # Take post-action screenshot
-                    post_screenshot = await self.computer.interface.screenshot()
-                    post_img = Image.open(io.BytesIO(post_screenshot))
-                    # Scale post-action image if needed
-                    if post_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                        )
-                        post_img = post_img.resize(
-                            (self.width, self.height), Image.Resampling.LANCZOS
-                        )
-                        buffer = io.BytesIO()
-                        post_img.save(buffer, format="PNG")
-                        post_screenshot = buffer.getvalue()
                     return ToolResult(
                         output=f"Pressed key: {output_text}",
-                        base64_image=base64.b64encode(post_screenshot).decode(),
                     )
                 elif action == "type":
@@ -406,66 +307,13 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                     # Wait briefly for UI changes
                     await asyncio.sleep(0.5)
-                    # Take post-action screenshot
-                    post_screenshot = await self.computer.interface.screenshot()
-                    post_img = Image.open(io.BytesIO(post_screenshot))
-                    # Scale post-action image if needed
-                    if post_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                        )
-                        post_img = post_img.resize(
-                            (self.width, self.height), Image.Resampling.LANCZOS
-                        )
-                        buffer = io.BytesIO()
-                        post_img.save(buffer, format="PNG")
-                        post_screenshot = buffer.getvalue()
                     return ToolResult(
                         output=f"Typed text: {text}",
-                        base64_image=base64.b64encode(post_screenshot).decode(),
                     )
             except Exception as e:
                 self.logger.error(f"Error during {action} action: {str(e)}")
                 raise ToolError(f"Failed to perform {action}: {str(e)}")
-        elif action in ("screenshot", "cursor_position"):
-            if text is not None:
-                raise ToolError(f"text is not accepted for {action}")
-            if coordinate is not None:
-                raise ToolError(f"coordinate is not accepted for {action}")
-            try:
-                if action == "screenshot":
-                    # Take screenshot
-                    screenshot = await self.computer.interface.screenshot()
-                    img = Image.open(io.BytesIO(screenshot))
-                    # Scale image if needed
-                    if img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling image from {img.size} to {self.width}x{self.height}"
-                        )
-                        if not isinstance(self.width, int) or not isinstance(self.height, int):
-                            raise ToolError("Screen dimensions must be integers")
-                        size = (int(self.width), int(self.height))
-                        img = img.resize(size, Image.Resampling.LANCZOS)
-                        buffer = io.BytesIO()
-                        img.save(buffer, format="PNG")
-                        screenshot = buffer.getvalue()
-                    return ToolResult(base64_image=base64.b64encode(screenshot).decode())
-                elif action == "cursor_position":
-                    pos = await self.computer.interface.get_cursor_position()
-                    x, y = pos  # Unpack the tuple
-                    return ToolResult(output=f"X={int(x)},Y={int(y)}")
-            except Exception as e:
-                self.logger.error(f"Error during {action} action: {str(e)}")
-                raise ToolError(f"Failed to perform {action}: {str(e)}")
         elif action == "scroll":
             # Implement scroll action
             direction = kwargs.get("direction", "down")
@@ -487,28 +335,20 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                 # Wait briefly for UI changes
                 await asyncio.sleep(0.5)
-                # Take post-action screenshot
-                post_screenshot = await self.computer.interface.screenshot()
-                post_img = Image.open(io.BytesIO(post_screenshot))
-                # Scale post-action image if needed
-                if post_img.size != (self.width, self.height):
-                    self.logger.info(
-                        f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                    )
-                    post_img = post_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
-                    buffer = io.BytesIO()
-                    post_img.save(buffer, format="PNG")
-                    post_screenshot = buffer.getvalue()
                 return ToolResult(
                     output=f"Scrolled {direction} by {amount} steps",
-                    base64_image=base64.b64encode(post_screenshot).decode(),
                 )
             except Exception as e:
                 self.logger.error(f"Error during scroll action: {str(e)}")
                 raise ToolError(f"Failed to perform scroll: {str(e)}")
+        elif action == "screenshot":
+            # Take screenshot
+            return await self.screenshot()
+        elif action == "cursor_position":
+            pos = await self.computer.interface.get_cursor_position()
+            x, y = pos  # Unpack the tuple
+            return ToolResult(output=f"X={int(x)},Y={int(y)}")
         raise ToolError(f"Invalid action: {action}")
     async def screenshot(self):

{cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/anthropic/utils.py RENAMED Viewed

@@ -4,7 +4,6 @@ import logging
 import re
 from typing import Any, Dict, List, Optional, Tuple, cast
 from anthropic.types.beta import BetaMessage
-from ..omni.parser import ParseResult
 from ...core.types import AgentResponse
 from datetime import datetime
@@ -188,7 +187,7 @@ def from_anthropic_format(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]
 async def to_agent_response_format(
     response: BetaMessage,
     messages: List[Dict[str, Any]],
-    parsed_screen: Optional[ParseResult] = None,
+    parsed_screen: Optional[dict] = None,
     parser: Optional[Any] = None,
     model: Optional[str] = None,
 ) -> AgentResponse:

{cua_agent-0.2.9 → cua_agent-0.2.11}/agent/providers/openai/tools/computer.py RENAMED Viewed

@@ -61,9 +61,6 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
     computer: Computer  # The CUA Computer instance
     logger = logging.getLogger(__name__)
-    _screenshot_delay = 1.0  # macOS is generally faster than X11
-    _scaling_enabled = True
     def __init__(self, computer: Computer):
         """Initialize the computer tool.
@@ -185,26 +182,23 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             raise ToolError(f"Failed to execute {type}: {str(e)}")
     async def handle_click(self, button: str, x: int, y: int) -> ToolResult:
-        """Handle different click actions."""
+        """Handle mouse clicks."""
         try:
-            # Perform requested click action
+            # Perform the click based on button type
             if button == "left":
                 await self.computer.interface.left_click(x, y)
             elif button == "right":
                 await self.computer.interface.right_click(x, y)
             elif button == "double":
                 await self.computer.interface.double_click(x, y)
+            else:
+                raise ToolError(f"Unsupported button type: {button}")
-            # Wait for UI to update
-            await asyncio.sleep(0.5)
-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
+            # Wait briefly for UI to update
+            await asyncio.sleep(0.3)
             return ToolResult(
                 output=f"Performed {button} click at ({x}, {y})",
-                base64_image=base64_screenshot,
             )
         except Exception as e:
             self.logger.error(f"Error in handle_click: {str(e)}")
@@ -218,11 +212,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             await asyncio.sleep(0.3)
-            # Take screenshot after typing
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
-            return ToolResult(output=f"Typed: {text}", base64_image=base64_screenshot)
+            return ToolResult(output=f"Typed: {text}")
         except Exception as e:
             self.logger.error(f"Error in handle_typing: {str(e)}")
             raise ToolError(f"Failed to type '{text}': {str(e)}")
@@ -254,11 +244,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             # Wait briefly
             await asyncio.sleep(0.3)
-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
-            return ToolResult(output=f"Pressed key: {key}", base64_image=base64_screenshot)
+            return ToolResult(output=f"Pressed key: {key}")
         except Exception as e:
             self.logger.error(f"Error in handle_key: {str(e)}")
             raise ToolError(f"Failed to press key '{key}': {str(e)}")
@@ -272,11 +258,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             # Wait briefly
             await asyncio.sleep(0.2)
-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
-            return ToolResult(output=f"Moved cursor to ({x}, {y})", base64_image=base64_screenshot)
+            return ToolResult(output=f"Moved cursor to ({x}, {y})")
         except Exception as e:
             self.logger.error(f"Error in handle_mouse_move: {str(e)}")
             raise ToolError(f"Failed to move cursor to ({x}, {y}): {str(e)}")
@@ -296,14 +278,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             # Wait for UI to update
             await asyncio.sleep(0.5)
-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
-            return ToolResult(
-                output=f"Scrolled at ({x}, {y}) with delta ({scroll_x}, {scroll_y})",
-                base64_image=base64_screenshot,
-            )
+            return ToolResult(output=f"Scrolled at ({x}, {y}) by ({scroll_x}, {scroll_y})")
         except Exception as e:
             self.logger.error(f"Error in handle_scroll: {str(e)}")
             raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
@@ -331,13 +306,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             # Wait for UI to update
             await asyncio.sleep(0.5)
-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
             return ToolResult(
                 output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
-                base64_image=base64_screenshot,
             )
         except Exception as e:
             self.logger.error(f"Error in handle_drag: {str(e)}")

cua_agent-0.2.11/agent/ui/__main__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""
+Main entry point for agent.ui module.
+This allows running the agent UI with:
+    python -m agent.ui
+Instead of:
+    python -m agent.ui.gradio.app
+"""
+from .gradio.app import create_gradio_ui
+if __name__ == "__main__":
+    app = create_gradio_ui()
+    app.launch(share=False, inbrowser=True)

{cua_agent-0.2.9 → cua_agent-0.2.11}/agent/ui/gradio/app.py RENAMED Viewed

@@ -41,7 +41,6 @@ from typing import cast
 # Import from agent package
 from agent.core.types import AgentResponse
 from agent.core.callbacks import DefaultCallbackHandler
-from agent.providers.omni.parser import ParseResult
 from computer import Computer
 from agent import ComputerAgent, AgentLoop, LLM, LLMProvider
@@ -103,7 +102,7 @@ class GradioChatScreenshotHandler(DefaultCallbackHandler):
         self,
         screenshot_base64: str,
         action_type: str = "",
-        parsed_screen: Optional[ParseResult] = None,
+        parsed_screen: Optional[dict] = None,
     ) -> None:
         """Add screenshot to chatbot when a screenshot is taken and update the annotated image.
@@ -138,6 +137,7 @@ MODEL_MAPPINGS = {
     "openai": {
         # Default to operator CUA model
         "default": "computer-use-preview",
+        "OpenAI: Computer-Use Preview": "computer-use-preview",
         # Map standard OpenAI model names to CUA-specific model names
         "gpt-4-turbo": "computer-use-preview",
         "gpt-4o": "computer-use-preview",
@@ -148,9 +148,17 @@ MODEL_MAPPINGS = {
     "anthropic": {
         # Default to newest model
         "default": "claude-3-7-sonnet-20250219",
+        # New Claude 4 models
+        "Anthropic: Claude 4 Opus (20250514)": "claude-opus-4-20250514",
+        "Anthropic: Claude 4 Sonnet (20250514)": "claude-sonnet-4-20250514",
+        "claude-opus-4-20250514": "claude-opus-4-20250514",
+        "claude-sonnet-4-20250514": "claude-sonnet-4-20250514",
         # Specific Claude models for CUA
-        "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
+        "Anthropic: Claude 3.7 Sonnet (20250219)": "claude-3-5-sonnet-20240620",
+        "Anthropic: Claude 3.5 Sonnet (20240620)": "claude-3-7-sonnet-20250219",
         "claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
+        "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
         # Map standard model names to CUA-specific model names
         "claude-3-opus": "claude-3-7-sonnet-20250219",
         "claude-3-sonnet": "claude-3-5-sonnet-20240620",
@@ -210,12 +218,12 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
     if agent_loop == AgentLoop.OPENAI:
         provider = LLMProvider.OPENAI
         model_name_to_use = MODEL_MAPPINGS["openai"].get(
-            model_name.lower(), MODEL_MAPPINGS["openai"]["default"]
+            model_name, MODEL_MAPPINGS["openai"]["default"]
         )
     elif agent_loop == AgentLoop.ANTHROPIC:
         provider = LLMProvider.ANTHROPIC
         model_name_to_use = MODEL_MAPPINGS["anthropic"].get(
-            model_name.lower(), MODEL_MAPPINGS["anthropic"]["default"]
+            model_name, MODEL_MAPPINGS["anthropic"]["default"]
         )
     elif agent_loop == AgentLoop.OMNI:
         # Determine provider and clean model name based on the full string from UI
@@ -235,33 +243,11 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
             cleaned_model_name = model_name.split("OMNI: Ollama ", 1)[1]
         elif model_name.startswith("OMNI: Claude "):
             provider = LLMProvider.ANTHROPIC
-            # Extract the canonical model name based on the UI string
-            # e.g., "OMNI: Claude 3.7 Sonnet (20250219)" -> "3.7 Sonnet" and "20250219"
-            parts = model_name.split(" (")
-            model_key_part = parts[0].replace("OMNI: Claude ", "")
-            date_part = parts[1].replace(")", "") if len(parts) > 1 else ""
-            # Normalize the extracted key part for comparison
-            # "3.7 Sonnet" -> "37sonnet"
-            model_key_part_norm = model_key_part.lower().replace(".", "").replace(" ", "")
-            cleaned_model_name = MODEL_MAPPINGS["omni"]["default"]  # Default if not found
-            # Find the canonical name in the main Anthropic map
-            for key_anthropic, val_anthropic in MODEL_MAPPINGS["anthropic"].items():
-                # Normalize the canonical key for comparison
-                # "claude-3-7-sonnet-20250219" -> "claude37sonnet20250219"
-                key_anthropic_norm = key_anthropic.lower().replace("-", "")
-                # Check if the normalized canonical key starts with "claude" + normalized extracted part
-                # AND contains the date part.
-                if (
-                    key_anthropic_norm.startswith("claude" + model_key_part_norm)
-                    and date_part in key_anthropic_norm
-                ):
-                    cleaned_model_name = (
-                        val_anthropic  # Use the canonical name like "claude-3-7-sonnet-20250219"
-                    )
-                    break
+            model_name = model_name.replace("OMNI: ", "Anthropic: ")
+            cleaned_model_name = MODEL_MAPPINGS["anthropic"].get(
+                model_name, MODEL_MAPPINGS["anthropic"]["default"]
+            )
         elif model_name.startswith("OMNI: OpenAI "):
             provider = LLMProvider.OPENAI
             # Extract the model part, e.g., "GPT-4o mini"
@@ -310,6 +296,8 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
         model_name_to_use = MODEL_MAPPINGS["openai"]["default"]
         agent_loop = AgentLoop.OPENAI
+    print(f"Mapping {model_name} and {loop_provider} to {provider}, {model_name_to_use}, {agent_loop}")
     return provider, model_name_to_use, agent_loop
@@ -454,6 +442,9 @@ def create_gradio_ui(
     # Always show models regardless of API key availability
     openai_models = ["OpenAI: Computer-Use Preview"]
     anthropic_models = [
+        "Anthropic: Claude 4 Opus (20250514)",
+        "Anthropic: Claude 4 Sonnet (20250514)",
         "Anthropic: Claude 3.7 Sonnet (20250219)",
         "Anthropic: Claude 3.5 Sonnet (20240620)",
     ]
@@ -461,6 +452,8 @@ def create_gradio_ui(
         "OMNI: OpenAI GPT-4o",
         "OMNI: OpenAI GPT-4o mini",
         "OMNI: OpenAI GPT-4.5-preview",
+        "OMNI: Claude 4 Opus (20250514)",
+        "OMNI: Claude 4 Sonnet (20250514)",
         "OMNI: Claude 3.7 Sonnet (20250219)",
         "OMNI: Claude 3.5 Sonnet (20240620)"
     ]
@@ -730,20 +723,25 @@ if __name__ == "__main__":
                 with gr.Accordion("Computer Configuration", open=True):
                     # Computer configuration options
                     computer_os = gr.Radio(
-                        choices=["macos", "linux"],
+                        choices=["macos", "linux", "windows"],
                         label="Operating System",
                         value="macos",
                         info="Select the operating system for the computer",
                     )
-                    # Detect if current device is MacOS
+                    is_windows = platform.system().lower() == "windows"
                     is_mac = platform.system().lower() == "darwin"
+                    providers = ["cloud"]
+                    if is_mac:
+                        providers += ["lume"]
+                    elif is_windows:
+                        providers += ["winsandbox"]
                     computer_provider = gr.Radio(
-                        choices=["cloud", "lume"],
+                        choices=providers,
                         label="Provider",
                         value="lume" if is_mac else "cloud",
-                        visible=is_mac,
                         info="Select the computer provider",
                     )

{cua_agent-0.2.9 → cua_agent-0.2.11}/pyproject.toml RENAMED Viewed

@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
 [project]
 name = "cua-agent"
-version = "0.2.9"
+version = "0.2.11"
 description = "CUA (Computer Use) Agent for AI-driven computer interaction"
 readme = "README.md"
 authors = [
@@ -109,7 +109,7 @@ target-version = [
 [tool.ruff]
 line-length = 100
-target-version = "0.2.9"
+target-version = "0.2.11"
 select = [
     "E",
     "F",
@@ -123,7 +123,7 @@ docstring-code-format = true
 [tool.mypy]
 strict = true
-python_version = "0.2.9"
+python_version = "0.2.11"
 ignore_missing_imports = true
 disallow_untyped_defs = true
 check_untyped_defs = true