PyPI - cua-agent - Versions diffs - 0.1.0__py3-none-any.whl - Mend

cua-agent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (65) hide show

agent/README.md +63 -0
agent/__init__.py +10 -0
agent/core/README.md +101 -0
agent/core/__init__.py +34 -0
agent/core/agent.py +284 -0
agent/core/base_agent.py +164 -0
agent/core/callbacks.py +147 -0
agent/core/computer_agent.py +69 -0
agent/core/experiment.py +222 -0
agent/core/factory.py +102 -0
agent/core/loop.py +244 -0
agent/core/messages.py +230 -0
agent/core/tools/__init__.py +21 -0
agent/core/tools/base.py +74 -0
agent/core/tools/bash.py +52 -0
agent/core/tools/collection.py +46 -0
agent/core/tools/computer.py +113 -0
agent/core/tools/edit.py +67 -0
agent/core/tools/manager.py +56 -0
agent/providers/__init__.py +4 -0
agent/providers/anthropic/__init__.py +6 -0
agent/providers/anthropic/api/client.py +222 -0
agent/providers/anthropic/api/logging.py +150 -0
agent/providers/anthropic/callbacks/manager.py +55 -0
agent/providers/anthropic/loop.py +521 -0
agent/providers/anthropic/messages/manager.py +110 -0
agent/providers/anthropic/prompts.py +20 -0
agent/providers/anthropic/tools/__init__.py +33 -0
agent/providers/anthropic/tools/base.py +88 -0
agent/providers/anthropic/tools/bash.py +163 -0
agent/providers/anthropic/tools/collection.py +34 -0
agent/providers/anthropic/tools/computer.py +550 -0
agent/providers/anthropic/tools/edit.py +326 -0
agent/providers/anthropic/tools/manager.py +54 -0
agent/providers/anthropic/tools/run.py +42 -0
agent/providers/anthropic/types.py +16 -0
agent/providers/omni/__init__.py +27 -0
agent/providers/omni/callbacks.py +78 -0
agent/providers/omni/clients/anthropic.py +99 -0
agent/providers/omni/clients/base.py +44 -0
agent/providers/omni/clients/groq.py +101 -0
agent/providers/omni/clients/openai.py +159 -0
agent/providers/omni/clients/utils.py +25 -0
agent/providers/omni/experiment.py +273 -0
agent/providers/omni/image_utils.py +106 -0
agent/providers/omni/loop.py +961 -0
agent/providers/omni/messages.py +168 -0
agent/providers/omni/parser.py +252 -0
agent/providers/omni/prompts.py +78 -0
agent/providers/omni/tool_manager.py +91 -0
agent/providers/omni/tools/__init__.py +13 -0
agent/providers/omni/tools/bash.py +69 -0
agent/providers/omni/tools/computer.py +216 -0
agent/providers/omni/tools/manager.py +83 -0
agent/providers/omni/types.py +30 -0
agent/providers/omni/utils.py +155 -0
agent/providers/omni/visualization.py +130 -0
agent/types/__init__.py +26 -0
agent/types/base.py +52 -0
agent/types/messages.py +36 -0
agent/types/tools.py +32 -0
cua_agent-0.1.0.dist-info/METADATA +44 -0
cua_agent-0.1.0.dist-info/RECORD +65 -0
cua_agent-0.1.0.dist-info/WHEEL +4 -0
cua_agent-0.1.0.dist-info/entry_points.txt +4 -0

agent/providers/omni/clients/groq.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Groq client implementation."""
+import os
+import logging
+from typing import Dict, List, Optional, Any, Tuple
+from groq import Groq
+import re
+from .utils import is_image_path
+from .base import BaseOmniClient
+logger = logging.getLogger(__name__)
+class GroqClient(BaseOmniClient):
+    """Client for making Groq API calls."""
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        model: str = "deepseek-r1-distill-llama-70b",
+        max_tokens: int = 4096,
+        temperature: float = 0.6,
+    ):
+        """Initialize Groq client.
+        Args:
+            api_key: Groq API key (if not provided, will try to get from env)
+            model: Model name to use
+            max_tokens: Maximum tokens to generate
+            temperature: Temperature for sampling
+        """
+        super().__init__(api_key=api_key, model=model)
+        self.api_key = api_key or os.getenv("GROQ_API_KEY")
+        if not self.api_key:
+            raise ValueError("No Groq API key provided")
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.client = Groq(api_key=self.api_key)
+        self.model: str = model  # Add explicit type annotation
+    def run_interleaved(
+        self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
+    ) -> tuple[str, int]:
+        """Run interleaved chat completion.
+        Args:
+            messages: List of message dicts
+            system: System prompt
+            max_tokens: Optional max tokens override
+        Returns:
+            Tuple of (response text, token usage)
+        """
+        # Avoid using system messages for R1
+        final_messages = [{"role": "user", "content": system}]
+        # Process messages
+        if isinstance(messages, list):
+            for item in messages:
+                if isinstance(item, dict):
+                    # For dict items, concatenate all text content, ignoring images
+                    text_contents = []
+                    for cnt in item["content"]:
+                        if isinstance(cnt, str):
+                            if not is_image_path(cnt):  # Skip image paths
+                                text_contents.append(cnt)
+                        else:
+                            text_contents.append(str(cnt))
+                    if text_contents:  # Only add if there's text content
+                        message = {"role": "user", "content": " ".join(text_contents)}
+                        final_messages.append(message)
+                else:  # str
+                    message = {"role": "user", "content": item}
+                    final_messages.append(message)
+        elif isinstance(messages, str):
+            final_messages.append({"role": "user", "content": messages})
+        try:
+            completion = self.client.chat.completions.create(  # type: ignore
+                model=self.model,
+                messages=final_messages,  # type: ignore
+                temperature=self.temperature,
+                max_tokens=max_tokens or self.max_tokens,
+                top_p=0.95,
+                stream=False,
+            )
+            response = completion.choices[0].message.content
+            final_answer = response.split("</think>\n")[-1] if "</think>" in response else response
+            final_answer = final_answer.replace("<output>", "").replace("</output>", "")
+            token_usage = completion.usage.total_tokens
+            return final_answer, token_usage
+        except Exception as e:
+            logger.error(f"Error in Groq API call: {e}")
+            raise

agent/providers/omni/clients/openai.py ADDED Viewed

@@ -0,0 +1,159 @@
+"""OpenAI client implementation."""
+import os
+import logging
+from typing import Dict, List, Optional, Any
+import aiohttp
+import base64
+import re
+import json
+import ssl
+import certifi
+from datetime import datetime
+from .base import BaseOmniClient
+logger = logging.getLogger(__name__)
+# OpenAI specific client for the OmniLoop
+class OpenAIClient(BaseOmniClient):
+    """OpenAI vision API client implementation."""
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        model: str = "gpt-4o",
+        provider_base_url: str = "https://api.openai.com/v1",
+        max_tokens: int = 4096,
+        temperature: float = 0.0,
+    ):
+        """Initialize the OpenAI client.
+        Args:
+            api_key: OpenAI API key
+            model: Model to use
+            provider_base_url: API endpoint
+            max_tokens: Maximum tokens to generate
+            temperature: Generation temperature
+        """
+        super().__init__(api_key=api_key, model=model)
+        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
+        if not self.api_key:
+            raise ValueError("No OpenAI API key provided")
+        self.model = model
+        self.provider_base_url = provider_base_url
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+    def _extract_base64_image(self, text: str) -> Optional[str]:
+        """Extract base64 image data from an HTML img tag."""
+        pattern = r'data:image/[^;]+;base64,([^"]+)'
+        match = re.search(pattern, text)
+        return match.group(1) if match else None
+    def _get_loggable_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Create a loggable version of messages with image data truncated."""
+        loggable_messages = []
+        for msg in messages:
+            if isinstance(msg.get("content"), list):
+                new_content = []
+                for content in msg["content"]:
+                    if content.get("type") == "image":
+                        new_content.append(
+                            {"type": "image", "image_url": {"url": "[BASE64_IMAGE_DATA]"}}
+                        )
+                    else:
+                        new_content.append(content)
+                loggable_messages.append({"role": msg["role"], "content": new_content})
+            else:
+                loggable_messages.append(msg)
+        return loggable_messages
+    async def run_interleaved(
+        self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
+    ) -> Dict[str, Any]:
+        """Run interleaved chat completion.
+        Args:
+            messages: List of message dicts
+            system: System prompt
+            max_tokens: Optional max tokens override
+        Returns:
+            Response dict
+        """
+        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
+        final_messages = [{"role": "system", "content": system}]
+        # Process messages
+        for item in messages:
+            if isinstance(item, dict):
+                if isinstance(item["content"], list):
+                    # Content is already in the correct format
+                    final_messages.append(item)
+                else:
+                    # Single string content, check for image
+                    base64_img = self._extract_base64_image(item["content"])
+                    if base64_img:
+                        message = {
+                            "role": item["role"],
+                            "content": [
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
+                                }
+                            ],
+                        }
+                    else:
+                        message = {
+                            "role": item["role"],
+                            "content": [{"type": "text", "text": item["content"]}],
+                        }
+                    final_messages.append(message)
+            else:
+                # String content, check for image
+                base64_img = self._extract_base64_image(item)
+                if base64_img:
+                    message = {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
+                            }
+                        ],
+                    }
+                else:
+                    message = {"role": "user", "content": [{"type": "text", "text": item}]}
+                final_messages.append(message)
+        payload = {"model": self.model, "messages": final_messages, "temperature": self.temperature}
+        if "o1" in self.model or "o3-mini" in self.model:
+            payload["reasoning_effort"] = "low"
+            payload["max_completion_tokens"] = max_tokens or self.max_tokens
+        else:
+            payload["max_tokens"] = max_tokens or self.max_tokens
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(
+                    f"{self.provider_base_url}/chat/completions", headers=headers, json=payload
+                ) as response:
+                    response_json = await response.json()
+                    if response.status != 200:
+                        error_msg = response_json.get("error", {}).get(
+                            "message", str(response_json)
+                        )
+                        logger.error(f"Error in OpenAI API call: {error_msg}")
+                        raise Exception(f"OpenAI API error: {error_msg}")
+                    return response_json
+        except Exception as e:
+            logger.error(f"Error in OpenAI API call: {str(e)}")
+            raise

agent/providers/omni/clients/utils.py ADDED Viewed

@@ -0,0 +1,25 @@
+import base64
+def is_image_path(text: str) -> bool:
+    """Check if a text string is an image file path.
+    Args:
+        text: Text string to check
+    Returns:
+        True if text ends with image extension, False otherwise
+    """
+    image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
+    return text.endswith(image_extensions)
+def encode_image(image_path: str) -> str:
+    """Encode image file to base64.
+    Args:
+        image_path: Path to image file
+    Returns:
+        Base64 encoded image string
+    """
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")

agent/providers/omni/experiment.py ADDED Viewed

@@ -0,0 +1,273 @@
+"""Experiment management for the Cua provider."""
+import os
+import logging
+import copy
+import base64
+from io import BytesIO
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+from PIL import Image
+import json
+import time
+logger = logging.getLogger(__name__)
+class ExperimentManager:
+    """Manages experiment directories and logging for the agent."""
+    def __init__(
+        self,
+        base_dir: Optional[str] = None,
+        only_n_most_recent_images: Optional[int] = None,
+    ):
+        """Initialize the experiment manager.
+        Args:
+            base_dir: Base directory for saving experiment data
+            only_n_most_recent_images: Maximum number of recent screenshots to include in API requests
+        """
+        self.base_dir = base_dir
+        self.only_n_most_recent_images = only_n_most_recent_images
+        self.run_dir = None
+        self.current_turn_dir = None
+        self.turn_count = 0
+        self.screenshot_count = 0
+        # Track all screenshots for potential API request inclusion
+        self.screenshot_paths = []
+        # Set up experiment directories if base_dir is provided
+        if self.base_dir:
+            self.setup_experiment_dirs()
+    def setup_experiment_dirs(self) -> None:
+        """Setup the experiment directory structure."""
+        if not self.base_dir:
+            return
+        # Create base experiments directory if it doesn't exist
+        os.makedirs(self.base_dir, exist_ok=True)
+        # Use the base_dir directly as the run_dir
+        self.run_dir = self.base_dir
+        logger.info(f"Using directory for experiment: {self.run_dir}")
+        # Create first turn directory
+        self.create_turn_dir()
+    def create_turn_dir(self) -> None:
+        """Create a new directory for the current turn."""
+        if not self.run_dir:
+            return
+        self.turn_count += 1
+        self.current_turn_dir = os.path.join(self.run_dir, f"turn_{self.turn_count:03d}")
+        os.makedirs(self.current_turn_dir, exist_ok=True)
+        logger.info(f"Created turn directory: {self.current_turn_dir}")
+    def sanitize_log_data(self, data: Any) -> Any:
+        """Sanitize data for logging by removing large base64 strings.
+        Args:
+            data: Data to sanitize (dict, list, or primitive)
+        Returns:
+            Sanitized copy of the data
+        """
+        if isinstance(data, dict):
+            result = copy.deepcopy(data)
+            # Handle nested dictionaries and lists
+            for key, value in result.items():
+                # Process content arrays that contain image data
+                if key == "content" and isinstance(value, list):
+                    for i, item in enumerate(value):
+                        if isinstance(item, dict):
+                            # Handle Anthropic format
+                            if item.get("type") == "image" and isinstance(item.get("source"), dict):
+                                source = item["source"]
+                                if "data" in source and isinstance(source["data"], str):
+                                    # Replace base64 data with a placeholder and length info
+                                    data_len = len(source["data"])
+                                    source["data"] = f"[BASE64_IMAGE_DATA_LENGTH_{data_len}]"
+                            # Handle OpenAI format
+                            elif item.get("type") == "image_url" and isinstance(
+                                item.get("image_url"), dict
+                            ):
+                                url_dict = item["image_url"]
+                                if "url" in url_dict and isinstance(url_dict["url"], str):
+                                    url = url_dict["url"]
+                                    if url.startswith("data:"):
+                                        # Replace base64 data with placeholder
+                                        data_len = len(url)
+                                        url_dict["url"] = f"[BASE64_IMAGE_URL_LENGTH_{data_len}]"
+                # Handle other nested structures recursively
+                if isinstance(value, dict):
+                    result[key] = self.sanitize_log_data(value)
+                elif isinstance(value, list):
+                    result[key] = [self.sanitize_log_data(item) for item in value]
+            return result
+        elif isinstance(data, list):
+            return [self.sanitize_log_data(item) for item in data]
+        else:
+            return data
+    def save_debug_image(self, image_data: str, filename: str) -> None:
+        """Save a debug image to the experiment directory.
+        Args:
+            image_data: Base64 encoded image data
+            filename: Filename to save the image as
+        """
+        # Since we no longer want to use the images/ folder, we'll skip this functionality
+        return
+    def save_screenshot(self, img_base64: str, action_type: str = "") -> None:
+        """Save a screenshot to the experiment directory.
+        Args:
+            img_base64: Base64 encoded screenshot
+            action_type: Type of action that triggered the screenshot
+        """
+        if not self.current_turn_dir:
+            return
+        try:
+            # Increment screenshot counter
+            self.screenshot_count += 1
+            # Create a descriptive filename
+            timestamp = int(time.time() * 1000)
+            action_suffix = f"_{action_type}" if action_type else ""
+            filename = f"screenshot_{self.screenshot_count:03d}{action_suffix}_{timestamp}.png"
+            # Save directly to the turn directory (no screenshots subdirectory)
+            filepath = os.path.join(self.current_turn_dir, filename)
+            # Save the screenshot
+            img_data = base64.b64decode(img_base64)
+            with open(filepath, "wb") as f:
+                f.write(img_data)
+            # Keep track of the file path for reference
+            self.screenshot_paths.append(filepath)
+            return filepath
+        except Exception as e:
+            logger.error(f"Error saving screenshot: {str(e)}")
+            return None
+    def should_save_debug_image(self) -> bool:
+        """Determine if debug images should be saved.
+        Returns:
+            Boolean indicating if debug images should be saved
+        """
+        # We no longer need to save debug images, so always return False
+        return False
+    def save_action_visualization(
+        self, img: Image.Image, action_name: str, details: str = ""
+    ) -> str:
+        """Save a visualization of an action.
+        Args:
+            img: Image to save
+            action_name: Name of the action
+            details: Additional details about the action
+        Returns:
+            Path to the saved image
+        """
+        if not self.current_turn_dir:
+            return ""
+        try:
+            # Create a descriptive filename
+            timestamp = int(time.time() * 1000)
+            details_suffix = f"_{details}" if details else ""
+            filename = f"vis_{action_name}{details_suffix}_{timestamp}.png"
+            # Save directly to the turn directory (no visualizations subdirectory)
+            filepath = os.path.join(self.current_turn_dir, filename)
+            # Save the image
+            img.save(filepath)
+            # Keep track of the file path for cleanup
+            self.screenshot_paths.append(filepath)
+            return filepath
+        except Exception as e:
+            logger.error(f"Error saving action visualization: {str(e)}")
+            return ""
+    def extract_and_save_images(self, data: Any, prefix: str) -> None:
+        """Extract and save images from response data.
+        Args:
+            data: Response data to extract images from
+            prefix: Prefix for saved image filenames
+        """
+        # Since we no longer want to save extracted images separately,
+        # we'll skip this functionality entirely
+        return
+    def log_api_call(
+        self,
+        call_type: str,
+        request: Any,
+        provider: str,
+        model: str,
+        response: Any = None,
+        error: Optional[Exception] = None,
+    ) -> None:
+        """Log API call details to file.
+        Args:
+            call_type: Type of API call (e.g., 'request', 'response', 'error')
+            request: The API request data
+            provider: The AI provider used
+            model: The AI model used
+            response: Optional API response data
+            error: Optional error information
+        """
+        if not self.current_turn_dir:
+            return
+        try:
+            # Create a unique filename with timestamp
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"api_call_{timestamp}_{call_type}.json"
+            filepath = os.path.join(self.current_turn_dir, filename)
+            # Sanitize data to remove large base64 strings
+            sanitized_request = self.sanitize_log_data(request)
+            sanitized_response = self.sanitize_log_data(response) if response is not None else None
+            # Prepare log data
+            log_data = {
+                "timestamp": timestamp,
+                "provider": provider,
+                "model": model,
+                "type": call_type,
+                "request": sanitized_request,
+            }
+            if sanitized_response is not None:
+                log_data["response"] = sanitized_response
+            if error is not None:
+                log_data["error"] = str(error)
+            # Write to file
+            with open(filepath, "w") as f:
+                json.dump(log_data, f, indent=2, default=str)
+            logger.info(f"Logged API {call_type} to {filepath}")
+        except Exception as e:
+            logger.error(f"Error logging API call: {str(e)}")

agent/providers/omni/image_utils.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""Image processing utilities for the Cua provider."""
+import base64
+import logging
+import re
+from io import BytesIO
+from typing import Optional, Tuple
+from PIL import Image
+logger = logging.getLogger(__name__)
+def decode_base64_image(img_base64: str) -> Optional[Image.Image]:
+    """Decode a base64 encoded image to a PIL Image.
+    Args:
+        img_base64: Base64 encoded image, may include data URL prefix
+    Returns:
+        PIL Image or None if decoding fails
+    """
+    try:
+        # Remove data URL prefix if present
+        if img_base64.startswith("data:image"):
+            img_base64 = img_base64.split(",")[1]
+        # Decode base64 to bytes
+        img_data = base64.b64decode(img_base64)
+        # Convert bytes to PIL Image
+        return Image.open(BytesIO(img_data))
+    except Exception as e:
+        logger.error(f"Error decoding base64 image: {str(e)}")
+        return None
+def encode_image_base64(img: Image.Image, format: str = "PNG") -> str:
+    """Encode a PIL Image to base64.
+    Args:
+        img: PIL Image to encode
+        format: Image format (PNG, JPEG, etc.)
+    Returns:
+        Base64 encoded image string
+    """
+    try:
+        buffered = BytesIO()
+        img.save(buffered, format=format)
+        return base64.b64encode(buffered.getvalue()).decode("utf-8")
+    except Exception as e:
+        logger.error(f"Error encoding image to base64: {str(e)}")
+        return ""
+def clean_base64_data(img_base64: str) -> str:
+    """Clean base64 image data by removing data URL prefix.
+    Args:
+        img_base64: Base64 encoded image, may include data URL prefix
+    Returns:
+        Clean base64 string without prefix
+    """
+    if img_base64.startswith("data:image"):
+        return img_base64.split(",")[1]
+    return img_base64
+def extract_base64_from_text(text: str) -> Optional[str]:
+    """Extract base64 image data from a text string.
+    Args:
+        text: Text potentially containing base64 image data
+    Returns:
+        Base64 string or None if not found
+    """
+    # Look for data URL pattern
+    data_url_pattern = r"data:image/[^;]+;base64,([a-zA-Z0-9+/=]+)"
+    match = re.search(data_url_pattern, text)
+    if match:
+        return match.group(1)
+    # Look for plain base64 pattern (basic heuristic)
+    base64_pattern = r"([a-zA-Z0-9+/=]{100,})"
+    match = re.search(base64_pattern, text)
+    if match:
+        return match.group(1)
+    return None
+def get_image_dimensions(img_base64: str) -> Tuple[int, int]:
+    """Get the dimensions of a base64 encoded image.
+    Args:
+        img_base64: Base64 encoded image
+    Returns:
+        Tuple of (width, height) or (0, 0) if decoding fails
+    """
+    img = decode_base64_image(img_base64)
+    if img:
+        return img.size
+    return (0, 0)