PyPI - cua-agent - Versions diffs - 0.3.1__py3-none-any.whl → 0.4.0b1__py3-none-any.whl - Mend

cua-agent 0.3.1py3-none-any.whl → 0.4.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show

agent/__init__.py +15 -51
agent/__main__.py +21 -0
agent/adapters/__init__.py +9 -0
agent/adapters/huggingfacelocal_adapter.py +216 -0
agent/agent.py +577 -0
agent/callbacks/__init__.py +17 -0
agent/callbacks/base.py +153 -0
agent/callbacks/budget_manager.py +44 -0
agent/callbacks/image_retention.py +139 -0
agent/callbacks/logging.py +247 -0
agent/callbacks/pii_anonymization.py +259 -0
agent/callbacks/trajectory_saver.py +305 -0
agent/cli.py +290 -0
agent/computer_handler.py +107 -0
agent/decorators.py +90 -0
agent/loops/__init__.py +11 -0
agent/loops/anthropic.py +728 -0
agent/loops/omniparser.py +339 -0
agent/loops/openai.py +95 -0
agent/loops/uitars.py +688 -0
agent/responses.py +207 -0
agent/types.py +79 -0
agent/ui/__init__.py +7 -1
agent/ui/gradio/__init__.py +6 -19
agent/ui/gradio/app.py +80 -1299
agent/ui/gradio/ui_components.py +703 -0
cua_agent-0.4.0b1.dist-info/METADATA +424 -0
cua_agent-0.4.0b1.dist-info/RECORD +30 -0
{cua_agent-0.3.1.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +1 -1
agent/core/__init__.py +0 -27
agent/core/agent.py +0 -210
agent/core/base.py +0 -217
agent/core/callbacks.py +0 -200
agent/core/experiment.py +0 -249
agent/core/factory.py +0 -122
agent/core/messages.py +0 -332
agent/core/provider_config.py +0 -21
agent/core/telemetry.py +0 -142
agent/core/tools/__init__.py +0 -21
agent/core/tools/base.py +0 -74
agent/core/tools/bash.py +0 -52
agent/core/tools/collection.py +0 -46
agent/core/tools/computer.py +0 -113
agent/core/tools/edit.py +0 -67
agent/core/tools/manager.py +0 -56
agent/core/tools.py +0 -32
agent/core/types.py +0 -88
agent/core/visualization.py +0 -197
agent/providers/__init__.py +0 -4
agent/providers/anthropic/__init__.py +0 -6
agent/providers/anthropic/api/client.py +0 -360
agent/providers/anthropic/api/logging.py +0 -150
agent/providers/anthropic/api_handler.py +0 -140
agent/providers/anthropic/callbacks/__init__.py +0 -5
agent/providers/anthropic/callbacks/manager.py +0 -65
agent/providers/anthropic/loop.py +0 -568
agent/providers/anthropic/prompts.py +0 -23
agent/providers/anthropic/response_handler.py +0 -226
agent/providers/anthropic/tools/__init__.py +0 -33
agent/providers/anthropic/tools/base.py +0 -88
agent/providers/anthropic/tools/bash.py +0 -66
agent/providers/anthropic/tools/collection.py +0 -34
agent/providers/anthropic/tools/computer.py +0 -396
agent/providers/anthropic/tools/edit.py +0 -326
agent/providers/anthropic/tools/manager.py +0 -54
agent/providers/anthropic/tools/run.py +0 -42
agent/providers/anthropic/types.py +0 -16
agent/providers/anthropic/utils.py +0 -367
agent/providers/omni/__init__.py +0 -8
agent/providers/omni/api_handler.py +0 -42
agent/providers/omni/clients/anthropic.py +0 -103
agent/providers/omni/clients/base.py +0 -35
agent/providers/omni/clients/oaicompat.py +0 -195
agent/providers/omni/clients/ollama.py +0 -122
agent/providers/omni/clients/openai.py +0 -155
agent/providers/omni/clients/utils.py +0 -25
agent/providers/omni/image_utils.py +0 -34
agent/providers/omni/loop.py +0 -990
agent/providers/omni/parser.py +0 -307
agent/providers/omni/prompts.py +0 -64
agent/providers/omni/tools/__init__.py +0 -30
agent/providers/omni/tools/base.py +0 -29
agent/providers/omni/tools/bash.py +0 -74
agent/providers/omni/tools/computer.py +0 -179
agent/providers/omni/tools/manager.py +0 -61
agent/providers/omni/utils.py +0 -236
agent/providers/openai/__init__.py +0 -6
agent/providers/openai/api_handler.py +0 -456
agent/providers/openai/loop.py +0 -472
agent/providers/openai/response_handler.py +0 -205
agent/providers/openai/tools/__init__.py +0 -15
agent/providers/openai/tools/base.py +0 -79
agent/providers/openai/tools/computer.py +0 -326
agent/providers/openai/tools/manager.py +0 -106
agent/providers/openai/types.py +0 -36
agent/providers/openai/utils.py +0 -98
agent/providers/uitars/__init__.py +0 -1
agent/providers/uitars/clients/base.py +0 -35
agent/providers/uitars/clients/mlxvlm.py +0 -263
agent/providers/uitars/clients/oaicompat.py +0 -214
agent/providers/uitars/loop.py +0 -660
agent/providers/uitars/prompts.py +0 -63
agent/providers/uitars/tools/__init__.py +0 -1
agent/providers/uitars/tools/computer.py +0 -283
agent/providers/uitars/tools/manager.py +0 -60
agent/providers/uitars/utils.py +0 -264
agent/telemetry.py +0 -21
agent/ui/__main__.py +0 -15
cua_agent-0.3.1.dist-info/METADATA +0 -295
cua_agent-0.3.1.dist-info/RECORD +0 -87
{cua_agent-0.3.1.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0

agent/providers/omni/clients/oaicompat.py DELETED Viewed

@@ -1,195 +0,0 @@
-"""OpenAI-compatible client implementation."""
-import os
-import logging
-from typing import Dict, List, Optional, Any
-import aiohttp
-import re
-from .base import BaseOmniClient
-logger = logging.getLogger(__name__)
-# OpenAI-compatible client for the OmniLoop
-class OAICompatClient(BaseOmniClient):
-    """OpenAI-compatible API client implementation.
-    This client can be used with any service that implements the OpenAI API protocol, including:
-    - vLLM
-    - LM Studio
-    - LocalAI
-    - Ollama (with OpenAI compatibility)
-    - Text Generation WebUI
-    - Any other service with OpenAI API compatibility
-    """
-    def __init__(
-        self,
-        api_key: Optional[str] = None,
-        model: str = "Qwen2.5-VL-7B-Instruct",
-        provider_base_url: Optional[str] = "http://localhost:8000/v1",
-        max_tokens: int = 4096,
-        temperature: float = 0.0,
-    ):
-        """Initialize the OpenAI-compatible client.
-        Args:
-            api_key: Not used for local endpoints, usually set to "EMPTY"
-            model: Model name to use
-            provider_base_url: API base URL. Typically in the format "http://localhost:PORT/v1"
-                Examples:
-                - vLLM: "http://localhost:8000/v1"
-                - LM Studio: "http://localhost:1234/v1"
-                - LocalAI: "http://localhost:8080/v1"
-                - Ollama: "http://localhost:11434/v1"
-            max_tokens: Maximum tokens to generate
-            temperature: Generation temperature
-        """
-        super().__init__(api_key=api_key or "EMPTY", model=model)
-        self.api_key = api_key or "EMPTY" # Local endpoints typically don't require an API key
-        self.model = model
-        self.provider_base_url = (
-            provider_base_url or "http://localhost:8000/v1"
-        )  # Use default if None
-        self.max_tokens = max_tokens
-        self.temperature = temperature
-    def _extract_base64_image(self, text: str) -> Optional[str]:
-        """Extract base64 image data from an HTML img tag."""
-        pattern = r'data:image/[^;]+;base64,([^"]+)'
-        match = re.search(pattern, text)
-        return match.group(1) if match else None
-    def _get_loggable_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """Create a loggable version of messages with image data truncated."""
-        loggable_messages = []
-        for msg in messages:
-            if isinstance(msg.get("content"), list):
-                new_content = []
-                for content in msg["content"]:
-                    if content.get("type") == "image":
-                        new_content.append(
-                            {"type": "image", "image_url": {"url": "[BASE64_IMAGE_DATA]"}}
-                        )
-                    else:
-                        new_content.append(content)
-                loggable_messages.append({"role": msg["role"], "content": new_content})
-            else:
-                loggable_messages.append(msg)
-        return loggable_messages
-    async def run_interleaved(
-        self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
-    ) -> Dict[str, Any]:
-        """Run interleaved chat completion.
-        Args:
-            messages: List of message dicts
-            system: System prompt
-            max_tokens: Optional max tokens override
-        Returns:
-            Response dict
-        """
-        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
-        final_messages = [
-            {
-                "role": "system",
-                "content": [
-                    { "type": "text", "text": system }
-                ]
-            }
-        ]
-        # Process messages
-        for item in messages:
-            if isinstance(item, dict):
-                if isinstance(item["content"], list):
-                    # Content is already in the correct format
-                    final_messages.append(item)
-                else:
-                    # Single string content, check for image
-                    base64_img = self._extract_base64_image(item["content"])
-                    if base64_img:
-                        message = {
-                            "role": item["role"],
-                            "content": [
-                                {
-                                    "type": "image_url",
-                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
-                                }
-                            ],
-                        }
-                    else:
-                        message = {
-                            "role": item["role"],
-                            "content": [{
-                                "type": "text",
-                                "text": item["content"]
-                            }],
-                        }
-                    final_messages.append(message)
-            else:
-                # String content, check for image
-                base64_img = self._extract_base64_image(item)
-                if base64_img:
-                    message = {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "image_url",
-                                "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
-                            }
-                        ],
-                    }
-                else:
-                    message = {"role": "user", "content": [{"type": "text", "text": item}]}
-                final_messages.append(message)
-        payload = {"model": self.model, "messages": final_messages, "temperature": self.temperature}
-        payload["max_tokens"] = max_tokens or self.max_tokens
-        try:
-            async with aiohttp.ClientSession() as session:
-                # Use default base URL if none provided
-                base_url = self.provider_base_url or "http://localhost:8000/v1"
-                # Check if the base URL already includes the chat/completions endpoint
-                endpoint_url = base_url
-                if not endpoint_url.endswith("/chat/completions"):
-                    # If URL is RunPod format, make it OpenAI compatible
-                    if endpoint_url.startswith("https://api.runpod.ai/v2/"):
-                        # Extract RunPod endpoint ID
-                        parts = endpoint_url.split("/")
-                        if len(parts) >= 5:
-                            runpod_id = parts[4]
-                            endpoint_url = f"https://api.runpod.ai/v2/{runpod_id}/openai/v1/chat/completions"
-                    # If the URL ends with /v1, append /chat/completions
-                    elif endpoint_url.endswith("/v1"):
-                        endpoint_url = f"{endpoint_url}/chat/completions"
-                    # If the URL doesn't end with /v1, make sure it has a proper structure
-                    elif not endpoint_url.endswith("/"):
-                        endpoint_url = f"{endpoint_url}/chat/completions"
-                    else:
-                        endpoint_url = f"{endpoint_url}chat/completions"
-                # Log the endpoint URL for debugging
-                logger.debug(f"Using endpoint URL: {endpoint_url}")
-                async with session.post(endpoint_url, headers=headers, json=payload) as response:
-                    response_json = await response.json()
-                    if response.status != 200:
-                        error_msg = response_json.get("error", {}).get(
-                            "message", str(response_json)
-                        )
-                        logger.error(f"Error in API call: {error_msg}")
-                        raise Exception(f"API error: {error_msg}")
-                    return response_json
-        except Exception as e:
-            logger.error(f"Error in API call: {str(e)}")
-            raise

agent/providers/omni/clients/ollama.py DELETED Viewed

@@ -1,122 +0,0 @@
-"""Ollama API client implementation."""
-import logging
-from typing import Any, Dict, List, Optional, Tuple, cast
-import asyncio
-from httpx import ConnectError, ReadTimeout
-from ollama import AsyncClient, Options
-from ollama import Message
-from .base import BaseOmniClient
-logger = logging.getLogger(__name__)
-class OllamaClient(BaseOmniClient):
-    """Client for making calls to Ollama API."""
-    def __init__(self, api_key: str, model: str, max_retries: int = 3, retry_delay: float = 1.0):
-        """Initialize the Ollama client.
-        Args:
-            api_key: Not used
-            model: Ollama model name (e.g. "gemma3:4b-it-q4_K_M")
-            max_retries: Maximum number of retries for API calls
-            retry_delay: Base delay between retries in seconds
-        """
-        if not model:
-            raise ValueError("Model name must be provided")
-        self.client = AsyncClient(
-            host="http://localhost:11434",
-        )
-        self.model: str = model  # Add explicit type annotation
-        self.max_retries = max_retries
-        self.retry_delay = retry_delay
-    def _convert_message_format(self, system: str, messages: List[Dict[str, Any]]) -> List[Any]:
-        """Convert messages from standard format to Ollama format.
-        Args:
-            messages: Messages in standard format
-        Returns:
-            Messages in Ollama format
-        """
-        ollama_messages = []
-        # Add system message
-        ollama_messages.append(
-            {
-                "role": "system",
-                "content": system,
-            }
-        )
-        for message in messages:
-            # Skip messages with empty content
-            if not message.get("content"):
-                continue
-            content = message.get("content", [{}])[0]
-            isImage = content.get("type", "") == "image_url"
-            isText = content.get("type", "") == "text"
-            if isText:
-                data = content.get("text", "")
-                ollama_messages.append({"role": message["role"], "content": data})
-            if isImage:
-                data = content.get("image_url", {}).get("url", "")
-                # remove header
-                data = data.removeprefix("data:image/png;base64,")
-                ollama_messages.append(
-                    {"role": message["role"], "content": "Use this image", "images": [data]}
-                )
-        # Cast the list to the correct type expected by Ollama
-        return cast(List[Any], ollama_messages)
-    async def run_interleaved(
-        self, messages: List[Dict[str, Any]], system: str, max_tokens: int
-    ) -> Any:
-        """Run model with interleaved conversation format.
-        Args:
-            messages: List of messages to process
-            system: System prompt
-            max_tokens: Not used
-        Returns:
-            Model response
-        """
-        last_error = None
-        for attempt in range(self.max_retries):
-            try:
-                # Convert messages to Ollama format
-                ollama_messages = self._convert_message_format(system, messages)
-                response = await self.client.chat(
-                    model=self.model,
-                    options=Options(
-                        temperature=0,
-                    ),
-                    messages=ollama_messages,
-                    format="json",
-                )
-                return response
-            except (ConnectError, ReadTimeout) as e:
-                last_error = e
-                logger.warning(
-                    f"Connection error on attempt {attempt + 1}/{self.max_retries}: {str(e)}"
-                )
-                if attempt < self.max_retries - 1:
-                    await asyncio.sleep(self.retry_delay * (attempt + 1))  # Exponential backoff
-                continue
-            except Exception as e:
-                logger.error(f"Unexpected error in Ollama API call: {str(e)}")
-                raise RuntimeError(f"Ollama API call failed: {str(e)}")
-        # If we get here, all retries failed
-        raise RuntimeError(f"Connection error after {self.max_retries} retries: {str(last_error)}")

agent/providers/omni/clients/openai.py DELETED Viewed

@@ -1,155 +0,0 @@
-"""OpenAI client implementation."""
-import os
-import logging
-from typing import Dict, List, Optional, Any
-import aiohttp
-import re
-from datetime import datetime
-from .base import BaseOmniClient
-logger = logging.getLogger(__name__)
-# OpenAI specific client for the OmniLoop
-class OpenAIClient(BaseOmniClient):
-    """OpenAI vision API client implementation."""
-    def __init__(
-        self,
-        api_key: Optional[str] = None,
-        model: str = "gpt-4o",
-        provider_base_url: str = "https://api.openai.com/v1",
-        max_tokens: int = 4096,
-        temperature: float = 0.0,
-    ):
-        """Initialize the OpenAI client.
-        Args:
-            api_key: OpenAI API key
-            model: Model to use
-            provider_base_url: API endpoint
-            max_tokens: Maximum tokens to generate
-            temperature: Generation temperature
-        """
-        super().__init__(api_key=api_key, model=model)
-        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
-        if not self.api_key:
-            raise ValueError("No OpenAI API key provided")
-        self.model = model
-        self.provider_base_url = provider_base_url
-        self.max_tokens = max_tokens
-        self.temperature = temperature
-    def _extract_base64_image(self, text: str) -> Optional[str]:
-        """Extract base64 image data from an HTML img tag."""
-        pattern = r'data:image/[^;]+;base64,([^"]+)'
-        match = re.search(pattern, text)
-        return match.group(1) if match else None
-    def _get_loggable_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """Create a loggable version of messages with image data truncated."""
-        loggable_messages = []
-        for msg in messages:
-            if isinstance(msg.get("content"), list):
-                new_content = []
-                for content in msg["content"]:
-                    if content.get("type") == "image":
-                        new_content.append(
-                            {"type": "image", "image_url": {"url": "[BASE64_IMAGE_DATA]"}}
-                        )
-                    else:
-                        new_content.append(content)
-                loggable_messages.append({"role": msg["role"], "content": new_content})
-            else:
-                loggable_messages.append(msg)
-        return loggable_messages
-    async def run_interleaved(
-        self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
-    ) -> Dict[str, Any]:
-        """Run interleaved chat completion.
-        Args:
-            messages: List of message dicts
-            system: System prompt
-            max_tokens: Optional max tokens override
-        Returns:
-            Response dict
-        """
-        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
-        final_messages = [{"role": "system", "content": system}]
-        # Process messages
-        for item in messages:
-            if isinstance(item, dict):
-                if isinstance(item["content"], list):
-                    # Content is already in the correct format
-                    final_messages.append(item)
-                else:
-                    # Single string content, check for image
-                    base64_img = self._extract_base64_image(item["content"])
-                    if base64_img:
-                        message = {
-                            "role": item["role"],
-                            "content": [
-                                {
-                                    "type": "image_url",
-                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
-                                }
-                            ],
-                        }
-                    else:
-                        message = {
-                            "role": item["role"],
-                            "content": [{"type": "text", "text": item["content"]}],
-                        }
-                    final_messages.append(message)
-            else:
-                # String content, check for image
-                base64_img = self._extract_base64_image(item)
-                if base64_img:
-                    message = {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "image_url",
-                                "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
-                            }
-                        ],
-                    }
-                else:
-                    message = {"role": "user", "content": [{"type": "text", "text": item}]}
-                final_messages.append(message)
-        payload = {"model": self.model, "messages": final_messages, "temperature": self.temperature}
-        if "o1" in self.model or "o3-mini" in self.model:
-            payload["reasoning_effort"] = "low"
-            payload["max_completion_tokens"] = max_tokens or self.max_tokens
-        else:
-            payload["max_tokens"] = max_tokens or self.max_tokens
-        try:
-            async with aiohttp.ClientSession() as session:
-                async with session.post(
-                    f"{self.provider_base_url}/chat/completions", headers=headers, json=payload
-                ) as response:
-                    response_json = await response.json()
-                    if response.status != 200:
-                        error_msg = response_json.get("error", {}).get(
-                            "message", str(response_json)
-                        )
-                        logger.error(f"Error in OpenAI API call: {error_msg}")
-                        raise Exception(f"OpenAI API error: {error_msg}")
-                    return response_json
-        except Exception as e:
-            logger.error(f"Error in OpenAI API call: {str(e)}")
-            raise

agent/providers/omni/clients/utils.py DELETED Viewed

@@ -1,25 +0,0 @@
-import base64
-def is_image_path(text: str) -> bool:
-    """Check if a text string is an image file path.
-    Args:
-        text: Text string to check
-    Returns:
-        True if text ends with image extension, False otherwise
-    """
-    image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
-    return text.endswith(image_extensions)
-def encode_image(image_path: str) -> str:
-    """Encode image file to base64.
-    Args:
-        image_path: Path to image file
-    Returns:
-        Base64 encoded image string
-    """
-    with open(image_path, "rb") as image_file:
-        return base64.b64encode(image_file.read()).decode("utf-8")

agent/providers/omni/image_utils.py DELETED Viewed

@@ -1,34 +0,0 @@
-"""Image processing utilities for the Cua provider."""
-import base64
-import logging
-import re
-from io import BytesIO
-from typing import Optional, Tuple
-from PIL import Image
-logger = logging.getLogger(__name__)
-def decode_base64_image(img_base64: str) -> Optional[Image.Image]:
-    """Decode a base64 encoded image to a PIL Image.
-    Args:
-        img_base64: Base64 encoded image, may include data URL prefix
-    Returns:
-        PIL Image or None if decoding fails
-    """
-    try:
-        # Remove data URL prefix if present
-        if img_base64.startswith("data:image"):
-            img_base64 = img_base64.split(",")[1]
-        # Decode base64 to bytes
-        img_data = base64.b64decode(img_base64)
-        # Convert bytes to PIL Image
-        return Image.open(BytesIO(img_data))
-    except Exception as e:
-        logger.error(f"Error decoding base64 image: {str(e)}")
-        return None

cua-agent 0.3.1__py3-none-any.whl → 0.4.0b1__py3-none-any.whl

Potentially problematic release.

cua-agent 0.3.1py3-none-any.whl → 0.4.0b1py3-none-any.whl