PyPI - cua-agent - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.17__py3-none-any.whl - Mend

cua-agent 0.1.6py3-none-any.whl → 0.1.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (42) hide show

agent/__init__.py +3 -2
agent/core/__init__.py +0 -5
agent/core/computer_agent.py +21 -28
agent/core/loop.py +78 -124
agent/core/messages.py +279 -125
agent/core/types.py +35 -0
agent/core/visualization.py +197 -0
agent/providers/anthropic/api/client.py +142 -1
agent/providers/anthropic/api_handler.py +140 -0
agent/providers/anthropic/callbacks/__init__.py +5 -0
agent/providers/anthropic/loop.py +206 -220
agent/providers/anthropic/response_handler.py +229 -0
agent/providers/anthropic/tools/bash.py +0 -97
agent/providers/anthropic/utils.py +370 -0
agent/providers/omni/__init__.py +1 -20
agent/providers/omni/api_handler.py +42 -0
agent/providers/omni/clients/anthropic.py +4 -0
agent/providers/omni/image_utils.py +0 -72
agent/providers/omni/loop.py +490 -606
agent/providers/omni/parser.py +58 -4
agent/providers/omni/tools/__init__.py +25 -7
agent/providers/omni/tools/base.py +29 -0
agent/providers/omni/tools/bash.py +43 -38
agent/providers/omni/tools/computer.py +144 -182
agent/providers/omni/tools/manager.py +25 -45
agent/providers/omni/types.py +0 -4
agent/providers/omni/utils.py +224 -145
{cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/METADATA +6 -36
cua_agent-0.1.17.dist-info/RECORD +63 -0
agent/providers/omni/callbacks.py +0 -78
agent/providers/omni/clients/groq.py +0 -101
agent/providers/omni/experiment.py +0 -276
agent/providers/omni/messages.py +0 -171
agent/providers/omni/tool_manager.py +0 -91
agent/providers/omni/visualization.py +0 -130
agent/types/__init__.py +0 -23
agent/types/base.py +0 -41
agent/types/messages.py +0 -36
cua_agent-0.1.6.dist-info/RECORD +0 -64
/agent/{types → core}/tools.py +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/WHEEL +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/entry_points.txt +0 -0

agent/providers/omni/callbacks.py DELETED Viewed

@@ -1,78 +0,0 @@
-"""Omni callback manager implementation."""
-import logging
-from typing import Any, Dict, Optional, Set
-from ...core.callbacks import BaseCallbackManager, ContentCallback, ToolCallback, APICallback
-from ...types.tools import ToolResult
-logger = logging.getLogger(__name__)
-class OmniCallbackManager(BaseCallbackManager):
-    """Callback manager for multi-provider support."""
-    def __init__(
-        self,
-        content_callback: ContentCallback,
-        tool_callback: ToolCallback,
-        api_callback: APICallback,
-    ):
-        """Initialize Omni callback manager.
-        Args:
-            content_callback: Callback for content updates
-            tool_callback: Callback for tool execution results
-            api_callback: Callback for API interactions
-        """
-        super().__init__(
-            content_callback=content_callback,
-            tool_callback=tool_callback,
-            api_callback=api_callback
-        )
-        self._active_tools: Set[str] = set()
-    def on_content(self, content: Any) -> None:
-        """Handle content updates.
-        Args:
-            content: Content update data
-        """
-        logger.debug(f"Content update: {content}")
-        self.content_callback(content)
-    def on_tool_result(self, result: ToolResult, tool_id: str) -> None:
-        """Handle tool execution results.
-        Args:
-            result: Tool execution result
-            tool_id: ID of the tool
-        """
-        logger.debug(f"Tool result for {tool_id}: {result}")
-        self.tool_callback(result, tool_id)
-    def on_api_interaction(
-        self,
-        request: Any,
-        response: Any,
-        error: Optional[Exception] = None
-    ) -> None:
-        """Handle API interactions.
-        Args:
-            request: API request data
-            response: API response data
-            error: Optional error that occurred
-        """
-        if error:
-            logger.error(f"API error: {str(error)}")
-        else:
-            logger.debug(f"API interaction - Request: {request}, Response: {response}")
-        self.api_callback(request, response, error)
-    def get_active_tools(self) -> Set[str]:
-        """Get currently active tools.
-        Returns:
-            Set of active tool names
-        """
-        return self._active_tools.copy()

agent/providers/omni/clients/groq.py DELETED Viewed

@@ -1,101 +0,0 @@
-"""Groq client implementation."""
-import os
-import logging
-from typing import Dict, List, Optional, Any, Tuple
-from groq import Groq
-import re
-from .utils import is_image_path
-from .base import BaseOmniClient
-logger = logging.getLogger(__name__)
-class GroqClient(BaseOmniClient):
-    """Client for making Groq API calls."""
-    def __init__(
-        self,
-        api_key: Optional[str] = None,
-        model: str = "deepseek-r1-distill-llama-70b",
-        max_tokens: int = 4096,
-        temperature: float = 0.6,
-    ):
-        """Initialize Groq client.
-        Args:
-            api_key: Groq API key (if not provided, will try to get from env)
-            model: Model name to use
-            max_tokens: Maximum tokens to generate
-            temperature: Temperature for sampling
-        """
-        super().__init__(api_key=api_key, model=model)
-        self.api_key = api_key or os.getenv("GROQ_API_KEY")
-        if not self.api_key:
-            raise ValueError("No Groq API key provided")
-        self.max_tokens = max_tokens
-        self.temperature = temperature
-        self.client = Groq(api_key=self.api_key)
-        self.model: str = model  # Add explicit type annotation
-    def run_interleaved(
-        self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
-    ) -> tuple[str, int]:
-        """Run interleaved chat completion.
-        Args:
-            messages: List of message dicts
-            system: System prompt
-            max_tokens: Optional max tokens override
-        Returns:
-            Tuple of (response text, token usage)
-        """
-        # Avoid using system messages for R1
-        final_messages = [{"role": "user", "content": system}]
-        # Process messages
-        if isinstance(messages, list):
-            for item in messages:
-                if isinstance(item, dict):
-                    # For dict items, concatenate all text content, ignoring images
-                    text_contents = []
-                    for cnt in item["content"]:
-                        if isinstance(cnt, str):
-                            if not is_image_path(cnt):  # Skip image paths
-                                text_contents.append(cnt)
-                        else:
-                            text_contents.append(str(cnt))
-                    if text_contents:  # Only add if there's text content
-                        message = {"role": "user", "content": " ".join(text_contents)}
-                        final_messages.append(message)
-                else:  # str
-                    message = {"role": "user", "content": item}
-                    final_messages.append(message)
-        elif isinstance(messages, str):
-            final_messages.append({"role": "user", "content": messages})
-        try:
-            completion = self.client.chat.completions.create(  # type: ignore
-                model=self.model,
-                messages=final_messages,  # type: ignore
-                temperature=self.temperature,
-                max_tokens=max_tokens or self.max_tokens,
-                top_p=0.95,
-                stream=False,
-            )
-            response = completion.choices[0].message.content
-            final_answer = response.split("</think>\n")[-1] if "</think>" in response else response
-            final_answer = final_answer.replace("<output>", "").replace("</output>", "")
-            token_usage = completion.usage.total_tokens
-            return final_answer, token_usage
-        except Exception as e:
-            logger.error(f"Error in Groq API call: {e}")
-            raise

agent/providers/omni/experiment.py DELETED Viewed

@@ -1,276 +0,0 @@
-"""Experiment management for the Cua provider."""
-import os
-import logging
-import copy
-import base64
-from io import BytesIO
-from datetime import datetime
-from typing import Any, Dict, List, Optional
-from PIL import Image
-import json
-import time
-logger = logging.getLogger(__name__)
-class ExperimentManager:
-    """Manages experiment directories and logging for the agent."""
-    def __init__(
-        self,
-        base_dir: Optional[str] = None,
-        only_n_most_recent_images: Optional[int] = None,
-    ):
-        """Initialize the experiment manager.
-        Args:
-            base_dir: Base directory for saving experiment data
-            only_n_most_recent_images: Maximum number of recent screenshots to include in API requests
-        """
-        self.base_dir = base_dir
-        self.only_n_most_recent_images = only_n_most_recent_images
-        self.run_dir = None
-        self.current_turn_dir = None
-        self.turn_count = 0
-        self.screenshot_count = 0
-        # Track all screenshots for potential API request inclusion
-        self.screenshot_paths = []
-        # Set up experiment directories if base_dir is provided
-        if self.base_dir:
-            self.setup_experiment_dirs()
-    def setup_experiment_dirs(self) -> None:
-        """Setup the experiment directory structure."""
-        if not self.base_dir:
-            return
-        # Create base experiments directory if it doesn't exist
-        os.makedirs(self.base_dir, exist_ok=True)
-        # Use the base_dir directly as the run_dir
-        self.run_dir = self.base_dir
-        logger.info(f"Using directory for experiment: {self.run_dir}")
-        # Create first turn directory
-        self.create_turn_dir()
-    def create_turn_dir(self) -> None:
-        """Create a new directory for the current turn."""
-        if not self.run_dir:
-            return
-        self.turn_count += 1
-        self.current_turn_dir = os.path.join(self.run_dir, f"turn_{self.turn_count:03d}")
-        os.makedirs(self.current_turn_dir, exist_ok=True)
-        logger.info(f"Created turn directory: {self.current_turn_dir}")
-    def sanitize_log_data(self, data: Any) -> Any:
-        """Sanitize data for logging by removing large base64 strings.
-        Args:
-            data: Data to sanitize (dict, list, or primitive)
-        Returns:
-            Sanitized copy of the data
-        """
-        if isinstance(data, dict):
-            result = copy.deepcopy(data)
-            # Handle nested dictionaries and lists
-            for key, value in result.items():
-                # Process content arrays that contain image data
-                if key == "content" and isinstance(value, list):
-                    for i, item in enumerate(value):
-                        if isinstance(item, dict):
-                            # Handle Anthropic format
-                            if item.get("type") == "image" and isinstance(item.get("source"), dict):
-                                source = item["source"]
-                                if "data" in source and isinstance(source["data"], str):
-                                    # Replace base64 data with a placeholder and length info
-                                    data_len = len(source["data"])
-                                    source["data"] = f"[BASE64_IMAGE_DATA_LENGTH_{data_len}]"
-                            # Handle OpenAI format
-                            elif item.get("type") == "image_url" and isinstance(
-                                item.get("image_url"), dict
-                            ):
-                                url_dict = item["image_url"]
-                                if "url" in url_dict and isinstance(url_dict["url"], str):
-                                    url = url_dict["url"]
-                                    if url.startswith("data:"):
-                                        # Replace base64 data with placeholder
-                                        data_len = len(url)
-                                        url_dict["url"] = f"[BASE64_IMAGE_URL_LENGTH_{data_len}]"
-                # Handle other nested structures recursively
-                if isinstance(value, dict):
-                    result[key] = self.sanitize_log_data(value)
-                elif isinstance(value, list):
-                    result[key] = [self.sanitize_log_data(item) for item in value]
-            return result
-        elif isinstance(data, list):
-            return [self.sanitize_log_data(item) for item in data]
-        else:
-            return data
-    def save_debug_image(self, image_data: str, filename: str) -> None:
-        """Save a debug image to the experiment directory.
-        Args:
-            image_data: Base64 encoded image data
-            filename: Filename to save the image as
-        """
-        # Since we no longer want to use the images/ folder, we'll skip this functionality
-        return
-    def save_screenshot(self, img_base64: str, action_type: str = "") -> Optional[str]:
-        """Save a screenshot to the experiment directory.
-        Args:
-            img_base64: Base64 encoded screenshot
-            action_type: Type of action that triggered the screenshot
-        Returns:
-            Optional[str]: Path to the saved screenshot, or None if saving failed
-        """
-        if not self.current_turn_dir:
-            return None
-        try:
-            # Increment screenshot counter
-            self.screenshot_count += 1
-            # Create a descriptive filename
-            timestamp = int(time.time() * 1000)
-            action_suffix = f"_{action_type}" if action_type else ""
-            filename = f"screenshot_{self.screenshot_count:03d}{action_suffix}_{timestamp}.png"
-            # Save directly to the turn directory (no screenshots subdirectory)
-            filepath = os.path.join(self.current_turn_dir, filename)
-            # Save the screenshot
-            img_data = base64.b64decode(img_base64)
-            with open(filepath, "wb") as f:
-                f.write(img_data)
-            # Keep track of the file path for reference
-            self.screenshot_paths.append(filepath)
-            return filepath
-        except Exception as e:
-            logger.error(f"Error saving screenshot: {str(e)}")
-            return None
-    def should_save_debug_image(self) -> bool:
-        """Determine if debug images should be saved.
-        Returns:
-            Boolean indicating if debug images should be saved
-        """
-        # We no longer need to save debug images, so always return False
-        return False
-    def save_action_visualization(
-        self, img: Image.Image, action_name: str, details: str = ""
-    ) -> str:
-        """Save a visualization of an action.
-        Args:
-            img: Image to save
-            action_name: Name of the action
-            details: Additional details about the action
-        Returns:
-            Path to the saved image
-        """
-        if not self.current_turn_dir:
-            return ""
-        try:
-            # Create a descriptive filename
-            timestamp = int(time.time() * 1000)
-            details_suffix = f"_{details}" if details else ""
-            filename = f"vis_{action_name}{details_suffix}_{timestamp}.png"
-            # Save directly to the turn directory (no visualizations subdirectory)
-            filepath = os.path.join(self.current_turn_dir, filename)
-            # Save the image
-            img.save(filepath)
-            # Keep track of the file path for cleanup
-            self.screenshot_paths.append(filepath)
-            return filepath
-        except Exception as e:
-            logger.error(f"Error saving action visualization: {str(e)}")
-            return ""
-    def extract_and_save_images(self, data: Any, prefix: str) -> None:
-        """Extract and save images from response data.
-        Args:
-            data: Response data to extract images from
-            prefix: Prefix for saved image filenames
-        """
-        # Since we no longer want to save extracted images separately,
-        # we'll skip this functionality entirely
-        return
-    def log_api_call(
-        self,
-        call_type: str,
-        request: Any,
-        provider: str,
-        model: str,
-        response: Any = None,
-        error: Optional[Exception] = None,
-    ) -> None:
-        """Log API call details to file.
-        Args:
-            call_type: Type of API call (e.g., 'request', 'response', 'error')
-            request: The API request data
-            provider: The AI provider used
-            model: The AI model used
-            response: Optional API response data
-            error: Optional error information
-        """
-        if not self.current_turn_dir:
-            return
-        try:
-            # Create a unique filename with timestamp
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            filename = f"api_call_{timestamp}_{call_type}.json"
-            filepath = os.path.join(self.current_turn_dir, filename)
-            # Sanitize data to remove large base64 strings
-            sanitized_request = self.sanitize_log_data(request)
-            sanitized_response = self.sanitize_log_data(response) if response is not None else None
-            # Prepare log data
-            log_data = {
-                "timestamp": timestamp,
-                "provider": provider,
-                "model": model,
-                "type": call_type,
-                "request": sanitized_request,
-            }
-            if sanitized_response is not None:
-                log_data["response"] = sanitized_response
-            if error is not None:
-                log_data["error"] = str(error)
-            # Write to file
-            with open(filepath, "w") as f:
-                json.dump(log_data, f, indent=2, default=str)
-            logger.info(f"Logged API {call_type} to {filepath}")
-        except Exception as e:
-            logger.error(f"Error logging API call: {str(e)}")

agent/providers/omni/messages.py DELETED Viewed

@@ -1,171 +0,0 @@
-"""Omni message manager implementation."""
-import base64
-from typing import Any, Dict, List, Optional
-from io import BytesIO
-from PIL import Image
-from ...core.messages import BaseMessageManager, ImageRetentionConfig
-class OmniMessageManager(BaseMessageManager):
-    """Message manager for multi-provider support."""
-    def __init__(self, config: Optional[ImageRetentionConfig] = None):
-        """Initialize the message manager.
-        Args:
-            config: Optional configuration for image retention
-        """
-        super().__init__(config)
-        self.messages: List[Dict[str, Any]] = []
-        self.config = config
-    def add_user_message(self, content: str, images: Optional[List[bytes]] = None) -> None:
-        """Add a user message to the history.
-        Args:
-            content: Message content
-            images: Optional list of image data
-        """
-        # Add images if present
-        if images:
-            # Initialize with proper typing for mixed content
-            message_content: List[Dict[str, Any]] = [{"type": "text", "text": content}]
-            # Add each image
-            for img in images:
-                message_content.append(
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/png;base64,{base64.b64encode(img).decode()}"
-                        },
-                    }
-                )
-            message = {"role": "user", "content": message_content}
-        else:
-            # Simple text message
-            message = {"role": "user", "content": content}
-        self.messages.append(message)
-        # Apply retention policy
-        if self.config and self.config.num_images_to_keep:
-            self._apply_image_retention_policy()
-    def add_assistant_message(self, content: str) -> None:
-        """Add an assistant message to the history.
-        Args:
-            content: Message content
-        """
-        self.messages.append({"role": "assistant", "content": content})
-    def add_system_message(self, content: str) -> None:
-        """Add a system message to the history.
-        Args:
-            content: Message content
-        """
-        self.messages.append({"role": "system", "content": content})
-    def _apply_image_retention_policy(self) -> None:
-        """Apply image retention policy to message history."""
-        if not self.config or not self.config.num_images_to_keep:
-            return
-        # Count images from newest to oldest
-        image_count = 0
-        for message in reversed(self.messages):
-            if message["role"] != "user":
-                continue
-            # Handle multimodal messages
-            if isinstance(message["content"], list):
-                new_content = []
-                for item in message["content"]:
-                    if item["type"] == "text":
-                        new_content.append(item)
-                    elif item["type"] == "image_url":
-                        if image_count < self.config.num_images_to_keep:
-                            new_content.append(item)
-                            image_count += 1
-                message["content"] = new_content
-    def get_formatted_messages(self, provider: str) -> List[Dict[str, Any]]:
-        """Get messages formatted for specific provider.
-        Args:
-            provider: Provider name to format messages for
-        Returns:
-            List of formatted messages
-        """
-        # Set the provider for message formatting
-        self.set_provider(provider)
-        if provider == "anthropic":
-            return self._format_for_anthropic()
-        elif provider == "openai":
-            return self._format_for_openai()
-        elif provider == "groq":
-            return self._format_for_groq()
-        elif provider == "qwen":
-            return self._format_for_qwen()
-        else:
-            raise ValueError(f"Unsupported provider: {provider}")
-    def _format_for_anthropic(self) -> List[Dict[str, Any]]:
-        """Format messages for Anthropic API."""
-        formatted = []
-        for msg in self.messages:
-            formatted_msg = {"role": msg["role"]}
-            # Handle multimodal content
-            if isinstance(msg["content"], list):
-                formatted_msg["content"] = []
-                for item in msg["content"]:
-                    if item["type"] == "text":
-                        formatted_msg["content"].append({"type": "text", "text": item["text"]})
-                    elif item["type"] == "image_url":
-                        formatted_msg["content"].append(
-                            {
-                                "type": "image",
-                                "source": {
-                                    "type": "base64",
-                                    "media_type": "image/png",
-                                    "data": item["image_url"]["url"].split(",")[1],
-                                },
-                            }
-                        )
-            else:
-                formatted_msg["content"] = msg["content"]
-            formatted.append(formatted_msg)
-        return formatted
-    def _format_for_openai(self) -> List[Dict[str, Any]]:
-        """Format messages for OpenAI API."""
-        # OpenAI already uses the same format
-        return self.messages
-    def _format_for_groq(self) -> List[Dict[str, Any]]:
-        """Format messages for Groq API."""
-        # Groq uses OpenAI-compatible format
-        return self.messages
-    def _format_for_qwen(self) -> List[Dict[str, Any]]:
-        """Format messages for Qwen API."""
-        formatted = []
-        for msg in self.messages:
-            if isinstance(msg["content"], list):
-                # Convert multimodal content to text-only
-                text_content = next(
-                    (item["text"] for item in msg["content"] if item["type"] == "text"), ""
-                )
-                formatted.append({"role": msg["role"], "content": text_content})
-            else:
-                formatted.append(msg)
-        return formatted

cua-agent 0.1.6__py3-none-any.whl → 0.1.17__py3-none-any.whl

Potentially problematic release.

cua-agent 0.1.6py3-none-any.whl → 0.1.17py3-none-any.whl