PyPI - cua-agent - Versions diffs - 0.1.6__tar.gz → 0.1.17__tar.gz - Mend

cua-agent 0.1.6tar.gz → 0.1.17tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show

{cua_agent-0.1.6 → cua_agent-0.1.17}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cua-agent
-Version: 0.1.6
+Version: 0.1.17
 Summary: CUA (Computer Use) Agent for AI-driven computer interaction
 Author-Email: TryCua <gh@trycua.com>
 Requires-Python: <3.13,>=3.10
@@ -63,43 +63,13 @@ Description-Content-Type: text/markdown
 **Agent** is a Computer Use (CUA) framework for running multi-app agentic workflows targeting macOS and Linux sandbox, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). The framework integrates with Microsoft's OmniParser for enhanced UI understanding and interaction.
-### Get started with Agent
-```python
-from agent import ComputerAgent, AgentLoop, LLMProvider
-from computer import Computer
-computer = Computer(verbosity=logging.INFO)
-agent = ComputerAgent(
-    computer=computer,
-    loop=AgentLoop.ANTHROPIC,
-    # loop=AgentLoop.OMNI,
-    model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
-    # model=LLM(provider=LLMProvider.OPENAI, name="gpt-4.5-preview"),
-    save_trajectory=True,
-    trajectory_dir=str(Path("trajectories")),
-    only_n_most_recent_images=3,
-    verbosity=logging.INFO,
-)
+> While our north star is to create a 1-click experience, this preview of Agent might be still a bit rough around the edges. We appreciate your patience as we work to improve the experience.
-tasks = [
-"""
-Please help me with the following task:
-1. Open Safari browser
-2. Go to Wikipedia.org
-3. Search for "Claude AI"
-4. Summarize the main points you find about Claude AI
-"""
-]
+### Get started with Agent
-async with agent:
-    for i, task in enumerate(tasks, 1):
-        print(f"\nExecuting task {i}/{len(tasks)}: {task}")
-        async for result in agent.run(task):
-            print(result)
-        print(f"Task {i} completed")
-```
+<div align="center">
+    <img src="../../img/agent.png"/>
+</div>
 ## Install

{cua_agent-0.1.6 → cua_agent-0.1.17}/README.md RENAMED Viewed

@@ -17,43 +17,13 @@
 **Agent** is a Computer Use (CUA) framework for running multi-app agentic workflows targeting macOS and Linux sandbox, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). The framework integrates with Microsoft's OmniParser for enhanced UI understanding and interaction.
-### Get started with Agent
-```python
-from agent import ComputerAgent, AgentLoop, LLMProvider
-from computer import Computer
-computer = Computer(verbosity=logging.INFO)
-agent = ComputerAgent(
-    computer=computer,
-    loop=AgentLoop.ANTHROPIC,
-    # loop=AgentLoop.OMNI,
-    model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
-    # model=LLM(provider=LLMProvider.OPENAI, name="gpt-4.5-preview"),
-    save_trajectory=True,
-    trajectory_dir=str(Path("trajectories")),
-    only_n_most_recent_images=3,
-    verbosity=logging.INFO,
-)
+> While our north star is to create a 1-click experience, this preview of Agent might be still a bit rough around the edges. We appreciate your patience as we work to improve the experience.
-tasks = [
-"""
-Please help me with the following task:
-1. Open Safari browser
-2. Go to Wikipedia.org
-3. Search for "Claude AI"
-4. Summarize the main points you find about Claude AI
-"""
-]
+### Get started with Agent
-async with agent:
-    for i, task in enumerate(tasks, 1):
-        print(f"\nExecuting task {i}/{len(tasks)}: {task}")
-        async for result in agent.run(task):
-            print(result)
-        print(f"Task {i} completed")
-```
+<div align="center">
+    <img src="../../img/agent.png"/>
+</div>
 ## Install

{cua_agent-0.1.6 → cua_agent-0.1.17}/agent/__init__.py RENAMED Viewed

@@ -49,6 +49,7 @@ except Exception as e:
     logger.warning(f"Error initializing telemetry: {e}")
 from .providers.omni.types import LLMProvider, LLM
-from .types.base import AgentLoop
+from .core.loop import AgentLoop
+from .core.computer_agent import ComputerAgent
-__all__ = ["AgentLoop", "LLMProvider", "LLM"]
+__all__ = ["AgentLoop", "LLMProvider", "LLM", "ComputerAgent"]

{cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/__init__.py RENAMED Viewed

@@ -2,11 +2,6 @@
 from .loop import BaseLoop
 from .messages import (
-    create_user_message,
-    create_assistant_message,
-    create_system_message,
-    create_image_message,
-    create_screen_message,
     BaseMessageManager,
     ImageRetentionConfig,
 )

{cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/computer_agent.py RENAMED Viewed

@@ -3,8 +3,7 @@
 import asyncio
 import logging
 import os
-from typing import Any, AsyncGenerator, Dict, Optional, cast
-from dataclasses import dataclass
+from typing import Any, AsyncGenerator, Dict, Optional, cast, List
 from computer import Computer
 from ..providers.anthropic.loop import AnthropicLoop
@@ -12,6 +11,8 @@ from ..providers.omni.loop import OmniLoop
 from ..providers.omni.parser import OmniParser
 from ..providers.omni.types import LLMProvider, LLM
 from .. import AgentLoop
+from .messages import StandardMessageManager, ImageRetentionConfig
+from .types import AgentResponse
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -44,7 +45,6 @@ class ComputerAgent:
         save_trajectory: bool = True,
         trajectory_dir: str = "trajectories",
         only_n_most_recent_images: Optional[int] = None,
-        parser: Optional[OmniParser] = None,
         verbosity: int = logging.INFO,
     ):
         """Initialize the ComputerAgent.
@@ -61,12 +61,11 @@ class ComputerAgent:
             save_trajectory: Whether to save the trajectory.
             trajectory_dir: Directory to save the trajectory.
             only_n_most_recent_images: Maximum number of recent screenshots to include in API requests.
-            parser: Parser instance for the OmniLoop. Only used if provider is not ANTHROPIC.
             verbosity: Logging level.
         """
         # Basic agent configuration
         self.max_retries = max_retries
-        self.computer = computer or Computer()
+        self.computer = computer
         self.queue = asyncio.Queue()
         self.screenshot_dir = screenshot_dir
         self.log_dir = log_dir
@@ -100,7 +99,7 @@ class ComputerAgent:
                 )
         # Ensure computer is properly cast for typing purposes
-        computer_instance = cast(Computer, self.computer)
+        computer_instance = self.computer
         # Get API key from environment if not provided
         actual_api_key = api_key or os.environ.get(ENV_VARS[self.provider], "")
@@ -118,10 +117,6 @@ class ComputerAgent:
                 only_n_most_recent_images=only_n_most_recent_images,
             )
         else:
-            # Default to OmniLoop for other loop types
-            # Initialize parser if not provided
-            actual_parser = parser or OmniParser()
             self._loop = OmniLoop(
                 provider=self.provider,
                 api_key=actual_api_key,
@@ -130,9 +125,12 @@ class ComputerAgent:
                 save_trajectory=save_trajectory,
                 base_dir=trajectory_dir,
                 only_n_most_recent_images=only_n_most_recent_images,
-                parser=actual_parser,
+                parser=OmniParser(),
             )
+        # Initialize the message manager from the loop
+        self.message_manager = self._loop.message_manager
         logger.info(
             f"ComputerAgent initialized with provider: {self.provider}, model: {actual_model_name}"
         )
@@ -201,36 +199,30 @@ class ComputerAgent:
                 await self.computer.run()
             self._initialized = True
-    async def _init_if_needed(self):
-        """Initialize the computer interface if it hasn't been initialized yet."""
-        if not self.computer._initialized:
-            logger.info("Computer not initialized, initializing now...")
-            try:
-                # Call run directly
-                await self.computer.run()
-                logger.info("Computer interface initialized successfully")
-            except Exception as e:
-                logger.error(f"Error initializing computer interface: {str(e)}")
-                raise
-    async def run(self, task: str) -> AsyncGenerator[Dict[str, Any], None]:
+    async def run(self, task: str) -> AsyncGenerator[AgentResponse, None]:
         """Run a task using the computer agent.
         Args:
             task: Task description
         Yields:
-            Task execution updates
+            Agent response format
         """
         try:
             logger.info(f"Running task: {task}")
+            logger.info(
+                f"Message history before task has {len(self.message_manager.messages)} messages"
+            )
             # Initialize the computer if needed
             if not self._initialized:
                 await self.initialize()
-            # Format task as a message
-            messages = [{"role": "user", "content": task}]
+            # Add task as a user message using the message manager
+            self.message_manager.add_user_message([{"type": "text", "text": task}])
+            logger.info(
+                f"Added task message. Message history now has {len(self.message_manager.messages)} messages"
+            )
             # Pass properly formatted messages to the loop
             if self._loop is None:
@@ -239,7 +231,8 @@ class ComputerAgent:
                 return
             # Execute the task and yield results
-            async for result in self._loop.run(messages):
+            async for result in self._loop.run(self.message_manager.messages):
+                # Yield the result to the caller
                 yield result
         except Exception as e:

{cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/loop.py RENAMED Viewed

@@ -2,22 +2,34 @@
 import logging
 import asyncio
-import json
-import os
 from abc import ABC, abstractmethod
+from enum import Enum, auto
 from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
 from datetime import datetime
-import base64
 from computer import Computer
 from .experiment import ExperimentManager
+from .messages import StandardMessageManager, ImageRetentionConfig
+from .types import AgentResponse
 logger = logging.getLogger(__name__)
+class AgentLoop(Enum):
+    """Enumeration of available loop types."""
+    ANTHROPIC = auto()  # Anthropic implementation
+    OMNI = auto()  # OmniLoop implementation
+    # Add more loop types as needed
 class BaseLoop(ABC):
     """Base class for agent loops that handle message processing and tool execution."""
+    ###########################################
+    # INITIALIZATION AND CONFIGURATION
+    ###########################################
     def __init__(
         self,
         computer: Computer,
@@ -55,8 +67,6 @@ class BaseLoop(ABC):
         self.save_trajectory = save_trajectory
         self.only_n_most_recent_images = only_n_most_recent_images
         self._kwargs = kwargs
-        self.message_history = []
-        # self.tool_manager = BaseToolManager(computer)
         # Initialize experiment manager
         if self.save_trajectory and self.base_dir:
@@ -75,6 +85,64 @@ class BaseLoop(ABC):
         # Initialize basic tracking
         self.turn_count = 0
+    async def initialize(self) -> None:
+        """Initialize both the API client and computer interface with retries."""
+        for attempt in range(self.max_retries):
+            try:
+                logger.info(
+                    f"Starting initialization (attempt {attempt + 1}/{self.max_retries})..."
+                )
+                # Initialize API client
+                await self.initialize_client()
+                logger.info("Initialization complete.")
+                return
+            except Exception as e:
+                if attempt < self.max_retries - 1:
+                    logger.warning(
+                        f"Initialization failed (attempt {attempt + 1}/{self.max_retries}): {str(e)}. Retrying..."
+                    )
+                    await asyncio.sleep(self.retry_delay)
+                else:
+                    logger.error(
+                        f"Initialization failed after {self.max_retries} attempts: {str(e)}"
+                    )
+                    raise RuntimeError(f"Failed to initialize: {str(e)}")
+        ###########################################
+    # ABSTRACT METHODS TO BE IMPLEMENTED BY SUBCLASSES
+    ###########################################
+    @abstractmethod
+    async def initialize_client(self) -> None:
+        """Initialize the API client and any provider-specific components.
+        This method must be implemented by subclasses to set up
+        provider-specific clients and tools.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
+        """Run the agent loop with provided messages.
+        This method handles the main agent loop including message processing,
+        API calls, response handling, and action execution.
+        Args:
+            messages: List of message objects
+        Yields:
+            Agent response format
+        """
+        raise NotImplementedError
+    ###########################################
+    # EXPERIMENT AND TRAJECTORY MANAGEMENT
+    ###########################################
     def _setup_experiment_dirs(self) -> None:
         """Setup the experiment directory structure."""
         if self.experiment_manager:
@@ -100,10 +168,13 @@ class BaseLoop(ABC):
     ) -> None:
         """Log API call details to file.
+        Preserves provider-specific formats for requests and responses to ensure
+        accurate logging for debugging and analysis purposes.
         Args:
             call_type: Type of API call (e.g., 'request', 'response', 'error')
-            request: The API request data
-            response: Optional API response data
+            request: The API request data in provider-specific format
+            response: Optional API response data in provider-specific format
             error: Optional error information
         """
         if self.experiment_manager:
@@ -129,120 +200,3 @@ class BaseLoop(ABC):
         """
         if self.experiment_manager:
             self.experiment_manager.save_screenshot(img_base64, action_type)
-    async def initialize(self) -> None:
-        """Initialize both the API client and computer interface with retries."""
-        for attempt in range(self.max_retries):
-            try:
-                logger.info(
-                    f"Starting initialization (attempt {attempt + 1}/{self.max_retries})..."
-                )
-                # Initialize API client
-                await self.initialize_client()
-                logger.info("Initialization complete.")
-                return
-            except Exception as e:
-                if attempt < self.max_retries - 1:
-                    logger.warning(
-                        f"Initialization failed (attempt {attempt + 1}/{self.max_retries}): {str(e)}. Retrying..."
-                    )
-                    await asyncio.sleep(self.retry_delay)
-                else:
-                    logger.error(
-                        f"Initialization failed after {self.max_retries} attempts: {str(e)}"
-                    )
-                    raise RuntimeError(f"Failed to initialize: {str(e)}")
-    async def _get_parsed_screen_som(self) -> Dict[str, Any]:
-        """Get parsed screen information.
-        Returns:
-            Dict containing screen information
-        """
-        try:
-            # Take screenshot
-            screenshot = await self.computer.interface.screenshot()
-            # Initialize with default values
-            width, height = 1024, 768
-            base64_image = ""
-            # Handle different types of screenshot returns
-            if isinstance(screenshot, (bytes, bytearray, memoryview)):
-                # Raw bytes screenshot
-                base64_image = base64.b64encode(screenshot).decode("utf-8")
-            elif hasattr(screenshot, "base64_image"):
-                # Object-style screenshot with attributes
-                # Type checking can't infer these attributes, but they exist at runtime
-                # on certain screenshot return types
-                base64_image = getattr(screenshot, "base64_image")
-                width = (
-                    getattr(screenshot, "width", width) if hasattr(screenshot, "width") else width
-                )
-                height = (
-                    getattr(screenshot, "height", height)
-                    if hasattr(screenshot, "height")
-                    else height
-                )
-            # Create parsed screen data
-            parsed_screen = {
-                "width": width,
-                "height": height,
-                "parsed_content_list": [],
-                "timestamp": datetime.now().isoformat(),
-                "screenshot_base64": base64_image,
-            }
-            # Save screenshot if requested
-            if self.save_trajectory and self.experiment_manager:
-                try:
-                    img_data = base64_image
-                    if "," in img_data:
-                        img_data = img_data.split(",")[1]
-                    self._save_screenshot(img_data, action_type="state")
-                except Exception as e:
-                    logger.error(f"Error saving screenshot: {str(e)}")
-            return parsed_screen
-        except Exception as e:
-            logger.error(f"Error taking screenshot: {str(e)}")
-            return {
-                "width": 1024,
-                "height": 768,
-                "parsed_content_list": [],
-                "timestamp": datetime.now().isoformat(),
-                "error": f"Error taking screenshot: {str(e)}",
-                "screenshot_base64": "",
-            }
-    @abstractmethod
-    async def initialize_client(self) -> None:
-        """Initialize the API client and any provider-specific components."""
-        raise NotImplementedError
-    @abstractmethod
-    async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
-        """Run the agent loop with provided messages.
-        Args:
-            messages: List of message objects
-        Yields:
-            Dict containing response data
-        """
-        raise NotImplementedError
-    @abstractmethod
-    async def _process_screen(
-        self, parsed_screen: Dict[str, Any], messages: List[Dict[str, Any]]
-    ) -> None:
-        """Process screen information and add to messages.
-        Args:
-            parsed_screen: Dictionary containing parsed screen info
-            messages: List of messages to update
-        """
-        raise NotImplementedError

cua-agent 0.1.6__tar.gz → 0.1.17__tar.gz

Potentially problematic release.

cua-agent 0.1.6tar.gz → 0.1.17tar.gz