PyPI - cua-agent - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.18__py3-none-any.whl - Mend

cua-agent 0.1.6py3-none-any.whl → 0.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (57) hide show

agent/__init__.py +3 -2
agent/core/__init__.py +1 -6
agent/core/{computer_agent.py → agent.py} +31 -76
agent/core/{loop.py → base.py} +68 -127
agent/core/factory.py +104 -0
agent/core/messages.py +279 -125
agent/core/provider_config.py +15 -0
agent/core/types.py +45 -0
agent/core/visualization.py +197 -0
agent/providers/anthropic/api/client.py +142 -1
agent/providers/anthropic/api_handler.py +140 -0
agent/providers/anthropic/callbacks/__init__.py +5 -0
agent/providers/anthropic/loop.py +207 -221
agent/providers/anthropic/response_handler.py +226 -0
agent/providers/anthropic/tools/bash.py +0 -97
agent/providers/anthropic/utils.py +368 -0
agent/providers/omni/__init__.py +1 -20
agent/providers/omni/api_handler.py +42 -0
agent/providers/omni/clients/anthropic.py +4 -0
agent/providers/omni/image_utils.py +0 -72
agent/providers/omni/loop.py +491 -607
agent/providers/omni/parser.py +58 -4
agent/providers/omni/tools/__init__.py +25 -7
agent/providers/omni/tools/base.py +29 -0
agent/providers/omni/tools/bash.py +43 -38
agent/providers/omni/tools/computer.py +144 -182
agent/providers/omni/tools/manager.py +25 -45
agent/providers/omni/types.py +1 -3
agent/providers/omni/utils.py +224 -145
agent/providers/openai/__init__.py +6 -0
agent/providers/openai/api_handler.py +453 -0
agent/providers/openai/loop.py +440 -0
agent/providers/openai/response_handler.py +205 -0
agent/providers/openai/tools/__init__.py +15 -0
agent/providers/openai/tools/base.py +79 -0
agent/providers/openai/tools/computer.py +319 -0
agent/providers/openai/tools/manager.py +106 -0
agent/providers/openai/types.py +36 -0
agent/providers/openai/utils.py +98 -0
cua_agent-0.1.18.dist-info/METADATA +165 -0
cua_agent-0.1.18.dist-info/RECORD +73 -0
agent/README.md +0 -63
agent/providers/anthropic/messages/manager.py +0 -112
agent/providers/omni/callbacks.py +0 -78
agent/providers/omni/clients/groq.py +0 -101
agent/providers/omni/experiment.py +0 -276
agent/providers/omni/messages.py +0 -171
agent/providers/omni/tool_manager.py +0 -91
agent/providers/omni/visualization.py +0 -130
agent/types/__init__.py +0 -23
agent/types/base.py +0 -41
agent/types/messages.py +0 -36
cua_agent-0.1.6.dist-info/METADATA +0 -120
cua_agent-0.1.6.dist-info/RECORD +0 -64
/agent/{types → core}/tools.py +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.18.dist-info}/WHEEL +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.18.dist-info}/entry_points.txt +0 -0

agent/providers/openai/loop.py ADDED Viewed

@@ -0,0 +1,440 @@
+"""OpenAI Agent Response API provider implementation."""
+import logging
+import asyncio
+import base64
+from typing import Any, Dict, List, Optional, AsyncGenerator, Callable, Awaitable, TYPE_CHECKING
+from computer import Computer
+from ...core.base import BaseLoop
+from ...core.types import AgentResponse
+from ...core.messages import StandardMessageManager, ImageRetentionConfig
+from .api_handler import OpenAIAPIHandler
+from .response_handler import OpenAIResponseHandler
+from .tools.manager import ToolManager
+from .types import LLMProvider, ResponseItemType
+logger = logging.getLogger(__name__)
+class OpenAILoop(BaseLoop):
+    """OpenAI-specific implementation of the agent loop.
+    This class extends BaseLoop to provide specialized support for OpenAI's Agent Response API
+    with computer control capabilities.
+    """
+    ###########################################
+    # INITIALIZATION AND CONFIGURATION
+    ###########################################
+    def __init__(
+        self,
+        api_key: str,
+        computer: Computer,
+        model: str = "computer-use-preview",
+        only_n_most_recent_images: Optional[int] = 2,
+        base_dir: Optional[str] = "trajectories",
+        max_retries: int = 3,
+        retry_delay: float = 1.0,
+        save_trajectory: bool = True,
+        acknowledge_safety_check_callback: Optional[Callable[[str], Awaitable[bool]]] = None,
+        **kwargs,
+    ):
+        """Initialize the OpenAI loop.
+        Args:
+            api_key: OpenAI API key
+            model: Model name (ignored, always uses computer-use-preview)
+            computer: Computer instance
+            only_n_most_recent_images: Maximum number of recent screenshots to include in API requests
+            base_dir: Base directory for saving experiment data
+            max_retries: Maximum number of retries for API calls
+            retry_delay: Delay between retries in seconds
+            save_trajectory: Whether to save trajectory data
+            acknowledge_safety_check_callback: Optional callback for safety check acknowledgment
+            **kwargs: Additional provider-specific arguments
+        """
+        # Always use computer-use-preview model
+        if model != "computer-use-preview":
+            logger.info(
+                f"Overriding provided model '{model}' with required model 'computer-use-preview'"
+            )
+        # Initialize base class with core config
+        super().__init__(
+            computer=computer,
+            model="computer-use-preview",  # Always use computer-use-preview
+            api_key=api_key,
+            max_retries=max_retries,
+            retry_delay=retry_delay,
+            base_dir=base_dir,
+            save_trajectory=save_trajectory,
+            only_n_most_recent_images=only_n_most_recent_images,
+            **kwargs,
+        )
+        # Initialize message manager
+        self.message_manager = StandardMessageManager(
+            config=ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images)
+        )
+        # OpenAI-specific attributes
+        self.provider = LLMProvider.OPENAI
+        self.client = None
+        self.retry_count = 0
+        self.acknowledge_safety_check_callback = acknowledge_safety_check_callback
+        self.queue = asyncio.Queue()  # Initialize queue
+        self.last_response_id = None  # Store the last response ID across runs
+        # Initialize handlers
+        self.api_handler = OpenAIAPIHandler(self)
+        self.response_handler = OpenAIResponseHandler(self)
+        # Initialize tool manager with callback
+        self.tool_manager = ToolManager(
+            computer=computer, acknowledge_safety_check_callback=acknowledge_safety_check_callback
+        )
+    ###########################################
+    # CLIENT INITIALIZATION - IMPLEMENTING ABSTRACT METHOD
+    ###########################################
+    async def initialize_client(self) -> None:
+        """Initialize the OpenAI API client and tools.
+        Implements abstract method from BaseLoop to set up the OpenAI-specific
+        client, tool manager, and message manager.
+        """
+        try:
+            # Initialize tool manager
+            await self.tool_manager.initialize()
+        except Exception as e:
+            logger.error(f"Error initializing OpenAI client: {str(e)}")
+            self.client = None
+            raise RuntimeError(f"Failed to initialize OpenAI client: {str(e)}")
+    ###########################################
+    # MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
+    ###########################################
+    async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
+        """Run the agent loop with provided messages.
+        Args:
+            messages: List of message objects in standard format
+        Yields:
+            Agent response format
+        """
+        try:
+            logger.info("Starting OpenAI loop run")
+            # Create queue for response streaming
+            queue = asyncio.Queue()
+            # Ensure tool manager is initialized
+            await self.tool_manager.initialize()
+            # Start loop in background task
+            loop_task = asyncio.create_task(self._run_loop(queue, messages))
+            # Process and yield messages as they arrive
+            while True:
+                try:
+                    item = await queue.get()
+                    if item is None:  # Stop signal
+                        break
+                    yield item
+                    queue.task_done()
+                except Exception as e:
+                    logger.error(f"Error processing queue item: {str(e)}")
+                    continue
+            # Wait for loop to complete
+            await loop_task
+            # Send completion message
+            yield {
+                "role": "assistant",
+                "content": "Task completed successfully.",
+                "metadata": {"title": "✅ Complete"},
+            }
+        except Exception as e:
+            logger.error(f"Error executing task: {str(e)}")
+            yield {
+                "role": "assistant",
+                "content": f"Error: {str(e)}",
+                "metadata": {"title": "❌ Error"},
+            }
+    ###########################################
+    # AGENT LOOP IMPLEMENTATION
+    ###########################################
+    async def _run_loop(self, queue: asyncio.Queue, messages: List[Dict[str, Any]]) -> None:
+        """Run the agent loop with provided messages.
+        Args:
+            queue: Queue for response streaming
+            messages: List of messages in standard format
+        """
+        try:
+            # Use the instance-level last_response_id instead of creating a local variable
+            # This way it persists between runs
+            # Capture initial screenshot
+            try:
+                # Take screenshot
+                screenshot = await self.computer.interface.screenshot()
+                logger.info("Screenshot captured successfully")
+                # Convert to base64 if needed
+                if isinstance(screenshot, bytes):
+                    screenshot_base64 = base64.b64encode(screenshot).decode("utf-8")
+                else:
+                    screenshot_base64 = screenshot
+                # Save screenshot if requested
+                if self.save_trajectory:
+                    # Ensure screenshot_base64 is a string
+                    if not isinstance(screenshot_base64, str):
+                        logger.warning(
+                            "Converting non-string screenshot_base64 to string for _save_screenshot"
+                        )
+                        if isinstance(screenshot_base64, (bytearray, memoryview)):
+                            screenshot_base64 = base64.b64encode(screenshot_base64).decode("utf-8")
+                    self._save_screenshot(screenshot_base64, action_type="state")
+                    logger.info("Screenshot saved to trajectory")
+                # First add any existing user messages that were passed to run()
+                user_query = None
+                for msg in messages:
+                    if msg.get("role") == "user":
+                        user_content = msg.get("content", "")
+                        if isinstance(user_content, str) and user_content:
+                            user_query = user_content
+                            # Add the user's original query to the message manager
+                            self.message_manager.add_user_message(
+                                [{"type": "text", "text": user_content}]
+                            )
+                            break
+                # Add screenshot to message manager
+                message_content = [
+                    {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": "image/png",
+                            "data": screenshot_base64,
+                        },
+                    }
+                ]
+                # Add appropriate text with the screenshot
+                message_content.append(
+                    {
+                        "type": "text",
+                        "text": user_query,
+                    }
+                )
+                # Add the screenshot and text to the message manager
+                self.message_manager.add_user_message(message_content)
+                # Process user request and convert our standard message format to one OpenAI expects
+                messages = self.message_manager.messages
+                logger.info(f"Starting agent loop with {len(messages)} messages")
+                # Create initial turn directory
+                if self.save_trajectory:
+                    self._create_turn_dir()
+                # Call API
+                screen_size = await self.computer.interface.get_screen_size()
+                response = await self.api_handler.send_initial_request(
+                    messages=messages,
+                    display_width=str(screen_size["width"]),
+                    display_height=str(screen_size["height"]),
+                    previous_response_id=self.last_response_id,
+                )
+                # Store response ID for next request
+                # OpenAI API response structure: the ID is in the response dictionary
+                if isinstance(response, dict) and "id" in response:
+                    self.last_response_id = response["id"]  # Update instance variable
+                    logger.info(f"Received response with ID: {self.last_response_id}")
+                else:
+                    logger.warning(
+                        f"Could not find response ID in OpenAI response: {type(response)}"
+                    )
+                    # Don't reset last_response_id to None - keep the previous value if available
+                # Process API response
+                await queue.put(response)
+                # Loop to continue processing responses until task is complete
+                task_complete = False
+                while not task_complete:
+                    # Check if there are any computer calls
+                    output_items = response.get("output", []) or []
+                    computer_calls = [
+                        item for item in output_items if item.get("type") == "computer_call"
+                    ]
+                    if not computer_calls:
+                        logger.info("No computer calls in response, task may be complete.")
+                        task_complete = True
+                        continue
+                    # Process the first computer call
+                    computer_call = computer_calls[0]
+                    action = computer_call.get("action", {})
+                    call_id = computer_call.get("call_id")
+                    # Check for safety checks
+                    pending_safety_checks = computer_call.get("pending_safety_checks", [])
+                    acknowledged_safety_checks = []
+                    if pending_safety_checks:
+                        # Log safety checks
+                        for check in pending_safety_checks:
+                            logger.warning(
+                                f"Safety check: {check.get('code')} - {check.get('message')}"
+                            )
+                        # If we have a callback, use it to acknowledge safety checks
+                        if self.acknowledge_safety_check_callback:
+                            acknowledged = await self.acknowledge_safety_check_callback(
+                                pending_safety_checks
+                            )
+                            if not acknowledged:
+                                logger.warning("Safety check acknowledgment failed")
+                                await queue.put(
+                                    {
+                                        "role": "assistant",
+                                        "content": "Safety checks were not acknowledged. Cannot proceed with action.",
+                                        "metadata": {"title": "⚠️ Safety Warning"},
+                                    }
+                                )
+                                continue
+                            acknowledged_safety_checks = pending_safety_checks
+                    # Execute the action
+                    try:
+                        # Create a new turn directory for this action if saving trajectories
+                        if self.save_trajectory:
+                            self._create_turn_dir()
+                        # Execute the tool
+                        result = await self.tool_manager.execute_tool("computer", action)
+                        # Take screenshot after action
+                        screenshot = await self.computer.interface.screenshot()
+                        if isinstance(screenshot, bytes):
+                            screenshot_base64 = base64.b64encode(screenshot).decode("utf-8")
+                        else:
+                            screenshot_base64 = screenshot
+                        # Create computer_call_output
+                        computer_call_output = {
+                            "type": "computer_call_output",
+                            "call_id": call_id,
+                            "output": {
+                                "type": "input_image",
+                                "image_url": f"data:image/png;base64,{screenshot_base64}",
+                            },
+                        }
+                        # Add acknowledged safety checks if any
+                        if acknowledged_safety_checks:
+                            computer_call_output["acknowledged_safety_checks"] = (
+                                acknowledged_safety_checks
+                            )
+                        # Save to message manager for history
+                        self.message_manager.add_system_message(
+                            f"[Computer action executed: {action.get('type')}]"
+                        )
+                        self.message_manager.add_user_message([computer_call_output])
+                        # For follow-up requests with previous_response_id, we only need to send
+                        # the computer_call_output, not the full message history
+                        # The API handler will extract this from the message history
+                        if isinstance(self.last_response_id, str):
+                            response = await self.api_handler.send_computer_call_request(
+                                messages=self.message_manager.messages,
+                                display_width=str(screen_size["width"]),
+                                display_height=str(screen_size["height"]),
+                                previous_response_id=self.last_response_id,  # Use instance variable
+                            )
+                        # Store response ID for next request
+                        if isinstance(response, dict) and "id" in response:
+                            self.last_response_id = response["id"]  # Update instance variable
+                            logger.info(f"Received response with ID: {self.last_response_id}")
+                        else:
+                            logger.warning(
+                                f"Could not find response ID in OpenAI response: {type(response)}"
+                            )
+                            # Keep using the previous response ID if we can't find a new one
+                        # Process the response
+                        # await self.response_handler.process_response(response, queue)
+                        await queue.put(response)
+                    except Exception as e:
+                        logger.error(f"Error executing computer action: {str(e)}")
+                        await queue.put(
+                            {
+                                "role": "assistant",
+                                "content": f"Error executing action: {str(e)}",
+                                "metadata": {"title": "❌ Error"},
+                            }
+                        )
+                        task_complete = True
+            except Exception as e:
+                logger.error(f"Error capturing initial screenshot: {str(e)}")
+                await queue.put(
+                    {
+                        "role": "assistant",
+                        "content": f"Error capturing screenshot: {str(e)}",
+                        "metadata": {"title": "❌ Error"},
+                    }
+                )
+                await queue.put(None)  # Signal that we're done
+                return
+            # Signal that we're done
+            await queue.put(None)
+        except Exception as e:
+            logger.error(f"Error in _run_loop: {str(e)}")
+            await queue.put(
+                {
+                    "role": "assistant",
+                    "content": f"Error: {str(e)}",
+                    "metadata": {"title": "❌ Error"},
+                }
+            )
+            await queue.put(None)  # Signal that we're done
+    def get_last_response_id(self) -> Optional[str]:
+        """Get the last response ID.
+        Returns:
+            The last response ID or None if no response has been received
+        """
+        return self.last_response_id
+    def set_last_response_id(self, response_id: str) -> None:
+        """Set the last response ID.
+        Args:
+            response_id: OpenAI response ID to set
+        """
+        self.last_response_id = response_id
+        logger.info(f"Manually set response ID to: {self.last_response_id}")

agent/providers/openai/response_handler.py ADDED Viewed

@@ -0,0 +1,205 @@
+"""Response handler for the OpenAI provider."""
+import logging
+import asyncio
+import traceback
+from typing import Any, Dict, List, Optional, TYPE_CHECKING, AsyncGenerator
+import base64
+from ...core.types import AgentResponse
+from .types import ResponseItemType
+if TYPE_CHECKING:
+    from .loop import OpenAILoop
+logger = logging.getLogger(__name__)
+class OpenAIResponseHandler:
+    """Handler for OpenAI API responses."""
+    def __init__(self, loop: "OpenAILoop"):
+        """Initialize the response handler.
+        Args:
+            loop: OpenAI loop instance
+        """
+        self.loop = loop
+        logger.info("Initialized OpenAI response handler")
+    async def process_response(self, response: Dict[str, Any], queue: asyncio.Queue) -> None:
+        """Process the response from the OpenAI API.
+        Args:
+            response: Response from the API
+            queue: Queue for response streaming
+        """
+        try:
+            # Get output items
+            output_items = response.get("output", []) or []
+            # Process each output item
+            for item in output_items:
+                if not isinstance(item, dict):
+                    continue
+                item_type = item.get("type")
+                # For computer_call items, we only need to add to the queue
+                # The loop is now handling executing the action and creating the computer_call_output
+                if item_type == ResponseItemType.COMPUTER_CALL:
+                    # Send computer_call to queue so it can be processed
+                    await queue.put(item)
+                elif item_type == ResponseItemType.MESSAGE:
+                    # Send message to queue
+                    await queue.put(item)
+                elif item_type == ResponseItemType.REASONING:
+                    # Process reasoning summary
+                    summary = None
+                    if "summary" in item and isinstance(item["summary"], list):
+                        for summary_item in item["summary"]:
+                            if (
+                                isinstance(summary_item, dict)
+                                and summary_item.get("type") == "summary_text"
+                            ):
+                                summary = summary_item.get("text")
+                                break
+                    if summary:
+                        # Log the reasoning summary
+                        logger.info(f"Reasoning summary: {summary}")
+                        # Send reasoning summary to queue with a special format
+                        await queue.put(
+                            {
+                                "role": "assistant",
+                                "content": f"[Reasoning: {summary}]",
+                                "metadata": {"title": "💭 Reasoning", "is_summary": True},
+                            }
+                        )
+                    # Also pass the original reasoning item to the queue for complete context
+                    await queue.put(item)
+        except Exception as e:
+            logger.error(f"Error processing response: {str(e)}")
+            await queue.put(
+                {
+                    "role": "assistant",
+                    "content": f"Error processing response: {str(e)}",
+                    "metadata": {"title": "❌ Error"},
+                }
+            )
+    def _process_message_item(self, item: Dict[str, Any]) -> AgentResponse:
+        """Process a message item from the response.
+        Args:
+            item: Message item from the response
+        Returns:
+            Processed message in AgentResponse format
+        """
+        # Extract content items - add null check
+        content_items = item.get("content", []) or []
+        # Extract text from content items - use output_text type from OpenAI
+        text = ""
+        for content_item in content_items:
+            # Skip if content_item is None or not a dict
+            if content_item is None or not isinstance(content_item, dict):
+                continue
+            # In OpenAI Agent Response API, text content is in "output_text" type items
+            if content_item.get("type") == "output_text":
+                text += content_item.get("text", "")
+        # Create agent response
+        return {
+            "role": "assistant",
+            "content": text
+            or "I don't have a response for that right now.",  # Provide fallback when text is empty
+            "metadata": {"title": "💬 Response"},
+        }
+    async def _process_computer_call(self, item: Dict[str, Any], queue: asyncio.Queue) -> None:
+        """Process a computer call item from the response.
+        Args:
+            item: Computer call item
+            queue: Queue to add responses to
+        """
+        try:
+            # Log the computer call
+            action = item.get("action", {}) or {}
+            if not isinstance(action, dict):
+                logger.warning(f"Expected dict for action, got {type(action)}")
+                action = {}
+            action_type = action.get("type", "unknown")
+            logger.info(f"Processing computer call: {action_type}")
+            # Execute the tool call
+            result = await self.loop.tool_manager.execute_tool("computer", action)
+            # Add any message to the conversation history and queue
+            if result and result.base64_image:
+                # Update message history with the call output
+                self.loop.message_manager.add_user_message(
+                    [{"type": "text", "text": f"[Computer action completed: {action_type}]"}]
+                )
+                # Add image to messages (using correct content types for Agent Response API)
+                self.loop.message_manager.add_user_message(
+                    [
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": "image/png",
+                                "data": result.base64_image,
+                            },
+                        }
+                    ]
+                )
+                # If browser environment, include URL if available
+                # if (
+                #     hasattr(self.loop.computer, "environment")
+                #     and self.loop.computer.environment == "browser"
+                # ):
+                #     try:
+                #         if hasattr(self.loop.computer.interface, "get_current_url"):
+                #             current_url = await self.loop.computer.interface.get_current_url()
+                #             self.loop.message_manager.add_user_message(
+                #                 [
+                #                     {
+                #                         "type": "text",
+                #                         "text": f"Current URL: {current_url}",
+                #                     }
+                #                 ]
+                #             )
+                #     except Exception as e:
+                #         logger.warning(f"Failed to get current URL: {str(e)}")
+            # Log successful completion
+            logger.info(f"Computer call {action_type} executed successfully")
+        except Exception as e:
+            logger.error(f"Error executing computer call: {str(e)}")
+            logger.debug(traceback.format_exc())
+            # Add error to conversation
+            self.loop.message_manager.add_user_message(
+                [{"type": "text", "text": f"Error executing computer action: {str(e)}"}]
+            )
+            # Send error to queue
+            error_response = {
+                "role": "assistant",
+                "content": f"Error executing computer action: {str(e)}",
+                "metadata": {"title": "❌ Error"},
+            }
+            await queue.put(error_response)

agent/providers/openai/tools/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""OpenAI tools module for computer control."""
+from .manager import ToolManager
+from .computer import ComputerTool
+from .base import BaseOpenAITool, ToolResult, ToolError, ToolFailure, CLIResult
+__all__ = [
+    "ToolManager",
+    "ComputerTool",
+    "BaseOpenAITool",
+    "ToolResult",
+    "ToolError",
+    "ToolFailure",
+    "CLIResult",
+]

cua-agent 0.1.6__py3-none-any.whl → 0.1.18__py3-none-any.whl

Potentially problematic release.

cua-agent 0.1.6py3-none-any.whl → 0.1.18py3-none-any.whl