PyPI - cua-agent - Versions diffs - 0.1.0__tar.gz - Mend

cua-agent 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (65) hide show

cua_agent-0.1.0/PKG-INFO +44 -0
cua_agent-0.1.0/README.md +213 -0
cua_agent-0.1.0/agent/README.md +63 -0
cua_agent-0.1.0/agent/__init__.py +10 -0
cua_agent-0.1.0/agent/core/README.md +101 -0
cua_agent-0.1.0/agent/core/__init__.py +34 -0
cua_agent-0.1.0/agent/core/agent.py +284 -0
cua_agent-0.1.0/agent/core/base_agent.py +164 -0
cua_agent-0.1.0/agent/core/callbacks.py +147 -0
cua_agent-0.1.0/agent/core/computer_agent.py +69 -0
cua_agent-0.1.0/agent/core/experiment.py +222 -0
cua_agent-0.1.0/agent/core/factory.py +102 -0
cua_agent-0.1.0/agent/core/loop.py +244 -0
cua_agent-0.1.0/agent/core/messages.py +230 -0
cua_agent-0.1.0/agent/core/tools/__init__.py +21 -0
cua_agent-0.1.0/agent/core/tools/base.py +74 -0
cua_agent-0.1.0/agent/core/tools/bash.py +52 -0
cua_agent-0.1.0/agent/core/tools/collection.py +46 -0
cua_agent-0.1.0/agent/core/tools/computer.py +113 -0
cua_agent-0.1.0/agent/core/tools/edit.py +67 -0
cua_agent-0.1.0/agent/core/tools/manager.py +56 -0
cua_agent-0.1.0/agent/providers/__init__.py +4 -0
cua_agent-0.1.0/agent/providers/anthropic/__init__.py +6 -0
cua_agent-0.1.0/agent/providers/anthropic/api/client.py +222 -0
cua_agent-0.1.0/agent/providers/anthropic/api/logging.py +150 -0
cua_agent-0.1.0/agent/providers/anthropic/callbacks/manager.py +55 -0
cua_agent-0.1.0/agent/providers/anthropic/loop.py +521 -0
cua_agent-0.1.0/agent/providers/anthropic/messages/manager.py +110 -0
cua_agent-0.1.0/agent/providers/anthropic/prompts.py +20 -0
cua_agent-0.1.0/agent/providers/anthropic/tools/__init__.py +33 -0
cua_agent-0.1.0/agent/providers/anthropic/tools/base.py +88 -0
cua_agent-0.1.0/agent/providers/anthropic/tools/bash.py +163 -0
cua_agent-0.1.0/agent/providers/anthropic/tools/collection.py +34 -0
cua_agent-0.1.0/agent/providers/anthropic/tools/computer.py +550 -0
cua_agent-0.1.0/agent/providers/anthropic/tools/edit.py +326 -0
cua_agent-0.1.0/agent/providers/anthropic/tools/manager.py +54 -0
cua_agent-0.1.0/agent/providers/anthropic/tools/run.py +42 -0
cua_agent-0.1.0/agent/providers/anthropic/types.py +16 -0
cua_agent-0.1.0/agent/providers/omni/__init__.py +27 -0
cua_agent-0.1.0/agent/providers/omni/callbacks.py +78 -0
cua_agent-0.1.0/agent/providers/omni/clients/anthropic.py +99 -0
cua_agent-0.1.0/agent/providers/omni/clients/base.py +44 -0
cua_agent-0.1.0/agent/providers/omni/clients/groq.py +101 -0
cua_agent-0.1.0/agent/providers/omni/clients/openai.py +159 -0
cua_agent-0.1.0/agent/providers/omni/clients/utils.py +25 -0
cua_agent-0.1.0/agent/providers/omni/experiment.py +273 -0
cua_agent-0.1.0/agent/providers/omni/image_utils.py +106 -0
cua_agent-0.1.0/agent/providers/omni/loop.py +961 -0
cua_agent-0.1.0/agent/providers/omni/messages.py +168 -0
cua_agent-0.1.0/agent/providers/omni/parser.py +252 -0
cua_agent-0.1.0/agent/providers/omni/prompts.py +78 -0
cua_agent-0.1.0/agent/providers/omni/tool_manager.py +91 -0
cua_agent-0.1.0/agent/providers/omni/tools/__init__.py +13 -0
cua_agent-0.1.0/agent/providers/omni/tools/bash.py +69 -0
cua_agent-0.1.0/agent/providers/omni/tools/computer.py +216 -0
cua_agent-0.1.0/agent/providers/omni/tools/manager.py +83 -0
cua_agent-0.1.0/agent/providers/omni/types.py +30 -0
cua_agent-0.1.0/agent/providers/omni/utils.py +155 -0
cua_agent-0.1.0/agent/providers/omni/visualization.py +130 -0
cua_agent-0.1.0/agent/types/__init__.py +26 -0
cua_agent-0.1.0/agent/types/base.py +52 -0
cua_agent-0.1.0/agent/types/messages.py +36 -0
cua_agent-0.1.0/agent/types/tools.py +32 -0
cua_agent-0.1.0/pyproject.toml +117 -0
cua_agent-0.1.0/tests/test_agent.py +91 -0

cua_agent-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,44 @@
+Metadata-Version: 2.1
+Name: cua-agent
+Version: 0.1.0
+Summary: CUA (Computer Use) Agent for AI-driven computer interaction
+Author-Email: TryCua <gh@trycua.com>
+Requires-Python: <3.13,>=3.10
+Requires-Dist: httpx<0.29.0,>=0.27.0
+Requires-Dist: aiohttp<4.0.0,>=3.9.3
+Requires-Dist: asyncio
+Requires-Dist: anyio<5.0.0,>=4.4.1
+Requires-Dist: typing-extensions<5.0.0,>=4.12.2
+Requires-Dist: pydantic<3.0.0,>=2.6.4
+Requires-Dist: rich<14.0.0,>=13.7.1
+Requires-Dist: python-dotenv<2.0.0,>=1.0.1
+Requires-Dist: cua-computer<0.2.0,>=0.1.0
+Requires-Dist: certifi>=2024.2.2
+Provides-Extra: anthropic
+Requires-Dist: anthropic>=0.49.0; extra == "anthropic"
+Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "anthropic"
+Provides-Extra: som
+Requires-Dist: torch>=2.2.1; extra == "som"
+Requires-Dist: torchvision>=0.17.1; extra == "som"
+Requires-Dist: ultralytics>=8.0.0; extra == "som"
+Requires-Dist: transformers>=4.38.2; extra == "som"
+Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "som"
+Requires-Dist: anthropic<0.47.0,>=0.46.0; extra == "som"
+Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "som"
+Requires-Dist: openai<2.0.0,>=1.14.0; extra == "som"
+Requires-Dist: groq<0.5.0,>=0.4.0; extra == "som"
+Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "som"
+Requires-Dist: requests<3.0.0,>=2.31.0; extra == "som"
+Provides-Extra: all
+Requires-Dist: torch>=2.2.1; extra == "all"
+Requires-Dist: torchvision>=0.17.1; extra == "all"
+Requires-Dist: ultralytics>=8.0.0; extra == "all"
+Requires-Dist: transformers>=4.38.2; extra == "all"
+Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "all"
+Requires-Dist: anthropic<0.47.0,>=0.46.0; extra == "all"
+Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "all"
+Requires-Dist: openai<2.0.0,>=1.14.0; extra == "all"
+Requires-Dist: groq<0.5.0,>=0.4.0; extra == "all"
+Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "all"
+Requires-Dist: requests<3.0.0,>=2.31.0; extra == "all"

cua_agent-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,213 @@
+<div align="center">
+<h1>
+  <div class="image-wrapper" style="display: inline-block;">
+    <picture>
+      <source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../img/logo_white.png" style="display: block; margin: auto;">
+      <source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../img/logo_black.png" style="display: block; margin: auto;">
+      <img alt="Shows my svg">
+    </picture>
+  </div>
+  [![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
+  [![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
+  [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
+  [![PyPI](https://img.shields.io/pypi/v/cua-computer?color=333333)](https://pypi.org/project/cua-computer/)
+</h1>
+</div>
+**Agent** is a Computer Use (CUA) framework for running multi-app agentic workflows targeting macOS and Linux sandbox, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). The framework integrates with Microsoft's OmniParser for enhanced UI understanding and interaction.
+### Get started with Agent
+There are two ways to use the agent: with OmniParser for enhanced UI understanding (recommended) or with basic computer control.
+#### Option 1: With OmniParser (Recommended)
+<div align="center">
+    <img src="../../img/agent.png"/>
+</div>
+```python
+from agent.providers.omni import OmniComputerAgent, APIProvider
+# Set your API key
+export OPENAI_API_KEY="your-openai-api-key"
+# Initialize agent with OmniParser for enhanced UI understanding
+agent = OmniComputerAgent(
+    provider=APIProvider.OPENAI,
+    model="gpt-4o",
+    start_omniparser=True  # Automatically starts OmniParser server
+)
+task = """
+1. Search the ai-gradio repo on GitHub.
+2. Clone it to the desktop.
+3. Open the repo with the Cursor app.
+4. Work with Cursor to add a new provider for Cua.
+"""
+async with agent:  # Ensures proper cleanup
+    async for result in agent.run(task):
+        print(result)
+```
+#### Option 2: Basic Computer Control
+```python
+from agent.computer_agent import ComputerAgent
+from agent.base.types import Provider
+# Set your API key (supports any provider)
+export OPENAI_API_KEY="your-openai-api-key"  # or other provider keys
+# Initialize basic agent
+agent = ComputerAgent(
+    provider=Provider.OPENAI,  # or ANTHROPIC, GROQ, etc.
+)
+task = """
+1. Open Chrome and navigate to github.com
+2. Search for 'trycua/cua'
+3. Star the repository
+"""
+async with agent:
+    async for result in agent.run(task):
+        print(result)
+```
+## Install
+### cua-agent
+```bash
+# Basic installation with Anthropic
+pip install cua-agent[anthropic]
+# Install with OmniParser (recommended)
+# Includes all provider dependencies (OpenAI, Anthropic, etc.)
+pip install cua-agent[omni]
+# Install with all features and providers
+pip install cua-agent[all]
+```
+## Features
+### OmniParser Integration
+- Enhanced UI understanding with element detection
+- Automatic bounding box detection for UI elements
+- Improved accuracy for complex UI interactions
+- Support for icon and text element recognition
+### Basic Computer Control
+- Direct keyboard and mouse control
+- Window and application management
+- Screenshot capabilities
+- Basic UI element detection
+### Provider Support
+- OpenAI (GPT-4V) - Recommended for OmniParser integration
+- Anthropic (Claude) - Strong general performance
+- Groq - Fast inference with Llama models
+- DeepSeek - Alternative model provider
+- Qwen - Alibaba's multimodal model
+## Run
+Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA):
+- [Getting Started with OmniParser](../../notebooks/omniparser_nb.ipynb) - Enhanced UI understanding
+- [Basic Computer Control](../../notebooks/basic_agent_nb.ipynb) - Simple computer interactions
+- [Advanced Usage](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
+# Computer Agent Library
+A Python library for controlling computer interactions with AI agents.
+## Introduction
+This library provides a unified interface for AI-powered computer interaction agents, allowing applications to automate UI interactions through various AI providers.
+## Key Features
+- **Unified Agent**: Single `ComputerAgent` class with configurable loop types
+- **Multiple AI providers**: Support for OpenAI, Anthropic, Groq, and other providers
+- **Screen analysis**: Intelligent screen parsing and element identification
+- **Tool execution**: Execute tools and commands to interact with the computer
+- **Trajectory saving**: Option to save screenshots and logs for debugging and analysis
+## Installation
+To install the library along with its dependencies:
+```bash
+pip install -e .
+```
+## Usage
+Here's a simple example of how to use the ComputerAgent:
+```python
+import asyncio
+from computer import Computer
+from agent.core.agent import ComputerAgent
+from agent.types.base import AgenticLoop
+from agent.providers.omni.types import APIProvider
+async def main():
+    # Initialize the computer interface
+    computer = Computer()
+    # Create a computer agent
+    agent = ComputerAgent(
+        computer=computer,
+        loop_type=AgenticLoop.OMNI,  # Use OMNI loop
+        provider=APIProvider.OPENAI,  # With OpenAI provider
+        model="gpt-4o",               # Specify the model
+        save_trajectory=True,         # Save logs and screenshots
+    )
+    # Use the agent with a context manager
+    async with agent:
+        # Run a task
+        async for result in agent.run("Open Safari and navigate to github.com"):
+            # Process the result
+            title = result["metadata"].get("title", "Screen Analysis")
+            content = result["content"]
+            print(f"\n{title}")
+            print(content)
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+## Components
+The library consists of several components:
+- **Core**
+  - `ComputerAgent`: Unified agent class supporting multiple loop types
+  - `BaseComputerAgent`: Abstract base class for computer agents
+- **Providers**
+  - `Anthropic`: Implementation for Anthropic Claude models
+  - `Omni`: Implementation for multiple providers (OpenAI, Groq, etc.)
+- **Loops**
+  - `AnthropicLoop`: Loop implementation for Anthropic
+  - `OmniLoop`: Generic loop supporting multiple providers
+## Configuration
+The agent can be configured with various parameters:
+- **loop_type**: The type of loop to use (ANTHROPIC or OMNI)
+- **provider**: AI provider to use with the loop
+- **model**: The AI model to use
+- **save_trajectory**: Whether to save screenshots and logs
+- **only_n_most_recent_images**: Only keep a specific number of recent images
+See the [Core README](./agent/core/README.md) for more details on the unified agent.

cua_agent-0.1.0/agent/README.md ADDED Viewed

@@ -0,0 +1,63 @@
+# Agent Package Structure
+## Overview
+The agent package provides a modular and extensible framework for AI-powered computer agents.
+## Directory Structure
+```
+agent/
+├── __init__.py           # Package exports
+├── core/                 # Core functionality
+│   ├── __init__.py
+│   ├── computer_agent.py # Main entry point
+│   └── factory.py        # Provider factory
+├── base/                 # Base implementations
+│   ├── __init__.py
+│   ├── agent.py         # Base agent class
+│   ├── core/            # Core components
+│   │   ├── callbacks.py
+│   │   ├── loop.py
+│   │   └── messages.py
+│   └── tools/           # Tool implementations
+├── providers/           # Provider implementations
+│   ├── __init__.py
+│   ├── anthropic/      # Anthropic provider
+│   │   ├── agent.py
+│   │   ├── loop.py
+│   │   └── tool_manager.py
+│   └── omni/           # Omni provider
+│       ├── agent.py
+│       ├── loop.py
+│       └── tool_manager.py
+└── types/              # Type definitions
+    ├── __init__.py
+    ├── base.py        # Core types
+    ├── messages.py    # Message types
+    ├── tools.py       # Tool types
+    └── providers/     # Provider-specific types
+        ├── anthropic.py
+        └── omni.py
+```
+## Key Components
+### Core
+- `computer_agent.py`: Main entry point for creating and using agents
+- `factory.py`: Factory for creating provider-specific implementations
+### Base
+- `agent.py`: Base agent implementation with shared functionality
+- `core/`: Core components used across providers
+- `tools/`: Shared tool implementations
+### Providers
+Each provider follows the same structure:
+- `agent.py`: Provider-specific agent implementation
+- `loop.py`: Provider-specific message loop
+- `tool_manager.py`: Tool management for provider
+### Types
+- `base.py`: Core type definitions
+- `messages.py`: Message-related types
+- `tools.py`: Tool-related types
+- `providers/`: Provider-specific type definitions

cua_agent-0.1.0/agent/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""CUA (Computer Use) Agent for AI-driven computer interaction."""
+__version__ = "0.1.0"
+from .core.factory import AgentFactory
+from .core.agent import ComputerAgent
+from .types.base import Provider, AgenticLoop
+from .providers.omni.types import APIProvider
+__all__ = ["AgentFactory", "Provider", "ComputerAgent", "AgenticLoop", "APIProvider"]

cua_agent-0.1.0/agent/core/README.md ADDED Viewed

@@ -0,0 +1,101 @@
+# Unified ComputerAgent
+The `ComputerAgent` class provides a unified implementation that consolidates the previously separate agent implementations (AnthropicComputerAgent and OmniComputerAgent) into a single, configurable class.
+## Features
+- **Multiple Loop Types**: Switch between different agentic loop implementations using the `loop_type` parameter (Anthropic or Omni).
+- **Provider Support**: Use different AI providers (OpenAI, Anthropic, etc.) with the appropriate loop.
+- **Trajectory Saving**: Control whether to save screenshots and logs with the `save_trajectory` parameter.
+- **Consistent Interface**: Maintains a consistent interface regardless of the underlying loop implementation.
+## API Key Requirements
+To use the ComputerAgent, you'll need API keys for the providers you want to use:
+- For **OpenAI**: Set the `OPENAI_API_KEY` environment variable or pass it directly as `api_key`.
+- For **Anthropic**: Set the `ANTHROPIC_API_KEY` environment variable or pass it directly as `api_key`.
+- For **Groq**: Set the `GROQ_API_KEY` environment variable or pass it directly as `api_key`.
+You can set environment variables in several ways:
+```bash
+# In your terminal before running the code
+export OPENAI_API_KEY=your_api_key_here
+# Or in a .env file
+OPENAI_API_KEY=your_api_key_here
+```
+## Usage
+Here's how to use the unified ComputerAgent:
+```python
+from agent.core.agent import ComputerAgent
+from agent.types.base import AgenticLoop
+from agent.providers.omni.types import APIProvider
+from computer import Computer
+# Create a Computer instance
+computer = Computer()
+# Create an agent with the OMNI loop and OpenAI provider
+agent = ComputerAgent(
+    computer=computer,
+    loop_type=AgenticLoop.OMNI,
+    provider=APIProvider.OPENAI,
+    model="gpt-4o",
+    api_key="your_api_key_here",  # Can also use OPENAI_API_KEY environment variable
+    save_trajectory=True,
+    only_n_most_recent_images=5
+)
+# Create an agent with the ANTHROPIC loop
+agent = ComputerAgent(
+    computer=computer,
+    loop_type=AgenticLoop.ANTHROPIC,
+    model="claude-3-7-sonnet-20250219",
+    api_key="your_api_key_here",  # Can also use ANTHROPIC_API_KEY environment variable
+    save_trajectory=True,
+    only_n_most_recent_images=5
+)
+# Use the agent
+async with agent:
+    async for result in agent.run("Your task description here"):
+        # Process the result
+        title = result["metadata"].get("title", "Screen Analysis")
+        content = result["content"]
+        print(f"\n{title}")
+        print(content)
+```
+## Parameters
+- `computer`: Computer instance to control
+- `loop_type`: The type of loop to use (AgenticLoop.ANTHROPIC or AgenticLoop.OMNI)
+- `provider`: AI provider to use (required for Omni loop)
+- `api_key`: Optional API key (will use environment variable if not provided)
+- `model`: Optional model name (will use provider default if not specified)
+- `save_trajectory`: Whether to save screenshots and logs
+- `only_n_most_recent_images`: Only keep N most recent images
+- `max_retries`: Maximum number of retry attempts
+## Directory Structure
+When `save_trajectory` is enabled, the agent will create the following directory structure:
+```
+experiments/
+  ├── screenshots/   # Screenshots captured during agent execution
+  └── logs/          # API call logs and other logging information
+```
+## Extending with New Loop Types
+To add a new loop type:
+1. Implement a new loop class
+2. Add a new value to the `AgenticLoop` enum
+3. Update the `_initialize_loop` method in `ComputerAgent` to handle the new loop type

cua_agent-0.1.0/agent/core/__init__.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Core agent components."""
+from .base_agent import BaseComputerAgent
+from .loop import BaseLoop
+from .messages import (
+    create_user_message,
+    create_assistant_message,
+    create_system_message,
+    create_image_message,
+    create_screen_message,
+    BaseMessageManager,
+    ImageRetentionConfig,
+)
+from .callbacks import (
+    CallbackManager,
+    CallbackHandler,
+    BaseCallbackManager,
+    ContentCallback,
+    ToolCallback,
+    APICallback,
+)
+__all__ = [
+    "BaseComputerAgent",
+    "BaseLoop",
+    "CallbackManager",
+    "CallbackHandler",
+    "BaseMessageManager",
+    "ImageRetentionConfig",
+    "BaseCallbackManager",
+    "ContentCallback",
+    "ToolCallback",
+    "APICallback",
+]