cua-agent 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/README.md +63 -0
- agent/__init__.py +10 -0
- agent/core/README.md +101 -0
- agent/core/__init__.py +34 -0
- agent/core/agent.py +284 -0
- agent/core/base_agent.py +164 -0
- agent/core/callbacks.py +147 -0
- agent/core/computer_agent.py +69 -0
- agent/core/experiment.py +222 -0
- agent/core/factory.py +102 -0
- agent/core/loop.py +244 -0
- agent/core/messages.py +230 -0
- agent/core/tools/__init__.py +21 -0
- agent/core/tools/base.py +74 -0
- agent/core/tools/bash.py +52 -0
- agent/core/tools/collection.py +46 -0
- agent/core/tools/computer.py +113 -0
- agent/core/tools/edit.py +67 -0
- agent/core/tools/manager.py +56 -0
- agent/providers/__init__.py +4 -0
- agent/providers/anthropic/__init__.py +6 -0
- agent/providers/anthropic/api/client.py +222 -0
- agent/providers/anthropic/api/logging.py +150 -0
- agent/providers/anthropic/callbacks/manager.py +55 -0
- agent/providers/anthropic/loop.py +521 -0
- agent/providers/anthropic/messages/manager.py +110 -0
- agent/providers/anthropic/prompts.py +20 -0
- agent/providers/anthropic/tools/__init__.py +33 -0
- agent/providers/anthropic/tools/base.py +88 -0
- agent/providers/anthropic/tools/bash.py +163 -0
- agent/providers/anthropic/tools/collection.py +34 -0
- agent/providers/anthropic/tools/computer.py +550 -0
- agent/providers/anthropic/tools/edit.py +326 -0
- agent/providers/anthropic/tools/manager.py +54 -0
- agent/providers/anthropic/tools/run.py +42 -0
- agent/providers/anthropic/types.py +16 -0
- agent/providers/omni/__init__.py +27 -0
- agent/providers/omni/callbacks.py +78 -0
- agent/providers/omni/clients/anthropic.py +99 -0
- agent/providers/omni/clients/base.py +44 -0
- agent/providers/omni/clients/groq.py +101 -0
- agent/providers/omni/clients/openai.py +159 -0
- agent/providers/omni/clients/utils.py +25 -0
- agent/providers/omni/experiment.py +273 -0
- agent/providers/omni/image_utils.py +106 -0
- agent/providers/omni/loop.py +961 -0
- agent/providers/omni/messages.py +168 -0
- agent/providers/omni/parser.py +252 -0
- agent/providers/omni/prompts.py +78 -0
- agent/providers/omni/tool_manager.py +91 -0
- agent/providers/omni/tools/__init__.py +13 -0
- agent/providers/omni/tools/bash.py +69 -0
- agent/providers/omni/tools/computer.py +216 -0
- agent/providers/omni/tools/manager.py +83 -0
- agent/providers/omni/types.py +30 -0
- agent/providers/omni/utils.py +155 -0
- agent/providers/omni/visualization.py +130 -0
- agent/types/__init__.py +26 -0
- agent/types/base.py +52 -0
- agent/types/messages.py +36 -0
- agent/types/tools.py +32 -0
- cua_agent-0.1.0.dist-info/METADATA +44 -0
- cua_agent-0.1.0.dist-info/RECORD +65 -0
- cua_agent-0.1.0.dist-info/WHEEL +4 -0
- cua_agent-0.1.0.dist-info/entry_points.txt +4 -0
agent/README.md
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# Agent Package Structure
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
The agent package provides a modular and extensible framework for AI-powered computer agents.
|
|
5
|
+
|
|
6
|
+
## Directory Structure
|
|
7
|
+
```
|
|
8
|
+
agent/
|
|
9
|
+
├── __init__.py # Package exports
|
|
10
|
+
├── core/ # Core functionality
|
|
11
|
+
│ ├── __init__.py
|
|
12
|
+
│ ├── computer_agent.py # Main entry point
|
|
13
|
+
│ └── factory.py # Provider factory
|
|
14
|
+
├── base/ # Base implementations
|
|
15
|
+
│ ├── __init__.py
|
|
16
|
+
│ ├── agent.py # Base agent class
|
|
17
|
+
│ ├── core/ # Core components
|
|
18
|
+
│ │ ├── callbacks.py
|
|
19
|
+
│ │ ├── loop.py
|
|
20
|
+
│ │ └── messages.py
|
|
21
|
+
│ └── tools/ # Tool implementations
|
|
22
|
+
├── providers/ # Provider implementations
|
|
23
|
+
│ ├── __init__.py
|
|
24
|
+
│ ├── anthropic/ # Anthropic provider
|
|
25
|
+
│ │ ├── agent.py
|
|
26
|
+
│ │ ├── loop.py
|
|
27
|
+
│ │ └── tool_manager.py
|
|
28
|
+
│ └── omni/ # Omni provider
|
|
29
|
+
│ ├── agent.py
|
|
30
|
+
│ ├── loop.py
|
|
31
|
+
│ └── tool_manager.py
|
|
32
|
+
└── types/ # Type definitions
|
|
33
|
+
├── __init__.py
|
|
34
|
+
├── base.py # Core types
|
|
35
|
+
├── messages.py # Message types
|
|
36
|
+
├── tools.py # Tool types
|
|
37
|
+
└── providers/ # Provider-specific types
|
|
38
|
+
├── anthropic.py
|
|
39
|
+
└── omni.py
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Key Components
|
|
43
|
+
|
|
44
|
+
### Core
|
|
45
|
+
- `computer_agent.py`: Main entry point for creating and using agents
|
|
46
|
+
- `factory.py`: Factory for creating provider-specific implementations
|
|
47
|
+
|
|
48
|
+
### Base
|
|
49
|
+
- `agent.py`: Base agent implementation with shared functionality
|
|
50
|
+
- `core/`: Core components used across providers
|
|
51
|
+
- `tools/`: Shared tool implementations
|
|
52
|
+
|
|
53
|
+
### Providers
|
|
54
|
+
Each provider follows the same structure:
|
|
55
|
+
- `agent.py`: Provider-specific agent implementation
|
|
56
|
+
- `loop.py`: Provider-specific message loop
|
|
57
|
+
- `tool_manager.py`: Tool management for provider
|
|
58
|
+
|
|
59
|
+
### Types
|
|
60
|
+
- `base.py`: Core type definitions
|
|
61
|
+
- `messages.py`: Message-related types
|
|
62
|
+
- `tools.py`: Tool-related types
|
|
63
|
+
- `providers/`: Provider-specific type definitions
|
agent/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""CUA (Computer Use) Agent for AI-driven computer interaction."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
from .core.factory import AgentFactory
|
|
6
|
+
from .core.agent import ComputerAgent
|
|
7
|
+
from .types.base import Provider, AgenticLoop
|
|
8
|
+
from .providers.omni.types import APIProvider
|
|
9
|
+
|
|
10
|
+
__all__ = ["AgentFactory", "Provider", "ComputerAgent", "AgenticLoop", "APIProvider"]
|
agent/core/README.md
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# Unified ComputerAgent
|
|
2
|
+
|
|
3
|
+
The `ComputerAgent` class provides a unified implementation that consolidates the previously separate agent implementations (AnthropicComputerAgent and OmniComputerAgent) into a single, configurable class.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Multiple Loop Types**: Switch between different agentic loop implementations using the `loop_type` parameter (Anthropic or Omni).
|
|
8
|
+
- **Provider Support**: Use different AI providers (OpenAI, Anthropic, etc.) with the appropriate loop.
|
|
9
|
+
- **Trajectory Saving**: Control whether to save screenshots and logs with the `save_trajectory` parameter.
|
|
10
|
+
- **Consistent Interface**: Maintains a consistent interface regardless of the underlying loop implementation.
|
|
11
|
+
|
|
12
|
+
## API Key Requirements
|
|
13
|
+
|
|
14
|
+
To use the ComputerAgent, you'll need API keys for the providers you want to use:
|
|
15
|
+
|
|
16
|
+
- For **OpenAI**: Set the `OPENAI_API_KEY` environment variable or pass it directly as `api_key`.
|
|
17
|
+
- For **Anthropic**: Set the `ANTHROPIC_API_KEY` environment variable or pass it directly as `api_key`.
|
|
18
|
+
- For **Groq**: Set the `GROQ_API_KEY` environment variable or pass it directly as `api_key`.
|
|
19
|
+
|
|
20
|
+
You can set environment variables in several ways:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# In your terminal before running the code
|
|
24
|
+
export OPENAI_API_KEY=your_api_key_here
|
|
25
|
+
|
|
26
|
+
# Or in a .env file
|
|
27
|
+
OPENAI_API_KEY=your_api_key_here
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Usage
|
|
31
|
+
|
|
32
|
+
Here's how to use the unified ComputerAgent:
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from agent.core.agent import ComputerAgent
|
|
36
|
+
from agent.types.base import AgenticLoop
|
|
37
|
+
from agent.providers.omni.types import APIProvider
|
|
38
|
+
from computer import Computer
|
|
39
|
+
|
|
40
|
+
# Create a Computer instance
|
|
41
|
+
computer = Computer()
|
|
42
|
+
|
|
43
|
+
# Create an agent with the OMNI loop and OpenAI provider
|
|
44
|
+
agent = ComputerAgent(
|
|
45
|
+
computer=computer,
|
|
46
|
+
loop_type=AgenticLoop.OMNI,
|
|
47
|
+
provider=APIProvider.OPENAI,
|
|
48
|
+
model="gpt-4o",
|
|
49
|
+
api_key="your_api_key_here", # Can also use OPENAI_API_KEY environment variable
|
|
50
|
+
save_trajectory=True,
|
|
51
|
+
only_n_most_recent_images=5
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Create an agent with the ANTHROPIC loop
|
|
55
|
+
agent = ComputerAgent(
|
|
56
|
+
computer=computer,
|
|
57
|
+
loop_type=AgenticLoop.ANTHROPIC,
|
|
58
|
+
model="claude-3-7-sonnet-20250219",
|
|
59
|
+
api_key="your_api_key_here", # Can also use ANTHROPIC_API_KEY environment variable
|
|
60
|
+
save_trajectory=True,
|
|
61
|
+
only_n_most_recent_images=5
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Use the agent
|
|
65
|
+
async with agent:
|
|
66
|
+
async for result in agent.run("Your task description here"):
|
|
67
|
+
# Process the result
|
|
68
|
+
title = result["metadata"].get("title", "Screen Analysis")
|
|
69
|
+
content = result["content"]
|
|
70
|
+
print(f"\n{title}")
|
|
71
|
+
print(content)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Parameters
|
|
75
|
+
|
|
76
|
+
- `computer`: Computer instance to control
|
|
77
|
+
- `loop_type`: The type of loop to use (AgenticLoop.ANTHROPIC or AgenticLoop.OMNI)
|
|
78
|
+
- `provider`: AI provider to use (required for Omni loop)
|
|
79
|
+
- `api_key`: Optional API key (will use environment variable if not provided)
|
|
80
|
+
- `model`: Optional model name (will use provider default if not specified)
|
|
81
|
+
- `save_trajectory`: Whether to save screenshots and logs
|
|
82
|
+
- `only_n_most_recent_images`: Only keep N most recent images
|
|
83
|
+
- `max_retries`: Maximum number of retry attempts
|
|
84
|
+
|
|
85
|
+
## Directory Structure
|
|
86
|
+
|
|
87
|
+
When `save_trajectory` is enabled, the agent will create the following directory structure:
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
experiments/
|
|
91
|
+
├── screenshots/ # Screenshots captured during agent execution
|
|
92
|
+
└── logs/ # API call logs and other logging information
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Extending with New Loop Types
|
|
96
|
+
|
|
97
|
+
To add a new loop type:
|
|
98
|
+
|
|
99
|
+
1. Implement a new loop class
|
|
100
|
+
2. Add a new value to the `AgenticLoop` enum
|
|
101
|
+
3. Update the `_initialize_loop` method in `ComputerAgent` to handle the new loop type
|
agent/core/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Core agent components."""
|
|
2
|
+
|
|
3
|
+
from .base_agent import BaseComputerAgent
|
|
4
|
+
from .loop import BaseLoop
|
|
5
|
+
from .messages import (
|
|
6
|
+
create_user_message,
|
|
7
|
+
create_assistant_message,
|
|
8
|
+
create_system_message,
|
|
9
|
+
create_image_message,
|
|
10
|
+
create_screen_message,
|
|
11
|
+
BaseMessageManager,
|
|
12
|
+
ImageRetentionConfig,
|
|
13
|
+
)
|
|
14
|
+
from .callbacks import (
|
|
15
|
+
CallbackManager,
|
|
16
|
+
CallbackHandler,
|
|
17
|
+
BaseCallbackManager,
|
|
18
|
+
ContentCallback,
|
|
19
|
+
ToolCallback,
|
|
20
|
+
APICallback,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"BaseComputerAgent",
|
|
25
|
+
"BaseLoop",
|
|
26
|
+
"CallbackManager",
|
|
27
|
+
"CallbackHandler",
|
|
28
|
+
"BaseMessageManager",
|
|
29
|
+
"ImageRetentionConfig",
|
|
30
|
+
"BaseCallbackManager",
|
|
31
|
+
"ContentCallback",
|
|
32
|
+
"ToolCallback",
|
|
33
|
+
"APICallback",
|
|
34
|
+
]
|
agent/core/agent.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""Unified computer agent implementation that supports multiple loops."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import logging
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, TYPE_CHECKING
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
|
|
9
|
+
from computer import Computer
|
|
10
|
+
|
|
11
|
+
from ..types.base import Provider, AgenticLoop
|
|
12
|
+
from .base_agent import BaseComputerAgent
|
|
13
|
+
|
|
14
|
+
# Only import types for type checking to avoid circular imports
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from ..providers.anthropic.loop import AnthropicLoop
|
|
17
|
+
from ..providers.omni.loop import OmniLoop
|
|
18
|
+
from ..providers.omni.parser import OmniParser
|
|
19
|
+
|
|
20
|
+
# Import the APIProvider enum without importing the whole module
|
|
21
|
+
from ..providers.omni.types import APIProvider
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
# Default models for different providers
|
|
26
|
+
DEFAULT_MODELS = {
|
|
27
|
+
APIProvider.OPENAI: "gpt-4o",
|
|
28
|
+
APIProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
29
|
+
APIProvider.GROQ: "llama3-70b-8192",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# Map providers to their environment variable names
|
|
33
|
+
ENV_VARS = {
|
|
34
|
+
APIProvider.OPENAI: "OPENAI_API_KEY",
|
|
35
|
+
APIProvider.GROQ: "GROQ_API_KEY",
|
|
36
|
+
APIProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ComputerAgent(BaseComputerAgent):
|
|
41
|
+
"""Unified implementation of the computer agent supporting multiple loop types.
|
|
42
|
+
|
|
43
|
+
This class consolidates the previous AnthropicComputerAgent and OmniComputerAgent
|
|
44
|
+
into a single implementation with configurable loop type.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
computer: Computer,
|
|
50
|
+
loop_type: AgenticLoop = AgenticLoop.OMNI,
|
|
51
|
+
ai_provider: APIProvider = APIProvider.OPENAI,
|
|
52
|
+
api_key: Optional[str] = None,
|
|
53
|
+
model: Optional[str] = None,
|
|
54
|
+
save_trajectory: bool = True,
|
|
55
|
+
trajectory_dir: Optional[str] = "trajectories",
|
|
56
|
+
only_n_most_recent_images: Optional[int] = None,
|
|
57
|
+
max_retries: int = 3,
|
|
58
|
+
verbosity: int = logging.INFO,
|
|
59
|
+
**kwargs,
|
|
60
|
+
):
|
|
61
|
+
"""Initialize the computer agent.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
computer: Computer instance to control
|
|
65
|
+
loop_type: The type of loop to use (Anthropic or Omni)
|
|
66
|
+
ai_provider: AI provider to use (required for Cua loop)
|
|
67
|
+
api_key: Optional API key (will use environment variable if not provided)
|
|
68
|
+
model: Optional model name (will use provider default if not specified)
|
|
69
|
+
save_trajectory: Whether to save screenshots and logs
|
|
70
|
+
trajectory_dir: Directory to save trajectories (defaults to "trajectories")
|
|
71
|
+
only_n_most_recent_images: Limit history to N most recent images
|
|
72
|
+
max_retries: Maximum number of retry attempts for failed operations
|
|
73
|
+
verbosity: Logging level (standard Python logging levels: logging.DEBUG, logging.INFO, etc.)
|
|
74
|
+
**kwargs: Additional keyword arguments to pass to the loop
|
|
75
|
+
"""
|
|
76
|
+
# Set up trajectory directories based on save_trajectory
|
|
77
|
+
base_dir = trajectory_dir if save_trajectory else None
|
|
78
|
+
# Don't create a redundant screenshots directory - directly use the timestamp folder
|
|
79
|
+
screenshot_dir = None # This was previously set to os.path.join(base_dir, "screenshots")
|
|
80
|
+
log_dir = None
|
|
81
|
+
|
|
82
|
+
super().__init__(
|
|
83
|
+
max_retries=max_retries,
|
|
84
|
+
computer=computer,
|
|
85
|
+
screenshot_dir=screenshot_dir,
|
|
86
|
+
log_dir=log_dir,
|
|
87
|
+
**kwargs,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
self.loop_type = loop_type
|
|
91
|
+
self.provider = ai_provider
|
|
92
|
+
self.save_trajectory = save_trajectory
|
|
93
|
+
self.trajectory_dir = trajectory_dir
|
|
94
|
+
self.only_n_most_recent_images = only_n_most_recent_images
|
|
95
|
+
self.verbosity = verbosity
|
|
96
|
+
self._kwargs = kwargs # Keep this for loop initialization
|
|
97
|
+
|
|
98
|
+
# Configure logging based on verbosity
|
|
99
|
+
self._configure_logging(verbosity)
|
|
100
|
+
|
|
101
|
+
# Get API key from environment if not provided
|
|
102
|
+
if api_key is None:
|
|
103
|
+
env_var = (
|
|
104
|
+
ENV_VARS.get(ai_provider) if loop_type == AgenticLoop.OMNI else "ANTHROPIC_API_KEY"
|
|
105
|
+
)
|
|
106
|
+
if not env_var:
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"Unsupported provider: {ai_provider}. Please use one of: {list(ENV_VARS.keys())}"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
api_key = os.environ.get(env_var)
|
|
112
|
+
if not api_key:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"No API key provided and {env_var} environment variable is not set.\n"
|
|
115
|
+
f"Please set the {env_var} environment variable or pass the api_key directly:\n"
|
|
116
|
+
f" - Export in terminal: export {env_var}=your_api_key_here\n"
|
|
117
|
+
f" - Add to .env file: {env_var}=your_api_key_here\n"
|
|
118
|
+
f" - Pass directly: api_key='your_api_key_here'"
|
|
119
|
+
)
|
|
120
|
+
self.api_key = api_key
|
|
121
|
+
|
|
122
|
+
# Set model based on provider if not specified
|
|
123
|
+
if model is None:
|
|
124
|
+
if loop_type == AgenticLoop.OMNI:
|
|
125
|
+
self.model = DEFAULT_MODELS[ai_provider]
|
|
126
|
+
else: # Anthropic loop
|
|
127
|
+
self.model = DEFAULT_MODELS[APIProvider.ANTHROPIC]
|
|
128
|
+
else:
|
|
129
|
+
self.model = model
|
|
130
|
+
|
|
131
|
+
# Initialize the appropriate loop based on loop_type
|
|
132
|
+
self.loop = self._init_loop()
|
|
133
|
+
|
|
134
|
+
def _configure_logging(self, verbosity: int):
|
|
135
|
+
"""Configure logging based on verbosity level."""
|
|
136
|
+
# Use the logging level directly without mapping
|
|
137
|
+
logger.setLevel(verbosity)
|
|
138
|
+
logging.getLogger("agent").setLevel(verbosity)
|
|
139
|
+
|
|
140
|
+
# Log the verbosity level that was set
|
|
141
|
+
if verbosity <= logging.DEBUG:
|
|
142
|
+
logger.info("Agent logging set to DEBUG level (full debug information)")
|
|
143
|
+
elif verbosity <= logging.INFO:
|
|
144
|
+
logger.info("Agent logging set to INFO level (standard output)")
|
|
145
|
+
elif verbosity <= logging.WARNING:
|
|
146
|
+
logger.warning("Agent logging set to WARNING level (warnings and errors only)")
|
|
147
|
+
elif verbosity <= logging.ERROR:
|
|
148
|
+
logger.warning("Agent logging set to ERROR level (errors only)")
|
|
149
|
+
elif verbosity <= logging.CRITICAL:
|
|
150
|
+
logger.warning("Agent logging set to CRITICAL level (critical errors only)")
|
|
151
|
+
|
|
152
|
+
def _init_loop(self) -> Any:
|
|
153
|
+
"""Initialize the loop based on the loop_type.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Initialized loop instance
|
|
157
|
+
"""
|
|
158
|
+
# Lazy import OmniLoop and OmniParser to avoid circular imports
|
|
159
|
+
from ..providers.omni.loop import OmniLoop
|
|
160
|
+
from ..providers.omni.parser import OmniParser
|
|
161
|
+
|
|
162
|
+
if self.loop_type == AgenticLoop.ANTHROPIC:
|
|
163
|
+
from ..providers.anthropic.loop import AnthropicLoop
|
|
164
|
+
|
|
165
|
+
return AnthropicLoop(
|
|
166
|
+
api_key=self.api_key,
|
|
167
|
+
model=self.model,
|
|
168
|
+
computer=self.computer,
|
|
169
|
+
save_trajectory=self.save_trajectory,
|
|
170
|
+
base_dir=self.trajectory_dir,
|
|
171
|
+
only_n_most_recent_images=self.only_n_most_recent_images,
|
|
172
|
+
**self._kwargs,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Initialize parser for OmniLoop with appropriate device
|
|
176
|
+
if "parser" not in self._kwargs:
|
|
177
|
+
self._kwargs["parser"] = OmniParser()
|
|
178
|
+
|
|
179
|
+
return OmniLoop(
|
|
180
|
+
provider=self.provider,
|
|
181
|
+
api_key=self.api_key,
|
|
182
|
+
model=self.model,
|
|
183
|
+
computer=self.computer,
|
|
184
|
+
save_trajectory=self.save_trajectory,
|
|
185
|
+
base_dir=self.trajectory_dir,
|
|
186
|
+
only_n_most_recent_images=self.only_n_most_recent_images,
|
|
187
|
+
**self._kwargs,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
async def _execute_task(self, task: str) -> AsyncGenerator[Dict[str, Any], None]:
|
|
191
|
+
"""Execute a task using the appropriate loop.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
task: Task description to execute
|
|
195
|
+
|
|
196
|
+
Yields:
|
|
197
|
+
Dict containing response content and metadata
|
|
198
|
+
"""
|
|
199
|
+
try:
|
|
200
|
+
# Format the messages based on loop type
|
|
201
|
+
if self.loop_type == AgenticLoop.ANTHROPIC:
|
|
202
|
+
# Anthropic format
|
|
203
|
+
messages = [{"role": "user", "content": [{"type": "text", "text": task}]}]
|
|
204
|
+
else:
|
|
205
|
+
# Cua format
|
|
206
|
+
messages = [{"role": "user", "content": task}]
|
|
207
|
+
|
|
208
|
+
# Run the loop
|
|
209
|
+
try:
|
|
210
|
+
async for result in self.loop.run(messages):
|
|
211
|
+
if result is None:
|
|
212
|
+
break
|
|
213
|
+
|
|
214
|
+
# Handle error case
|
|
215
|
+
if "error" in result:
|
|
216
|
+
yield {
|
|
217
|
+
"role": "assistant",
|
|
218
|
+
"content": result["error"],
|
|
219
|
+
"metadata": {"title": "❌ Error"},
|
|
220
|
+
}
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
# Extract content and metadata based on loop type
|
|
224
|
+
if self.loop_type == AgenticLoop.ANTHROPIC:
|
|
225
|
+
# Handle Anthropic format
|
|
226
|
+
if "content" in result:
|
|
227
|
+
content_text = ""
|
|
228
|
+
for content_block in result["content"]:
|
|
229
|
+
try:
|
|
230
|
+
# Try to access the text attribute directly
|
|
231
|
+
content_text += content_block.text
|
|
232
|
+
except (AttributeError, TypeError):
|
|
233
|
+
# If it's a dictionary instead of an object
|
|
234
|
+
if isinstance(content_block, dict) and "text" in content_block:
|
|
235
|
+
content_text += content_block["text"]
|
|
236
|
+
|
|
237
|
+
yield {
|
|
238
|
+
"role": "assistant",
|
|
239
|
+
"content": content_text,
|
|
240
|
+
"metadata": result.get("parsed_screen", {}),
|
|
241
|
+
}
|
|
242
|
+
else:
|
|
243
|
+
yield {
|
|
244
|
+
"role": "assistant",
|
|
245
|
+
"content": str(result),
|
|
246
|
+
"metadata": {"title": "Screen Analysis"},
|
|
247
|
+
}
|
|
248
|
+
else:
|
|
249
|
+
# Handle Omni format
|
|
250
|
+
content = ""
|
|
251
|
+
metadata = {"title": "Screen Analysis"}
|
|
252
|
+
|
|
253
|
+
# If result has content (normal case)
|
|
254
|
+
if "content" in result:
|
|
255
|
+
content = result["content"]
|
|
256
|
+
|
|
257
|
+
# Ensure metadata has a title
|
|
258
|
+
if isinstance(content, dict) and "metadata" in content:
|
|
259
|
+
metadata = content["metadata"]
|
|
260
|
+
if "title" not in metadata:
|
|
261
|
+
metadata["title"] = "Screen Analysis"
|
|
262
|
+
|
|
263
|
+
# For string content, convert to proper format
|
|
264
|
+
if isinstance(content, str):
|
|
265
|
+
content = content
|
|
266
|
+
elif isinstance(content, dict) and "content" in content:
|
|
267
|
+
content = content.get("content", "")
|
|
268
|
+
|
|
269
|
+
yield {"role": "assistant", "content": content, "metadata": metadata}
|
|
270
|
+
except Exception as e:
|
|
271
|
+
logger.error(f"Error running the loop: {str(e)}")
|
|
272
|
+
yield {
|
|
273
|
+
"role": "assistant",
|
|
274
|
+
"content": f"Error running the agent loop: {str(e)}",
|
|
275
|
+
"metadata": {"title": "❌ Loop Error"},
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.error(f"Error in _execute_task: {str(e)}")
|
|
280
|
+
yield {
|
|
281
|
+
"role": "assistant",
|
|
282
|
+
"content": f"Error: {str(e)}",
|
|
283
|
+
"metadata": {"title": "❌ Error"},
|
|
284
|
+
}
|
agent/core/base_agent.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Base computer agent implementation."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from typing import Any, AsyncGenerator, Dict, Optional
|
|
8
|
+
|
|
9
|
+
from computer import Computer
|
|
10
|
+
|
|
11
|
+
from ..types.base import Provider
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BaseComputerAgent(ABC):
|
|
17
|
+
"""Base class for computer agents."""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
max_retries: int = 3,
|
|
22
|
+
computer: Optional[Computer] = None,
|
|
23
|
+
screenshot_dir: Optional[str] = None,
|
|
24
|
+
log_dir: Optional[str] = None,
|
|
25
|
+
**kwargs,
|
|
26
|
+
):
|
|
27
|
+
"""Initialize the base computer agent.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
max_retries: Maximum number of retry attempts
|
|
31
|
+
computer: Optional Computer instance
|
|
32
|
+
screenshot_dir: Directory to save screenshots
|
|
33
|
+
log_dir: Directory to save logs (set to None to disable logging to files)
|
|
34
|
+
**kwargs: Additional provider-specific arguments
|
|
35
|
+
"""
|
|
36
|
+
self.max_retries = max_retries
|
|
37
|
+
self.computer = computer or Computer()
|
|
38
|
+
self.queue = asyncio.Queue()
|
|
39
|
+
self.screenshot_dir = screenshot_dir
|
|
40
|
+
self.log_dir = log_dir
|
|
41
|
+
self._retry_count = 0
|
|
42
|
+
self.provider = Provider.UNKNOWN
|
|
43
|
+
|
|
44
|
+
# Setup logging
|
|
45
|
+
if self.log_dir:
|
|
46
|
+
os.makedirs(self.log_dir, exist_ok=True)
|
|
47
|
+
logger.info(f"Created logs directory: {self.log_dir}")
|
|
48
|
+
|
|
49
|
+
# Setup screenshots directory
|
|
50
|
+
if self.screenshot_dir:
|
|
51
|
+
os.makedirs(self.screenshot_dir, exist_ok=True)
|
|
52
|
+
logger.info(f"Created screenshots directory: {self.screenshot_dir}")
|
|
53
|
+
|
|
54
|
+
logger.info("BaseComputerAgent initialized")
|
|
55
|
+
|
|
56
|
+
async def run(self, task: str) -> AsyncGenerator[Dict[str, Any], None]:
|
|
57
|
+
"""Run a task using the computer agent.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
task: Task description
|
|
61
|
+
|
|
62
|
+
Yields:
|
|
63
|
+
Task execution updates
|
|
64
|
+
"""
|
|
65
|
+
try:
|
|
66
|
+
logger.info(f"Running task: {task}")
|
|
67
|
+
|
|
68
|
+
# Initialize the computer if needed
|
|
69
|
+
await self._init_if_needed()
|
|
70
|
+
|
|
71
|
+
# Execute the task and yield results
|
|
72
|
+
# The _execute_task method should be implemented to yield results
|
|
73
|
+
async for result in self._execute_task(task):
|
|
74
|
+
yield result
|
|
75
|
+
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(f"Error in agent run method: {str(e)}")
|
|
78
|
+
yield {
|
|
79
|
+
"role": "assistant",
|
|
80
|
+
"content": f"Error: {str(e)}",
|
|
81
|
+
"metadata": {"title": "❌ Error"},
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
async def _init_if_needed(self):
|
|
85
|
+
"""Initialize the computer interface if it hasn't been initialized yet."""
|
|
86
|
+
if not self.computer._initialized:
|
|
87
|
+
logger.info("Computer not initialized, initializing now...")
|
|
88
|
+
try:
|
|
89
|
+
# Call run directly without setting the flag first
|
|
90
|
+
await self.computer.run()
|
|
91
|
+
logger.info("Computer interface initialized successfully")
|
|
92
|
+
except Exception as e:
|
|
93
|
+
logger.error(f"Error initializing computer interface: {str(e)}")
|
|
94
|
+
raise
|
|
95
|
+
|
|
96
|
+
async def __aenter__(self):
|
|
97
|
+
"""Initialize the agent when used as a context manager."""
|
|
98
|
+
logger.info("Entering BaseComputerAgent context")
|
|
99
|
+
|
|
100
|
+
# In case the computer wasn't initialized
|
|
101
|
+
try:
|
|
102
|
+
# Initialize the computer only if not already initialized
|
|
103
|
+
logger.info("Checking if computer is already initialized...")
|
|
104
|
+
if not self.computer._initialized:
|
|
105
|
+
logger.info("Initializing computer in __aenter__...")
|
|
106
|
+
# Use the computer's __aenter__ directly instead of calling run()
|
|
107
|
+
# This avoids the circular dependency
|
|
108
|
+
await self.computer.__aenter__()
|
|
109
|
+
logger.info("Computer initialized in __aenter__")
|
|
110
|
+
else:
|
|
111
|
+
logger.info("Computer already initialized, skipping initialization")
|
|
112
|
+
|
|
113
|
+
# Take a test screenshot to verify the computer is working
|
|
114
|
+
logger.info("Testing computer with a screenshot...")
|
|
115
|
+
try:
|
|
116
|
+
test_screenshot = await self.computer.screenshot()
|
|
117
|
+
# Determine the screenshot size based on its type
|
|
118
|
+
if isinstance(test_screenshot, bytes):
|
|
119
|
+
size = len(test_screenshot)
|
|
120
|
+
else:
|
|
121
|
+
# Assume it's an object with base64_image attribute
|
|
122
|
+
try:
|
|
123
|
+
size = len(test_screenshot.base64_image)
|
|
124
|
+
except AttributeError:
|
|
125
|
+
size = "unknown"
|
|
126
|
+
logger.info(f"Screenshot test successful, size: {size}")
|
|
127
|
+
except Exception as e:
|
|
128
|
+
logger.error(f"Screenshot test failed: {str(e)}")
|
|
129
|
+
# Even though screenshot failed, we continue since some tests might not need it
|
|
130
|
+
except Exception as e:
|
|
131
|
+
logger.error(f"Error initializing computer in __aenter__: {str(e)}")
|
|
132
|
+
raise
|
|
133
|
+
|
|
134
|
+
return self
|
|
135
|
+
|
|
136
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
137
|
+
"""Cleanup computer resources if needed."""
|
|
138
|
+
logger.info("Cleaning up agent resources")
|
|
139
|
+
|
|
140
|
+
# Do any necessary cleanup
|
|
141
|
+
# We're not shutting down the computer here as it might be shared
|
|
142
|
+
# Just log that we're exiting
|
|
143
|
+
if exc_type:
|
|
144
|
+
logger.error(f"Exiting agent context with error: {exc_type.__name__}: {exc_val}")
|
|
145
|
+
else:
|
|
146
|
+
logger.info("Exiting agent context normally")
|
|
147
|
+
|
|
148
|
+
# If we have a queue, make sure to signal it's done
|
|
149
|
+
if hasattr(self, "queue") and self.queue:
|
|
150
|
+
await self.queue.put(None) # Signal that we're done
|
|
151
|
+
|
|
152
|
+
@abstractmethod
|
|
153
|
+
async def _execute_task(self, task: str) -> AsyncGenerator[Dict[str, Any], None]:
|
|
154
|
+
"""Execute a task. Must be implemented by subclasses.
|
|
155
|
+
|
|
156
|
+
This is an async method that returns an AsyncGenerator. Implementations
|
|
157
|
+
should use 'yield' statements to produce results asynchronously.
|
|
158
|
+
"""
|
|
159
|
+
yield {
|
|
160
|
+
"role": "assistant",
|
|
161
|
+
"content": "Base class method called",
|
|
162
|
+
"metadata": {"title": "Error"},
|
|
163
|
+
}
|
|
164
|
+
raise NotImplementedError("Subclasses must implement _execute_task")
|