cua-agent 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- {cua_agent-0.1.0 → cua_agent-0.1.1}/PKG-INFO +1 -1
- cua_agent-0.1.1/README.md +126 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/__init__.py +2 -2
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/agent.py +74 -28
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/messages.py +15 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/loop.py +5 -1
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/messages.py +3 -0
- cua_agent-0.1.1/agent/providers/omni/types.py +53 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/pyproject.toml +3 -3
- cua_agent-0.1.0/README.md +0 -213
- cua_agent-0.1.0/agent/providers/omni/types.py +0 -30
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/README.md +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/README.md +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/__init__.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/base_agent.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/callbacks.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/computer_agent.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/experiment.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/factory.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/loop.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/tools/__init__.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/tools/base.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/tools/bash.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/tools/collection.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/tools/computer.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/tools/edit.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/tools/manager.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/__init__.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/__init__.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/api/client.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/api/logging.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/callbacks/manager.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/loop.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/messages/manager.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/prompts.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/__init__.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/base.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/bash.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/collection.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/computer.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/edit.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/manager.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/run.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/types.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/__init__.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/callbacks.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/clients/anthropic.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/clients/base.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/clients/groq.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/clients/openai.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/clients/utils.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/experiment.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/image_utils.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/parser.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/prompts.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/tool_manager.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/tools/__init__.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/tools/bash.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/tools/computer.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/tools/manager.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/utils.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/visualization.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/types/__init__.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/types/base.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/types/messages.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/types/tools.py +0 -0
- {cua_agent-0.1.0 → cua_agent-0.1.1}/tests/test_agent.py +0 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<h1>
|
|
3
|
+
<div class="image-wrapper" style="display: inline-block;">
|
|
4
|
+
<picture>
|
|
5
|
+
<source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../img/logo_white.png" style="display: block; margin: auto;">
|
|
6
|
+
<source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../img/logo_black.png" style="display: block; margin: auto;">
|
|
7
|
+
<img alt="Shows my svg">
|
|
8
|
+
</picture>
|
|
9
|
+
</div>
|
|
10
|
+
|
|
11
|
+
[](#)
|
|
12
|
+
[](#)
|
|
13
|
+
[](https://discord.com/invite/mVnXXpdE85)
|
|
14
|
+
[](https://pypi.org/project/cua-computer/)
|
|
15
|
+
</h1>
|
|
16
|
+
</div>
|
|
17
|
+
|
|
18
|
+
**Agent** is a Computer Use (CUA) framework for running multi-app agentic workflows targeting macOS and Linux sandbox, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). The framework integrates with Microsoft's OmniParser for enhanced UI understanding and interaction.
|
|
19
|
+
|
|
20
|
+
### Get started with Agent
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from agent import ComputerAgent, AgenticLoop, APIProvider
|
|
24
|
+
from computer import Computer
|
|
25
|
+
|
|
26
|
+
computer = Computer(verbosity=logging.INFO)
|
|
27
|
+
|
|
28
|
+
agent = ComputerAgent(
|
|
29
|
+
computer=computer,
|
|
30
|
+
api_key="<your-anthropic-api-key>",
|
|
31
|
+
loop_type=AgenticLoop.ANTHROPIC, # or AgenticLoop.OMNI
|
|
32
|
+
ai_provider=APIProvider.ANTHROPIC,
|
|
33
|
+
model='claude-3-7-sonnet-20250219',
|
|
34
|
+
save_trajectory=True,
|
|
35
|
+
trajectory_dir=str(Path("trajectories") / datetime.now().strftime("%Y%m%d_%H%M%S")),
|
|
36
|
+
only_n_most_recent_images=3,
|
|
37
|
+
verbosity=logging.INFO,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
tasks = [
|
|
41
|
+
"""
|
|
42
|
+
Please help me with the following task:
|
|
43
|
+
1. Open Safari browser
|
|
44
|
+
2. Go to Wikipedia.org
|
|
45
|
+
3. Search for "Claude AI"
|
|
46
|
+
4. Summarize the main points you find about Claude AI
|
|
47
|
+
"""
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
async with agent:
|
|
51
|
+
for i, task in enumerate(tasks, 1):
|
|
52
|
+
print(f"\nExecuting task {i}/{len(tasks)}: {task}")
|
|
53
|
+
async for result in agent.run(task):
|
|
54
|
+
print(result)
|
|
55
|
+
print(f"Task {i} completed")
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Install
|
|
59
|
+
|
|
60
|
+
### cua-agent
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
|
|
64
|
+
pip install cua-agent[all]
|
|
65
|
+
|
|
66
|
+
# or install specific loop providers
|
|
67
|
+
pip install cua-agent[anthropic]
|
|
68
|
+
pip install cua-agent[omni]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Features
|
|
74
|
+
|
|
75
|
+
### OmniParser Integration
|
|
76
|
+
- Enhanced UI understanding with element detection
|
|
77
|
+
- Automatic bounding box detection for UI elements
|
|
78
|
+
- Improved accuracy for complex UI interactions
|
|
79
|
+
- Support for icon and text element recognition
|
|
80
|
+
|
|
81
|
+
### Basic Computer Control
|
|
82
|
+
- Direct keyboard and mouse control
|
|
83
|
+
- Window and application management
|
|
84
|
+
- Screenshot capabilities
|
|
85
|
+
- Basic UI element detection
|
|
86
|
+
|
|
87
|
+
### Provider Support
|
|
88
|
+
- OpenAI (GPT-4V) - Recommended for OmniParser integration
|
|
89
|
+
- Anthropic (Claude) - Strong general performance
|
|
90
|
+
- Groq - Fast inference with Llama models
|
|
91
|
+
- DeepSeek - Alternative model provider
|
|
92
|
+
- Qwen - Alibaba's multimodal model
|
|
93
|
+
|
|
94
|
+
## Run
|
|
95
|
+
|
|
96
|
+
Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA):
|
|
97
|
+
|
|
98
|
+
- [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
|
|
99
|
+
|
|
100
|
+
## Components
|
|
101
|
+
|
|
102
|
+
The library consists of several components:
|
|
103
|
+
|
|
104
|
+
- **Core**
|
|
105
|
+
- `ComputerAgent`: Unified agent class supporting multiple loop types
|
|
106
|
+
- `BaseComputerAgent`: Abstract base class for computer agents
|
|
107
|
+
|
|
108
|
+
- **Providers**
|
|
109
|
+
- `Anthropic`: Implementation for Anthropic Claude models
|
|
110
|
+
- `Omni`: Implementation for multiple providers (OpenAI, Groq, etc.)
|
|
111
|
+
|
|
112
|
+
- **Loops**
|
|
113
|
+
- `AnthropicLoop`: Loop implementation for Anthropic
|
|
114
|
+
- `OmniLoop`: Generic loop supporting multiple providers
|
|
115
|
+
|
|
116
|
+
## Configuration
|
|
117
|
+
|
|
118
|
+
The agent can be configured with various parameters:
|
|
119
|
+
|
|
120
|
+
- **loop_type**: The type of loop to use (ANTHROPIC or OMNI)
|
|
121
|
+
- **provider**: AI provider to use with the loop
|
|
122
|
+
- **model**: The AI model to use
|
|
123
|
+
- **save_trajectory**: Whether to save screenshots and logs
|
|
124
|
+
- **only_n_most_recent_images**: Only keep a specific number of recent images
|
|
125
|
+
|
|
126
|
+
See the [Core README](./agent/core/README.md) for more details on the unified agent.
|
|
@@ -5,6 +5,6 @@ __version__ = "0.1.0"
|
|
|
5
5
|
from .core.factory import AgentFactory
|
|
6
6
|
from .core.agent import ComputerAgent
|
|
7
7
|
from .types.base import Provider, AgenticLoop
|
|
8
|
-
from .providers.omni.types import APIProvider
|
|
8
|
+
from .providers.omni.types import LLMProvider, LLM, Model, LLMModel, APIProvider
|
|
9
9
|
|
|
10
|
-
__all__ = ["AgentFactory", "Provider", "ComputerAgent", "AgenticLoop", "APIProvider"]
|
|
10
|
+
__all__ = ["AgentFactory", "Provider", "ComputerAgent", "AgenticLoop", "LLMProvider", "LLM", "Model", "LLMModel", "APIProvider"]
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import os
|
|
4
4
|
import logging
|
|
5
5
|
import asyncio
|
|
6
|
-
from typing import Any, AsyncGenerator, Dict, List, Optional, TYPE_CHECKING
|
|
6
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, TYPE_CHECKING, Union, cast
|
|
7
7
|
from datetime import datetime
|
|
8
8
|
|
|
9
9
|
from computer import Computer
|
|
@@ -17,23 +17,23 @@ if TYPE_CHECKING:
|
|
|
17
17
|
from ..providers.omni.loop import OmniLoop
|
|
18
18
|
from ..providers.omni.parser import OmniParser
|
|
19
19
|
|
|
20
|
-
# Import the
|
|
21
|
-
from ..providers.omni.types import APIProvider
|
|
20
|
+
# Import the provider types
|
|
21
|
+
from ..providers.omni.types import LLMProvider, LLM, Model, LLMModel, APIProvider
|
|
22
22
|
|
|
23
23
|
logger = logging.getLogger(__name__)
|
|
24
24
|
|
|
25
25
|
# Default models for different providers
|
|
26
26
|
DEFAULT_MODELS = {
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
27
|
+
LLMProvider.OPENAI: "gpt-4o",
|
|
28
|
+
LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
29
|
+
LLMProvider.GROQ: "llama3-70b-8192",
|
|
30
30
|
}
|
|
31
31
|
|
|
32
32
|
# Map providers to their environment variable names
|
|
33
33
|
ENV_VARS = {
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
LLMProvider.OPENAI: "OPENAI_API_KEY",
|
|
35
|
+
LLMProvider.GROQ: "GROQ_API_KEY",
|
|
36
|
+
LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
37
37
|
}
|
|
38
38
|
|
|
39
39
|
|
|
@@ -48,9 +48,8 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
48
48
|
self,
|
|
49
49
|
computer: Computer,
|
|
50
50
|
loop_type: AgenticLoop = AgenticLoop.OMNI,
|
|
51
|
-
|
|
51
|
+
model: Optional[Union[LLM, Dict[str, str], str]] = None,
|
|
52
52
|
api_key: Optional[str] = None,
|
|
53
|
-
model: Optional[str] = None,
|
|
54
53
|
save_trajectory: bool = True,
|
|
55
54
|
trajectory_dir: Optional[str] = "trajectories",
|
|
56
55
|
only_n_most_recent_images: Optional[int] = None,
|
|
@@ -63,9 +62,12 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
63
62
|
Args:
|
|
64
63
|
computer: Computer instance to control
|
|
65
64
|
loop_type: The type of loop to use (Anthropic or Omni)
|
|
66
|
-
|
|
65
|
+
model: LLM configuration. Can be:
|
|
66
|
+
- LLM object with provider and name
|
|
67
|
+
- Dict with 'provider' and 'name' keys
|
|
68
|
+
- String with model name (defaults to OpenAI provider)
|
|
69
|
+
- None (defaults based on loop_type)
|
|
67
70
|
api_key: Optional API key (will use environment variable if not provided)
|
|
68
|
-
model: Optional model name (will use provider default if not specified)
|
|
69
71
|
save_trajectory: Whether to save screenshots and logs
|
|
70
72
|
trajectory_dir: Directory to save trajectories (defaults to "trajectories")
|
|
71
73
|
only_n_most_recent_images: Limit history to N most recent images
|
|
@@ -88,7 +90,6 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
88
90
|
)
|
|
89
91
|
|
|
90
92
|
self.loop_type = loop_type
|
|
91
|
-
self.provider = ai_provider
|
|
92
93
|
self.save_trajectory = save_trajectory
|
|
93
94
|
self.trajectory_dir = trajectory_dir
|
|
94
95
|
self.only_n_most_recent_images = only_n_most_recent_images
|
|
@@ -98,14 +99,19 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
98
99
|
# Configure logging based on verbosity
|
|
99
100
|
self._configure_logging(verbosity)
|
|
100
101
|
|
|
102
|
+
# Process model configuration
|
|
103
|
+
self.model_config = self._process_model_config(model, loop_type)
|
|
104
|
+
|
|
101
105
|
# Get API key from environment if not provided
|
|
102
106
|
if api_key is None:
|
|
103
107
|
env_var = (
|
|
104
|
-
ENV_VARS.get(
|
|
108
|
+
ENV_VARS.get(self.model_config.provider)
|
|
109
|
+
if loop_type == AgenticLoop.OMNI
|
|
110
|
+
else "ANTHROPIC_API_KEY"
|
|
105
111
|
)
|
|
106
112
|
if not env_var:
|
|
107
113
|
raise ValueError(
|
|
108
|
-
f"Unsupported provider: {
|
|
114
|
+
f"Unsupported provider: {self.model_config.provider}. Please use one of: {list(ENV_VARS.keys())}"
|
|
109
115
|
)
|
|
110
116
|
|
|
111
117
|
api_key = os.environ.get(env_var)
|
|
@@ -119,17 +125,51 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
119
125
|
)
|
|
120
126
|
self.api_key = api_key
|
|
121
127
|
|
|
122
|
-
# Set model based on provider if not specified
|
|
123
|
-
if model is None:
|
|
124
|
-
if loop_type == AgenticLoop.OMNI:
|
|
125
|
-
self.model = DEFAULT_MODELS[ai_provider]
|
|
126
|
-
else: # Anthropic loop
|
|
127
|
-
self.model = DEFAULT_MODELS[APIProvider.ANTHROPIC]
|
|
128
|
-
else:
|
|
129
|
-
self.model = model
|
|
130
|
-
|
|
131
128
|
# Initialize the appropriate loop based on loop_type
|
|
132
129
|
self.loop = self._init_loop()
|
|
130
|
+
|
|
131
|
+
def _process_model_config(
|
|
132
|
+
self, model_input: Optional[Union[LLM, Dict[str, str], str]], loop_type: AgenticLoop
|
|
133
|
+
) -> LLM:
|
|
134
|
+
"""Process and normalize model configuration.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
model_input: Input model configuration (LLM, dict, string, or None)
|
|
138
|
+
loop_type: The loop type being used
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Normalized LLM instance
|
|
142
|
+
"""
|
|
143
|
+
# Handle case where model_input is None
|
|
144
|
+
if model_input is None:
|
|
145
|
+
# Use Anthropic for Anthropic loop, OpenAI for Omni loop
|
|
146
|
+
default_provider = (
|
|
147
|
+
LLMProvider.ANTHROPIC if loop_type == AgenticLoop.ANTHROPIC else LLMProvider.OPENAI
|
|
148
|
+
)
|
|
149
|
+
return LLM(provider=default_provider)
|
|
150
|
+
|
|
151
|
+
# Handle case where model_input is already a LLM or one of its aliases
|
|
152
|
+
if isinstance(model_input, (LLM, Model, LLMModel)):
|
|
153
|
+
return model_input
|
|
154
|
+
|
|
155
|
+
# Handle case where model_input is a dict
|
|
156
|
+
if isinstance(model_input, dict):
|
|
157
|
+
provider = model_input.get("provider", LLMProvider.OPENAI)
|
|
158
|
+
if isinstance(provider, str):
|
|
159
|
+
provider = LLMProvider(provider)
|
|
160
|
+
return LLM(
|
|
161
|
+
provider=provider,
|
|
162
|
+
name=model_input.get("name")
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Handle case where model_input is a string (model name)
|
|
166
|
+
if isinstance(model_input, str):
|
|
167
|
+
default_provider = (
|
|
168
|
+
LLMProvider.ANTHROPIC if loop_type == AgenticLoop.ANTHROPIC else LLMProvider.OPENAI
|
|
169
|
+
)
|
|
170
|
+
return LLM(provider=default_provider, name=model_input)
|
|
171
|
+
|
|
172
|
+
raise ValueError(f"Unsupported model configuration: {model_input}")
|
|
133
173
|
|
|
134
174
|
def _configure_logging(self, verbosity: int):
|
|
135
175
|
"""Configure logging based on verbosity level."""
|
|
@@ -162,9 +202,12 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
162
202
|
if self.loop_type == AgenticLoop.ANTHROPIC:
|
|
163
203
|
from ..providers.anthropic.loop import AnthropicLoop
|
|
164
204
|
|
|
205
|
+
# Ensure we always have a valid model name
|
|
206
|
+
model_name = self.model_config.name or DEFAULT_MODELS[LLMProvider.ANTHROPIC]
|
|
207
|
+
|
|
165
208
|
return AnthropicLoop(
|
|
166
209
|
api_key=self.api_key,
|
|
167
|
-
model=
|
|
210
|
+
model=model_name,
|
|
168
211
|
computer=self.computer,
|
|
169
212
|
save_trajectory=self.save_trajectory,
|
|
170
213
|
base_dir=self.trajectory_dir,
|
|
@@ -176,10 +219,13 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
176
219
|
if "parser" not in self._kwargs:
|
|
177
220
|
self._kwargs["parser"] = OmniParser()
|
|
178
221
|
|
|
222
|
+
# Ensure we always have a valid model name
|
|
223
|
+
model_name = self.model_config.name or DEFAULT_MODELS[self.model_config.provider]
|
|
224
|
+
|
|
179
225
|
return OmniLoop(
|
|
180
|
-
provider=self.provider,
|
|
226
|
+
provider=self.model_config.provider,
|
|
181
227
|
api_key=self.api_key,
|
|
182
|
-
model=
|
|
228
|
+
model=model_name,
|
|
183
229
|
computer=self.computer,
|
|
184
230
|
save_trajectory=self.save_trajectory,
|
|
185
231
|
base_dir=self.trajectory_dir,
|
|
@@ -37,6 +37,17 @@ class BaseMessageManager:
|
|
|
37
37
|
if self.image_retention_config.min_removal_threshold < 1:
|
|
38
38
|
raise ValueError("min_removal_threshold must be at least 1")
|
|
39
39
|
|
|
40
|
+
# Track provider for message formatting
|
|
41
|
+
self.provider = "openai" # Default provider
|
|
42
|
+
|
|
43
|
+
def set_provider(self, provider: str) -> None:
|
|
44
|
+
"""Set the current provider to format messages for.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
provider: Provider name (e.g., 'openai', 'anthropic')
|
|
48
|
+
"""
|
|
49
|
+
self.provider = provider.lower()
|
|
50
|
+
|
|
40
51
|
def prepare_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
41
52
|
"""Prepare messages by applying image retention and caching as configured.
|
|
42
53
|
|
|
@@ -96,6 +107,10 @@ class BaseMessageManager:
|
|
|
96
107
|
Args:
|
|
97
108
|
messages: Messages to inject caching into
|
|
98
109
|
"""
|
|
110
|
+
# Only apply cache_control for Anthropic API, not OpenAI
|
|
111
|
+
if self.provider != "anthropic":
|
|
112
|
+
return
|
|
113
|
+
|
|
99
114
|
# Default to caching last 3 turns
|
|
100
115
|
turns_to_cache = 3
|
|
101
116
|
for message in reversed(messages):
|
|
@@ -219,9 +219,13 @@ class OmniLoop(BaseLoop):
|
|
|
219
219
|
if self.client is None:
|
|
220
220
|
raise RuntimeError("Failed to initialize client")
|
|
221
221
|
|
|
222
|
+
# Set the provider in message manager based on current provider
|
|
223
|
+
provider_name = str(self.provider).split(".")[-1].lower() # Extract name from enum
|
|
224
|
+
self.message_manager.set_provider(provider_name)
|
|
225
|
+
|
|
222
226
|
# Apply image retention and prepare messages
|
|
223
227
|
# This will limit the number of images based on only_n_most_recent_images
|
|
224
|
-
prepared_messages = self.message_manager.
|
|
228
|
+
prepared_messages = self.message_manager.get_formatted_messages(provider_name)
|
|
225
229
|
|
|
226
230
|
# Filter out system messages for Anthropic
|
|
227
231
|
if self.provider == APIProvider.ANTHROPIC:
|
|
@@ -103,6 +103,9 @@ class OmniMessageManager(BaseMessageManager):
|
|
|
103
103
|
Returns:
|
|
104
104
|
List of formatted messages
|
|
105
105
|
"""
|
|
106
|
+
# Set the provider for message formatting
|
|
107
|
+
self.set_provider(provider)
|
|
108
|
+
|
|
106
109
|
if provider == "anthropic":
|
|
107
110
|
return self._format_for_anthropic()
|
|
108
111
|
elif provider == "openai":
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Type definitions for the Omni provider."""
|
|
2
|
+
|
|
3
|
+
from enum import StrEnum
|
|
4
|
+
from typing import Dict, Optional
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LLMProvider(StrEnum):
|
|
9
|
+
"""Supported LLM providers."""
|
|
10
|
+
|
|
11
|
+
ANTHROPIC = "anthropic"
|
|
12
|
+
OPENAI = "openai"
|
|
13
|
+
GROQ = "groq"
|
|
14
|
+
QWEN = "qwen"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# For backward compatibility
|
|
18
|
+
APIProvider = LLMProvider
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class LLM:
|
|
23
|
+
"""Configuration for LLM model and provider."""
|
|
24
|
+
|
|
25
|
+
provider: LLMProvider
|
|
26
|
+
name: Optional[str] = None
|
|
27
|
+
|
|
28
|
+
def __post_init__(self):
|
|
29
|
+
"""Set default model name if not provided."""
|
|
30
|
+
if self.name is None:
|
|
31
|
+
self.name = PROVIDER_TO_DEFAULT_MODEL.get(self.provider)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# For backward compatibility
|
|
35
|
+
LLMModel = LLM
|
|
36
|
+
Model = LLM
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# Default models for each provider
|
|
40
|
+
PROVIDER_TO_DEFAULT_MODEL: Dict[LLMProvider, str] = {
|
|
41
|
+
LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
42
|
+
LLMProvider.OPENAI: "gpt-4o",
|
|
43
|
+
LLMProvider.GROQ: "deepseek-r1-distill-llama-70b",
|
|
44
|
+
LLMProvider.QWEN: "qwen2.5-vl-72b-instruct",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
# Environment variable names for each provider
|
|
48
|
+
PROVIDER_TO_ENV_VAR: Dict[LLMProvider, str] = {
|
|
49
|
+
LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
50
|
+
LLMProvider.OPENAI: "OPENAI_API_KEY",
|
|
51
|
+
LLMProvider.GROQ: "GROQ_API_KEY",
|
|
52
|
+
LLMProvider.QWEN: "QWEN_API_KEY",
|
|
53
|
+
}
|
|
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
|
|
|
6
6
|
|
|
7
7
|
[project]
|
|
8
8
|
name = "cua-agent"
|
|
9
|
-
version = "0.1.
|
|
9
|
+
version = "0.1.1"
|
|
10
10
|
description = "CUA (Computer Use) Agent for AI-driven computer interaction"
|
|
11
11
|
authors = [
|
|
12
12
|
{ name = "TryCua", email = "gh@trycua.com" },
|
|
@@ -78,7 +78,7 @@ target-version = [
|
|
|
78
78
|
|
|
79
79
|
[tool.ruff]
|
|
80
80
|
line-length = 100
|
|
81
|
-
target-version = "0.1.
|
|
81
|
+
target-version = "0.1.1"
|
|
82
82
|
select = [
|
|
83
83
|
"E",
|
|
84
84
|
"F",
|
|
@@ -92,7 +92,7 @@ docstring-code-format = true
|
|
|
92
92
|
|
|
93
93
|
[tool.mypy]
|
|
94
94
|
strict = true
|
|
95
|
-
python_version = "0.1.
|
|
95
|
+
python_version = "0.1.1"
|
|
96
96
|
ignore_missing_imports = true
|
|
97
97
|
disallow_untyped_defs = true
|
|
98
98
|
check_untyped_defs = true
|
cua_agent-0.1.0/README.md
DELETED
|
@@ -1,213 +0,0 @@
|
|
|
1
|
-
<div align="center">
|
|
2
|
-
<h1>
|
|
3
|
-
<div class="image-wrapper" style="display: inline-block;">
|
|
4
|
-
<picture>
|
|
5
|
-
<source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../img/logo_white.png" style="display: block; margin: auto;">
|
|
6
|
-
<source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../img/logo_black.png" style="display: block; margin: auto;">
|
|
7
|
-
<img alt="Shows my svg">
|
|
8
|
-
</picture>
|
|
9
|
-
</div>
|
|
10
|
-
|
|
11
|
-
[](#)
|
|
12
|
-
[](#)
|
|
13
|
-
[](https://discord.com/invite/mVnXXpdE85)
|
|
14
|
-
[](https://pypi.org/project/cua-computer/)
|
|
15
|
-
</h1>
|
|
16
|
-
</div>
|
|
17
|
-
|
|
18
|
-
**Agent** is a Computer Use (CUA) framework for running multi-app agentic workflows targeting macOS and Linux sandbox, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). The framework integrates with Microsoft's OmniParser for enhanced UI understanding and interaction.
|
|
19
|
-
|
|
20
|
-
### Get started with Agent
|
|
21
|
-
|
|
22
|
-
There are two ways to use the agent: with OmniParser for enhanced UI understanding (recommended) or with basic computer control.
|
|
23
|
-
|
|
24
|
-
#### Option 1: With OmniParser (Recommended)
|
|
25
|
-
|
|
26
|
-
<div align="center">
|
|
27
|
-
<img src="../../img/agent.png"/>
|
|
28
|
-
</div>
|
|
29
|
-
|
|
30
|
-
```python
|
|
31
|
-
from agent.providers.omni import OmniComputerAgent, APIProvider
|
|
32
|
-
|
|
33
|
-
# Set your API key
|
|
34
|
-
export OPENAI_API_KEY="your-openai-api-key"
|
|
35
|
-
|
|
36
|
-
# Initialize agent with OmniParser for enhanced UI understanding
|
|
37
|
-
agent = OmniComputerAgent(
|
|
38
|
-
provider=APIProvider.OPENAI,
|
|
39
|
-
model="gpt-4o",
|
|
40
|
-
start_omniparser=True # Automatically starts OmniParser server
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
task = """
|
|
44
|
-
1. Search the ai-gradio repo on GitHub.
|
|
45
|
-
2. Clone it to the desktop.
|
|
46
|
-
3. Open the repo with the Cursor app.
|
|
47
|
-
4. Work with Cursor to add a new provider for Cua.
|
|
48
|
-
"""
|
|
49
|
-
|
|
50
|
-
async with agent: # Ensures proper cleanup
|
|
51
|
-
async for result in agent.run(task):
|
|
52
|
-
print(result)
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
#### Option 2: Basic Computer Control
|
|
56
|
-
|
|
57
|
-
```python
|
|
58
|
-
from agent.computer_agent import ComputerAgent
|
|
59
|
-
from agent.base.types import Provider
|
|
60
|
-
|
|
61
|
-
# Set your API key (supports any provider)
|
|
62
|
-
export OPENAI_API_KEY="your-openai-api-key" # or other provider keys
|
|
63
|
-
|
|
64
|
-
# Initialize basic agent
|
|
65
|
-
agent = ComputerAgent(
|
|
66
|
-
provider=Provider.OPENAI, # or ANTHROPIC, GROQ, etc.
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
task = """
|
|
70
|
-
1. Open Chrome and navigate to github.com
|
|
71
|
-
2. Search for 'trycua/cua'
|
|
72
|
-
3. Star the repository
|
|
73
|
-
"""
|
|
74
|
-
|
|
75
|
-
async with agent:
|
|
76
|
-
async for result in agent.run(task):
|
|
77
|
-
print(result)
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
## Install
|
|
81
|
-
|
|
82
|
-
### cua-agent
|
|
83
|
-
|
|
84
|
-
```bash
|
|
85
|
-
# Basic installation with Anthropic
|
|
86
|
-
pip install cua-agent[anthropic]
|
|
87
|
-
|
|
88
|
-
# Install with OmniParser (recommended)
|
|
89
|
-
# Includes all provider dependencies (OpenAI, Anthropic, etc.)
|
|
90
|
-
pip install cua-agent[omni]
|
|
91
|
-
|
|
92
|
-
# Install with all features and providers
|
|
93
|
-
pip install cua-agent[all]
|
|
94
|
-
```
|
|
95
|
-
|
|
96
|
-
## Features
|
|
97
|
-
|
|
98
|
-
### OmniParser Integration
|
|
99
|
-
- Enhanced UI understanding with element detection
|
|
100
|
-
- Automatic bounding box detection for UI elements
|
|
101
|
-
- Improved accuracy for complex UI interactions
|
|
102
|
-
- Support for icon and text element recognition
|
|
103
|
-
|
|
104
|
-
### Basic Computer Control
|
|
105
|
-
- Direct keyboard and mouse control
|
|
106
|
-
- Window and application management
|
|
107
|
-
- Screenshot capabilities
|
|
108
|
-
- Basic UI element detection
|
|
109
|
-
|
|
110
|
-
### Provider Support
|
|
111
|
-
- OpenAI (GPT-4V) - Recommended for OmniParser integration
|
|
112
|
-
- Anthropic (Claude) - Strong general performance
|
|
113
|
-
- Groq - Fast inference with Llama models
|
|
114
|
-
- DeepSeek - Alternative model provider
|
|
115
|
-
- Qwen - Alibaba's multimodal model
|
|
116
|
-
|
|
117
|
-
## Run
|
|
118
|
-
|
|
119
|
-
Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA):
|
|
120
|
-
|
|
121
|
-
- [Getting Started with OmniParser](../../notebooks/omniparser_nb.ipynb) - Enhanced UI understanding
|
|
122
|
-
- [Basic Computer Control](../../notebooks/basic_agent_nb.ipynb) - Simple computer interactions
|
|
123
|
-
- [Advanced Usage](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
|
|
124
|
-
|
|
125
|
-
# Computer Agent Library
|
|
126
|
-
|
|
127
|
-
A Python library for controlling computer interactions with AI agents.
|
|
128
|
-
|
|
129
|
-
## Introduction
|
|
130
|
-
|
|
131
|
-
This library provides a unified interface for AI-powered computer interaction agents, allowing applications to automate UI interactions through various AI providers.
|
|
132
|
-
|
|
133
|
-
## Key Features
|
|
134
|
-
|
|
135
|
-
- **Unified Agent**: Single `ComputerAgent` class with configurable loop types
|
|
136
|
-
- **Multiple AI providers**: Support for OpenAI, Anthropic, Groq, and other providers
|
|
137
|
-
- **Screen analysis**: Intelligent screen parsing and element identification
|
|
138
|
-
- **Tool execution**: Execute tools and commands to interact with the computer
|
|
139
|
-
- **Trajectory saving**: Option to save screenshots and logs for debugging and analysis
|
|
140
|
-
|
|
141
|
-
## Installation
|
|
142
|
-
|
|
143
|
-
To install the library along with its dependencies:
|
|
144
|
-
|
|
145
|
-
```bash
|
|
146
|
-
pip install -e .
|
|
147
|
-
```
|
|
148
|
-
|
|
149
|
-
## Usage
|
|
150
|
-
|
|
151
|
-
Here's a simple example of how to use the ComputerAgent:
|
|
152
|
-
|
|
153
|
-
```python
|
|
154
|
-
import asyncio
|
|
155
|
-
from computer import Computer
|
|
156
|
-
from agent.core.agent import ComputerAgent
|
|
157
|
-
from agent.types.base import AgenticLoop
|
|
158
|
-
from agent.providers.omni.types import APIProvider
|
|
159
|
-
|
|
160
|
-
async def main():
|
|
161
|
-
# Initialize the computer interface
|
|
162
|
-
computer = Computer()
|
|
163
|
-
|
|
164
|
-
# Create a computer agent
|
|
165
|
-
agent = ComputerAgent(
|
|
166
|
-
computer=computer,
|
|
167
|
-
loop_type=AgenticLoop.OMNI, # Use OMNI loop
|
|
168
|
-
provider=APIProvider.OPENAI, # With OpenAI provider
|
|
169
|
-
model="gpt-4o", # Specify the model
|
|
170
|
-
save_trajectory=True, # Save logs and screenshots
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
# Use the agent with a context manager
|
|
174
|
-
async with agent:
|
|
175
|
-
# Run a task
|
|
176
|
-
async for result in agent.run("Open Safari and navigate to github.com"):
|
|
177
|
-
# Process the result
|
|
178
|
-
title = result["metadata"].get("title", "Screen Analysis")
|
|
179
|
-
content = result["content"]
|
|
180
|
-
print(f"\n{title}")
|
|
181
|
-
print(content)
|
|
182
|
-
|
|
183
|
-
if __name__ == "__main__":
|
|
184
|
-
asyncio.run(main())
|
|
185
|
-
```
|
|
186
|
-
|
|
187
|
-
## Components
|
|
188
|
-
|
|
189
|
-
The library consists of several components:
|
|
190
|
-
|
|
191
|
-
- **Core**
|
|
192
|
-
- `ComputerAgent`: Unified agent class supporting multiple loop types
|
|
193
|
-
- `BaseComputerAgent`: Abstract base class for computer agents
|
|
194
|
-
|
|
195
|
-
- **Providers**
|
|
196
|
-
- `Anthropic`: Implementation for Anthropic Claude models
|
|
197
|
-
- `Omni`: Implementation for multiple providers (OpenAI, Groq, etc.)
|
|
198
|
-
|
|
199
|
-
- **Loops**
|
|
200
|
-
- `AnthropicLoop`: Loop implementation for Anthropic
|
|
201
|
-
- `OmniLoop`: Generic loop supporting multiple providers
|
|
202
|
-
|
|
203
|
-
## Configuration
|
|
204
|
-
|
|
205
|
-
The agent can be configured with various parameters:
|
|
206
|
-
|
|
207
|
-
- **loop_type**: The type of loop to use (ANTHROPIC or OMNI)
|
|
208
|
-
- **provider**: AI provider to use with the loop
|
|
209
|
-
- **model**: The AI model to use
|
|
210
|
-
- **save_trajectory**: Whether to save screenshots and logs
|
|
211
|
-
- **only_n_most_recent_images**: Only keep a specific number of recent images
|
|
212
|
-
|
|
213
|
-
See the [Core README](./agent/core/README.md) for more details on the unified agent.
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
"""Type definitions for the Omni provider."""
|
|
2
|
-
|
|
3
|
-
from enum import StrEnum
|
|
4
|
-
from typing import Dict
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class APIProvider(StrEnum):
|
|
8
|
-
"""Supported API providers."""
|
|
9
|
-
|
|
10
|
-
ANTHROPIC = "anthropic"
|
|
11
|
-
OPENAI = "openai"
|
|
12
|
-
GROQ = "groq"
|
|
13
|
-
QWEN = "qwen"
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
# Default models for each provider
|
|
17
|
-
PROVIDER_TO_DEFAULT_MODEL: Dict[APIProvider, str] = {
|
|
18
|
-
APIProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
19
|
-
APIProvider.OPENAI: "gpt-4o",
|
|
20
|
-
APIProvider.GROQ: "deepseek-r1-distill-llama-70b",
|
|
21
|
-
APIProvider.QWEN: "qwen2.5-vl-72b-instruct",
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
# Environment variable names for each provider
|
|
25
|
-
PROVIDER_TO_ENV_VAR: Dict[APIProvider, str] = {
|
|
26
|
-
APIProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
27
|
-
APIProvider.OPENAI: "OPENAI_API_KEY",
|
|
28
|
-
APIProvider.GROQ: "GROQ_API_KEY",
|
|
29
|
-
APIProvider.QWEN: "QWEN_API_KEY",
|
|
30
|
-
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|