cua-agent 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- {cua_agent-0.1.1 → cua_agent-0.1.2}/PKG-INFO +1 -1
- {cua_agent-0.1.1 → cua_agent-0.1.2}/README.md +6 -6
- cua_agent-0.1.2/agent/__init__.py +10 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/README.md +2 -2
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/agent.py +27 -30
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/__init__.py +2 -2
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/api/client.py +43 -46
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/loop.py +2 -2
- cua_agent-0.1.2/agent/providers/anthropic/types.py +16 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/__init__.py +2 -2
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/loop.py +12 -12
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/prompts.py +0 -14
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/types.py +3 -4
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/types/base.py +2 -1
- {cua_agent-0.1.1 → cua_agent-0.1.2}/pyproject.toml +3 -3
- {cua_agent-0.1.1 → cua_agent-0.1.2}/tests/test_agent.py +3 -3
- cua_agent-0.1.1/agent/__init__.py +0 -10
- cua_agent-0.1.1/agent/providers/anthropic/types.py +0 -16
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/README.md +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/__init__.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/base_agent.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/callbacks.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/computer_agent.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/experiment.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/factory.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/loop.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/messages.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/tools/__init__.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/tools/base.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/tools/bash.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/tools/collection.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/tools/computer.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/tools/edit.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/core/tools/manager.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/__init__.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/api/logging.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/callbacks/manager.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/messages/manager.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/prompts.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/tools/__init__.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/tools/base.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/tools/bash.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/tools/collection.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/tools/computer.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/tools/edit.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/tools/manager.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/anthropic/tools/run.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/callbacks.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/clients/anthropic.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/clients/base.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/clients/groq.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/clients/openai.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/clients/utils.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/experiment.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/image_utils.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/messages.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/parser.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/tool_manager.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/tools/__init__.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/tools/bash.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/tools/computer.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/tools/manager.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/utils.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/providers/omni/visualization.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/types/__init__.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/types/messages.py +0 -0
- {cua_agent-0.1.1 → cua_agent-0.1.2}/agent/types/tools.py +0 -0
|
@@ -20,19 +20,19 @@
|
|
|
20
20
|
### Get started with Agent
|
|
21
21
|
|
|
22
22
|
```python
|
|
23
|
-
from agent import ComputerAgent,
|
|
23
|
+
from agent import ComputerAgent, AgentLoop, LLMProvider
|
|
24
24
|
from computer import Computer
|
|
25
25
|
|
|
26
26
|
computer = Computer(verbosity=logging.INFO)
|
|
27
27
|
|
|
28
28
|
agent = ComputerAgent(
|
|
29
29
|
computer=computer,
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
model=
|
|
30
|
+
loop=AgentLoop.ANTHROPIC,
|
|
31
|
+
# loop=AgentLoop.OMNI,
|
|
32
|
+
model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
|
|
33
|
+
# model=LLM(provider=LLMProvider.OPENAI, name="gpt-4.5-preview"),
|
|
34
34
|
save_trajectory=True,
|
|
35
|
-
trajectory_dir=str(Path("trajectories")
|
|
35
|
+
trajectory_dir=str(Path("trajectories")),
|
|
36
36
|
only_n_most_recent_images=3,
|
|
37
37
|
verbosity=logging.INFO,
|
|
38
38
|
)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""CUA (Computer Use) Agent for AI-driven computer interaction."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
from .core.factory import AgentFactory
|
|
6
|
+
from .core.agent import ComputerAgent
|
|
7
|
+
from .providers.omni.types import LLMProvider, LLM
|
|
8
|
+
from .types.base import Provider, AgentLoop
|
|
9
|
+
|
|
10
|
+
__all__ = ["AgentFactory", "Provider", "ComputerAgent", "AgentLoop", "LLMProvider", "LLM"]
|
|
@@ -34,7 +34,7 @@ Here's how to use the unified ComputerAgent:
|
|
|
34
34
|
```python
|
|
35
35
|
from agent.core.agent import ComputerAgent
|
|
36
36
|
from agent.types.base import AgenticLoop
|
|
37
|
-
from agent.providers.omni.types import
|
|
37
|
+
from agent.providers.omni.types import LLMProvider
|
|
38
38
|
from computer import Computer
|
|
39
39
|
|
|
40
40
|
# Create a Computer instance
|
|
@@ -44,7 +44,7 @@ computer = Computer()
|
|
|
44
44
|
agent = ComputerAgent(
|
|
45
45
|
computer=computer,
|
|
46
46
|
loop_type=AgenticLoop.OMNI,
|
|
47
|
-
provider=
|
|
47
|
+
provider=LLMProvider.OPENAI,
|
|
48
48
|
model="gpt-4o",
|
|
49
49
|
api_key="your_api_key_here", # Can also use OPENAI_API_KEY environment variable
|
|
50
50
|
save_trajectory=True,
|
|
@@ -8,7 +8,7 @@ from datetime import datetime
|
|
|
8
8
|
|
|
9
9
|
from computer import Computer
|
|
10
10
|
|
|
11
|
-
from ..types.base import Provider,
|
|
11
|
+
from ..types.base import Provider, AgentLoop
|
|
12
12
|
from .base_agent import BaseComputerAgent
|
|
13
13
|
|
|
14
14
|
# Only import types for type checking to avoid circular imports
|
|
@@ -18,7 +18,7 @@ if TYPE_CHECKING:
|
|
|
18
18
|
from ..providers.omni.parser import OmniParser
|
|
19
19
|
|
|
20
20
|
# Import the provider types
|
|
21
|
-
from ..providers.omni.types import LLMProvider, LLM, Model, LLMModel
|
|
21
|
+
from ..providers.omni.types import LLMProvider, LLM, Model, LLMModel
|
|
22
22
|
|
|
23
23
|
logger = logging.getLogger(__name__)
|
|
24
24
|
|
|
@@ -47,7 +47,7 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
47
47
|
def __init__(
|
|
48
48
|
self,
|
|
49
49
|
computer: Computer,
|
|
50
|
-
|
|
50
|
+
loop: AgentLoop = AgentLoop.OMNI,
|
|
51
51
|
model: Optional[Union[LLM, Dict[str, str], str]] = None,
|
|
52
52
|
api_key: Optional[str] = None,
|
|
53
53
|
save_trajectory: bool = True,
|
|
@@ -61,12 +61,12 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
61
61
|
|
|
62
62
|
Args:
|
|
63
63
|
computer: Computer instance to control
|
|
64
|
-
|
|
64
|
+
loop: The type of loop to use (Anthropic or Omni)
|
|
65
65
|
model: LLM configuration. Can be:
|
|
66
66
|
- LLM object with provider and name
|
|
67
67
|
- Dict with 'provider' and 'name' keys
|
|
68
68
|
- String with model name (defaults to OpenAI provider)
|
|
69
|
-
- None (defaults based on
|
|
69
|
+
- None (defaults based on loop)
|
|
70
70
|
api_key: Optional API key (will use environment variable if not provided)
|
|
71
71
|
save_trajectory: Whether to save screenshots and logs
|
|
72
72
|
trajectory_dir: Directory to save trajectories (defaults to "trajectories")
|
|
@@ -89,7 +89,7 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
89
89
|
**kwargs,
|
|
90
90
|
)
|
|
91
91
|
|
|
92
|
-
self.loop_type =
|
|
92
|
+
self.loop_type = loop
|
|
93
93
|
self.save_trajectory = save_trajectory
|
|
94
94
|
self.trajectory_dir = trajectory_dir
|
|
95
95
|
self.only_n_most_recent_images = only_n_most_recent_images
|
|
@@ -100,13 +100,13 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
100
100
|
self._configure_logging(verbosity)
|
|
101
101
|
|
|
102
102
|
# Process model configuration
|
|
103
|
-
self.model_config = self._process_model_config(model,
|
|
104
|
-
|
|
103
|
+
self.model_config = self._process_model_config(model, loop)
|
|
104
|
+
|
|
105
105
|
# Get API key from environment if not provided
|
|
106
106
|
if api_key is None:
|
|
107
107
|
env_var = (
|
|
108
|
-
ENV_VARS.get(self.model_config.provider)
|
|
109
|
-
if
|
|
108
|
+
ENV_VARS.get(self.model_config.provider)
|
|
109
|
+
if loop == AgentLoop.OMNI
|
|
110
110
|
else "ANTHROPIC_API_KEY"
|
|
111
111
|
)
|
|
112
112
|
if not env_var:
|
|
@@ -127,16 +127,16 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
127
127
|
|
|
128
128
|
# Initialize the appropriate loop based on loop_type
|
|
129
129
|
self.loop = self._init_loop()
|
|
130
|
-
|
|
130
|
+
|
|
131
131
|
def _process_model_config(
|
|
132
|
-
self, model_input: Optional[Union[LLM, Dict[str, str], str]],
|
|
132
|
+
self, model_input: Optional[Union[LLM, Dict[str, str], str]], loop: AgentLoop
|
|
133
133
|
) -> LLM:
|
|
134
134
|
"""Process and normalize model configuration.
|
|
135
|
-
|
|
135
|
+
|
|
136
136
|
Args:
|
|
137
137
|
model_input: Input model configuration (LLM, dict, string, or None)
|
|
138
|
-
|
|
139
|
-
|
|
138
|
+
loop: The loop type being used
|
|
139
|
+
|
|
140
140
|
Returns:
|
|
141
141
|
Normalized LLM instance
|
|
142
142
|
"""
|
|
@@ -144,31 +144,28 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
144
144
|
if model_input is None:
|
|
145
145
|
# Use Anthropic for Anthropic loop, OpenAI for Omni loop
|
|
146
146
|
default_provider = (
|
|
147
|
-
LLMProvider.ANTHROPIC if
|
|
147
|
+
LLMProvider.ANTHROPIC if loop == AgentLoop.ANTHROPIC else LLMProvider.OPENAI
|
|
148
148
|
)
|
|
149
149
|
return LLM(provider=default_provider)
|
|
150
|
-
|
|
150
|
+
|
|
151
151
|
# Handle case where model_input is already a LLM or one of its aliases
|
|
152
152
|
if isinstance(model_input, (LLM, Model, LLMModel)):
|
|
153
153
|
return model_input
|
|
154
|
-
|
|
154
|
+
|
|
155
155
|
# Handle case where model_input is a dict
|
|
156
156
|
if isinstance(model_input, dict):
|
|
157
157
|
provider = model_input.get("provider", LLMProvider.OPENAI)
|
|
158
158
|
if isinstance(provider, str):
|
|
159
159
|
provider = LLMProvider(provider)
|
|
160
|
-
return LLM(
|
|
161
|
-
|
|
162
|
-
name=model_input.get("name")
|
|
163
|
-
)
|
|
164
|
-
|
|
160
|
+
return LLM(provider=provider, name=model_input.get("name"))
|
|
161
|
+
|
|
165
162
|
# Handle case where model_input is a string (model name)
|
|
166
163
|
if isinstance(model_input, str):
|
|
167
164
|
default_provider = (
|
|
168
|
-
LLMProvider.ANTHROPIC if
|
|
165
|
+
LLMProvider.ANTHROPIC if loop == AgentLoop.ANTHROPIC else LLMProvider.OPENAI
|
|
169
166
|
)
|
|
170
167
|
return LLM(provider=default_provider, name=model_input)
|
|
171
|
-
|
|
168
|
+
|
|
172
169
|
raise ValueError(f"Unsupported model configuration: {model_input}")
|
|
173
170
|
|
|
174
171
|
def _configure_logging(self, verbosity: int):
|
|
@@ -199,12 +196,12 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
199
196
|
from ..providers.omni.loop import OmniLoop
|
|
200
197
|
from ..providers.omni.parser import OmniParser
|
|
201
198
|
|
|
202
|
-
if self.loop_type ==
|
|
199
|
+
if self.loop_type == AgentLoop.ANTHROPIC:
|
|
203
200
|
from ..providers.anthropic.loop import AnthropicLoop
|
|
204
201
|
|
|
205
202
|
# Ensure we always have a valid model name
|
|
206
203
|
model_name = self.model_config.name or DEFAULT_MODELS[LLMProvider.ANTHROPIC]
|
|
207
|
-
|
|
204
|
+
|
|
208
205
|
return AnthropicLoop(
|
|
209
206
|
api_key=self.api_key,
|
|
210
207
|
model=model_name,
|
|
@@ -221,7 +218,7 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
221
218
|
|
|
222
219
|
# Ensure we always have a valid model name
|
|
223
220
|
model_name = self.model_config.name or DEFAULT_MODELS[self.model_config.provider]
|
|
224
|
-
|
|
221
|
+
|
|
225
222
|
return OmniLoop(
|
|
226
223
|
provider=self.model_config.provider,
|
|
227
224
|
api_key=self.api_key,
|
|
@@ -244,7 +241,7 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
244
241
|
"""
|
|
245
242
|
try:
|
|
246
243
|
# Format the messages based on loop type
|
|
247
|
-
if self.loop_type ==
|
|
244
|
+
if self.loop_type == AgentLoop.ANTHROPIC:
|
|
248
245
|
# Anthropic format
|
|
249
246
|
messages = [{"role": "user", "content": [{"type": "text", "text": task}]}]
|
|
250
247
|
else:
|
|
@@ -267,7 +264,7 @@ class ComputerAgent(BaseComputerAgent):
|
|
|
267
264
|
continue
|
|
268
265
|
|
|
269
266
|
# Extract content and metadata based on loop type
|
|
270
|
-
if self.loop_type ==
|
|
267
|
+
if self.loop_type == AgentLoop.ANTHROPIC:
|
|
271
268
|
# Handle Anthropic format
|
|
272
269
|
if "content" in result:
|
|
273
270
|
content_text = ""
|
|
@@ -3,25 +3,28 @@ import httpx
|
|
|
3
3
|
import asyncio
|
|
4
4
|
from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex
|
|
5
5
|
from anthropic.types.beta import BetaMessage, BetaMessageParam, BetaToolUnionParam
|
|
6
|
-
from ..types import
|
|
6
|
+
from ..types import LLMProvider
|
|
7
7
|
from .logging import log_api_interaction
|
|
8
8
|
import random
|
|
9
9
|
import logging
|
|
10
10
|
|
|
11
11
|
logger = logging.getLogger(__name__)
|
|
12
12
|
|
|
13
|
+
|
|
13
14
|
class APIConnectionError(Exception):
|
|
14
15
|
"""Error raised when there are connection issues with the API."""
|
|
16
|
+
|
|
15
17
|
pass
|
|
16
18
|
|
|
19
|
+
|
|
17
20
|
class BaseAnthropicClient:
|
|
18
21
|
"""Base class for Anthropic API clients."""
|
|
19
|
-
|
|
22
|
+
|
|
20
23
|
MAX_RETRIES = 10
|
|
21
24
|
INITIAL_RETRY_DELAY = 1.0
|
|
22
25
|
MAX_RETRY_DELAY = 60.0
|
|
23
26
|
JITTER_FACTOR = 0.1
|
|
24
|
-
|
|
27
|
+
|
|
25
28
|
async def create_message(
|
|
26
29
|
self,
|
|
27
30
|
*,
|
|
@@ -36,79 +39,67 @@ class BaseAnthropicClient:
|
|
|
36
39
|
|
|
37
40
|
async def _make_api_call_with_retries(self, api_call):
|
|
38
41
|
"""Make an API call with exponential backoff retry logic.
|
|
39
|
-
|
|
42
|
+
|
|
40
43
|
Args:
|
|
41
44
|
api_call: Async function that makes the actual API call
|
|
42
|
-
|
|
45
|
+
|
|
43
46
|
Returns:
|
|
44
47
|
API response
|
|
45
|
-
|
|
48
|
+
|
|
46
49
|
Raises:
|
|
47
50
|
APIConnectionError: If all retries fail
|
|
48
51
|
"""
|
|
49
52
|
retry_count = 0
|
|
50
53
|
last_error = None
|
|
51
|
-
|
|
54
|
+
|
|
52
55
|
while retry_count < self.MAX_RETRIES:
|
|
53
56
|
try:
|
|
54
57
|
return await api_call()
|
|
55
58
|
except Exception as e:
|
|
56
59
|
last_error = e
|
|
57
60
|
retry_count += 1
|
|
58
|
-
|
|
61
|
+
|
|
59
62
|
if retry_count == self.MAX_RETRIES:
|
|
60
63
|
break
|
|
61
|
-
|
|
64
|
+
|
|
62
65
|
# Calculate delay with exponential backoff and jitter
|
|
63
66
|
delay = min(
|
|
64
|
-
self.INITIAL_RETRY_DELAY * (2 ** (retry_count - 1)),
|
|
65
|
-
self.MAX_RETRY_DELAY
|
|
67
|
+
self.INITIAL_RETRY_DELAY * (2 ** (retry_count - 1)), self.MAX_RETRY_DELAY
|
|
66
68
|
)
|
|
67
69
|
# Add jitter to avoid thundering herd
|
|
68
70
|
jitter = delay * self.JITTER_FACTOR * (2 * random.random() - 1)
|
|
69
71
|
final_delay = delay + jitter
|
|
70
|
-
|
|
72
|
+
|
|
71
73
|
logger.info(
|
|
72
74
|
f"Retrying request (attempt {retry_count}/{self.MAX_RETRIES}) "
|
|
73
75
|
f"in {final_delay:.2f} seconds after error: {str(e)}"
|
|
74
76
|
)
|
|
75
77
|
await asyncio.sleep(final_delay)
|
|
76
|
-
|
|
78
|
+
|
|
77
79
|
raise APIConnectionError(
|
|
78
|
-
f"Failed after {self.MAX_RETRIES} retries. "
|
|
79
|
-
f"Last error: {str(last_error)}"
|
|
80
|
+
f"Failed after {self.MAX_RETRIES} retries. " f"Last error: {str(last_error)}"
|
|
80
81
|
)
|
|
81
82
|
|
|
83
|
+
|
|
82
84
|
class AnthropicDirectClient(BaseAnthropicClient):
|
|
83
85
|
"""Direct Anthropic API client implementation."""
|
|
84
|
-
|
|
86
|
+
|
|
85
87
|
def __init__(self, api_key: str, model: str):
|
|
86
88
|
self.model = model
|
|
87
|
-
self.client = Anthropic(
|
|
88
|
-
|
|
89
|
-
http_client=self._create_http_client()
|
|
90
|
-
)
|
|
91
|
-
|
|
89
|
+
self.client = Anthropic(api_key=api_key, http_client=self._create_http_client())
|
|
90
|
+
|
|
92
91
|
def _create_http_client(self) -> httpx.Client:
|
|
93
92
|
"""Create an HTTP client with appropriate settings."""
|
|
94
93
|
return httpx.Client(
|
|
95
94
|
verify=True,
|
|
96
|
-
timeout=httpx.Timeout(
|
|
97
|
-
connect=30.0,
|
|
98
|
-
read=300.0,
|
|
99
|
-
write=30.0,
|
|
100
|
-
pool=30.0
|
|
101
|
-
),
|
|
95
|
+
timeout=httpx.Timeout(connect=30.0, read=300.0, write=30.0, pool=30.0),
|
|
102
96
|
transport=httpx.HTTPTransport(
|
|
103
97
|
retries=3,
|
|
104
98
|
verify=True,
|
|
105
|
-
limits=httpx.Limits(
|
|
106
|
-
|
|
107
|
-
max_connections=10
|
|
108
|
-
)
|
|
109
|
-
)
|
|
99
|
+
limits=httpx.Limits(max_keepalive_connections=5, max_connections=10),
|
|
100
|
+
),
|
|
110
101
|
)
|
|
111
|
-
|
|
102
|
+
|
|
112
103
|
async def create_message(
|
|
113
104
|
self,
|
|
114
105
|
*,
|
|
@@ -119,6 +110,7 @@ class AnthropicDirectClient(BaseAnthropicClient):
|
|
|
119
110
|
betas: list[str],
|
|
120
111
|
) -> BetaMessage:
|
|
121
112
|
"""Create a message using the direct Anthropic API with retry logic."""
|
|
113
|
+
|
|
122
114
|
async def api_call():
|
|
123
115
|
response = self.client.beta.messages.with_raw_response.create(
|
|
124
116
|
max_tokens=max_tokens,
|
|
@@ -130,20 +122,21 @@ class AnthropicDirectClient(BaseAnthropicClient):
|
|
|
130
122
|
)
|
|
131
123
|
log_api_interaction(response.http_response.request, response.http_response, None)
|
|
132
124
|
return response.parse()
|
|
133
|
-
|
|
125
|
+
|
|
134
126
|
try:
|
|
135
127
|
return await self._make_api_call_with_retries(api_call)
|
|
136
128
|
except Exception as e:
|
|
137
129
|
log_api_interaction(None, None, e)
|
|
138
130
|
raise
|
|
139
131
|
|
|
132
|
+
|
|
140
133
|
class AnthropicVertexClient(BaseAnthropicClient):
|
|
141
134
|
"""Google Cloud Vertex AI implementation of Anthropic client."""
|
|
142
|
-
|
|
135
|
+
|
|
143
136
|
def __init__(self, model: str):
|
|
144
137
|
self.model = model
|
|
145
138
|
self.client = AnthropicVertex()
|
|
146
|
-
|
|
139
|
+
|
|
147
140
|
async def create_message(
|
|
148
141
|
self,
|
|
149
142
|
*,
|
|
@@ -154,6 +147,7 @@ class AnthropicVertexClient(BaseAnthropicClient):
|
|
|
154
147
|
betas: list[str],
|
|
155
148
|
) -> BetaMessage:
|
|
156
149
|
"""Create a message using Vertex AI with retry logic."""
|
|
150
|
+
|
|
157
151
|
async def api_call():
|
|
158
152
|
response = self.client.beta.messages.with_raw_response.create(
|
|
159
153
|
max_tokens=max_tokens,
|
|
@@ -165,20 +159,21 @@ class AnthropicVertexClient(BaseAnthropicClient):
|
|
|
165
159
|
)
|
|
166
160
|
log_api_interaction(response.http_response.request, response.http_response, None)
|
|
167
161
|
return response.parse()
|
|
168
|
-
|
|
162
|
+
|
|
169
163
|
try:
|
|
170
164
|
return await self._make_api_call_with_retries(api_call)
|
|
171
165
|
except Exception as e:
|
|
172
166
|
log_api_interaction(None, None, e)
|
|
173
167
|
raise
|
|
174
168
|
|
|
169
|
+
|
|
175
170
|
class AnthropicBedrockClient(BaseAnthropicClient):
|
|
176
171
|
"""AWS Bedrock implementation of Anthropic client."""
|
|
177
|
-
|
|
172
|
+
|
|
178
173
|
def __init__(self, model: str):
|
|
179
174
|
self.model = model
|
|
180
175
|
self.client = AnthropicBedrock()
|
|
181
|
-
|
|
176
|
+
|
|
182
177
|
async def create_message(
|
|
183
178
|
self,
|
|
184
179
|
*,
|
|
@@ -189,6 +184,7 @@ class AnthropicBedrockClient(BaseAnthropicClient):
|
|
|
189
184
|
betas: list[str],
|
|
190
185
|
) -> BetaMessage:
|
|
191
186
|
"""Create a message using AWS Bedrock with retry logic."""
|
|
187
|
+
|
|
192
188
|
async def api_call():
|
|
193
189
|
response = self.client.beta.messages.with_raw_response.create(
|
|
194
190
|
max_tokens=max_tokens,
|
|
@@ -200,23 +196,24 @@ class AnthropicBedrockClient(BaseAnthropicClient):
|
|
|
200
196
|
)
|
|
201
197
|
log_api_interaction(response.http_response.request, response.http_response, None)
|
|
202
198
|
return response.parse()
|
|
203
|
-
|
|
199
|
+
|
|
204
200
|
try:
|
|
205
201
|
return await self._make_api_call_with_retries(api_call)
|
|
206
202
|
except Exception as e:
|
|
207
203
|
log_api_interaction(None, None, e)
|
|
208
204
|
raise
|
|
209
205
|
|
|
206
|
+
|
|
210
207
|
class AnthropicClientFactory:
|
|
211
208
|
"""Factory for creating appropriate Anthropic client implementations."""
|
|
212
|
-
|
|
209
|
+
|
|
213
210
|
@staticmethod
|
|
214
|
-
def create_client(provider:
|
|
211
|
+
def create_client(provider: LLMProvider, api_key: str, model: str) -> BaseAnthropicClient:
|
|
215
212
|
"""Create an appropriate client based on the provider."""
|
|
216
|
-
if provider ==
|
|
213
|
+
if provider == LLMProvider.ANTHROPIC:
|
|
217
214
|
return AnthropicDirectClient(api_key, model)
|
|
218
|
-
elif provider ==
|
|
215
|
+
elif provider == LLMProvider.VERTEX:
|
|
219
216
|
return AnthropicVertexClient(model)
|
|
220
|
-
elif provider ==
|
|
217
|
+
elif provider == LLMProvider.BEDROCK:
|
|
221
218
|
return AnthropicBedrockClient(model)
|
|
222
|
-
raise ValueError(f"Unsupported provider: {provider}")
|
|
219
|
+
raise ValueError(f"Unsupported provider: {provider}")
|
|
@@ -32,7 +32,7 @@ from .tools.manager import ToolManager
|
|
|
32
32
|
from .messages.manager import MessageManager
|
|
33
33
|
from .callbacks.manager import CallbackManager
|
|
34
34
|
from .prompts import SYSTEM_PROMPT
|
|
35
|
-
from .types import
|
|
35
|
+
from .types import LLMProvider
|
|
36
36
|
from .tools import ToolResult
|
|
37
37
|
|
|
38
38
|
# Constants
|
|
@@ -86,7 +86,7 @@ class AnthropicLoop(BaseLoop):
|
|
|
86
86
|
self.model = "claude-3-7-sonnet-20250219"
|
|
87
87
|
|
|
88
88
|
# Anthropic-specific attributes
|
|
89
|
-
self.provider =
|
|
89
|
+
self.provider = LLMProvider.ANTHROPIC
|
|
90
90
|
self.client = None
|
|
91
91
|
self.retry_count = 0
|
|
92
92
|
self.tool_manager = None
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from enum import StrEnum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LLMProvider(StrEnum):
|
|
5
|
+
"""Enum for supported API providers."""
|
|
6
|
+
|
|
7
|
+
ANTHROPIC = "anthropic"
|
|
8
|
+
BEDROCK = "bedrock"
|
|
9
|
+
VERTEX = "vertex"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
PROVIDER_TO_DEFAULT_MODEL_NAME: dict[LLMProvider, str] = {
|
|
13
|
+
LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
14
|
+
LLMProvider.BEDROCK: "anthropic.claude-3-7-sonnet-20250219-v2:0",
|
|
15
|
+
LLMProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
|
|
16
|
+
}
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
# The OmniComputerAgent has been replaced by the unified ComputerAgent
|
|
4
4
|
# which can be found in agent.core.agent
|
|
5
|
-
from .types import
|
|
5
|
+
from .types import LLMProvider
|
|
6
6
|
from .experiment import ExperimentManager
|
|
7
7
|
from .visualization import visualize_click, visualize_scroll, calculate_element_center
|
|
8
8
|
from .image_utils import (
|
|
@@ -14,7 +14,7 @@ from .image_utils import (
|
|
|
14
14
|
)
|
|
15
15
|
|
|
16
16
|
__all__ = [
|
|
17
|
-
"
|
|
17
|
+
"LLMProvider",
|
|
18
18
|
"ExperimentManager",
|
|
19
19
|
"visualize_click",
|
|
20
20
|
"visualize_scroll",
|
|
@@ -17,7 +17,7 @@ import copy
|
|
|
17
17
|
from .parser import OmniParser, ParseResult, ParserMetadata, UIElement
|
|
18
18
|
from ...core.loop import BaseLoop
|
|
19
19
|
from computer import Computer
|
|
20
|
-
from .types import
|
|
20
|
+
from .types import LLMProvider
|
|
21
21
|
from .clients.base import BaseOmniClient
|
|
22
22
|
from .clients.openai import OpenAIClient
|
|
23
23
|
from .clients.groq import GroqClient
|
|
@@ -46,7 +46,7 @@ class OmniLoop(BaseLoop):
|
|
|
46
46
|
def __init__(
|
|
47
47
|
self,
|
|
48
48
|
parser: OmniParser,
|
|
49
|
-
provider:
|
|
49
|
+
provider: LLMProvider,
|
|
50
50
|
api_key: str,
|
|
51
51
|
model: str,
|
|
52
52
|
computer: Computer,
|
|
@@ -180,11 +180,11 @@ class OmniLoop(BaseLoop):
|
|
|
180
180
|
try:
|
|
181
181
|
logger.info(f"Initializing {self.provider} client with model {self.model}...")
|
|
182
182
|
|
|
183
|
-
if self.provider ==
|
|
183
|
+
if self.provider == LLMProvider.OPENAI:
|
|
184
184
|
self.client = OpenAIClient(api_key=self.api_key, model=self.model)
|
|
185
|
-
elif self.provider ==
|
|
185
|
+
elif self.provider == LLMProvider.GROQ:
|
|
186
186
|
self.client = GroqClient(api_key=self.api_key, model=self.model)
|
|
187
|
-
elif self.provider ==
|
|
187
|
+
elif self.provider == LLMProvider.ANTHROPIC:
|
|
188
188
|
self.client = AnthropicClient(
|
|
189
189
|
api_key=self.api_key,
|
|
190
190
|
model=self.model,
|
|
@@ -228,7 +228,7 @@ class OmniLoop(BaseLoop):
|
|
|
228
228
|
prepared_messages = self.message_manager.get_formatted_messages(provider_name)
|
|
229
229
|
|
|
230
230
|
# Filter out system messages for Anthropic
|
|
231
|
-
if self.provider ==
|
|
231
|
+
if self.provider == LLMProvider.ANTHROPIC:
|
|
232
232
|
filtered_messages = [
|
|
233
233
|
msg for msg in prepared_messages if msg["role"] != "system"
|
|
234
234
|
]
|
|
@@ -238,7 +238,7 @@ class OmniLoop(BaseLoop):
|
|
|
238
238
|
# Log request
|
|
239
239
|
request_data = {"messages": filtered_messages, "max_tokens": self.max_tokens}
|
|
240
240
|
|
|
241
|
-
if self.provider ==
|
|
241
|
+
if self.provider == LLMProvider.ANTHROPIC:
|
|
242
242
|
request_data["system"] = self._get_system_prompt()
|
|
243
243
|
else:
|
|
244
244
|
request_data["system"] = system_prompt
|
|
@@ -255,7 +255,7 @@ class OmniLoop(BaseLoop):
|
|
|
255
255
|
|
|
256
256
|
if is_async:
|
|
257
257
|
# For async implementations (AnthropicClient)
|
|
258
|
-
if self.provider ==
|
|
258
|
+
if self.provider == LLMProvider.ANTHROPIC:
|
|
259
259
|
response = await run_method(
|
|
260
260
|
messages=filtered_messages,
|
|
261
261
|
system=self._get_system_prompt(),
|
|
@@ -269,7 +269,7 @@ class OmniLoop(BaseLoop):
|
|
|
269
269
|
)
|
|
270
270
|
else:
|
|
271
271
|
# For non-async implementations (GroqClient, etc.)
|
|
272
|
-
if self.provider ==
|
|
272
|
+
if self.provider == LLMProvider.ANTHROPIC:
|
|
273
273
|
response = run_method(
|
|
274
274
|
messages=filtered_messages,
|
|
275
275
|
system=self._get_system_prompt(),
|
|
@@ -339,7 +339,7 @@ class OmniLoop(BaseLoop):
|
|
|
339
339
|
action_screenshot_saved = False
|
|
340
340
|
try:
|
|
341
341
|
# Handle Anthropic response format
|
|
342
|
-
if self.provider ==
|
|
342
|
+
if self.provider == LLMProvider.ANTHROPIC:
|
|
343
343
|
if hasattr(response, "content") and isinstance(response.content, list):
|
|
344
344
|
# Extract text from content blocks
|
|
345
345
|
for block in response.content:
|
|
@@ -563,7 +563,7 @@ class OmniLoop(BaseLoop):
|
|
|
563
563
|
"""Process and add screen info to messages."""
|
|
564
564
|
try:
|
|
565
565
|
# Only add message if we have an image and provider supports it
|
|
566
|
-
if self.provider in [
|
|
566
|
+
if self.provider in [LLMProvider.OPENAI, LLMProvider.ANTHROPIC]:
|
|
567
567
|
image = parsed_screen.annotated_image_base64 or None
|
|
568
568
|
if image:
|
|
569
569
|
# Save screen info to current turn directory
|
|
@@ -577,7 +577,7 @@ class OmniLoop(BaseLoop):
|
|
|
577
577
|
logger.info(f"Saved elements to {elements_path}")
|
|
578
578
|
|
|
579
579
|
# Format the image content based on the provider
|
|
580
|
-
if self.provider ==
|
|
580
|
+
if self.provider == LLMProvider.ANTHROPIC:
|
|
581
581
|
# Compress the image before sending to Anthropic (5MB limit)
|
|
582
582
|
image_size = len(image)
|
|
583
583
|
logger.info(f"Image base64 is present, length: {image_size}")
|
|
@@ -62,17 +62,3 @@ IMPORTANT NOTES:
|
|
|
62
62
|
9. Reflect whether the element is clickable or not, for example reflect if it is an hyperlink or a button or a normal text.
|
|
63
63
|
10. If you are prompted with login information page or captcha page, or you think it need user's permission to do the next action, you should say "Action": "None" in the json field.
|
|
64
64
|
"""
|
|
65
|
-
|
|
66
|
-
# SYSTEM_PROMPT1 = """You are an AI assistant helping users interact with their computer.
|
|
67
|
-
# Analyze the screen information and respond with JSON containing:
|
|
68
|
-
# {
|
|
69
|
-
# "Box ID": "Numeric ID of the relevant UI element",
|
|
70
|
-
# "Action": "One of: left_click, right_click, double_click, move_cursor, drag_to, type_text, press_key, hotkey, scroll_down, scroll_up, wait",
|
|
71
|
-
# "Value": "Text to type, key to press",
|
|
72
|
-
# "Explanation": "Why this action was chosen"
|
|
73
|
-
# }
|
|
74
|
-
|
|
75
|
-
# Notes:
|
|
76
|
-
# - For starting applications, use the "hotkey" action with command+space for starting a Spotlight search.
|
|
77
|
-
# - Each UI element is highlighted with a colored bounding box, and its Box ID appears nearby in the same color for easy identification.
|
|
78
|
-
# """
|
|
@@ -14,17 +14,16 @@ class LLMProvider(StrEnum):
|
|
|
14
14
|
QWEN = "qwen"
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
APIProvider = LLMProvider
|
|
17
|
+
LLMProvider
|
|
19
18
|
|
|
20
19
|
|
|
21
20
|
@dataclass
|
|
22
21
|
class LLM:
|
|
23
22
|
"""Configuration for LLM model and provider."""
|
|
24
|
-
|
|
23
|
+
|
|
25
24
|
provider: LLMProvider
|
|
26
25
|
name: Optional[str] = None
|
|
27
|
-
|
|
26
|
+
|
|
28
27
|
def __post_init__(self):
|
|
29
28
|
"""Set default model name if not provided."""
|
|
30
29
|
if self.name is None:
|
|
@@ -44,9 +44,10 @@ class Annotation(BaseModel):
|
|
|
44
44
|
vm_url: str
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
class
|
|
47
|
+
class AgentLoop(Enum):
|
|
48
48
|
"""Enumeration of available loop types."""
|
|
49
49
|
|
|
50
50
|
ANTHROPIC = auto() # Anthropic implementation
|
|
51
|
+
OPENAI = auto() # OpenAI implementation
|
|
51
52
|
OMNI = auto() # OmniLoop implementation
|
|
52
53
|
# Add more loop types as needed
|
|
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
|
|
|
6
6
|
|
|
7
7
|
[project]
|
|
8
8
|
name = "cua-agent"
|
|
9
|
-
version = "0.1.
|
|
9
|
+
version = "0.1.2"
|
|
10
10
|
description = "CUA (Computer Use) Agent for AI-driven computer interaction"
|
|
11
11
|
authors = [
|
|
12
12
|
{ name = "TryCua", email = "gh@trycua.com" },
|
|
@@ -78,7 +78,7 @@ target-version = [
|
|
|
78
78
|
|
|
79
79
|
[tool.ruff]
|
|
80
80
|
line-length = 100
|
|
81
|
-
target-version = "0.1.
|
|
81
|
+
target-version = "0.1.2"
|
|
82
82
|
select = [
|
|
83
83
|
"E",
|
|
84
84
|
"F",
|
|
@@ -92,7 +92,7 @@ docstring-code-format = true
|
|
|
92
92
|
|
|
93
93
|
[tool.mypy]
|
|
94
94
|
strict = true
|
|
95
|
-
python_version = "0.1.
|
|
95
|
+
python_version = "0.1.2"
|
|
96
96
|
ignore_missing_imports = true
|
|
97
97
|
disallow_untyped_defs = true
|
|
98
98
|
check_untyped_defs = true
|
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
# """Basic tests for the agent package."""
|
|
2
2
|
|
|
3
3
|
# import pytest
|
|
4
|
-
# from agent import OmniComputerAgent,
|
|
4
|
+
# from agent import OmniComputerAgent, LLMProvider
|
|
5
5
|
# from agent.base.agent import BaseComputerAgent
|
|
6
6
|
# from computer import Computer
|
|
7
7
|
|
|
8
8
|
# def test_agent_import():
|
|
9
9
|
# """Test that we can import the OmniComputerAgent class."""
|
|
10
10
|
# assert OmniComputerAgent is not None
|
|
11
|
-
# assert
|
|
11
|
+
# assert LLMProvider is not None
|
|
12
12
|
|
|
13
13
|
# def test_agent_init():
|
|
14
14
|
# """Test that we can create an OmniComputerAgent instance."""
|
|
15
15
|
# agent = OmniComputerAgent(
|
|
16
|
-
# provider=
|
|
16
|
+
# provider=LLMProvider.OPENAI,
|
|
17
17
|
# use_host_computer_server=True
|
|
18
18
|
# )
|
|
19
19
|
# assert agent is not None
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
"""CUA (Computer Use) Agent for AI-driven computer interaction."""
|
|
2
|
-
|
|
3
|
-
__version__ = "0.1.0"
|
|
4
|
-
|
|
5
|
-
from .core.factory import AgentFactory
|
|
6
|
-
from .core.agent import ComputerAgent
|
|
7
|
-
from .types.base import Provider, AgenticLoop
|
|
8
|
-
from .providers.omni.types import LLMProvider, LLM, Model, LLMModel, APIProvider
|
|
9
|
-
|
|
10
|
-
__all__ = ["AgentFactory", "Provider", "ComputerAgent", "AgenticLoop", "LLMProvider", "LLM", "Model", "LLMModel", "APIProvider"]
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
from enum import StrEnum
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class APIProvider(StrEnum):
|
|
5
|
-
"""Enum for supported API providers."""
|
|
6
|
-
|
|
7
|
-
ANTHROPIC = "anthropic"
|
|
8
|
-
BEDROCK = "bedrock"
|
|
9
|
-
VERTEX = "vertex"
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
|
|
13
|
-
APIProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
14
|
-
APIProvider.BEDROCK: "anthropic.claude-3-7-sonnet-20250219-v2:0",
|
|
15
|
-
APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
|
|
16
|
-
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|