cua-agent 0.1.6__tar.gz → 0.1.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- {cua_agent-0.1.6 → cua_agent-0.1.17}/PKG-INFO +6 -36
- {cua_agent-0.1.6 → cua_agent-0.1.17}/README.md +5 -35
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/__init__.py +3 -2
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/__init__.py +0 -5
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/computer_agent.py +21 -28
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/loop.py +78 -124
- cua_agent-0.1.17/agent/core/messages.py +399 -0
- cua_agent-0.1.17/agent/core/types.py +35 -0
- cua_agent-0.1.17/agent/core/visualization.py +197 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/api/client.py +142 -1
- cua_agent-0.1.17/agent/providers/anthropic/api_handler.py +140 -0
- cua_agent-0.1.17/agent/providers/anthropic/callbacks/__init__.py +5 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/loop.py +206 -220
- cua_agent-0.1.17/agent/providers/anthropic/response_handler.py +229 -0
- cua_agent-0.1.17/agent/providers/anthropic/tools/bash.py +66 -0
- cua_agent-0.1.17/agent/providers/anthropic/utils.py +370 -0
- cua_agent-0.1.17/agent/providers/omni/__init__.py +8 -0
- cua_agent-0.1.17/agent/providers/omni/api_handler.py +42 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/omni/clients/anthropic.py +4 -0
- cua_agent-0.1.17/agent/providers/omni/image_utils.py +34 -0
- cua_agent-0.1.17/agent/providers/omni/loop.py +855 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/omni/parser.py +58 -4
- cua_agent-0.1.17/agent/providers/omni/tools/__init__.py +30 -0
- cua_agent-0.1.17/agent/providers/omni/tools/base.py +29 -0
- cua_agent-0.1.17/agent/providers/omni/tools/bash.py +74 -0
- cua_agent-0.1.17/agent/providers/omni/tools/computer.py +179 -0
- cua_agent-0.1.17/agent/providers/omni/tools/manager.py +61 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/omni/types.py +0 -4
- cua_agent-0.1.17/agent/providers/omni/utils.py +236 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/pyproject.toml +3 -3
- cua_agent-0.1.6/agent/core/messages.py +0 -245
- cua_agent-0.1.6/agent/providers/anthropic/tools/bash.py +0 -163
- cua_agent-0.1.6/agent/providers/omni/__init__.py +0 -27
- cua_agent-0.1.6/agent/providers/omni/callbacks.py +0 -78
- cua_agent-0.1.6/agent/providers/omni/clients/groq.py +0 -101
- cua_agent-0.1.6/agent/providers/omni/experiment.py +0 -276
- cua_agent-0.1.6/agent/providers/omni/image_utils.py +0 -106
- cua_agent-0.1.6/agent/providers/omni/loop.py +0 -971
- cua_agent-0.1.6/agent/providers/omni/messages.py +0 -171
- cua_agent-0.1.6/agent/providers/omni/tool_manager.py +0 -91
- cua_agent-0.1.6/agent/providers/omni/tools/__init__.py +0 -12
- cua_agent-0.1.6/agent/providers/omni/tools/bash.py +0 -69
- cua_agent-0.1.6/agent/providers/omni/tools/computer.py +0 -217
- cua_agent-0.1.6/agent/providers/omni/tools/manager.py +0 -81
- cua_agent-0.1.6/agent/providers/omni/utils.py +0 -157
- cua_agent-0.1.6/agent/providers/omni/visualization.py +0 -130
- cua_agent-0.1.6/agent/types/__init__.py +0 -23
- cua_agent-0.1.6/agent/types/base.py +0 -41
- cua_agent-0.1.6/agent/types/messages.py +0 -36
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/README.md +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/README.md +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/callbacks.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/experiment.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/telemetry.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/tools/__init__.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/tools/base.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/tools/bash.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/tools/collection.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/tools/computer.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/tools/edit.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/tools/manager.py +0 -0
- {cua_agent-0.1.6/agent/types → cua_agent-0.1.17/agent/core}/tools.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/__init__.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/__init__.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/api/logging.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/callbacks/manager.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/messages/manager.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/prompts.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/tools/__init__.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/tools/base.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/tools/collection.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/tools/computer.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/tools/edit.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/tools/manager.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/tools/run.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/types.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/omni/clients/base.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/omni/clients/openai.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/omni/clients/utils.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/omni/prompts.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/telemetry.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.17}/tests/test_agent.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cua-agent
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.17
|
|
4
4
|
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
5
|
Author-Email: TryCua <gh@trycua.com>
|
|
6
6
|
Requires-Python: <3.13,>=3.10
|
|
@@ -63,43 +63,13 @@ Description-Content-Type: text/markdown
|
|
|
63
63
|
|
|
64
64
|
**Agent** is a Computer Use (CUA) framework for running multi-app agentic workflows targeting macOS and Linux sandbox, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). The framework integrates with Microsoft's OmniParser for enhanced UI understanding and interaction.
|
|
65
65
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
```python
|
|
69
|
-
from agent import ComputerAgent, AgentLoop, LLMProvider
|
|
70
|
-
from computer import Computer
|
|
71
|
-
|
|
72
|
-
computer = Computer(verbosity=logging.INFO)
|
|
73
|
-
|
|
74
|
-
agent = ComputerAgent(
|
|
75
|
-
computer=computer,
|
|
76
|
-
loop=AgentLoop.ANTHROPIC,
|
|
77
|
-
# loop=AgentLoop.OMNI,
|
|
78
|
-
model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
|
|
79
|
-
# model=LLM(provider=LLMProvider.OPENAI, name="gpt-4.5-preview"),
|
|
80
|
-
save_trajectory=True,
|
|
81
|
-
trajectory_dir=str(Path("trajectories")),
|
|
82
|
-
only_n_most_recent_images=3,
|
|
83
|
-
verbosity=logging.INFO,
|
|
84
|
-
)
|
|
66
|
+
> While our north star is to create a 1-click experience, this preview of Agent might be still a bit rough around the edges. We appreciate your patience as we work to improve the experience.
|
|
85
67
|
|
|
86
|
-
|
|
87
|
-
"""
|
|
88
|
-
Please help me with the following task:
|
|
89
|
-
1. Open Safari browser
|
|
90
|
-
2. Go to Wikipedia.org
|
|
91
|
-
3. Search for "Claude AI"
|
|
92
|
-
4. Summarize the main points you find about Claude AI
|
|
93
|
-
"""
|
|
94
|
-
]
|
|
68
|
+
### Get started with Agent
|
|
95
69
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
async for result in agent.run(task):
|
|
100
|
-
print(result)
|
|
101
|
-
print(f"Task {i} completed")
|
|
102
|
-
```
|
|
70
|
+
<div align="center">
|
|
71
|
+
<img src="../../img/agent.png"/>
|
|
72
|
+
</div>
|
|
103
73
|
|
|
104
74
|
## Install
|
|
105
75
|
|
|
@@ -17,43 +17,13 @@
|
|
|
17
17
|
|
|
18
18
|
**Agent** is a Computer Use (CUA) framework for running multi-app agentic workflows targeting macOS and Linux sandbox, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). The framework integrates with Microsoft's OmniParser for enhanced UI understanding and interaction.
|
|
19
19
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
```python
|
|
23
|
-
from agent import ComputerAgent, AgentLoop, LLMProvider
|
|
24
|
-
from computer import Computer
|
|
25
|
-
|
|
26
|
-
computer = Computer(verbosity=logging.INFO)
|
|
27
|
-
|
|
28
|
-
agent = ComputerAgent(
|
|
29
|
-
computer=computer,
|
|
30
|
-
loop=AgentLoop.ANTHROPIC,
|
|
31
|
-
# loop=AgentLoop.OMNI,
|
|
32
|
-
model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
|
|
33
|
-
# model=LLM(provider=LLMProvider.OPENAI, name="gpt-4.5-preview"),
|
|
34
|
-
save_trajectory=True,
|
|
35
|
-
trajectory_dir=str(Path("trajectories")),
|
|
36
|
-
only_n_most_recent_images=3,
|
|
37
|
-
verbosity=logging.INFO,
|
|
38
|
-
)
|
|
20
|
+
> While our north star is to create a 1-click experience, this preview of Agent might be still a bit rough around the edges. We appreciate your patience as we work to improve the experience.
|
|
39
21
|
|
|
40
|
-
|
|
41
|
-
"""
|
|
42
|
-
Please help me with the following task:
|
|
43
|
-
1. Open Safari browser
|
|
44
|
-
2. Go to Wikipedia.org
|
|
45
|
-
3. Search for "Claude AI"
|
|
46
|
-
4. Summarize the main points you find about Claude AI
|
|
47
|
-
"""
|
|
48
|
-
]
|
|
22
|
+
### Get started with Agent
|
|
49
23
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
async for result in agent.run(task):
|
|
54
|
-
print(result)
|
|
55
|
-
print(f"Task {i} completed")
|
|
56
|
-
```
|
|
24
|
+
<div align="center">
|
|
25
|
+
<img src="../../img/agent.png"/>
|
|
26
|
+
</div>
|
|
57
27
|
|
|
58
28
|
## Install
|
|
59
29
|
|
|
@@ -49,6 +49,7 @@ except Exception as e:
|
|
|
49
49
|
logger.warning(f"Error initializing telemetry: {e}")
|
|
50
50
|
|
|
51
51
|
from .providers.omni.types import LLMProvider, LLM
|
|
52
|
-
from .
|
|
52
|
+
from .core.loop import AgentLoop
|
|
53
|
+
from .core.computer_agent import ComputerAgent
|
|
53
54
|
|
|
54
|
-
__all__ = ["AgentLoop", "LLMProvider", "LLM"]
|
|
55
|
+
__all__ = ["AgentLoop", "LLMProvider", "LLM", "ComputerAgent"]
|
|
@@ -3,8 +3,7 @@
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
-
from typing import Any, AsyncGenerator, Dict, Optional, cast
|
|
7
|
-
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, AsyncGenerator, Dict, Optional, cast, List
|
|
8
7
|
|
|
9
8
|
from computer import Computer
|
|
10
9
|
from ..providers.anthropic.loop import AnthropicLoop
|
|
@@ -12,6 +11,8 @@ from ..providers.omni.loop import OmniLoop
|
|
|
12
11
|
from ..providers.omni.parser import OmniParser
|
|
13
12
|
from ..providers.omni.types import LLMProvider, LLM
|
|
14
13
|
from .. import AgentLoop
|
|
14
|
+
from .messages import StandardMessageManager, ImageRetentionConfig
|
|
15
|
+
from .types import AgentResponse
|
|
15
16
|
|
|
16
17
|
logging.basicConfig(level=logging.INFO)
|
|
17
18
|
logger = logging.getLogger(__name__)
|
|
@@ -44,7 +45,6 @@ class ComputerAgent:
|
|
|
44
45
|
save_trajectory: bool = True,
|
|
45
46
|
trajectory_dir: str = "trajectories",
|
|
46
47
|
only_n_most_recent_images: Optional[int] = None,
|
|
47
|
-
parser: Optional[OmniParser] = None,
|
|
48
48
|
verbosity: int = logging.INFO,
|
|
49
49
|
):
|
|
50
50
|
"""Initialize the ComputerAgent.
|
|
@@ -61,12 +61,11 @@ class ComputerAgent:
|
|
|
61
61
|
save_trajectory: Whether to save the trajectory.
|
|
62
62
|
trajectory_dir: Directory to save the trajectory.
|
|
63
63
|
only_n_most_recent_images: Maximum number of recent screenshots to include in API requests.
|
|
64
|
-
parser: Parser instance for the OmniLoop. Only used if provider is not ANTHROPIC.
|
|
65
64
|
verbosity: Logging level.
|
|
66
65
|
"""
|
|
67
66
|
# Basic agent configuration
|
|
68
67
|
self.max_retries = max_retries
|
|
69
|
-
self.computer = computer
|
|
68
|
+
self.computer = computer
|
|
70
69
|
self.queue = asyncio.Queue()
|
|
71
70
|
self.screenshot_dir = screenshot_dir
|
|
72
71
|
self.log_dir = log_dir
|
|
@@ -100,7 +99,7 @@ class ComputerAgent:
|
|
|
100
99
|
)
|
|
101
100
|
|
|
102
101
|
# Ensure computer is properly cast for typing purposes
|
|
103
|
-
computer_instance =
|
|
102
|
+
computer_instance = self.computer
|
|
104
103
|
|
|
105
104
|
# Get API key from environment if not provided
|
|
106
105
|
actual_api_key = api_key or os.environ.get(ENV_VARS[self.provider], "")
|
|
@@ -118,10 +117,6 @@ class ComputerAgent:
|
|
|
118
117
|
only_n_most_recent_images=only_n_most_recent_images,
|
|
119
118
|
)
|
|
120
119
|
else:
|
|
121
|
-
# Default to OmniLoop for other loop types
|
|
122
|
-
# Initialize parser if not provided
|
|
123
|
-
actual_parser = parser or OmniParser()
|
|
124
|
-
|
|
125
120
|
self._loop = OmniLoop(
|
|
126
121
|
provider=self.provider,
|
|
127
122
|
api_key=actual_api_key,
|
|
@@ -130,9 +125,12 @@ class ComputerAgent:
|
|
|
130
125
|
save_trajectory=save_trajectory,
|
|
131
126
|
base_dir=trajectory_dir,
|
|
132
127
|
only_n_most_recent_images=only_n_most_recent_images,
|
|
133
|
-
parser=
|
|
128
|
+
parser=OmniParser(),
|
|
134
129
|
)
|
|
135
130
|
|
|
131
|
+
# Initialize the message manager from the loop
|
|
132
|
+
self.message_manager = self._loop.message_manager
|
|
133
|
+
|
|
136
134
|
logger.info(
|
|
137
135
|
f"ComputerAgent initialized with provider: {self.provider}, model: {actual_model_name}"
|
|
138
136
|
)
|
|
@@ -201,36 +199,30 @@ class ComputerAgent:
|
|
|
201
199
|
await self.computer.run()
|
|
202
200
|
self._initialized = True
|
|
203
201
|
|
|
204
|
-
async def
|
|
205
|
-
"""Initialize the computer interface if it hasn't been initialized yet."""
|
|
206
|
-
if not self.computer._initialized:
|
|
207
|
-
logger.info("Computer not initialized, initializing now...")
|
|
208
|
-
try:
|
|
209
|
-
# Call run directly
|
|
210
|
-
await self.computer.run()
|
|
211
|
-
logger.info("Computer interface initialized successfully")
|
|
212
|
-
except Exception as e:
|
|
213
|
-
logger.error(f"Error initializing computer interface: {str(e)}")
|
|
214
|
-
raise
|
|
215
|
-
|
|
216
|
-
async def run(self, task: str) -> AsyncGenerator[Dict[str, Any], None]:
|
|
202
|
+
async def run(self, task: str) -> AsyncGenerator[AgentResponse, None]:
|
|
217
203
|
"""Run a task using the computer agent.
|
|
218
204
|
|
|
219
205
|
Args:
|
|
220
206
|
task: Task description
|
|
221
207
|
|
|
222
208
|
Yields:
|
|
223
|
-
|
|
209
|
+
Agent response format
|
|
224
210
|
"""
|
|
225
211
|
try:
|
|
226
212
|
logger.info(f"Running task: {task}")
|
|
213
|
+
logger.info(
|
|
214
|
+
f"Message history before task has {len(self.message_manager.messages)} messages"
|
|
215
|
+
)
|
|
227
216
|
|
|
228
217
|
# Initialize the computer if needed
|
|
229
218
|
if not self._initialized:
|
|
230
219
|
await self.initialize()
|
|
231
220
|
|
|
232
|
-
#
|
|
233
|
-
|
|
221
|
+
# Add task as a user message using the message manager
|
|
222
|
+
self.message_manager.add_user_message([{"type": "text", "text": task}])
|
|
223
|
+
logger.info(
|
|
224
|
+
f"Added task message. Message history now has {len(self.message_manager.messages)} messages"
|
|
225
|
+
)
|
|
234
226
|
|
|
235
227
|
# Pass properly formatted messages to the loop
|
|
236
228
|
if self._loop is None:
|
|
@@ -239,7 +231,8 @@ class ComputerAgent:
|
|
|
239
231
|
return
|
|
240
232
|
|
|
241
233
|
# Execute the task and yield results
|
|
242
|
-
async for result in self._loop.run(messages):
|
|
234
|
+
async for result in self._loop.run(self.message_manager.messages):
|
|
235
|
+
# Yield the result to the caller
|
|
243
236
|
yield result
|
|
244
237
|
|
|
245
238
|
except Exception as e:
|
|
@@ -2,22 +2,34 @@
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import asyncio
|
|
5
|
-
import json
|
|
6
|
-
import os
|
|
7
5
|
from abc import ABC, abstractmethod
|
|
6
|
+
from enum import Enum, auto
|
|
8
7
|
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
|
|
9
8
|
from datetime import datetime
|
|
10
|
-
import base64
|
|
11
9
|
|
|
12
10
|
from computer import Computer
|
|
13
11
|
from .experiment import ExperimentManager
|
|
12
|
+
from .messages import StandardMessageManager, ImageRetentionConfig
|
|
13
|
+
from .types import AgentResponse
|
|
14
14
|
|
|
15
15
|
logger = logging.getLogger(__name__)
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
class AgentLoop(Enum):
|
|
19
|
+
"""Enumeration of available loop types."""
|
|
20
|
+
|
|
21
|
+
ANTHROPIC = auto() # Anthropic implementation
|
|
22
|
+
OMNI = auto() # OmniLoop implementation
|
|
23
|
+
# Add more loop types as needed
|
|
24
|
+
|
|
25
|
+
|
|
18
26
|
class BaseLoop(ABC):
|
|
19
27
|
"""Base class for agent loops that handle message processing and tool execution."""
|
|
20
28
|
|
|
29
|
+
###########################################
|
|
30
|
+
# INITIALIZATION AND CONFIGURATION
|
|
31
|
+
###########################################
|
|
32
|
+
|
|
21
33
|
def __init__(
|
|
22
34
|
self,
|
|
23
35
|
computer: Computer,
|
|
@@ -55,8 +67,6 @@ class BaseLoop(ABC):
|
|
|
55
67
|
self.save_trajectory = save_trajectory
|
|
56
68
|
self.only_n_most_recent_images = only_n_most_recent_images
|
|
57
69
|
self._kwargs = kwargs
|
|
58
|
-
self.message_history = []
|
|
59
|
-
# self.tool_manager = BaseToolManager(computer)
|
|
60
70
|
|
|
61
71
|
# Initialize experiment manager
|
|
62
72
|
if self.save_trajectory and self.base_dir:
|
|
@@ -75,6 +85,64 @@ class BaseLoop(ABC):
|
|
|
75
85
|
# Initialize basic tracking
|
|
76
86
|
self.turn_count = 0
|
|
77
87
|
|
|
88
|
+
async def initialize(self) -> None:
|
|
89
|
+
"""Initialize both the API client and computer interface with retries."""
|
|
90
|
+
for attempt in range(self.max_retries):
|
|
91
|
+
try:
|
|
92
|
+
logger.info(
|
|
93
|
+
f"Starting initialization (attempt {attempt + 1}/{self.max_retries})..."
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Initialize API client
|
|
97
|
+
await self.initialize_client()
|
|
98
|
+
|
|
99
|
+
logger.info("Initialization complete.")
|
|
100
|
+
return
|
|
101
|
+
except Exception as e:
|
|
102
|
+
if attempt < self.max_retries - 1:
|
|
103
|
+
logger.warning(
|
|
104
|
+
f"Initialization failed (attempt {attempt + 1}/{self.max_retries}): {str(e)}. Retrying..."
|
|
105
|
+
)
|
|
106
|
+
await asyncio.sleep(self.retry_delay)
|
|
107
|
+
else:
|
|
108
|
+
logger.error(
|
|
109
|
+
f"Initialization failed after {self.max_retries} attempts: {str(e)}"
|
|
110
|
+
)
|
|
111
|
+
raise RuntimeError(f"Failed to initialize: {str(e)}")
|
|
112
|
+
|
|
113
|
+
###########################################
|
|
114
|
+
|
|
115
|
+
# ABSTRACT METHODS TO BE IMPLEMENTED BY SUBCLASSES
|
|
116
|
+
###########################################
|
|
117
|
+
|
|
118
|
+
@abstractmethod
|
|
119
|
+
async def initialize_client(self) -> None:
|
|
120
|
+
"""Initialize the API client and any provider-specific components.
|
|
121
|
+
|
|
122
|
+
This method must be implemented by subclasses to set up
|
|
123
|
+
provider-specific clients and tools.
|
|
124
|
+
"""
|
|
125
|
+
raise NotImplementedError
|
|
126
|
+
|
|
127
|
+
@abstractmethod
|
|
128
|
+
async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
|
|
129
|
+
"""Run the agent loop with provided messages.
|
|
130
|
+
|
|
131
|
+
This method handles the main agent loop including message processing,
|
|
132
|
+
API calls, response handling, and action execution.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
messages: List of message objects
|
|
136
|
+
|
|
137
|
+
Yields:
|
|
138
|
+
Agent response format
|
|
139
|
+
"""
|
|
140
|
+
raise NotImplementedError
|
|
141
|
+
|
|
142
|
+
###########################################
|
|
143
|
+
# EXPERIMENT AND TRAJECTORY MANAGEMENT
|
|
144
|
+
###########################################
|
|
145
|
+
|
|
78
146
|
def _setup_experiment_dirs(self) -> None:
|
|
79
147
|
"""Setup the experiment directory structure."""
|
|
80
148
|
if self.experiment_manager:
|
|
@@ -100,10 +168,13 @@ class BaseLoop(ABC):
|
|
|
100
168
|
) -> None:
|
|
101
169
|
"""Log API call details to file.
|
|
102
170
|
|
|
171
|
+
Preserves provider-specific formats for requests and responses to ensure
|
|
172
|
+
accurate logging for debugging and analysis purposes.
|
|
173
|
+
|
|
103
174
|
Args:
|
|
104
175
|
call_type: Type of API call (e.g., 'request', 'response', 'error')
|
|
105
|
-
request: The API request data
|
|
106
|
-
response: Optional API response data
|
|
176
|
+
request: The API request data in provider-specific format
|
|
177
|
+
response: Optional API response data in provider-specific format
|
|
107
178
|
error: Optional error information
|
|
108
179
|
"""
|
|
109
180
|
if self.experiment_manager:
|
|
@@ -129,120 +200,3 @@ class BaseLoop(ABC):
|
|
|
129
200
|
"""
|
|
130
201
|
if self.experiment_manager:
|
|
131
202
|
self.experiment_manager.save_screenshot(img_base64, action_type)
|
|
132
|
-
|
|
133
|
-
async def initialize(self) -> None:
|
|
134
|
-
"""Initialize both the API client and computer interface with retries."""
|
|
135
|
-
for attempt in range(self.max_retries):
|
|
136
|
-
try:
|
|
137
|
-
logger.info(
|
|
138
|
-
f"Starting initialization (attempt {attempt + 1}/{self.max_retries})..."
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
# Initialize API client
|
|
142
|
-
await self.initialize_client()
|
|
143
|
-
|
|
144
|
-
logger.info("Initialization complete.")
|
|
145
|
-
return
|
|
146
|
-
except Exception as e:
|
|
147
|
-
if attempt < self.max_retries - 1:
|
|
148
|
-
logger.warning(
|
|
149
|
-
f"Initialization failed (attempt {attempt + 1}/{self.max_retries}): {str(e)}. Retrying..."
|
|
150
|
-
)
|
|
151
|
-
await asyncio.sleep(self.retry_delay)
|
|
152
|
-
else:
|
|
153
|
-
logger.error(
|
|
154
|
-
f"Initialization failed after {self.max_retries} attempts: {str(e)}"
|
|
155
|
-
)
|
|
156
|
-
raise RuntimeError(f"Failed to initialize: {str(e)}")
|
|
157
|
-
|
|
158
|
-
async def _get_parsed_screen_som(self) -> Dict[str, Any]:
|
|
159
|
-
"""Get parsed screen information.
|
|
160
|
-
|
|
161
|
-
Returns:
|
|
162
|
-
Dict containing screen information
|
|
163
|
-
"""
|
|
164
|
-
try:
|
|
165
|
-
# Take screenshot
|
|
166
|
-
screenshot = await self.computer.interface.screenshot()
|
|
167
|
-
|
|
168
|
-
# Initialize with default values
|
|
169
|
-
width, height = 1024, 768
|
|
170
|
-
base64_image = ""
|
|
171
|
-
|
|
172
|
-
# Handle different types of screenshot returns
|
|
173
|
-
if isinstance(screenshot, (bytes, bytearray, memoryview)):
|
|
174
|
-
# Raw bytes screenshot
|
|
175
|
-
base64_image = base64.b64encode(screenshot).decode("utf-8")
|
|
176
|
-
elif hasattr(screenshot, "base64_image"):
|
|
177
|
-
# Object-style screenshot with attributes
|
|
178
|
-
# Type checking can't infer these attributes, but they exist at runtime
|
|
179
|
-
# on certain screenshot return types
|
|
180
|
-
base64_image = getattr(screenshot, "base64_image")
|
|
181
|
-
width = (
|
|
182
|
-
getattr(screenshot, "width", width) if hasattr(screenshot, "width") else width
|
|
183
|
-
)
|
|
184
|
-
height = (
|
|
185
|
-
getattr(screenshot, "height", height)
|
|
186
|
-
if hasattr(screenshot, "height")
|
|
187
|
-
else height
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
# Create parsed screen data
|
|
191
|
-
parsed_screen = {
|
|
192
|
-
"width": width,
|
|
193
|
-
"height": height,
|
|
194
|
-
"parsed_content_list": [],
|
|
195
|
-
"timestamp": datetime.now().isoformat(),
|
|
196
|
-
"screenshot_base64": base64_image,
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
# Save screenshot if requested
|
|
200
|
-
if self.save_trajectory and self.experiment_manager:
|
|
201
|
-
try:
|
|
202
|
-
img_data = base64_image
|
|
203
|
-
if "," in img_data:
|
|
204
|
-
img_data = img_data.split(",")[1]
|
|
205
|
-
self._save_screenshot(img_data, action_type="state")
|
|
206
|
-
except Exception as e:
|
|
207
|
-
logger.error(f"Error saving screenshot: {str(e)}")
|
|
208
|
-
|
|
209
|
-
return parsed_screen
|
|
210
|
-
except Exception as e:
|
|
211
|
-
logger.error(f"Error taking screenshot: {str(e)}")
|
|
212
|
-
return {
|
|
213
|
-
"width": 1024,
|
|
214
|
-
"height": 768,
|
|
215
|
-
"parsed_content_list": [],
|
|
216
|
-
"timestamp": datetime.now().isoformat(),
|
|
217
|
-
"error": f"Error taking screenshot: {str(e)}",
|
|
218
|
-
"screenshot_base64": "",
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
@abstractmethod
|
|
222
|
-
async def initialize_client(self) -> None:
|
|
223
|
-
"""Initialize the API client and any provider-specific components."""
|
|
224
|
-
raise NotImplementedError
|
|
225
|
-
|
|
226
|
-
@abstractmethod
|
|
227
|
-
async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
|
|
228
|
-
"""Run the agent loop with provided messages.
|
|
229
|
-
|
|
230
|
-
Args:
|
|
231
|
-
messages: List of message objects
|
|
232
|
-
|
|
233
|
-
Yields:
|
|
234
|
-
Dict containing response data
|
|
235
|
-
"""
|
|
236
|
-
raise NotImplementedError
|
|
237
|
-
|
|
238
|
-
@abstractmethod
|
|
239
|
-
async def _process_screen(
|
|
240
|
-
self, parsed_screen: Dict[str, Any], messages: List[Dict[str, Any]]
|
|
241
|
-
) -> None:
|
|
242
|
-
"""Process screen information and add to messages.
|
|
243
|
-
|
|
244
|
-
Args:
|
|
245
|
-
parsed_screen: Dictionary containing parsed screen info
|
|
246
|
-
messages: List of messages to update
|
|
247
|
-
"""
|
|
248
|
-
raise NotImplementedError
|