cua-agent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (65) hide show
  1. agent/README.md +63 -0
  2. agent/__init__.py +10 -0
  3. agent/core/README.md +101 -0
  4. agent/core/__init__.py +34 -0
  5. agent/core/agent.py +284 -0
  6. agent/core/base_agent.py +164 -0
  7. agent/core/callbacks.py +147 -0
  8. agent/core/computer_agent.py +69 -0
  9. agent/core/experiment.py +222 -0
  10. agent/core/factory.py +102 -0
  11. agent/core/loop.py +244 -0
  12. agent/core/messages.py +230 -0
  13. agent/core/tools/__init__.py +21 -0
  14. agent/core/tools/base.py +74 -0
  15. agent/core/tools/bash.py +52 -0
  16. agent/core/tools/collection.py +46 -0
  17. agent/core/tools/computer.py +113 -0
  18. agent/core/tools/edit.py +67 -0
  19. agent/core/tools/manager.py +56 -0
  20. agent/providers/__init__.py +4 -0
  21. agent/providers/anthropic/__init__.py +6 -0
  22. agent/providers/anthropic/api/client.py +222 -0
  23. agent/providers/anthropic/api/logging.py +150 -0
  24. agent/providers/anthropic/callbacks/manager.py +55 -0
  25. agent/providers/anthropic/loop.py +521 -0
  26. agent/providers/anthropic/messages/manager.py +110 -0
  27. agent/providers/anthropic/prompts.py +20 -0
  28. agent/providers/anthropic/tools/__init__.py +33 -0
  29. agent/providers/anthropic/tools/base.py +88 -0
  30. agent/providers/anthropic/tools/bash.py +163 -0
  31. agent/providers/anthropic/tools/collection.py +34 -0
  32. agent/providers/anthropic/tools/computer.py +550 -0
  33. agent/providers/anthropic/tools/edit.py +326 -0
  34. agent/providers/anthropic/tools/manager.py +54 -0
  35. agent/providers/anthropic/tools/run.py +42 -0
  36. agent/providers/anthropic/types.py +16 -0
  37. agent/providers/omni/__init__.py +27 -0
  38. agent/providers/omni/callbacks.py +78 -0
  39. agent/providers/omni/clients/anthropic.py +99 -0
  40. agent/providers/omni/clients/base.py +44 -0
  41. agent/providers/omni/clients/groq.py +101 -0
  42. agent/providers/omni/clients/openai.py +159 -0
  43. agent/providers/omni/clients/utils.py +25 -0
  44. agent/providers/omni/experiment.py +273 -0
  45. agent/providers/omni/image_utils.py +106 -0
  46. agent/providers/omni/loop.py +961 -0
  47. agent/providers/omni/messages.py +168 -0
  48. agent/providers/omni/parser.py +252 -0
  49. agent/providers/omni/prompts.py +78 -0
  50. agent/providers/omni/tool_manager.py +91 -0
  51. agent/providers/omni/tools/__init__.py +13 -0
  52. agent/providers/omni/tools/bash.py +69 -0
  53. agent/providers/omni/tools/computer.py +216 -0
  54. agent/providers/omni/tools/manager.py +83 -0
  55. agent/providers/omni/types.py +30 -0
  56. agent/providers/omni/utils.py +155 -0
  57. agent/providers/omni/visualization.py +130 -0
  58. agent/types/__init__.py +26 -0
  59. agent/types/base.py +52 -0
  60. agent/types/messages.py +36 -0
  61. agent/types/tools.py +32 -0
  62. cua_agent-0.1.0.dist-info/METADATA +44 -0
  63. cua_agent-0.1.0.dist-info/RECORD +65 -0
  64. cua_agent-0.1.0.dist-info/WHEEL +4 -0
  65. cua_agent-0.1.0.dist-info/entry_points.txt +4 -0
agent/README.md ADDED
@@ -0,0 +1,63 @@
1
+ # Agent Package Structure
2
+
3
+ ## Overview
4
+ The agent package provides a modular and extensible framework for AI-powered computer agents.
5
+
6
+ ## Directory Structure
7
+ ```
8
+ agent/
9
+ ├── __init__.py # Package exports
10
+ ├── core/ # Core functionality
11
+ │ ├── __init__.py
12
+ │ ├── computer_agent.py # Main entry point
13
+ │ └── factory.py # Provider factory
14
+ ├── base/ # Base implementations
15
+ │ ├── __init__.py
16
+ │ ├── agent.py # Base agent class
17
+ │ ├── core/ # Core components
18
+ │ │ ├── callbacks.py
19
+ │ │ ├── loop.py
20
+ │ │ └── messages.py
21
+ │ └── tools/ # Tool implementations
22
+ ├── providers/ # Provider implementations
23
+ │ ├── __init__.py
24
+ │ ├── anthropic/ # Anthropic provider
25
+ │ │ ├── agent.py
26
+ │ │ ├── loop.py
27
+ │ │ └── tool_manager.py
28
+ │ └── omni/ # Omni provider
29
+ │ ├── agent.py
30
+ │ ├── loop.py
31
+ │ └── tool_manager.py
32
+ └── types/ # Type definitions
33
+ ├── __init__.py
34
+ ├── base.py # Core types
35
+ ├── messages.py # Message types
36
+ ├── tools.py # Tool types
37
+ └── providers/ # Provider-specific types
38
+ ├── anthropic.py
39
+ └── omni.py
40
+ ```
41
+
42
+ ## Key Components
43
+
44
+ ### Core
45
+ - `computer_agent.py`: Main entry point for creating and using agents
46
+ - `factory.py`: Factory for creating provider-specific implementations
47
+
48
+ ### Base
49
+ - `agent.py`: Base agent implementation with shared functionality
50
+ - `core/`: Core components used across providers
51
+ - `tools/`: Shared tool implementations
52
+
53
+ ### Providers
54
+ Each provider follows the same structure:
55
+ - `agent.py`: Provider-specific agent implementation
56
+ - `loop.py`: Provider-specific message loop
57
+ - `tool_manager.py`: Tool management for provider
58
+
59
+ ### Types
60
+ - `base.py`: Core type definitions
61
+ - `messages.py`: Message-related types
62
+ - `tools.py`: Tool-related types
63
+ - `providers/`: Provider-specific type definitions
agent/__init__.py ADDED
@@ -0,0 +1,10 @@
1
+ """CUA (Computer Use) Agent for AI-driven computer interaction."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from .core.factory import AgentFactory
6
+ from .core.agent import ComputerAgent
7
+ from .types.base import Provider, AgenticLoop
8
+ from .providers.omni.types import APIProvider
9
+
10
+ __all__ = ["AgentFactory", "Provider", "ComputerAgent", "AgenticLoop", "APIProvider"]
agent/core/README.md ADDED
@@ -0,0 +1,101 @@
1
+ # Unified ComputerAgent
2
+
3
+ The `ComputerAgent` class provides a unified implementation that consolidates the previously separate agent implementations (AnthropicComputerAgent and OmniComputerAgent) into a single, configurable class.
4
+
5
+ ## Features
6
+
7
+ - **Multiple Loop Types**: Switch between different agentic loop implementations using the `loop_type` parameter (Anthropic or Omni).
8
+ - **Provider Support**: Use different AI providers (OpenAI, Anthropic, etc.) with the appropriate loop.
9
+ - **Trajectory Saving**: Control whether to save screenshots and logs with the `save_trajectory` parameter.
10
+ - **Consistent Interface**: Maintains a consistent interface regardless of the underlying loop implementation.
11
+
12
+ ## API Key Requirements
13
+
14
+ To use the ComputerAgent, you'll need API keys for the providers you want to use:
15
+
16
+ - For **OpenAI**: Set the `OPENAI_API_KEY` environment variable or pass it directly as `api_key`.
17
+ - For **Anthropic**: Set the `ANTHROPIC_API_KEY` environment variable or pass it directly as `api_key`.
18
+ - For **Groq**: Set the `GROQ_API_KEY` environment variable or pass it directly as `api_key`.
19
+
20
+ You can set environment variables in several ways:
21
+
22
+ ```bash
23
+ # In your terminal before running the code
24
+ export OPENAI_API_KEY=your_api_key_here
25
+
26
+ # Or in a .env file
27
+ OPENAI_API_KEY=your_api_key_here
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ Here's how to use the unified ComputerAgent:
33
+
34
+ ```python
35
+ from agent.core.agent import ComputerAgent
36
+ from agent.types.base import AgenticLoop
37
+ from agent.providers.omni.types import APIProvider
38
+ from computer import Computer
39
+
40
+ # Create a Computer instance
41
+ computer = Computer()
42
+
43
+ # Create an agent with the OMNI loop and OpenAI provider
44
+ agent = ComputerAgent(
45
+ computer=computer,
46
+ loop_type=AgenticLoop.OMNI,
47
+ provider=APIProvider.OPENAI,
48
+ model="gpt-4o",
49
+ api_key="your_api_key_here", # Can also use OPENAI_API_KEY environment variable
50
+ save_trajectory=True,
51
+ only_n_most_recent_images=5
52
+ )
53
+
54
+ # Create an agent with the ANTHROPIC loop
55
+ agent = ComputerAgent(
56
+ computer=computer,
57
+ loop_type=AgenticLoop.ANTHROPIC,
58
+ model="claude-3-7-sonnet-20250219",
59
+ api_key="your_api_key_here", # Can also use ANTHROPIC_API_KEY environment variable
60
+ save_trajectory=True,
61
+ only_n_most_recent_images=5
62
+ )
63
+
64
+ # Use the agent
65
+ async with agent:
66
+ async for result in agent.run("Your task description here"):
67
+ # Process the result
68
+ title = result["metadata"].get("title", "Screen Analysis")
69
+ content = result["content"]
70
+ print(f"\n{title}")
71
+ print(content)
72
+ ```
73
+
74
+ ## Parameters
75
+
76
+ - `computer`: Computer instance to control
77
+ - `loop_type`: The type of loop to use (AgenticLoop.ANTHROPIC or AgenticLoop.OMNI)
78
+ - `provider`: AI provider to use (required for Omni loop)
79
+ - `api_key`: Optional API key (will use environment variable if not provided)
80
+ - `model`: Optional model name (will use provider default if not specified)
81
+ - `save_trajectory`: Whether to save screenshots and logs
82
+ - `only_n_most_recent_images`: Only keep N most recent images
83
+ - `max_retries`: Maximum number of retry attempts
84
+
85
+ ## Directory Structure
86
+
87
+ When `save_trajectory` is enabled, the agent will create the following directory structure:
88
+
89
+ ```
90
+ experiments/
91
+ ├── screenshots/ # Screenshots captured during agent execution
92
+ └── logs/ # API call logs and other logging information
93
+ ```
94
+
95
+ ## Extending with New Loop Types
96
+
97
+ To add a new loop type:
98
+
99
+ 1. Implement a new loop class
100
+ 2. Add a new value to the `AgenticLoop` enum
101
+ 3. Update the `_initialize_loop` method in `ComputerAgent` to handle the new loop type
agent/core/__init__.py ADDED
@@ -0,0 +1,34 @@
1
+ """Core agent components."""
2
+
3
+ from .base_agent import BaseComputerAgent
4
+ from .loop import BaseLoop
5
+ from .messages import (
6
+ create_user_message,
7
+ create_assistant_message,
8
+ create_system_message,
9
+ create_image_message,
10
+ create_screen_message,
11
+ BaseMessageManager,
12
+ ImageRetentionConfig,
13
+ )
14
+ from .callbacks import (
15
+ CallbackManager,
16
+ CallbackHandler,
17
+ BaseCallbackManager,
18
+ ContentCallback,
19
+ ToolCallback,
20
+ APICallback,
21
+ )
22
+
23
+ __all__ = [
24
+ "BaseComputerAgent",
25
+ "BaseLoop",
26
+ "CallbackManager",
27
+ "CallbackHandler",
28
+ "BaseMessageManager",
29
+ "ImageRetentionConfig",
30
+ "BaseCallbackManager",
31
+ "ContentCallback",
32
+ "ToolCallback",
33
+ "APICallback",
34
+ ]
agent/core/agent.py ADDED
@@ -0,0 +1,284 @@
1
+ """Unified computer agent implementation that supports multiple loops."""
2
+
3
+ import os
4
+ import logging
5
+ import asyncio
6
+ from typing import Any, AsyncGenerator, Dict, List, Optional, TYPE_CHECKING
7
+ from datetime import datetime
8
+
9
+ from computer import Computer
10
+
11
+ from ..types.base import Provider, AgenticLoop
12
+ from .base_agent import BaseComputerAgent
13
+
14
+ # Only import types for type checking to avoid circular imports
15
+ if TYPE_CHECKING:
16
+ from ..providers.anthropic.loop import AnthropicLoop
17
+ from ..providers.omni.loop import OmniLoop
18
+ from ..providers.omni.parser import OmniParser
19
+
20
+ # Import the APIProvider enum without importing the whole module
21
+ from ..providers.omni.types import APIProvider
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Default models for different providers
26
+ DEFAULT_MODELS = {
27
+ APIProvider.OPENAI: "gpt-4o",
28
+ APIProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
29
+ APIProvider.GROQ: "llama3-70b-8192",
30
+ }
31
+
32
+ # Map providers to their environment variable names
33
+ ENV_VARS = {
34
+ APIProvider.OPENAI: "OPENAI_API_KEY",
35
+ APIProvider.GROQ: "GROQ_API_KEY",
36
+ APIProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
37
+ }
38
+
39
+
40
+ class ComputerAgent(BaseComputerAgent):
41
+ """Unified implementation of the computer agent supporting multiple loop types.
42
+
43
+ This class consolidates the previous AnthropicComputerAgent and OmniComputerAgent
44
+ into a single implementation with configurable loop type.
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ computer: Computer,
50
+ loop_type: AgenticLoop = AgenticLoop.OMNI,
51
+ ai_provider: APIProvider = APIProvider.OPENAI,
52
+ api_key: Optional[str] = None,
53
+ model: Optional[str] = None,
54
+ save_trajectory: bool = True,
55
+ trajectory_dir: Optional[str] = "trajectories",
56
+ only_n_most_recent_images: Optional[int] = None,
57
+ max_retries: int = 3,
58
+ verbosity: int = logging.INFO,
59
+ **kwargs,
60
+ ):
61
+ """Initialize the computer agent.
62
+
63
+ Args:
64
+ computer: Computer instance to control
65
+ loop_type: The type of loop to use (Anthropic or Omni)
66
+ ai_provider: AI provider to use (required for Cua loop)
67
+ api_key: Optional API key (will use environment variable if not provided)
68
+ model: Optional model name (will use provider default if not specified)
69
+ save_trajectory: Whether to save screenshots and logs
70
+ trajectory_dir: Directory to save trajectories (defaults to "trajectories")
71
+ only_n_most_recent_images: Limit history to N most recent images
72
+ max_retries: Maximum number of retry attempts for failed operations
73
+ verbosity: Logging level (standard Python logging levels: logging.DEBUG, logging.INFO, etc.)
74
+ **kwargs: Additional keyword arguments to pass to the loop
75
+ """
76
+ # Set up trajectory directories based on save_trajectory
77
+ base_dir = trajectory_dir if save_trajectory else None
78
+ # Don't create a redundant screenshots directory - directly use the timestamp folder
79
+ screenshot_dir = None # This was previously set to os.path.join(base_dir, "screenshots")
80
+ log_dir = None
81
+
82
+ super().__init__(
83
+ max_retries=max_retries,
84
+ computer=computer,
85
+ screenshot_dir=screenshot_dir,
86
+ log_dir=log_dir,
87
+ **kwargs,
88
+ )
89
+
90
+ self.loop_type = loop_type
91
+ self.provider = ai_provider
92
+ self.save_trajectory = save_trajectory
93
+ self.trajectory_dir = trajectory_dir
94
+ self.only_n_most_recent_images = only_n_most_recent_images
95
+ self.verbosity = verbosity
96
+ self._kwargs = kwargs # Keep this for loop initialization
97
+
98
+ # Configure logging based on verbosity
99
+ self._configure_logging(verbosity)
100
+
101
+ # Get API key from environment if not provided
102
+ if api_key is None:
103
+ env_var = (
104
+ ENV_VARS.get(ai_provider) if loop_type == AgenticLoop.OMNI else "ANTHROPIC_API_KEY"
105
+ )
106
+ if not env_var:
107
+ raise ValueError(
108
+ f"Unsupported provider: {ai_provider}. Please use one of: {list(ENV_VARS.keys())}"
109
+ )
110
+
111
+ api_key = os.environ.get(env_var)
112
+ if not api_key:
113
+ raise ValueError(
114
+ f"No API key provided and {env_var} environment variable is not set.\n"
115
+ f"Please set the {env_var} environment variable or pass the api_key directly:\n"
116
+ f" - Export in terminal: export {env_var}=your_api_key_here\n"
117
+ f" - Add to .env file: {env_var}=your_api_key_here\n"
118
+ f" - Pass directly: api_key='your_api_key_here'"
119
+ )
120
+ self.api_key = api_key
121
+
122
+ # Set model based on provider if not specified
123
+ if model is None:
124
+ if loop_type == AgenticLoop.OMNI:
125
+ self.model = DEFAULT_MODELS[ai_provider]
126
+ else: # Anthropic loop
127
+ self.model = DEFAULT_MODELS[APIProvider.ANTHROPIC]
128
+ else:
129
+ self.model = model
130
+
131
+ # Initialize the appropriate loop based on loop_type
132
+ self.loop = self._init_loop()
133
+
134
+ def _configure_logging(self, verbosity: int):
135
+ """Configure logging based on verbosity level."""
136
+ # Use the logging level directly without mapping
137
+ logger.setLevel(verbosity)
138
+ logging.getLogger("agent").setLevel(verbosity)
139
+
140
+ # Log the verbosity level that was set
141
+ if verbosity <= logging.DEBUG:
142
+ logger.info("Agent logging set to DEBUG level (full debug information)")
143
+ elif verbosity <= logging.INFO:
144
+ logger.info("Agent logging set to INFO level (standard output)")
145
+ elif verbosity <= logging.WARNING:
146
+ logger.warning("Agent logging set to WARNING level (warnings and errors only)")
147
+ elif verbosity <= logging.ERROR:
148
+ logger.warning("Agent logging set to ERROR level (errors only)")
149
+ elif verbosity <= logging.CRITICAL:
150
+ logger.warning("Agent logging set to CRITICAL level (critical errors only)")
151
+
152
+ def _init_loop(self) -> Any:
153
+ """Initialize the loop based on the loop_type.
154
+
155
+ Returns:
156
+ Initialized loop instance
157
+ """
158
+ # Lazy import OmniLoop and OmniParser to avoid circular imports
159
+ from ..providers.omni.loop import OmniLoop
160
+ from ..providers.omni.parser import OmniParser
161
+
162
+ if self.loop_type == AgenticLoop.ANTHROPIC:
163
+ from ..providers.anthropic.loop import AnthropicLoop
164
+
165
+ return AnthropicLoop(
166
+ api_key=self.api_key,
167
+ model=self.model,
168
+ computer=self.computer,
169
+ save_trajectory=self.save_trajectory,
170
+ base_dir=self.trajectory_dir,
171
+ only_n_most_recent_images=self.only_n_most_recent_images,
172
+ **self._kwargs,
173
+ )
174
+
175
+ # Initialize parser for OmniLoop with appropriate device
176
+ if "parser" not in self._kwargs:
177
+ self._kwargs["parser"] = OmniParser()
178
+
179
+ return OmniLoop(
180
+ provider=self.provider,
181
+ api_key=self.api_key,
182
+ model=self.model,
183
+ computer=self.computer,
184
+ save_trajectory=self.save_trajectory,
185
+ base_dir=self.trajectory_dir,
186
+ only_n_most_recent_images=self.only_n_most_recent_images,
187
+ **self._kwargs,
188
+ )
189
+
190
+ async def _execute_task(self, task: str) -> AsyncGenerator[Dict[str, Any], None]:
191
+ """Execute a task using the appropriate loop.
192
+
193
+ Args:
194
+ task: Task description to execute
195
+
196
+ Yields:
197
+ Dict containing response content and metadata
198
+ """
199
+ try:
200
+ # Format the messages based on loop type
201
+ if self.loop_type == AgenticLoop.ANTHROPIC:
202
+ # Anthropic format
203
+ messages = [{"role": "user", "content": [{"type": "text", "text": task}]}]
204
+ else:
205
+ # Cua format
206
+ messages = [{"role": "user", "content": task}]
207
+
208
+ # Run the loop
209
+ try:
210
+ async for result in self.loop.run(messages):
211
+ if result is None:
212
+ break
213
+
214
+ # Handle error case
215
+ if "error" in result:
216
+ yield {
217
+ "role": "assistant",
218
+ "content": result["error"],
219
+ "metadata": {"title": "❌ Error"},
220
+ }
221
+ continue
222
+
223
+ # Extract content and metadata based on loop type
224
+ if self.loop_type == AgenticLoop.ANTHROPIC:
225
+ # Handle Anthropic format
226
+ if "content" in result:
227
+ content_text = ""
228
+ for content_block in result["content"]:
229
+ try:
230
+ # Try to access the text attribute directly
231
+ content_text += content_block.text
232
+ except (AttributeError, TypeError):
233
+ # If it's a dictionary instead of an object
234
+ if isinstance(content_block, dict) and "text" in content_block:
235
+ content_text += content_block["text"]
236
+
237
+ yield {
238
+ "role": "assistant",
239
+ "content": content_text,
240
+ "metadata": result.get("parsed_screen", {}),
241
+ }
242
+ else:
243
+ yield {
244
+ "role": "assistant",
245
+ "content": str(result),
246
+ "metadata": {"title": "Screen Analysis"},
247
+ }
248
+ else:
249
+ # Handle Omni format
250
+ content = ""
251
+ metadata = {"title": "Screen Analysis"}
252
+
253
+ # If result has content (normal case)
254
+ if "content" in result:
255
+ content = result["content"]
256
+
257
+ # Ensure metadata has a title
258
+ if isinstance(content, dict) and "metadata" in content:
259
+ metadata = content["metadata"]
260
+ if "title" not in metadata:
261
+ metadata["title"] = "Screen Analysis"
262
+
263
+ # For string content, convert to proper format
264
+ if isinstance(content, str):
265
+ content = content
266
+ elif isinstance(content, dict) and "content" in content:
267
+ content = content.get("content", "")
268
+
269
+ yield {"role": "assistant", "content": content, "metadata": metadata}
270
+ except Exception as e:
271
+ logger.error(f"Error running the loop: {str(e)}")
272
+ yield {
273
+ "role": "assistant",
274
+ "content": f"Error running the agent loop: {str(e)}",
275
+ "metadata": {"title": "❌ Loop Error"},
276
+ }
277
+
278
+ except Exception as e:
279
+ logger.error(f"Error in _execute_task: {str(e)}")
280
+ yield {
281
+ "role": "assistant",
282
+ "content": f"Error: {str(e)}",
283
+ "metadata": {"title": "❌ Error"},
284
+ }
@@ -0,0 +1,164 @@
1
+ """Base computer agent implementation."""
2
+
3
+ import asyncio
4
+ import logging
5
+ import os
6
+ from abc import ABC, abstractmethod
7
+ from typing import Any, AsyncGenerator, Dict, Optional
8
+
9
+ from computer import Computer
10
+
11
+ from ..types.base import Provider
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class BaseComputerAgent(ABC):
17
+ """Base class for computer agents."""
18
+
19
+ def __init__(
20
+ self,
21
+ max_retries: int = 3,
22
+ computer: Optional[Computer] = None,
23
+ screenshot_dir: Optional[str] = None,
24
+ log_dir: Optional[str] = None,
25
+ **kwargs,
26
+ ):
27
+ """Initialize the base computer agent.
28
+
29
+ Args:
30
+ max_retries: Maximum number of retry attempts
31
+ computer: Optional Computer instance
32
+ screenshot_dir: Directory to save screenshots
33
+ log_dir: Directory to save logs (set to None to disable logging to files)
34
+ **kwargs: Additional provider-specific arguments
35
+ """
36
+ self.max_retries = max_retries
37
+ self.computer = computer or Computer()
38
+ self.queue = asyncio.Queue()
39
+ self.screenshot_dir = screenshot_dir
40
+ self.log_dir = log_dir
41
+ self._retry_count = 0
42
+ self.provider = Provider.UNKNOWN
43
+
44
+ # Setup logging
45
+ if self.log_dir:
46
+ os.makedirs(self.log_dir, exist_ok=True)
47
+ logger.info(f"Created logs directory: {self.log_dir}")
48
+
49
+ # Setup screenshots directory
50
+ if self.screenshot_dir:
51
+ os.makedirs(self.screenshot_dir, exist_ok=True)
52
+ logger.info(f"Created screenshots directory: {self.screenshot_dir}")
53
+
54
+ logger.info("BaseComputerAgent initialized")
55
+
56
+ async def run(self, task: str) -> AsyncGenerator[Dict[str, Any], None]:
57
+ """Run a task using the computer agent.
58
+
59
+ Args:
60
+ task: Task description
61
+
62
+ Yields:
63
+ Task execution updates
64
+ """
65
+ try:
66
+ logger.info(f"Running task: {task}")
67
+
68
+ # Initialize the computer if needed
69
+ await self._init_if_needed()
70
+
71
+ # Execute the task and yield results
72
+ # The _execute_task method should be implemented to yield results
73
+ async for result in self._execute_task(task):
74
+ yield result
75
+
76
+ except Exception as e:
77
+ logger.error(f"Error in agent run method: {str(e)}")
78
+ yield {
79
+ "role": "assistant",
80
+ "content": f"Error: {str(e)}",
81
+ "metadata": {"title": "❌ Error"},
82
+ }
83
+
84
+ async def _init_if_needed(self):
85
+ """Initialize the computer interface if it hasn't been initialized yet."""
86
+ if not self.computer._initialized:
87
+ logger.info("Computer not initialized, initializing now...")
88
+ try:
89
+ # Call run directly without setting the flag first
90
+ await self.computer.run()
91
+ logger.info("Computer interface initialized successfully")
92
+ except Exception as e:
93
+ logger.error(f"Error initializing computer interface: {str(e)}")
94
+ raise
95
+
96
+ async def __aenter__(self):
97
+ """Initialize the agent when used as a context manager."""
98
+ logger.info("Entering BaseComputerAgent context")
99
+
100
+ # In case the computer wasn't initialized
101
+ try:
102
+ # Initialize the computer only if not already initialized
103
+ logger.info("Checking if computer is already initialized...")
104
+ if not self.computer._initialized:
105
+ logger.info("Initializing computer in __aenter__...")
106
+ # Use the computer's __aenter__ directly instead of calling run()
107
+ # This avoids the circular dependency
108
+ await self.computer.__aenter__()
109
+ logger.info("Computer initialized in __aenter__")
110
+ else:
111
+ logger.info("Computer already initialized, skipping initialization")
112
+
113
+ # Take a test screenshot to verify the computer is working
114
+ logger.info("Testing computer with a screenshot...")
115
+ try:
116
+ test_screenshot = await self.computer.screenshot()
117
+ # Determine the screenshot size based on its type
118
+ if isinstance(test_screenshot, bytes):
119
+ size = len(test_screenshot)
120
+ else:
121
+ # Assume it's an object with base64_image attribute
122
+ try:
123
+ size = len(test_screenshot.base64_image)
124
+ except AttributeError:
125
+ size = "unknown"
126
+ logger.info(f"Screenshot test successful, size: {size}")
127
+ except Exception as e:
128
+ logger.error(f"Screenshot test failed: {str(e)}")
129
+ # Even though screenshot failed, we continue since some tests might not need it
130
+ except Exception as e:
131
+ logger.error(f"Error initializing computer in __aenter__: {str(e)}")
132
+ raise
133
+
134
+ return self
135
+
136
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
137
+ """Cleanup computer resources if needed."""
138
+ logger.info("Cleaning up agent resources")
139
+
140
+ # Do any necessary cleanup
141
+ # We're not shutting down the computer here as it might be shared
142
+ # Just log that we're exiting
143
+ if exc_type:
144
+ logger.error(f"Exiting agent context with error: {exc_type.__name__}: {exc_val}")
145
+ else:
146
+ logger.info("Exiting agent context normally")
147
+
148
+ # If we have a queue, make sure to signal it's done
149
+ if hasattr(self, "queue") and self.queue:
150
+ await self.queue.put(None) # Signal that we're done
151
+
152
+ @abstractmethod
153
+ async def _execute_task(self, task: str) -> AsyncGenerator[Dict[str, Any], None]:
154
+ """Execute a task. Must be implemented by subclasses.
155
+
156
+ This is an async method that returns an AsyncGenerator. Implementations
157
+ should use 'yield' statements to produce results asynchronously.
158
+ """
159
+ yield {
160
+ "role": "assistant",
161
+ "content": "Base class method called",
162
+ "metadata": {"title": "Error"},
163
+ }
164
+ raise NotImplementedError("Subclasses must implement _execute_task")