cua-agent 0.1.6__tar.gz → 0.1.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. {cua_agent-0.1.6 → cua_agent-0.1.17}/PKG-INFO +6 -36
  2. {cua_agent-0.1.6 → cua_agent-0.1.17}/README.md +5 -35
  3. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/__init__.py +3 -2
  4. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/__init__.py +0 -5
  5. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/computer_agent.py +21 -28
  6. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/loop.py +78 -124
  7. cua_agent-0.1.17/agent/core/messages.py +399 -0
  8. cua_agent-0.1.17/agent/core/types.py +35 -0
  9. cua_agent-0.1.17/agent/core/visualization.py +197 -0
  10. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/api/client.py +142 -1
  11. cua_agent-0.1.17/agent/providers/anthropic/api_handler.py +140 -0
  12. cua_agent-0.1.17/agent/providers/anthropic/callbacks/__init__.py +5 -0
  13. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/loop.py +206 -220
  14. cua_agent-0.1.17/agent/providers/anthropic/response_handler.py +229 -0
  15. cua_agent-0.1.17/agent/providers/anthropic/tools/bash.py +66 -0
  16. cua_agent-0.1.17/agent/providers/anthropic/utils.py +370 -0
  17. cua_agent-0.1.17/agent/providers/omni/__init__.py +8 -0
  18. cua_agent-0.1.17/agent/providers/omni/api_handler.py +42 -0
  19. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/omni/clients/anthropic.py +4 -0
  20. cua_agent-0.1.17/agent/providers/omni/image_utils.py +34 -0
  21. cua_agent-0.1.17/agent/providers/omni/loop.py +855 -0
  22. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/omni/parser.py +58 -4
  23. cua_agent-0.1.17/agent/providers/omni/tools/__init__.py +30 -0
  24. cua_agent-0.1.17/agent/providers/omni/tools/base.py +29 -0
  25. cua_agent-0.1.17/agent/providers/omni/tools/bash.py +74 -0
  26. cua_agent-0.1.17/agent/providers/omni/tools/computer.py +179 -0
  27. cua_agent-0.1.17/agent/providers/omni/tools/manager.py +61 -0
  28. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/omni/types.py +0 -4
  29. cua_agent-0.1.17/agent/providers/omni/utils.py +236 -0
  30. {cua_agent-0.1.6 → cua_agent-0.1.17}/pyproject.toml +3 -3
  31. cua_agent-0.1.6/agent/core/messages.py +0 -245
  32. cua_agent-0.1.6/agent/providers/anthropic/tools/bash.py +0 -163
  33. cua_agent-0.1.6/agent/providers/omni/__init__.py +0 -27
  34. cua_agent-0.1.6/agent/providers/omni/callbacks.py +0 -78
  35. cua_agent-0.1.6/agent/providers/omni/clients/groq.py +0 -101
  36. cua_agent-0.1.6/agent/providers/omni/experiment.py +0 -276
  37. cua_agent-0.1.6/agent/providers/omni/image_utils.py +0 -106
  38. cua_agent-0.1.6/agent/providers/omni/loop.py +0 -971
  39. cua_agent-0.1.6/agent/providers/omni/messages.py +0 -171
  40. cua_agent-0.1.6/agent/providers/omni/tool_manager.py +0 -91
  41. cua_agent-0.1.6/agent/providers/omni/tools/__init__.py +0 -12
  42. cua_agent-0.1.6/agent/providers/omni/tools/bash.py +0 -69
  43. cua_agent-0.1.6/agent/providers/omni/tools/computer.py +0 -217
  44. cua_agent-0.1.6/agent/providers/omni/tools/manager.py +0 -81
  45. cua_agent-0.1.6/agent/providers/omni/utils.py +0 -157
  46. cua_agent-0.1.6/agent/providers/omni/visualization.py +0 -130
  47. cua_agent-0.1.6/agent/types/__init__.py +0 -23
  48. cua_agent-0.1.6/agent/types/base.py +0 -41
  49. cua_agent-0.1.6/agent/types/messages.py +0 -36
  50. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/README.md +0 -0
  51. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/README.md +0 -0
  52. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/callbacks.py +0 -0
  53. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/experiment.py +0 -0
  54. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/telemetry.py +0 -0
  55. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/tools/__init__.py +0 -0
  56. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/tools/base.py +0 -0
  57. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/tools/bash.py +0 -0
  58. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/tools/collection.py +0 -0
  59. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/tools/computer.py +0 -0
  60. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/tools/edit.py +0 -0
  61. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/core/tools/manager.py +0 -0
  62. {cua_agent-0.1.6/agent/types → cua_agent-0.1.17/agent/core}/tools.py +0 -0
  63. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/__init__.py +0 -0
  64. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/__init__.py +0 -0
  65. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/api/logging.py +0 -0
  66. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/callbacks/manager.py +0 -0
  67. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/messages/manager.py +0 -0
  68. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/prompts.py +0 -0
  69. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/tools/__init__.py +0 -0
  70. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/tools/base.py +0 -0
  71. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/tools/collection.py +0 -0
  72. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/tools/computer.py +0 -0
  73. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/tools/edit.py +0 -0
  74. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/tools/manager.py +0 -0
  75. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/tools/run.py +0 -0
  76. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/anthropic/types.py +0 -0
  77. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/omni/clients/base.py +0 -0
  78. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/omni/clients/openai.py +0 -0
  79. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/omni/clients/utils.py +0 -0
  80. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/providers/omni/prompts.py +0 -0
  81. {cua_agent-0.1.6 → cua_agent-0.1.17}/agent/telemetry.py +0 -0
  82. {cua_agent-0.1.6 → cua_agent-0.1.17}/tests/test_agent.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.1.6
3
+ Version: 0.1.17
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: <3.13,>=3.10
@@ -63,43 +63,13 @@ Description-Content-Type: text/markdown
63
63
 
64
64
  **Agent** is a Computer Use (CUA) framework for running multi-app agentic workflows targeting macOS and Linux sandbox, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). The framework integrates with Microsoft's OmniParser for enhanced UI understanding and interaction.
65
65
 
66
- ### Get started with Agent
67
-
68
- ```python
69
- from agent import ComputerAgent, AgentLoop, LLMProvider
70
- from computer import Computer
71
-
72
- computer = Computer(verbosity=logging.INFO)
73
-
74
- agent = ComputerAgent(
75
- computer=computer,
76
- loop=AgentLoop.ANTHROPIC,
77
- # loop=AgentLoop.OMNI,
78
- model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
79
- # model=LLM(provider=LLMProvider.OPENAI, name="gpt-4.5-preview"),
80
- save_trajectory=True,
81
- trajectory_dir=str(Path("trajectories")),
82
- only_n_most_recent_images=3,
83
- verbosity=logging.INFO,
84
- )
66
+ > While our north star is to create a 1-click experience, this preview of Agent might be still a bit rough around the edges. We appreciate your patience as we work to improve the experience.
85
67
 
86
- tasks = [
87
- """
88
- Please help me with the following task:
89
- 1. Open Safari browser
90
- 2. Go to Wikipedia.org
91
- 3. Search for "Claude AI"
92
- 4. Summarize the main points you find about Claude AI
93
- """
94
- ]
68
+ ### Get started with Agent
95
69
 
96
- async with agent:
97
- for i, task in enumerate(tasks, 1):
98
- print(f"\nExecuting task {i}/{len(tasks)}: {task}")
99
- async for result in agent.run(task):
100
- print(result)
101
- print(f"Task {i} completed")
102
- ```
70
+ <div align="center">
71
+ <img src="../../img/agent.png"/>
72
+ </div>
103
73
 
104
74
  ## Install
105
75
 
@@ -17,43 +17,13 @@
17
17
 
18
18
  **Agent** is a Computer Use (CUA) framework for running multi-app agentic workflows targeting macOS and Linux sandbox, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). The framework integrates with Microsoft's OmniParser for enhanced UI understanding and interaction.
19
19
 
20
- ### Get started with Agent
21
-
22
- ```python
23
- from agent import ComputerAgent, AgentLoop, LLMProvider
24
- from computer import Computer
25
-
26
- computer = Computer(verbosity=logging.INFO)
27
-
28
- agent = ComputerAgent(
29
- computer=computer,
30
- loop=AgentLoop.ANTHROPIC,
31
- # loop=AgentLoop.OMNI,
32
- model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
33
- # model=LLM(provider=LLMProvider.OPENAI, name="gpt-4.5-preview"),
34
- save_trajectory=True,
35
- trajectory_dir=str(Path("trajectories")),
36
- only_n_most_recent_images=3,
37
- verbosity=logging.INFO,
38
- )
20
+ > While our north star is to create a 1-click experience, this preview of Agent might be still a bit rough around the edges. We appreciate your patience as we work to improve the experience.
39
21
 
40
- tasks = [
41
- """
42
- Please help me with the following task:
43
- 1. Open Safari browser
44
- 2. Go to Wikipedia.org
45
- 3. Search for "Claude AI"
46
- 4. Summarize the main points you find about Claude AI
47
- """
48
- ]
22
+ ### Get started with Agent
49
23
 
50
- async with agent:
51
- for i, task in enumerate(tasks, 1):
52
- print(f"\nExecuting task {i}/{len(tasks)}: {task}")
53
- async for result in agent.run(task):
54
- print(result)
55
- print(f"Task {i} completed")
56
- ```
24
+ <div align="center">
25
+ <img src="../../img/agent.png"/>
26
+ </div>
57
27
 
58
28
  ## Install
59
29
 
@@ -49,6 +49,7 @@ except Exception as e:
49
49
  logger.warning(f"Error initializing telemetry: {e}")
50
50
 
51
51
  from .providers.omni.types import LLMProvider, LLM
52
- from .types.base import AgentLoop
52
+ from .core.loop import AgentLoop
53
+ from .core.computer_agent import ComputerAgent
53
54
 
54
- __all__ = ["AgentLoop", "LLMProvider", "LLM"]
55
+ __all__ = ["AgentLoop", "LLMProvider", "LLM", "ComputerAgent"]
@@ -2,11 +2,6 @@
2
2
 
3
3
  from .loop import BaseLoop
4
4
  from .messages import (
5
- create_user_message,
6
- create_assistant_message,
7
- create_system_message,
8
- create_image_message,
9
- create_screen_message,
10
5
  BaseMessageManager,
11
6
  ImageRetentionConfig,
12
7
  )
@@ -3,8 +3,7 @@
3
3
  import asyncio
4
4
  import logging
5
5
  import os
6
- from typing import Any, AsyncGenerator, Dict, Optional, cast
7
- from dataclasses import dataclass
6
+ from typing import Any, AsyncGenerator, Dict, Optional, cast, List
8
7
 
9
8
  from computer import Computer
10
9
  from ..providers.anthropic.loop import AnthropicLoop
@@ -12,6 +11,8 @@ from ..providers.omni.loop import OmniLoop
12
11
  from ..providers.omni.parser import OmniParser
13
12
  from ..providers.omni.types import LLMProvider, LLM
14
13
  from .. import AgentLoop
14
+ from .messages import StandardMessageManager, ImageRetentionConfig
15
+ from .types import AgentResponse
15
16
 
16
17
  logging.basicConfig(level=logging.INFO)
17
18
  logger = logging.getLogger(__name__)
@@ -44,7 +45,6 @@ class ComputerAgent:
44
45
  save_trajectory: bool = True,
45
46
  trajectory_dir: str = "trajectories",
46
47
  only_n_most_recent_images: Optional[int] = None,
47
- parser: Optional[OmniParser] = None,
48
48
  verbosity: int = logging.INFO,
49
49
  ):
50
50
  """Initialize the ComputerAgent.
@@ -61,12 +61,11 @@ class ComputerAgent:
61
61
  save_trajectory: Whether to save the trajectory.
62
62
  trajectory_dir: Directory to save the trajectory.
63
63
  only_n_most_recent_images: Maximum number of recent screenshots to include in API requests.
64
- parser: Parser instance for the OmniLoop. Only used if provider is not ANTHROPIC.
65
64
  verbosity: Logging level.
66
65
  """
67
66
  # Basic agent configuration
68
67
  self.max_retries = max_retries
69
- self.computer = computer or Computer()
68
+ self.computer = computer
70
69
  self.queue = asyncio.Queue()
71
70
  self.screenshot_dir = screenshot_dir
72
71
  self.log_dir = log_dir
@@ -100,7 +99,7 @@ class ComputerAgent:
100
99
  )
101
100
 
102
101
  # Ensure computer is properly cast for typing purposes
103
- computer_instance = cast(Computer, self.computer)
102
+ computer_instance = self.computer
104
103
 
105
104
  # Get API key from environment if not provided
106
105
  actual_api_key = api_key or os.environ.get(ENV_VARS[self.provider], "")
@@ -118,10 +117,6 @@ class ComputerAgent:
118
117
  only_n_most_recent_images=only_n_most_recent_images,
119
118
  )
120
119
  else:
121
- # Default to OmniLoop for other loop types
122
- # Initialize parser if not provided
123
- actual_parser = parser or OmniParser()
124
-
125
120
  self._loop = OmniLoop(
126
121
  provider=self.provider,
127
122
  api_key=actual_api_key,
@@ -130,9 +125,12 @@ class ComputerAgent:
130
125
  save_trajectory=save_trajectory,
131
126
  base_dir=trajectory_dir,
132
127
  only_n_most_recent_images=only_n_most_recent_images,
133
- parser=actual_parser,
128
+ parser=OmniParser(),
134
129
  )
135
130
 
131
+ # Initialize the message manager from the loop
132
+ self.message_manager = self._loop.message_manager
133
+
136
134
  logger.info(
137
135
  f"ComputerAgent initialized with provider: {self.provider}, model: {actual_model_name}"
138
136
  )
@@ -201,36 +199,30 @@ class ComputerAgent:
201
199
  await self.computer.run()
202
200
  self._initialized = True
203
201
 
204
- async def _init_if_needed(self):
205
- """Initialize the computer interface if it hasn't been initialized yet."""
206
- if not self.computer._initialized:
207
- logger.info("Computer not initialized, initializing now...")
208
- try:
209
- # Call run directly
210
- await self.computer.run()
211
- logger.info("Computer interface initialized successfully")
212
- except Exception as e:
213
- logger.error(f"Error initializing computer interface: {str(e)}")
214
- raise
215
-
216
- async def run(self, task: str) -> AsyncGenerator[Dict[str, Any], None]:
202
+ async def run(self, task: str) -> AsyncGenerator[AgentResponse, None]:
217
203
  """Run a task using the computer agent.
218
204
 
219
205
  Args:
220
206
  task: Task description
221
207
 
222
208
  Yields:
223
- Task execution updates
209
+ Agent response format
224
210
  """
225
211
  try:
226
212
  logger.info(f"Running task: {task}")
213
+ logger.info(
214
+ f"Message history before task has {len(self.message_manager.messages)} messages"
215
+ )
227
216
 
228
217
  # Initialize the computer if needed
229
218
  if not self._initialized:
230
219
  await self.initialize()
231
220
 
232
- # Format task as a message
233
- messages = [{"role": "user", "content": task}]
221
+ # Add task as a user message using the message manager
222
+ self.message_manager.add_user_message([{"type": "text", "text": task}])
223
+ logger.info(
224
+ f"Added task message. Message history now has {len(self.message_manager.messages)} messages"
225
+ )
234
226
 
235
227
  # Pass properly formatted messages to the loop
236
228
  if self._loop is None:
@@ -239,7 +231,8 @@ class ComputerAgent:
239
231
  return
240
232
 
241
233
  # Execute the task and yield results
242
- async for result in self._loop.run(messages):
234
+ async for result in self._loop.run(self.message_manager.messages):
235
+ # Yield the result to the caller
243
236
  yield result
244
237
 
245
238
  except Exception as e:
@@ -2,22 +2,34 @@
2
2
 
3
3
  import logging
4
4
  import asyncio
5
- import json
6
- import os
7
5
  from abc import ABC, abstractmethod
6
+ from enum import Enum, auto
8
7
  from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
9
8
  from datetime import datetime
10
- import base64
11
9
 
12
10
  from computer import Computer
13
11
  from .experiment import ExperimentManager
12
+ from .messages import StandardMessageManager, ImageRetentionConfig
13
+ from .types import AgentResponse
14
14
 
15
15
  logger = logging.getLogger(__name__)
16
16
 
17
17
 
18
+ class AgentLoop(Enum):
19
+ """Enumeration of available loop types."""
20
+
21
+ ANTHROPIC = auto() # Anthropic implementation
22
+ OMNI = auto() # OmniLoop implementation
23
+ # Add more loop types as needed
24
+
25
+
18
26
  class BaseLoop(ABC):
19
27
  """Base class for agent loops that handle message processing and tool execution."""
20
28
 
29
+ ###########################################
30
+ # INITIALIZATION AND CONFIGURATION
31
+ ###########################################
32
+
21
33
  def __init__(
22
34
  self,
23
35
  computer: Computer,
@@ -55,8 +67,6 @@ class BaseLoop(ABC):
55
67
  self.save_trajectory = save_trajectory
56
68
  self.only_n_most_recent_images = only_n_most_recent_images
57
69
  self._kwargs = kwargs
58
- self.message_history = []
59
- # self.tool_manager = BaseToolManager(computer)
60
70
 
61
71
  # Initialize experiment manager
62
72
  if self.save_trajectory and self.base_dir:
@@ -75,6 +85,64 @@ class BaseLoop(ABC):
75
85
  # Initialize basic tracking
76
86
  self.turn_count = 0
77
87
 
88
+ async def initialize(self) -> None:
89
+ """Initialize both the API client and computer interface with retries."""
90
+ for attempt in range(self.max_retries):
91
+ try:
92
+ logger.info(
93
+ f"Starting initialization (attempt {attempt + 1}/{self.max_retries})..."
94
+ )
95
+
96
+ # Initialize API client
97
+ await self.initialize_client()
98
+
99
+ logger.info("Initialization complete.")
100
+ return
101
+ except Exception as e:
102
+ if attempt < self.max_retries - 1:
103
+ logger.warning(
104
+ f"Initialization failed (attempt {attempt + 1}/{self.max_retries}): {str(e)}. Retrying..."
105
+ )
106
+ await asyncio.sleep(self.retry_delay)
107
+ else:
108
+ logger.error(
109
+ f"Initialization failed after {self.max_retries} attempts: {str(e)}"
110
+ )
111
+ raise RuntimeError(f"Failed to initialize: {str(e)}")
112
+
113
+ ###########################################
114
+
115
+ # ABSTRACT METHODS TO BE IMPLEMENTED BY SUBCLASSES
116
+ ###########################################
117
+
118
+ @abstractmethod
119
+ async def initialize_client(self) -> None:
120
+ """Initialize the API client and any provider-specific components.
121
+
122
+ This method must be implemented by subclasses to set up
123
+ provider-specific clients and tools.
124
+ """
125
+ raise NotImplementedError
126
+
127
+ @abstractmethod
128
+ async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
129
+ """Run the agent loop with provided messages.
130
+
131
+ This method handles the main agent loop including message processing,
132
+ API calls, response handling, and action execution.
133
+
134
+ Args:
135
+ messages: List of message objects
136
+
137
+ Yields:
138
+ Agent response format
139
+ """
140
+ raise NotImplementedError
141
+
142
+ ###########################################
143
+ # EXPERIMENT AND TRAJECTORY MANAGEMENT
144
+ ###########################################
145
+
78
146
  def _setup_experiment_dirs(self) -> None:
79
147
  """Setup the experiment directory structure."""
80
148
  if self.experiment_manager:
@@ -100,10 +168,13 @@ class BaseLoop(ABC):
100
168
  ) -> None:
101
169
  """Log API call details to file.
102
170
 
171
+ Preserves provider-specific formats for requests and responses to ensure
172
+ accurate logging for debugging and analysis purposes.
173
+
103
174
  Args:
104
175
  call_type: Type of API call (e.g., 'request', 'response', 'error')
105
- request: The API request data
106
- response: Optional API response data
176
+ request: The API request data in provider-specific format
177
+ response: Optional API response data in provider-specific format
107
178
  error: Optional error information
108
179
  """
109
180
  if self.experiment_manager:
@@ -129,120 +200,3 @@ class BaseLoop(ABC):
129
200
  """
130
201
  if self.experiment_manager:
131
202
  self.experiment_manager.save_screenshot(img_base64, action_type)
132
-
133
- async def initialize(self) -> None:
134
- """Initialize both the API client and computer interface with retries."""
135
- for attempt in range(self.max_retries):
136
- try:
137
- logger.info(
138
- f"Starting initialization (attempt {attempt + 1}/{self.max_retries})..."
139
- )
140
-
141
- # Initialize API client
142
- await self.initialize_client()
143
-
144
- logger.info("Initialization complete.")
145
- return
146
- except Exception as e:
147
- if attempt < self.max_retries - 1:
148
- logger.warning(
149
- f"Initialization failed (attempt {attempt + 1}/{self.max_retries}): {str(e)}. Retrying..."
150
- )
151
- await asyncio.sleep(self.retry_delay)
152
- else:
153
- logger.error(
154
- f"Initialization failed after {self.max_retries} attempts: {str(e)}"
155
- )
156
- raise RuntimeError(f"Failed to initialize: {str(e)}")
157
-
158
- async def _get_parsed_screen_som(self) -> Dict[str, Any]:
159
- """Get parsed screen information.
160
-
161
- Returns:
162
- Dict containing screen information
163
- """
164
- try:
165
- # Take screenshot
166
- screenshot = await self.computer.interface.screenshot()
167
-
168
- # Initialize with default values
169
- width, height = 1024, 768
170
- base64_image = ""
171
-
172
- # Handle different types of screenshot returns
173
- if isinstance(screenshot, (bytes, bytearray, memoryview)):
174
- # Raw bytes screenshot
175
- base64_image = base64.b64encode(screenshot).decode("utf-8")
176
- elif hasattr(screenshot, "base64_image"):
177
- # Object-style screenshot with attributes
178
- # Type checking can't infer these attributes, but they exist at runtime
179
- # on certain screenshot return types
180
- base64_image = getattr(screenshot, "base64_image")
181
- width = (
182
- getattr(screenshot, "width", width) if hasattr(screenshot, "width") else width
183
- )
184
- height = (
185
- getattr(screenshot, "height", height)
186
- if hasattr(screenshot, "height")
187
- else height
188
- )
189
-
190
- # Create parsed screen data
191
- parsed_screen = {
192
- "width": width,
193
- "height": height,
194
- "parsed_content_list": [],
195
- "timestamp": datetime.now().isoformat(),
196
- "screenshot_base64": base64_image,
197
- }
198
-
199
- # Save screenshot if requested
200
- if self.save_trajectory and self.experiment_manager:
201
- try:
202
- img_data = base64_image
203
- if "," in img_data:
204
- img_data = img_data.split(",")[1]
205
- self._save_screenshot(img_data, action_type="state")
206
- except Exception as e:
207
- logger.error(f"Error saving screenshot: {str(e)}")
208
-
209
- return parsed_screen
210
- except Exception as e:
211
- logger.error(f"Error taking screenshot: {str(e)}")
212
- return {
213
- "width": 1024,
214
- "height": 768,
215
- "parsed_content_list": [],
216
- "timestamp": datetime.now().isoformat(),
217
- "error": f"Error taking screenshot: {str(e)}",
218
- "screenshot_base64": "",
219
- }
220
-
221
- @abstractmethod
222
- async def initialize_client(self) -> None:
223
- """Initialize the API client and any provider-specific components."""
224
- raise NotImplementedError
225
-
226
- @abstractmethod
227
- async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
228
- """Run the agent loop with provided messages.
229
-
230
- Args:
231
- messages: List of message objects
232
-
233
- Yields:
234
- Dict containing response data
235
- """
236
- raise NotImplementedError
237
-
238
- @abstractmethod
239
- async def _process_screen(
240
- self, parsed_screen: Dict[str, Any], messages: List[Dict[str, Any]]
241
- ) -> None:
242
- """Process screen information and add to messages.
243
-
244
- Args:
245
- parsed_screen: Dictionary containing parsed screen info
246
- messages: List of messages to update
247
- """
248
- raise NotImplementedError