cua-agent 0.1.5__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +3 -4
- agent/core/__init__.py +3 -10
- agent/core/computer_agent.py +207 -32
- agent/core/experiment.py +20 -3
- agent/core/loop.py +78 -120
- agent/core/messages.py +279 -125
- agent/core/telemetry.py +44 -32
- agent/core/types.py +35 -0
- agent/core/visualization.py +197 -0
- agent/providers/anthropic/api/client.py +142 -1
- agent/providers/anthropic/api_handler.py +140 -0
- agent/providers/anthropic/callbacks/__init__.py +5 -0
- agent/providers/anthropic/loop.py +224 -209
- agent/providers/anthropic/messages/manager.py +3 -1
- agent/providers/anthropic/response_handler.py +229 -0
- agent/providers/anthropic/tools/base.py +1 -1
- agent/providers/anthropic/tools/bash.py +0 -97
- agent/providers/anthropic/tools/collection.py +2 -2
- agent/providers/anthropic/tools/computer.py +34 -24
- agent/providers/anthropic/tools/manager.py +2 -2
- agent/providers/anthropic/utils.py +370 -0
- agent/providers/omni/__init__.py +1 -20
- agent/providers/omni/api_handler.py +42 -0
- agent/providers/omni/clients/anthropic.py +4 -0
- agent/providers/omni/image_utils.py +0 -72
- agent/providers/omni/loop.py +497 -607
- agent/providers/omni/parser.py +60 -5
- agent/providers/omni/tools/__init__.py +25 -8
- agent/providers/omni/tools/base.py +29 -0
- agent/providers/omni/tools/bash.py +43 -38
- agent/providers/omni/tools/computer.py +144 -181
- agent/providers/omni/tools/manager.py +26 -48
- agent/providers/omni/types.py +0 -4
- agent/providers/omni/utils.py +225 -144
- {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/METADATA +6 -36
- cua_agent-0.1.17.dist-info/RECORD +63 -0
- agent/core/agent.py +0 -252
- agent/core/base_agent.py +0 -164
- agent/core/factory.py +0 -102
- agent/providers/omni/callbacks.py +0 -78
- agent/providers/omni/clients/groq.py +0 -101
- agent/providers/omni/experiment.py +0 -273
- agent/providers/omni/messages.py +0 -171
- agent/providers/omni/tool_manager.py +0 -91
- agent/providers/omni/visualization.py +0 -130
- agent/types/__init__.py +0 -26
- agent/types/base.py +0 -53
- agent/types/messages.py +0 -36
- cua_agent-0.1.5.dist-info/RECORD +0 -67
- /agent/{types → core}/tools.py +0 -0
- {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/WHEEL +0 -0
- {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/entry_points.txt +0 -0
agent/providers/omni/loop.py
CHANGED
|
@@ -1,33 +1,28 @@
|
|
|
1
1
|
"""Omni-specific agent loop implementation."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Any, Dict, List, Optional, Tuple, AsyncGenerator
|
|
5
|
-
import base64
|
|
6
|
-
from PIL import Image
|
|
7
|
-
from io import BytesIO
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, AsyncGenerator
|
|
8
5
|
import json
|
|
9
6
|
import re
|
|
10
7
|
import os
|
|
11
|
-
from datetime import datetime
|
|
12
8
|
import asyncio
|
|
13
9
|
from httpx import ConnectError, ReadTimeout
|
|
14
|
-
import
|
|
15
|
-
import copy
|
|
10
|
+
from typing import cast
|
|
16
11
|
|
|
17
|
-
from .parser import OmniParser, ParseResult
|
|
12
|
+
from .parser import OmniParser, ParseResult
|
|
18
13
|
from ...core.loop import BaseLoop
|
|
14
|
+
from ...core.visualization import VisualizationHelper
|
|
15
|
+
from ...core.messages import StandardMessageManager, ImageRetentionConfig
|
|
16
|
+
from .utils import to_openai_agent_response_format
|
|
17
|
+
from ...core.types import AgentResponse
|
|
19
18
|
from computer import Computer
|
|
20
19
|
from .types import LLMProvider
|
|
21
|
-
from .clients.base import BaseOmniClient
|
|
22
20
|
from .clients.openai import OpenAIClient
|
|
23
|
-
from .clients.groq import GroqClient
|
|
24
21
|
from .clients.anthropic import AnthropicClient
|
|
25
22
|
from .prompts import SYSTEM_PROMPT
|
|
26
|
-
from .
|
|
27
|
-
from .
|
|
28
|
-
from .
|
|
29
|
-
from ...core.messages import ImageRetentionConfig
|
|
30
|
-
from .messages import OmniMessageManager
|
|
23
|
+
from .api_handler import OmniAPIHandler
|
|
24
|
+
from .tools.manager import ToolManager
|
|
25
|
+
from .tools import ToolResult
|
|
31
26
|
|
|
32
27
|
logging.basicConfig(level=logging.INFO)
|
|
33
28
|
logger = logging.getLogger(__name__)
|
|
@@ -41,7 +36,16 @@ def extract_data(input_string: str, data_type: str) -> str:
|
|
|
41
36
|
|
|
42
37
|
|
|
43
38
|
class OmniLoop(BaseLoop):
|
|
44
|
-
"""Omni-specific implementation of the agent loop.
|
|
39
|
+
"""Omni-specific implementation of the agent loop.
|
|
40
|
+
|
|
41
|
+
This class extends BaseLoop to provide support for multimodal models
|
|
42
|
+
from various providers (OpenAI, Anthropic, etc.) with UI parsing
|
|
43
|
+
and desktop automation capabilities.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
###########################################
|
|
47
|
+
# INITIALIZATION AND CONFIGURATION
|
|
48
|
+
###########################################
|
|
45
49
|
|
|
46
50
|
def __init__(
|
|
47
51
|
self,
|
|
@@ -76,8 +80,9 @@ class OmniLoop(BaseLoop):
|
|
|
76
80
|
self.provider = provider
|
|
77
81
|
|
|
78
82
|
# Initialize message manager with image retention config
|
|
79
|
-
|
|
80
|
-
|
|
83
|
+
self.message_manager = StandardMessageManager(
|
|
84
|
+
config=ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images)
|
|
85
|
+
)
|
|
81
86
|
|
|
82
87
|
# Initialize base class (which will set up experiment manager)
|
|
83
88
|
super().__init__(
|
|
@@ -96,94 +101,58 @@ class OmniLoop(BaseLoop):
|
|
|
96
101
|
self.client = None
|
|
97
102
|
self.retry_count = 0
|
|
98
103
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
Returns:
|
|
103
|
-
bool: Always returns False as debug image saving has been disabled.
|
|
104
|
-
"""
|
|
105
|
-
# Debug image saving functionality has been removed
|
|
106
|
-
return False
|
|
107
|
-
|
|
108
|
-
def _extract_and_save_images(self, data: Any, prefix: str) -> None:
|
|
109
|
-
"""Extract and save images from API data.
|
|
104
|
+
# Initialize handlers
|
|
105
|
+
self.api_handler = OmniAPIHandler(loop=self)
|
|
106
|
+
self.viz_helper = VisualizationHelper(agent=self)
|
|
110
107
|
|
|
111
|
-
|
|
108
|
+
# Initialize tool manager
|
|
109
|
+
self.tool_manager = ToolManager(computer=computer, provider=provider)
|
|
112
110
|
|
|
113
|
-
|
|
114
|
-
data: Data to extract images from
|
|
115
|
-
prefix: Prefix for the extracted image filenames
|
|
116
|
-
"""
|
|
117
|
-
# Image extraction functionality has been removed
|
|
118
|
-
return
|
|
111
|
+
logger.info("OmniLoop initialized with StandardMessageManager")
|
|
119
112
|
|
|
120
|
-
def
|
|
121
|
-
"""
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
Args:
|
|
126
|
-
image_data: Base64 encoded image data
|
|
127
|
-
filename: Name to use for the saved image
|
|
128
|
-
"""
|
|
129
|
-
# Debug image saving functionality has been removed
|
|
130
|
-
return
|
|
131
|
-
|
|
132
|
-
def _visualize_action(self, x: int, y: int, img_base64: str) -> None:
|
|
133
|
-
"""Visualize an action by drawing on the screenshot."""
|
|
134
|
-
if (
|
|
135
|
-
not self.save_trajectory
|
|
136
|
-
or not hasattr(self, "experiment_manager")
|
|
137
|
-
or not self.experiment_manager
|
|
138
|
-
):
|
|
139
|
-
return
|
|
113
|
+
async def initialize(self) -> None:
|
|
114
|
+
"""Initialize the loop by setting up tools and clients."""
|
|
115
|
+
# Initialize base class
|
|
116
|
+
await super().initialize()
|
|
140
117
|
|
|
118
|
+
# Initialize tool manager with error handling
|
|
141
119
|
try:
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
# Save the visualization
|
|
146
|
-
self.experiment_manager.save_action_visualization(img, "click", f"x{x}_y{y}")
|
|
120
|
+
logger.info("Initializing tool manager...")
|
|
121
|
+
await self.tool_manager.initialize()
|
|
122
|
+
logger.info("Tool manager initialized successfully.")
|
|
147
123
|
except Exception as e:
|
|
148
|
-
logger.error(f"Error
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
if
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
):
|
|
157
|
-
return
|
|
158
|
-
|
|
159
|
-
try:
|
|
160
|
-
# Use the visualization utility
|
|
161
|
-
img = visualize_scroll(direction, clicks, img_base64)
|
|
162
|
-
|
|
163
|
-
# Save the visualization
|
|
164
|
-
self.experiment_manager.save_action_visualization(
|
|
165
|
-
img, "scroll", f"{direction}_{clicks}"
|
|
124
|
+
logger.error(f"Error initializing tool manager: {str(e)}")
|
|
125
|
+
logger.warning("Will attempt to initialize tools on first use.")
|
|
126
|
+
|
|
127
|
+
# Initialize API clients based on provider
|
|
128
|
+
if self.provider == LLMProvider.ANTHROPIC:
|
|
129
|
+
self.client = AnthropicClient(
|
|
130
|
+
api_key=self.api_key,
|
|
131
|
+
model=self.model,
|
|
166
132
|
)
|
|
167
|
-
|
|
168
|
-
|
|
133
|
+
elif self.provider == LLMProvider.OPENAI:
|
|
134
|
+
self.client = OpenAIClient(
|
|
135
|
+
api_key=self.api_key,
|
|
136
|
+
model=self.model,
|
|
137
|
+
)
|
|
138
|
+
else:
|
|
139
|
+
raise ValueError(f"Unsupported provider: {self.provider}")
|
|
169
140
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
"""Save a visualization of an action."""
|
|
174
|
-
if hasattr(self, "experiment_manager") and self.experiment_manager:
|
|
175
|
-
return self.experiment_manager.save_action_visualization(img, action_name, details)
|
|
176
|
-
return ""
|
|
141
|
+
###########################################
|
|
142
|
+
# CLIENT INITIALIZATION - IMPLEMENTING ABSTRACT METHOD
|
|
143
|
+
###########################################
|
|
177
144
|
|
|
178
145
|
async def initialize_client(self) -> None:
|
|
179
|
-
"""Initialize the appropriate client based on provider.
|
|
146
|
+
"""Initialize the appropriate client based on provider.
|
|
147
|
+
|
|
148
|
+
Implements abstract method from BaseLoop to set up the specific
|
|
149
|
+
provider client (OpenAI, Anthropic, etc.).
|
|
150
|
+
"""
|
|
180
151
|
try:
|
|
181
152
|
logger.info(f"Initializing {self.provider} client with model {self.model}...")
|
|
182
153
|
|
|
183
154
|
if self.provider == LLMProvider.OPENAI:
|
|
184
155
|
self.client = OpenAIClient(api_key=self.api_key, model=self.model)
|
|
185
|
-
elif self.provider == LLMProvider.GROQ:
|
|
186
|
-
self.client = GroqClient(api_key=self.api_key, model=self.model)
|
|
187
156
|
elif self.provider == LLMProvider.ANTHROPIC:
|
|
188
157
|
self.client = AnthropicClient(
|
|
189
158
|
api_key=self.api_key,
|
|
@@ -200,6 +169,10 @@ class OmniLoop(BaseLoop):
|
|
|
200
169
|
self.client = None
|
|
201
170
|
raise RuntimeError(f"Failed to initialize client: {str(e)}")
|
|
202
171
|
|
|
172
|
+
###########################################
|
|
173
|
+
# API CALL HANDLING
|
|
174
|
+
###########################################
|
|
175
|
+
|
|
203
176
|
async def _make_api_call(self, messages: List[Dict[str, Any]], system_prompt: str) -> Any:
|
|
204
177
|
"""Make API call to provider with retry logic."""
|
|
205
178
|
# Create new turn directory for this API call
|
|
@@ -219,68 +192,73 @@ class OmniLoop(BaseLoop):
|
|
|
219
192
|
if self.client is None:
|
|
220
193
|
raise RuntimeError("Failed to initialize client")
|
|
221
194
|
|
|
222
|
-
#
|
|
223
|
-
|
|
224
|
-
self.message_manager.
|
|
195
|
+
# Get messages in standard format from the message manager
|
|
196
|
+
self.message_manager.messages = messages.copy()
|
|
197
|
+
prepared_messages = self.message_manager.get_messages()
|
|
225
198
|
|
|
226
|
-
#
|
|
227
|
-
# This will limit the number of images based on only_n_most_recent_images
|
|
228
|
-
prepared_messages = self.message_manager.get_formatted_messages(provider_name)
|
|
229
|
-
|
|
230
|
-
# Filter out system messages for Anthropic
|
|
199
|
+
# Special handling for Anthropic
|
|
231
200
|
if self.provider == LLMProvider.ANTHROPIC:
|
|
201
|
+
# Convert to Anthropic format
|
|
202
|
+
anthropic_messages, anthropic_system = self.message_manager.to_anthropic_format(
|
|
203
|
+
prepared_messages
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Filter out any empty/invalid messages
|
|
232
207
|
filtered_messages = [
|
|
233
|
-
msg
|
|
208
|
+
msg
|
|
209
|
+
for msg in anthropic_messages
|
|
210
|
+
if msg.get("role") in ["user", "assistant"]
|
|
234
211
|
]
|
|
235
|
-
else:
|
|
236
|
-
filtered_messages = prepared_messages
|
|
237
212
|
|
|
238
|
-
|
|
239
|
-
|
|
213
|
+
# Ensure there's at least one message for Anthropic
|
|
214
|
+
if not filtered_messages:
|
|
215
|
+
logger.warning(
|
|
216
|
+
"No valid messages found for Anthropic API call. Adding a default user message."
|
|
217
|
+
)
|
|
218
|
+
filtered_messages = [
|
|
219
|
+
{
|
|
220
|
+
"role": "user",
|
|
221
|
+
"content": [
|
|
222
|
+
{"type": "text", "text": "Please help with this task."}
|
|
223
|
+
],
|
|
224
|
+
}
|
|
225
|
+
]
|
|
240
226
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
else:
|
|
244
|
-
request_data["system"] = system_prompt
|
|
227
|
+
# Combine system prompts if needed
|
|
228
|
+
final_system_prompt = anthropic_system or system_prompt
|
|
245
229
|
|
|
246
|
-
|
|
230
|
+
# Log request
|
|
231
|
+
request_data = {
|
|
232
|
+
"messages": filtered_messages,
|
|
233
|
+
"max_tokens": self.max_tokens,
|
|
234
|
+
"system": final_system_prompt,
|
|
235
|
+
}
|
|
247
236
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
if is_async:
|
|
257
|
-
# For async implementations (AnthropicClient)
|
|
258
|
-
if self.provider == LLMProvider.ANTHROPIC:
|
|
259
|
-
response = await run_method(
|
|
260
|
-
messages=filtered_messages,
|
|
261
|
-
system=self._get_system_prompt(),
|
|
262
|
-
max_tokens=self.max_tokens,
|
|
263
|
-
)
|
|
264
|
-
else:
|
|
265
|
-
response = await run_method(
|
|
266
|
-
messages=messages,
|
|
267
|
-
system=system_prompt,
|
|
268
|
-
max_tokens=self.max_tokens,
|
|
269
|
-
)
|
|
237
|
+
self._log_api_call("request", request_data)
|
|
238
|
+
|
|
239
|
+
# Make API call
|
|
240
|
+
response = await self.client.run_interleaved(
|
|
241
|
+
messages=filtered_messages,
|
|
242
|
+
system=final_system_prompt,
|
|
243
|
+
max_tokens=self.max_tokens,
|
|
244
|
+
)
|
|
270
245
|
else:
|
|
271
|
-
# For
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
246
|
+
# For OpenAI and others, use standard format directly
|
|
247
|
+
# Log request
|
|
248
|
+
request_data = {
|
|
249
|
+
"messages": prepared_messages,
|
|
250
|
+
"max_tokens": self.max_tokens,
|
|
251
|
+
"system": system_prompt,
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
self._log_api_call("request", request_data)
|
|
255
|
+
|
|
256
|
+
# Make API call
|
|
257
|
+
response = await self.client.run_interleaved(
|
|
258
|
+
messages=prepared_messages,
|
|
259
|
+
system=system_prompt,
|
|
260
|
+
max_tokens=self.max_tokens,
|
|
261
|
+
)
|
|
284
262
|
|
|
285
263
|
# Log success response
|
|
286
264
|
self._log_api_call("response", request_data, response)
|
|
@@ -328,201 +306,169 @@ class OmniLoop(BaseLoop):
|
|
|
328
306
|
logger.error(error_message)
|
|
329
307
|
raise RuntimeError(error_message)
|
|
330
308
|
|
|
309
|
+
###########################################
|
|
310
|
+
# RESPONSE AND ACTION HANDLING
|
|
311
|
+
###########################################
|
|
312
|
+
|
|
331
313
|
async def _handle_response(
|
|
332
|
-
self, response: Any, messages: List[Dict[str, Any]], parsed_screen:
|
|
314
|
+
self, response: Any, messages: List[Dict[str, Any]], parsed_screen: ParseResult
|
|
333
315
|
) -> Tuple[bool, bool]:
|
|
334
316
|
"""Handle API response.
|
|
335
317
|
|
|
318
|
+
Args:
|
|
319
|
+
response: API response
|
|
320
|
+
messages: List of messages to update
|
|
321
|
+
parsed_screen: Current parsed screen information
|
|
322
|
+
|
|
336
323
|
Returns:
|
|
337
324
|
Tuple of (should_continue, action_screenshot_saved)
|
|
338
325
|
"""
|
|
339
326
|
action_screenshot_saved = False
|
|
327
|
+
|
|
328
|
+
# Helper function to safely add assistant messages using the message manager
|
|
329
|
+
def add_assistant_message(content):
|
|
330
|
+
if isinstance(content, str):
|
|
331
|
+
# Convert string to proper format
|
|
332
|
+
formatted_content = [{"type": "text", "text": content}]
|
|
333
|
+
self.message_manager.add_assistant_message(formatted_content)
|
|
334
|
+
logger.info("Added formatted text assistant message")
|
|
335
|
+
elif isinstance(content, list):
|
|
336
|
+
# Already in proper format
|
|
337
|
+
self.message_manager.add_assistant_message(content)
|
|
338
|
+
logger.info("Added structured assistant message")
|
|
339
|
+
else:
|
|
340
|
+
# Default case - convert to string
|
|
341
|
+
formatted_content = [{"type": "text", "text": str(content)}]
|
|
342
|
+
self.message_manager.add_assistant_message(formatted_content)
|
|
343
|
+
logger.info("Added converted assistant message")
|
|
344
|
+
|
|
340
345
|
try:
|
|
341
|
-
#
|
|
346
|
+
# Step 1: Normalize response to standard format based on provider
|
|
347
|
+
standard_content = []
|
|
348
|
+
raw_text = None
|
|
349
|
+
|
|
350
|
+
# Convert response to standardized content based on provider
|
|
342
351
|
if self.provider == LLMProvider.ANTHROPIC:
|
|
343
352
|
if hasattr(response, "content") and isinstance(response.content, list):
|
|
344
|
-
#
|
|
353
|
+
# Convert Anthropic response to standard format
|
|
345
354
|
for block in response.content:
|
|
346
|
-
if hasattr(block, "type")
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
parsed_content = json.loads(json_str)
|
|
364
|
-
logger.info("Successfully parsed JSON from text")
|
|
365
|
-
else:
|
|
366
|
-
logger.error(f"No JSON found in content: {content}")
|
|
367
|
-
continue
|
|
368
|
-
except json.JSONDecodeError as e:
|
|
369
|
-
logger.error(f"Failed to parse JSON from text: {str(e)}")
|
|
370
|
-
continue
|
|
371
|
-
|
|
372
|
-
# Clean up Box ID format
|
|
373
|
-
if "Box ID" in parsed_content and isinstance(
|
|
374
|
-
parsed_content["Box ID"], str
|
|
375
|
-
):
|
|
376
|
-
parsed_content["Box ID"] = parsed_content["Box ID"].replace(
|
|
377
|
-
"Box #", ""
|
|
378
|
-
)
|
|
379
|
-
|
|
380
|
-
# Add any explanatory text as reasoning if not present
|
|
381
|
-
if "Explanation" not in parsed_content:
|
|
382
|
-
# Extract any text before the JSON as reasoning
|
|
383
|
-
text_before_json = content.split("{")[0].strip()
|
|
384
|
-
if text_before_json:
|
|
385
|
-
parsed_content["Explanation"] = text_before_json
|
|
386
|
-
|
|
387
|
-
# Log the parsed content for debugging
|
|
388
|
-
logger.info(f"Parsed content: {json.dumps(parsed_content, indent=2)}")
|
|
389
|
-
|
|
390
|
-
# Add response to messages
|
|
391
|
-
messages.append(
|
|
392
|
-
{"role": "assistant", "content": json.dumps(parsed_content)}
|
|
393
|
-
)
|
|
394
|
-
|
|
395
|
-
try:
|
|
396
|
-
# Execute action with current parsed screen info
|
|
397
|
-
await self._execute_action(parsed_content, parsed_screen)
|
|
398
|
-
action_screenshot_saved = True
|
|
399
|
-
except Exception as e:
|
|
400
|
-
logger.error(f"Error executing action: {str(e)}")
|
|
401
|
-
# Add error message to conversation
|
|
402
|
-
messages.append(
|
|
403
|
-
{
|
|
404
|
-
"role": "assistant",
|
|
405
|
-
"content": f"Error executing action: {str(e)}",
|
|
406
|
-
"metadata": {"title": "❌ Error"},
|
|
407
|
-
}
|
|
408
|
-
)
|
|
409
|
-
return False, action_screenshot_saved
|
|
410
|
-
|
|
411
|
-
# Check if task is complete
|
|
412
|
-
if parsed_content.get("Action") == "None":
|
|
413
|
-
return False, action_screenshot_saved
|
|
414
|
-
return True, action_screenshot_saved
|
|
415
|
-
|
|
416
|
-
logger.warning("No text block found in Anthropic response")
|
|
355
|
+
if hasattr(block, "type"):
|
|
356
|
+
if block.type == "text":
|
|
357
|
+
standard_content.append({"type": "text", "text": block.text})
|
|
358
|
+
# Store raw text for JSON parsing
|
|
359
|
+
if raw_text is None:
|
|
360
|
+
raw_text = block.text
|
|
361
|
+
else:
|
|
362
|
+
raw_text += "\n" + block.text
|
|
363
|
+
else:
|
|
364
|
+
# Add other block types
|
|
365
|
+
block_dict = {}
|
|
366
|
+
for key, value in vars(block).items():
|
|
367
|
+
if not key.startswith("_"):
|
|
368
|
+
block_dict[key] = value
|
|
369
|
+
standard_content.append(block_dict)
|
|
370
|
+
else:
|
|
371
|
+
logger.warning("Invalid Anthropic response format")
|
|
417
372
|
return True, action_screenshot_saved
|
|
418
|
-
|
|
419
|
-
# Handle other providers' response formats
|
|
420
|
-
if isinstance(response, dict) and "choices" in response:
|
|
421
|
-
content = response["choices"][0]["message"]["content"]
|
|
422
373
|
else:
|
|
423
|
-
|
|
374
|
+
# Assume OpenAI or compatible format
|
|
375
|
+
try:
|
|
376
|
+
raw_text = response["choices"][0]["message"]["content"]
|
|
377
|
+
standard_content = [{"type": "text", "text": raw_text}]
|
|
378
|
+
except (KeyError, TypeError, IndexError) as e:
|
|
379
|
+
logger.error(f"Invalid response format: {str(e)}")
|
|
380
|
+
return True, action_screenshot_saved
|
|
424
381
|
|
|
425
|
-
#
|
|
426
|
-
|
|
382
|
+
# Step 2: Add the normalized response to message history
|
|
383
|
+
add_assistant_message(standard_content)
|
|
384
|
+
|
|
385
|
+
# Step 3: Extract JSON from the content for action execution
|
|
386
|
+
parsed_content = None
|
|
387
|
+
|
|
388
|
+
# If we have raw text, try to extract JSON from it
|
|
389
|
+
if raw_text:
|
|
390
|
+
# Try different approaches to extract JSON
|
|
427
391
|
try:
|
|
428
392
|
# First try to parse the whole content as JSON
|
|
429
|
-
parsed_content = json.loads(
|
|
393
|
+
parsed_content = json.loads(raw_text)
|
|
394
|
+
logger.info("Successfully parsed whole content as JSON")
|
|
430
395
|
except json.JSONDecodeError:
|
|
431
396
|
try:
|
|
432
397
|
# Try to find JSON block
|
|
433
|
-
json_content = extract_data(
|
|
398
|
+
json_content = extract_data(raw_text, "json")
|
|
434
399
|
parsed_content = json.loads(json_content)
|
|
400
|
+
logger.info("Successfully parsed JSON from code block")
|
|
435
401
|
except (json.JSONDecodeError, IndexError):
|
|
436
402
|
try:
|
|
437
403
|
# Look for JSON object pattern
|
|
438
404
|
json_pattern = r"\{[^}]+\}"
|
|
439
|
-
json_match = re.search(json_pattern,
|
|
405
|
+
json_match = re.search(json_pattern, raw_text)
|
|
440
406
|
if json_match:
|
|
441
407
|
json_str = json_match.group(0)
|
|
442
408
|
parsed_content = json.loads(json_str)
|
|
409
|
+
logger.info("Successfully parsed JSON from text")
|
|
443
410
|
else:
|
|
444
|
-
logger.error(f"No JSON found in content
|
|
411
|
+
logger.error(f"No JSON found in content")
|
|
445
412
|
return True, action_screenshot_saved
|
|
446
413
|
except json.JSONDecodeError as e:
|
|
447
414
|
logger.error(f"Failed to parse JSON from text: {str(e)}")
|
|
448
415
|
return True, action_screenshot_saved
|
|
449
416
|
|
|
417
|
+
# Step 4: Process the parsed content if available
|
|
418
|
+
if parsed_content:
|
|
450
419
|
# Clean up Box ID format
|
|
451
420
|
if "Box ID" in parsed_content and isinstance(parsed_content["Box ID"], str):
|
|
452
421
|
parsed_content["Box ID"] = parsed_content["Box ID"].replace("Box #", "")
|
|
453
422
|
|
|
454
423
|
# Add any explanatory text as reasoning if not present
|
|
455
|
-
if "Explanation" not in parsed_content:
|
|
424
|
+
if "Explanation" not in parsed_content and raw_text:
|
|
456
425
|
# Extract any text before the JSON as reasoning
|
|
457
|
-
text_before_json =
|
|
426
|
+
text_before_json = raw_text.split("{")[0].strip()
|
|
458
427
|
if text_before_json:
|
|
459
428
|
parsed_content["Explanation"] = text_before_json
|
|
460
429
|
|
|
461
|
-
#
|
|
462
|
-
|
|
430
|
+
# Log the parsed content for debugging
|
|
431
|
+
logger.info(f"Parsed content: {json.dumps(parsed_content, indent=2)}")
|
|
463
432
|
|
|
433
|
+
# Step 5: Execute the action
|
|
464
434
|
try:
|
|
465
|
-
# Execute action
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
# Add error message to conversation
|
|
471
|
-
messages.append(
|
|
472
|
-
{
|
|
473
|
-
"role": "assistant",
|
|
474
|
-
"content": f"Error executing action: {str(e)}",
|
|
475
|
-
"metadata": {"title": "❌ Error"},
|
|
476
|
-
}
|
|
435
|
+
# Execute action using the common helper method
|
|
436
|
+
should_continue, action_screenshot_saved = (
|
|
437
|
+
await self._execute_action_with_tools(
|
|
438
|
+
parsed_content, cast(ParseResult, parsed_screen)
|
|
439
|
+
)
|
|
477
440
|
)
|
|
478
|
-
return False, action_screenshot_saved
|
|
479
441
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
return True, action_screenshot_saved
|
|
485
|
-
elif isinstance(content, dict):
|
|
486
|
-
# Handle case where content is already a dictionary
|
|
487
|
-
messages.append({"role": "assistant", "content": json.dumps(content)})
|
|
488
|
-
|
|
489
|
-
try:
|
|
490
|
-
# Execute action with current parsed screen info
|
|
491
|
-
await self._execute_action(content, parsed_screen)
|
|
492
|
-
action_screenshot_saved = True
|
|
442
|
+
# Check if task is complete
|
|
443
|
+
if parsed_content.get("Action") == "None":
|
|
444
|
+
return False, action_screenshot_saved
|
|
445
|
+
return should_continue, action_screenshot_saved
|
|
493
446
|
except Exception as e:
|
|
494
447
|
logger.error(f"Error executing action: {str(e)}")
|
|
495
|
-
#
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
"content": f"Error executing action: {str(e)}",
|
|
500
|
-
"metadata": {"title": "❌ Error"},
|
|
501
|
-
}
|
|
502
|
-
)
|
|
503
|
-
return False, action_screenshot_saved
|
|
504
|
-
|
|
505
|
-
# Check if task is complete
|
|
506
|
-
if content.get("Action") == "None":
|
|
448
|
+
# Update the last assistant message with error
|
|
449
|
+
error_message = [{"type": "text", "text": f"Error executing action: {str(e)}"}]
|
|
450
|
+
# Replace the last assistant message with the error
|
|
451
|
+
self.message_manager.add_assistant_message(error_message)
|
|
507
452
|
return False, action_screenshot_saved
|
|
508
453
|
|
|
509
|
-
return True, action_screenshot_saved
|
|
510
|
-
|
|
511
454
|
return True, action_screenshot_saved
|
|
512
455
|
|
|
513
456
|
except Exception as e:
|
|
514
457
|
logger.error(f"Error handling response: {str(e)}")
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
"content": f"Error: {str(e)}",
|
|
519
|
-
"metadata": {"title": "❌ Error"},
|
|
520
|
-
}
|
|
521
|
-
)
|
|
458
|
+
# Add error message using the message manager
|
|
459
|
+
error_message = [{"type": "text", "text": f"Error: {str(e)}"}]
|
|
460
|
+
self.message_manager.add_assistant_message(error_message)
|
|
522
461
|
raise
|
|
523
462
|
|
|
463
|
+
###########################################
|
|
464
|
+
# SCREEN PARSING - IMPLEMENTING ABSTRACT METHOD
|
|
465
|
+
###########################################
|
|
466
|
+
|
|
524
467
|
async def _get_parsed_screen_som(self, save_screenshot: bool = True) -> ParseResult:
|
|
525
|
-
"""Get parsed screen information with
|
|
468
|
+
"""Get parsed screen information with Screen Object Model.
|
|
469
|
+
|
|
470
|
+
Extends the base class method to use the OmniParser to parse the screen
|
|
471
|
+
and extract UI elements.
|
|
526
472
|
|
|
527
473
|
Args:
|
|
528
474
|
save_screenshot: Whether to save the screenshot (set to False when screenshots will be saved elsewhere)
|
|
@@ -557,337 +503,26 @@ class OmniLoop(BaseLoop):
|
|
|
557
503
|
logger.error(f"Error getting parsed screen: {str(e)}")
|
|
558
504
|
raise
|
|
559
505
|
|
|
560
|
-
async def _process_screen(
|
|
561
|
-
self, parsed_screen: ParseResult, messages: List[Dict[str, Any]]
|
|
562
|
-
) -> None:
|
|
563
|
-
"""Process and add screen info to messages."""
|
|
564
|
-
try:
|
|
565
|
-
# Only add message if we have an image and provider supports it
|
|
566
|
-
if self.provider in [LLMProvider.OPENAI, LLMProvider.ANTHROPIC]:
|
|
567
|
-
image = parsed_screen.annotated_image_base64 or None
|
|
568
|
-
if image:
|
|
569
|
-
# Save screen info to current turn directory
|
|
570
|
-
if self.current_turn_dir:
|
|
571
|
-
# Save elements as JSON
|
|
572
|
-
elements_path = os.path.join(self.current_turn_dir, "elements.json")
|
|
573
|
-
with open(elements_path, "w") as f:
|
|
574
|
-
# Convert elements to dicts for JSON serialization
|
|
575
|
-
elements_json = [elem.model_dump() for elem in parsed_screen.elements]
|
|
576
|
-
json.dump(elements_json, f, indent=2)
|
|
577
|
-
logger.info(f"Saved elements to {elements_path}")
|
|
578
|
-
|
|
579
|
-
# Format the image content based on the provider
|
|
580
|
-
if self.provider == LLMProvider.ANTHROPIC:
|
|
581
|
-
# Compress the image before sending to Anthropic (5MB limit)
|
|
582
|
-
image_size = len(image)
|
|
583
|
-
logger.info(f"Image base64 is present, length: {image_size}")
|
|
584
|
-
|
|
585
|
-
# Anthropic has a 5MB limit - check against base64 string length
|
|
586
|
-
# which is what matters for the API call payload
|
|
587
|
-
# Use slightly smaller limit (4.9MB) to account for request overhead
|
|
588
|
-
max_size = int(4.9 * 1024 * 1024) # 4.9MB
|
|
589
|
-
|
|
590
|
-
# Default media type (will be overridden if compression is needed)
|
|
591
|
-
media_type = "image/png"
|
|
592
|
-
|
|
593
|
-
# Check if the image already has a media type prefix
|
|
594
|
-
if image.startswith("data:"):
|
|
595
|
-
parts = image.split(",", 1)
|
|
596
|
-
if len(parts) == 2 and "image/jpeg" in parts[0].lower():
|
|
597
|
-
media_type = "image/jpeg"
|
|
598
|
-
elif len(parts) == 2 and "image/png" in parts[0].lower():
|
|
599
|
-
media_type = "image/png"
|
|
600
|
-
|
|
601
|
-
if image_size > max_size:
|
|
602
|
-
logger.info(
|
|
603
|
-
f"Image size ({image_size} bytes) exceeds Anthropic limit ({max_size} bytes), compressing..."
|
|
604
|
-
)
|
|
605
|
-
image, media_type = compress_image_base64(image, max_size)
|
|
606
|
-
logger.info(
|
|
607
|
-
f"Image compressed to {len(image)} bytes with media_type {media_type}"
|
|
608
|
-
)
|
|
609
|
-
|
|
610
|
-
# Anthropic uses "type": "image"
|
|
611
|
-
screen_info_msg = {
|
|
612
|
-
"role": "user",
|
|
613
|
-
"content": [
|
|
614
|
-
{
|
|
615
|
-
"type": "image",
|
|
616
|
-
"source": {
|
|
617
|
-
"type": "base64",
|
|
618
|
-
"media_type": media_type,
|
|
619
|
-
"data": image,
|
|
620
|
-
},
|
|
621
|
-
}
|
|
622
|
-
],
|
|
623
|
-
}
|
|
624
|
-
else:
|
|
625
|
-
# OpenAI and others use "type": "image_url"
|
|
626
|
-
screen_info_msg = {
|
|
627
|
-
"role": "user",
|
|
628
|
-
"content": [
|
|
629
|
-
{
|
|
630
|
-
"type": "image_url",
|
|
631
|
-
"image_url": {"url": f"data:image/png;base64,{image}"},
|
|
632
|
-
}
|
|
633
|
-
],
|
|
634
|
-
}
|
|
635
|
-
messages.append(screen_info_msg)
|
|
636
|
-
|
|
637
|
-
except Exception as e:
|
|
638
|
-
logger.error(f"Error processing screen info: {str(e)}")
|
|
639
|
-
raise
|
|
640
|
-
|
|
641
506
|
def _get_system_prompt(self) -> str:
|
|
642
507
|
"""Get the system prompt for the model."""
|
|
643
508
|
return SYSTEM_PROMPT
|
|
644
509
|
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
Args:
|
|
649
|
-
content: Dictionary containing the action details
|
|
650
|
-
parsed_screen: Current parsed screen information
|
|
651
|
-
"""
|
|
652
|
-
try:
|
|
653
|
-
action = content.get("Action", "").lower()
|
|
654
|
-
if not action:
|
|
655
|
-
return
|
|
656
|
-
|
|
657
|
-
# Track if we saved an action-specific screenshot
|
|
658
|
-
action_screenshot_saved = False
|
|
659
|
-
|
|
660
|
-
try:
|
|
661
|
-
# Prepare kwargs based on action type
|
|
662
|
-
kwargs = {}
|
|
663
|
-
|
|
664
|
-
if action in ["left_click", "right_click", "double_click", "move_cursor"]:
|
|
665
|
-
try:
|
|
666
|
-
box_id = int(content["Box ID"])
|
|
667
|
-
logger.info(f"Processing Box ID: {box_id}")
|
|
668
|
-
|
|
669
|
-
# Calculate click coordinates
|
|
670
|
-
x, y = await self._calculate_click_coordinates(box_id, parsed_screen)
|
|
671
|
-
logger.info(f"Calculated coordinates: x={x}, y={y}")
|
|
672
|
-
|
|
673
|
-
kwargs["x"] = x
|
|
674
|
-
kwargs["y"] = y
|
|
675
|
-
|
|
676
|
-
# Visualize action if screenshot is available
|
|
677
|
-
if parsed_screen.annotated_image_base64:
|
|
678
|
-
img_data = parsed_screen.annotated_image_base64
|
|
679
|
-
# Remove data URL prefix if present
|
|
680
|
-
if img_data.startswith("data:image"):
|
|
681
|
-
img_data = img_data.split(",")[1]
|
|
682
|
-
# Only save visualization for coordinate-based actions
|
|
683
|
-
self._visualize_action(x, y, img_data)
|
|
684
|
-
action_screenshot_saved = True
|
|
685
|
-
|
|
686
|
-
except ValueError as e:
|
|
687
|
-
logger.error(f"Error processing Box ID: {str(e)}")
|
|
688
|
-
return
|
|
689
|
-
|
|
690
|
-
elif action == "drag_to":
|
|
691
|
-
try:
|
|
692
|
-
box_id = int(content["Box ID"])
|
|
693
|
-
x, y = await self._calculate_click_coordinates(box_id, parsed_screen)
|
|
694
|
-
kwargs.update(
|
|
695
|
-
{
|
|
696
|
-
"x": x,
|
|
697
|
-
"y": y,
|
|
698
|
-
"button": content.get("button", "left"),
|
|
699
|
-
"duration": float(content.get("duration", 0.5)),
|
|
700
|
-
}
|
|
701
|
-
)
|
|
702
|
-
|
|
703
|
-
# Visualize drag destination if screenshot is available
|
|
704
|
-
if parsed_screen.annotated_image_base64:
|
|
705
|
-
img_data = parsed_screen.annotated_image_base64
|
|
706
|
-
# Remove data URL prefix if present
|
|
707
|
-
if img_data.startswith("data:image"):
|
|
708
|
-
img_data = img_data.split(",")[1]
|
|
709
|
-
# Only save visualization for coordinate-based actions
|
|
710
|
-
self._visualize_action(x, y, img_data)
|
|
711
|
-
action_screenshot_saved = True
|
|
712
|
-
|
|
713
|
-
except ValueError as e:
|
|
714
|
-
logger.error(f"Error processing drag coordinates: {str(e)}")
|
|
715
|
-
return
|
|
716
|
-
|
|
717
|
-
elif action == "type_text":
|
|
718
|
-
kwargs["text"] = content["Value"]
|
|
719
|
-
# For type_text, store the value in the action type
|
|
720
|
-
action_type = f"type_{content['Value'][:20]}" # Truncate if too long
|
|
721
|
-
elif action == "press_key":
|
|
722
|
-
kwargs["key"] = content["Value"]
|
|
723
|
-
action_type = f"press_{content['Value']}"
|
|
724
|
-
elif action == "hotkey":
|
|
725
|
-
if isinstance(content.get("Value"), list):
|
|
726
|
-
keys = content["Value"]
|
|
727
|
-
action_type = f"hotkey_{'_'.join(keys)}"
|
|
728
|
-
else:
|
|
729
|
-
# Simply split string format like "command+space" into a list
|
|
730
|
-
keys = [k.strip() for k in content["Value"].lower().split("+")]
|
|
731
|
-
action_type = f"hotkey_{content['Value'].replace('+', '_')}"
|
|
732
|
-
logger.info(f"Preparing hotkey with keys: {keys}")
|
|
733
|
-
# Get the method but call it with *args instead of **kwargs
|
|
734
|
-
method = getattr(self.computer.interface, action)
|
|
735
|
-
await method(*keys) # Unpack the keys list as positional arguments
|
|
736
|
-
logger.info(f"Tool execution completed successfully: {action}")
|
|
737
|
-
|
|
738
|
-
# For hotkeys, take a screenshot after the action
|
|
739
|
-
try:
|
|
740
|
-
# Get a new screenshot after the action and save it with the action type
|
|
741
|
-
new_parsed_screen = await self._get_parsed_screen_som(save_screenshot=False)
|
|
742
|
-
if new_parsed_screen and new_parsed_screen.annotated_image_base64:
|
|
743
|
-
img_data = new_parsed_screen.annotated_image_base64
|
|
744
|
-
# Remove data URL prefix if present
|
|
745
|
-
if img_data.startswith("data:image"):
|
|
746
|
-
img_data = img_data.split(",")[1]
|
|
747
|
-
# Save with action type to indicate this is a post-action screenshot
|
|
748
|
-
self._save_screenshot(img_data, action_type=action_type)
|
|
749
|
-
action_screenshot_saved = True
|
|
750
|
-
except Exception as screenshot_error:
|
|
751
|
-
logger.error(
|
|
752
|
-
f"Error taking post-hotkey screenshot: {str(screenshot_error)}"
|
|
753
|
-
)
|
|
754
|
-
|
|
755
|
-
return
|
|
756
|
-
|
|
757
|
-
elif action in ["scroll_down", "scroll_up"]:
|
|
758
|
-
clicks = int(content.get("amount", 1))
|
|
759
|
-
kwargs["clicks"] = clicks
|
|
760
|
-
action_type = f"scroll_{action.split('_')[1]}_{clicks}"
|
|
510
|
+
###########################################
|
|
511
|
+
# MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
|
|
512
|
+
###########################################
|
|
761
513
|
|
|
762
|
-
|
|
763
|
-
if parsed_screen.annotated_image_base64:
|
|
764
|
-
img_data = parsed_screen.annotated_image_base64
|
|
765
|
-
# Remove data URL prefix if present
|
|
766
|
-
if img_data.startswith("data:image"):
|
|
767
|
-
img_data = img_data.split(",")[1]
|
|
768
|
-
direction = "down" if action == "scroll_down" else "up"
|
|
769
|
-
# For scrolling, we only save the visualization to avoid duplicate images
|
|
770
|
-
self._visualize_scroll(direction, clicks, img_data)
|
|
771
|
-
action_screenshot_saved = True
|
|
772
|
-
|
|
773
|
-
else:
|
|
774
|
-
logger.warning(f"Unknown action: {action}")
|
|
775
|
-
return
|
|
776
|
-
|
|
777
|
-
# Execute tool and handle result
|
|
778
|
-
try:
|
|
779
|
-
method = getattr(self.computer.interface, action)
|
|
780
|
-
logger.info(f"Found method for action '{action}': {method}")
|
|
781
|
-
await method(**kwargs)
|
|
782
|
-
logger.info(f"Tool execution completed successfully: {action}")
|
|
783
|
-
|
|
784
|
-
# For non-coordinate based actions that don't already have visualizations,
|
|
785
|
-
# take a new screenshot after the action
|
|
786
|
-
if not action_screenshot_saved:
|
|
787
|
-
# Take a new screenshot
|
|
788
|
-
try:
|
|
789
|
-
# Get a new screenshot after the action and save it with the action type
|
|
790
|
-
new_parsed_screen = await self._get_parsed_screen_som(
|
|
791
|
-
save_screenshot=False
|
|
792
|
-
)
|
|
793
|
-
if new_parsed_screen and new_parsed_screen.annotated_image_base64:
|
|
794
|
-
img_data = new_parsed_screen.annotated_image_base64
|
|
795
|
-
# Remove data URL prefix if present
|
|
796
|
-
if img_data.startswith("data:image"):
|
|
797
|
-
img_data = img_data.split(",")[1]
|
|
798
|
-
# Save with action type to indicate this is a post-action screenshot
|
|
799
|
-
if "action_type" in locals():
|
|
800
|
-
self._save_screenshot(img_data, action_type=action_type)
|
|
801
|
-
else:
|
|
802
|
-
self._save_screenshot(img_data, action_type=action)
|
|
803
|
-
# Update the action screenshot flag for this turn
|
|
804
|
-
action_screenshot_saved = True
|
|
805
|
-
except Exception as screenshot_error:
|
|
806
|
-
logger.error(
|
|
807
|
-
f"Error taking post-action screenshot: {str(screenshot_error)}"
|
|
808
|
-
)
|
|
809
|
-
|
|
810
|
-
except AttributeError as e:
|
|
811
|
-
logger.error(f"Method not found for action '{action}': {str(e)}")
|
|
812
|
-
return
|
|
813
|
-
except Exception as tool_error:
|
|
814
|
-
logger.error(f"Tool execution failed: {str(tool_error)}")
|
|
815
|
-
return
|
|
816
|
-
|
|
817
|
-
except Exception as e:
|
|
818
|
-
logger.error(f"Error executing action {action}: {str(e)}")
|
|
819
|
-
return
|
|
820
|
-
|
|
821
|
-
except Exception as e:
|
|
822
|
-
logger.error(f"Error in _execute_action: {str(e)}")
|
|
823
|
-
return
|
|
824
|
-
|
|
825
|
-
async def _calculate_click_coordinates(
|
|
826
|
-
self, box_id: int, parsed_screen: ParseResult
|
|
827
|
-
) -> Tuple[int, int]:
|
|
828
|
-
"""Calculate click coordinates based on box ID.
|
|
829
|
-
|
|
830
|
-
Args:
|
|
831
|
-
box_id: The ID of the box to click
|
|
832
|
-
parsed_screen: The parsed screen information
|
|
833
|
-
|
|
834
|
-
Returns:
|
|
835
|
-
Tuple of (x, y) coordinates
|
|
836
|
-
|
|
837
|
-
Raises:
|
|
838
|
-
ValueError: If box_id is invalid or missing from parsed screen
|
|
839
|
-
"""
|
|
840
|
-
# First try to use structured elements data
|
|
841
|
-
logger.info(f"Elements count: {len(parsed_screen.elements)}")
|
|
842
|
-
|
|
843
|
-
# Try to find element with matching ID
|
|
844
|
-
for element in parsed_screen.elements:
|
|
845
|
-
if element.id == box_id:
|
|
846
|
-
logger.info(f"Found element with ID {box_id}: {element}")
|
|
847
|
-
bbox = element.bbox
|
|
848
|
-
|
|
849
|
-
# Get screen dimensions from the metadata if available, or fallback
|
|
850
|
-
width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
|
|
851
|
-
height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
|
|
852
|
-
logger.info(f"Screen dimensions: width={width}, height={height}")
|
|
853
|
-
|
|
854
|
-
# Calculate center of the box in pixels
|
|
855
|
-
center_x = int((bbox.x1 + bbox.x2) / 2 * width)
|
|
856
|
-
center_y = int((bbox.y1 + bbox.y2) / 2 * height)
|
|
857
|
-
logger.info(f"Calculated center: ({center_x}, {center_y})")
|
|
858
|
-
|
|
859
|
-
# Validate coordinates - if they're (0,0) or unreasonably small,
|
|
860
|
-
# use a default position in the center of the screen
|
|
861
|
-
if center_x == 0 and center_y == 0:
|
|
862
|
-
logger.warning("Got (0,0) coordinates, using fallback position")
|
|
863
|
-
center_x = width // 2
|
|
864
|
-
center_y = height // 2
|
|
865
|
-
logger.info(f"Using fallback center: ({center_x}, {center_y})")
|
|
866
|
-
|
|
867
|
-
return center_x, center_y
|
|
868
|
-
|
|
869
|
-
# If we couldn't find the box, use center of screen
|
|
870
|
-
logger.error(
|
|
871
|
-
f"Box ID {box_id} not found in structured elements (count={len(parsed_screen.elements)})"
|
|
872
|
-
)
|
|
873
|
-
|
|
874
|
-
# Use center of screen as fallback
|
|
875
|
-
width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
|
|
876
|
-
height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
|
|
877
|
-
logger.warning(f"Using fallback position in center of screen ({width//2}, {height//2})")
|
|
878
|
-
return width // 2, height // 2
|
|
879
|
-
|
|
880
|
-
async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
|
|
514
|
+
async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
|
|
881
515
|
"""Run the agent loop with provided messages.
|
|
882
516
|
|
|
883
517
|
Args:
|
|
884
|
-
messages: List of
|
|
518
|
+
messages: List of messages in standard OpenAI format
|
|
885
519
|
|
|
886
520
|
Yields:
|
|
887
|
-
|
|
521
|
+
Agent response format
|
|
888
522
|
"""
|
|
889
|
-
#
|
|
890
|
-
|
|
523
|
+
# Initialize the message manager with the provided messages
|
|
524
|
+
self.message_manager.messages = messages.copy()
|
|
525
|
+
logger.info(f"Starting OmniLoop run with {len(self.message_manager.messages)} messages")
|
|
891
526
|
|
|
892
527
|
# Continue running until explicitly told to stop
|
|
893
528
|
running = True
|
|
@@ -916,26 +551,66 @@ class OmniLoop(BaseLoop):
|
|
|
916
551
|
# Get up-to-date screen information
|
|
917
552
|
parsed_screen = await self._get_parsed_screen_som()
|
|
918
553
|
|
|
919
|
-
# Process screen info and update messages
|
|
920
|
-
|
|
554
|
+
# Process screen info and update messages in standard format
|
|
555
|
+
try:
|
|
556
|
+
# Get image from parsed screen
|
|
557
|
+
image = parsed_screen.annotated_image_base64 or None
|
|
558
|
+
if image:
|
|
559
|
+
# Save elements as JSON if we have a turn directory
|
|
560
|
+
if self.current_turn_dir and hasattr(parsed_screen, "elements"):
|
|
561
|
+
elements_path = os.path.join(self.current_turn_dir, "elements.json")
|
|
562
|
+
with open(elements_path, "w") as f:
|
|
563
|
+
# Convert elements to dicts for JSON serialization
|
|
564
|
+
elements_json = [
|
|
565
|
+
elem.model_dump() for elem in parsed_screen.elements
|
|
566
|
+
]
|
|
567
|
+
json.dump(elements_json, f, indent=2)
|
|
568
|
+
logger.info(f"Saved elements to {elements_path}")
|
|
569
|
+
|
|
570
|
+
# Remove data URL prefix if present
|
|
571
|
+
if "," in image:
|
|
572
|
+
image = image.split(",")[1]
|
|
573
|
+
|
|
574
|
+
# Add screenshot to message history using message manager
|
|
575
|
+
self.message_manager.add_user_message(
|
|
576
|
+
[
|
|
577
|
+
{
|
|
578
|
+
"type": "image_url",
|
|
579
|
+
"image_url": {"url": f"data:image/png;base64,{image}"},
|
|
580
|
+
}
|
|
581
|
+
]
|
|
582
|
+
)
|
|
583
|
+
logger.info("Added screenshot to message history")
|
|
584
|
+
except Exception as e:
|
|
585
|
+
logger.error(f"Error processing screen info: {str(e)}")
|
|
586
|
+
raise
|
|
921
587
|
|
|
922
588
|
# Get system prompt
|
|
923
589
|
system_prompt = self._get_system_prompt()
|
|
924
590
|
|
|
925
|
-
# Make API call with retries
|
|
926
|
-
response = await self.
|
|
591
|
+
# Make API call with retries using the APIHandler
|
|
592
|
+
response = await self.api_handler.make_api_call(
|
|
593
|
+
self.message_manager.messages, system_prompt
|
|
594
|
+
)
|
|
927
595
|
|
|
928
596
|
# Handle the response (may execute actions)
|
|
929
597
|
# Returns: (should_continue, action_screenshot_saved)
|
|
930
598
|
should_continue, new_screenshot_saved = await self._handle_response(
|
|
931
|
-
response,
|
|
599
|
+
response, self.message_manager.messages, parsed_screen
|
|
932
600
|
)
|
|
933
601
|
|
|
934
602
|
# Update whether an action screenshot was saved this turn
|
|
935
603
|
action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
|
|
936
604
|
|
|
605
|
+
# Create OpenAI-compatible response format using utility function
|
|
606
|
+
openai_compatible_response = await to_openai_agent_response_format(
|
|
607
|
+
response=response,
|
|
608
|
+
messages=self.message_manager.messages,
|
|
609
|
+
model=self.model,
|
|
610
|
+
)
|
|
611
|
+
|
|
937
612
|
# Yield the response to the caller
|
|
938
|
-
yield
|
|
613
|
+
yield openai_compatible_response
|
|
939
614
|
|
|
940
615
|
# Check if we should continue this conversation
|
|
941
616
|
running = should_continue
|
|
@@ -963,3 +638,218 @@ class OmniLoop(BaseLoop):
|
|
|
963
638
|
|
|
964
639
|
# Create a brief delay before retrying
|
|
965
640
|
await asyncio.sleep(1)
|
|
641
|
+
|
|
642
|
+
async def process_model_response(self, response_text: str) -> Optional[Dict[str, Any]]:
|
|
643
|
+
"""Process model response to extract tool calls.
|
|
644
|
+
|
|
645
|
+
Args:
|
|
646
|
+
response_text: Model response text
|
|
647
|
+
|
|
648
|
+
Returns:
|
|
649
|
+
Extracted tool information, or None if no tool call was found
|
|
650
|
+
"""
|
|
651
|
+
try:
|
|
652
|
+
# Ensure tools are initialized before use
|
|
653
|
+
await self._ensure_tools_initialized()
|
|
654
|
+
|
|
655
|
+
# Look for tool use in the response
|
|
656
|
+
if "function_call" in response_text or "tool_use" in response_text:
|
|
657
|
+
# The extract_tool_call method should be implemented in the OmniAPIHandler
|
|
658
|
+
# For now, we'll just use a simple approach
|
|
659
|
+
# This will be replaced with the proper implementation
|
|
660
|
+
tool_info = None
|
|
661
|
+
if "function_call" in response_text:
|
|
662
|
+
# Extract function call params
|
|
663
|
+
try:
|
|
664
|
+
# Simple extraction - in real code this would be more robust
|
|
665
|
+
import json
|
|
666
|
+
import re
|
|
667
|
+
|
|
668
|
+
match = re.search(r'"function_call"\s*:\s*{([^}]+)}', response_text)
|
|
669
|
+
if match:
|
|
670
|
+
function_text = "{" + match.group(1) + "}"
|
|
671
|
+
tool_info = json.loads(function_text)
|
|
672
|
+
except Exception as e:
|
|
673
|
+
logger.error(f"Error extracting function call: {str(e)}")
|
|
674
|
+
|
|
675
|
+
if tool_info:
|
|
676
|
+
try:
|
|
677
|
+
# Execute the tool
|
|
678
|
+
result = await self.tool_manager.execute_tool(
|
|
679
|
+
name=tool_info.get("name"), tool_input=tool_info.get("arguments", {})
|
|
680
|
+
)
|
|
681
|
+
# Handle the result
|
|
682
|
+
return {"tool_result": result}
|
|
683
|
+
except Exception as e:
|
|
684
|
+
error_msg = (
|
|
685
|
+
f"Error executing tool '{tool_info.get('name', 'unknown')}': {str(e)}"
|
|
686
|
+
)
|
|
687
|
+
logger.error(error_msg)
|
|
688
|
+
return {"tool_result": ToolResult(error=error_msg)}
|
|
689
|
+
except Exception as e:
|
|
690
|
+
logger.error(f"Error processing tool call: {str(e)}")
|
|
691
|
+
|
|
692
|
+
return None
|
|
693
|
+
|
|
694
|
+
async def process_response_with_tools(
|
|
695
|
+
self, response_text: str, parsed_screen: Optional[ParseResult] = None
|
|
696
|
+
) -> Tuple[bool, str]:
|
|
697
|
+
"""Process model response and execute tools.
|
|
698
|
+
|
|
699
|
+
Args:
|
|
700
|
+
response_text: Model response text
|
|
701
|
+
parsed_screen: Current parsed screen information (optional)
|
|
702
|
+
|
|
703
|
+
Returns:
|
|
704
|
+
Tuple of (action_taken, observation)
|
|
705
|
+
"""
|
|
706
|
+
logger.info("Processing response with tools")
|
|
707
|
+
|
|
708
|
+
# Process the response to extract tool calls
|
|
709
|
+
tool_result = await self.process_model_response(response_text)
|
|
710
|
+
|
|
711
|
+
if tool_result and "tool_result" in tool_result:
|
|
712
|
+
# A tool was executed
|
|
713
|
+
result = tool_result["tool_result"]
|
|
714
|
+
if result.error:
|
|
715
|
+
return False, f"ERROR: {result.error}"
|
|
716
|
+
else:
|
|
717
|
+
return True, result.output or "Tool executed successfully"
|
|
718
|
+
|
|
719
|
+
# No action or tool call found
|
|
720
|
+
return False, "No action taken - no tool call detected in response"
|
|
721
|
+
|
|
722
|
+
###########################################
|
|
723
|
+
# UTILITY METHODS
|
|
724
|
+
###########################################
|
|
725
|
+
|
|
726
|
+
async def _ensure_tools_initialized(self) -> None:
|
|
727
|
+
"""Ensure the tool manager and tools are initialized before use."""
|
|
728
|
+
if not hasattr(self.tool_manager, "tools") or self.tool_manager.tools is None:
|
|
729
|
+
logger.info("Tools not initialized. Initializing now...")
|
|
730
|
+
await self.tool_manager.initialize()
|
|
731
|
+
logger.info("Tools initialized successfully.")
|
|
732
|
+
|
|
733
|
+
async def _execute_action_with_tools(
|
|
734
|
+
self, action_data: Dict[str, Any], parsed_screen: ParseResult
|
|
735
|
+
) -> Tuple[bool, bool]:
|
|
736
|
+
"""Execute an action using the tools-based approach.
|
|
737
|
+
|
|
738
|
+
Args:
|
|
739
|
+
action_data: Dictionary containing action details
|
|
740
|
+
parsed_screen: Current parsed screen information
|
|
741
|
+
|
|
742
|
+
Returns:
|
|
743
|
+
Tuple of (should_continue, action_screenshot_saved)
|
|
744
|
+
"""
|
|
745
|
+
action_screenshot_saved = False
|
|
746
|
+
action_type = None # Initialize for possible use in post-action screenshot
|
|
747
|
+
|
|
748
|
+
try:
|
|
749
|
+
# Extract the action
|
|
750
|
+
parsed_action = action_data.get("Action", "").lower()
|
|
751
|
+
|
|
752
|
+
# Only process if we have a valid action
|
|
753
|
+
if not parsed_action or parsed_action == "none":
|
|
754
|
+
return False, action_screenshot_saved
|
|
755
|
+
|
|
756
|
+
# Convert the parsed content to a format suitable for the tools system
|
|
757
|
+
tool_name = "computer" # Default to computer tool
|
|
758
|
+
tool_args = {"action": parsed_action}
|
|
759
|
+
|
|
760
|
+
# Add specific arguments based on action type
|
|
761
|
+
if parsed_action in ["left_click", "right_click", "double_click", "move_cursor"]:
|
|
762
|
+
# Calculate coordinates from Box ID using parser
|
|
763
|
+
try:
|
|
764
|
+
box_id = int(action_data["Box ID"])
|
|
765
|
+
x, y = await self.parser.calculate_click_coordinates(
|
|
766
|
+
box_id, cast(ParseResult, parsed_screen)
|
|
767
|
+
)
|
|
768
|
+
tool_args["x"] = x
|
|
769
|
+
tool_args["y"] = y
|
|
770
|
+
|
|
771
|
+
# Visualize action if screenshot is available
|
|
772
|
+
if parsed_screen and parsed_screen.annotated_image_base64:
|
|
773
|
+
img_data = parsed_screen.annotated_image_base64
|
|
774
|
+
# Remove data URL prefix if present
|
|
775
|
+
if img_data.startswith("data:image"):
|
|
776
|
+
img_data = img_data.split(",")[1]
|
|
777
|
+
# Save visualization for coordinate-based actions
|
|
778
|
+
self.viz_helper.visualize_action(x, y, img_data)
|
|
779
|
+
action_screenshot_saved = True
|
|
780
|
+
|
|
781
|
+
except (ValueError, KeyError) as e:
|
|
782
|
+
logger.error(f"Error processing Box ID: {str(e)}")
|
|
783
|
+
return False, action_screenshot_saved
|
|
784
|
+
|
|
785
|
+
elif parsed_action == "type_text":
|
|
786
|
+
tool_args["text"] = action_data.get("Value", "")
|
|
787
|
+
# For type_text, store the value in the action type for screenshot naming
|
|
788
|
+
action_type = f"type_{tool_args['text'][:20]}" # Truncate if too long
|
|
789
|
+
|
|
790
|
+
elif parsed_action == "press_key":
|
|
791
|
+
tool_args["key"] = action_data.get("Value", "")
|
|
792
|
+
action_type = f"press_{tool_args['key']}"
|
|
793
|
+
|
|
794
|
+
elif parsed_action == "hotkey":
|
|
795
|
+
value = action_data.get("Value", "")
|
|
796
|
+
if isinstance(value, list):
|
|
797
|
+
tool_args["keys"] = value
|
|
798
|
+
action_type = f"hotkey_{'_'.join(value)}"
|
|
799
|
+
else:
|
|
800
|
+
# Split string format like "command+space" into a list
|
|
801
|
+
keys = [k.strip() for k in value.lower().split("+")]
|
|
802
|
+
tool_args["keys"] = keys
|
|
803
|
+
action_type = f"hotkey_{value.replace('+', '_')}"
|
|
804
|
+
|
|
805
|
+
elif parsed_action in ["scroll_down", "scroll_up"]:
|
|
806
|
+
clicks = int(action_data.get("amount", 1))
|
|
807
|
+
tool_args["amount"] = clicks
|
|
808
|
+
action_type = f"scroll_{parsed_action.split('_')[1]}_{clicks}"
|
|
809
|
+
|
|
810
|
+
# Visualize scrolling if screenshot is available
|
|
811
|
+
if parsed_screen and parsed_screen.annotated_image_base64:
|
|
812
|
+
img_data = parsed_screen.annotated_image_base64
|
|
813
|
+
# Remove data URL prefix if present
|
|
814
|
+
if img_data.startswith("data:image"):
|
|
815
|
+
img_data = img_data.split(",")[1]
|
|
816
|
+
direction = "down" if parsed_action == "scroll_down" else "up"
|
|
817
|
+
# For scrolling, we save the visualization
|
|
818
|
+
self.viz_helper.visualize_scroll(direction, clicks, img_data)
|
|
819
|
+
action_screenshot_saved = True
|
|
820
|
+
|
|
821
|
+
# Ensure tools are initialized before use
|
|
822
|
+
await self._ensure_tools_initialized()
|
|
823
|
+
|
|
824
|
+
# Execute tool with prepared arguments
|
|
825
|
+
result = await self.tool_manager.execute_tool(name=tool_name, tool_input=tool_args)
|
|
826
|
+
|
|
827
|
+
# Take a new screenshot after the action if we haven't already saved one
|
|
828
|
+
if not action_screenshot_saved:
|
|
829
|
+
try:
|
|
830
|
+
# Get a new screenshot after the action
|
|
831
|
+
new_parsed_screen = await self._get_parsed_screen_som(save_screenshot=False)
|
|
832
|
+
if new_parsed_screen and new_parsed_screen.annotated_image_base64:
|
|
833
|
+
img_data = new_parsed_screen.annotated_image_base64
|
|
834
|
+
# Remove data URL prefix if present
|
|
835
|
+
if img_data.startswith("data:image"):
|
|
836
|
+
img_data = img_data.split(",")[1]
|
|
837
|
+
# Save with action type if defined, otherwise use the action name
|
|
838
|
+
if action_type:
|
|
839
|
+
self._save_screenshot(img_data, action_type=action_type)
|
|
840
|
+
else:
|
|
841
|
+
self._save_screenshot(img_data, action_type=parsed_action)
|
|
842
|
+
action_screenshot_saved = True
|
|
843
|
+
except Exception as screenshot_error:
|
|
844
|
+
logger.error(f"Error taking post-action screenshot: {str(screenshot_error)}")
|
|
845
|
+
|
|
846
|
+
# Continue the loop if the action is not "None"
|
|
847
|
+
return True, action_screenshot_saved
|
|
848
|
+
|
|
849
|
+
except Exception as e:
|
|
850
|
+
logger.error(f"Error executing action: {str(e)}")
|
|
851
|
+
# Update the last assistant message with error
|
|
852
|
+
error_message = [{"type": "text", "text": f"Error executing action: {str(e)}"}]
|
|
853
|
+
# Replace the last assistant message with the error
|
|
854
|
+
self.message_manager.add_assistant_message(error_message)
|
|
855
|
+
return False, action_screenshot_saved
|