cua-agent 0.1.6__py3-none-any.whl → 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +3 -2
- agent/core/__init__.py +1 -6
- agent/core/{computer_agent.py → agent.py} +31 -76
- agent/core/{loop.py → base.py} +68 -127
- agent/core/factory.py +104 -0
- agent/core/messages.py +279 -125
- agent/core/provider_config.py +15 -0
- agent/core/types.py +45 -0
- agent/core/visualization.py +197 -0
- agent/providers/anthropic/api/client.py +142 -1
- agent/providers/anthropic/api_handler.py +140 -0
- agent/providers/anthropic/callbacks/__init__.py +5 -0
- agent/providers/anthropic/loop.py +207 -221
- agent/providers/anthropic/response_handler.py +226 -0
- agent/providers/anthropic/tools/bash.py +0 -97
- agent/providers/anthropic/utils.py +368 -0
- agent/providers/omni/__init__.py +1 -20
- agent/providers/omni/api_handler.py +42 -0
- agent/providers/omni/clients/anthropic.py +4 -0
- agent/providers/omni/image_utils.py +0 -72
- agent/providers/omni/loop.py +491 -607
- agent/providers/omni/parser.py +58 -4
- agent/providers/omni/tools/__init__.py +25 -7
- agent/providers/omni/tools/base.py +29 -0
- agent/providers/omni/tools/bash.py +43 -38
- agent/providers/omni/tools/computer.py +144 -182
- agent/providers/omni/tools/manager.py +25 -45
- agent/providers/omni/types.py +1 -3
- agent/providers/omni/utils.py +224 -145
- agent/providers/openai/__init__.py +6 -0
- agent/providers/openai/api_handler.py +453 -0
- agent/providers/openai/loop.py +440 -0
- agent/providers/openai/response_handler.py +205 -0
- agent/providers/openai/tools/__init__.py +15 -0
- agent/providers/openai/tools/base.py +79 -0
- agent/providers/openai/tools/computer.py +319 -0
- agent/providers/openai/tools/manager.py +106 -0
- agent/providers/openai/types.py +36 -0
- agent/providers/openai/utils.py +98 -0
- cua_agent-0.1.18.dist-info/METADATA +165 -0
- cua_agent-0.1.18.dist-info/RECORD +73 -0
- agent/README.md +0 -63
- agent/providers/anthropic/messages/manager.py +0 -112
- agent/providers/omni/callbacks.py +0 -78
- agent/providers/omni/clients/groq.py +0 -101
- agent/providers/omni/experiment.py +0 -276
- agent/providers/omni/messages.py +0 -171
- agent/providers/omni/tool_manager.py +0 -91
- agent/providers/omni/visualization.py +0 -130
- agent/types/__init__.py +0 -23
- agent/types/base.py +0 -41
- agent/types/messages.py +0 -36
- cua_agent-0.1.6.dist-info/METADATA +0 -120
- cua_agent-0.1.6.dist-info/RECORD +0 -64
- /agent/{types → core}/tools.py +0 -0
- {cua_agent-0.1.6.dist-info → cua_agent-0.1.18.dist-info}/WHEEL +0 -0
- {cua_agent-0.1.6.dist-info → cua_agent-0.1.18.dist-info}/entry_points.txt +0 -0
agent/providers/omni/loop.py
CHANGED
|
@@ -1,34 +1,28 @@
|
|
|
1
1
|
"""Omni-specific agent loop implementation."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Any, Dict, List, Optional, Tuple, AsyncGenerator
|
|
5
|
-
import base64
|
|
6
|
-
from PIL import Image
|
|
7
|
-
from io import BytesIO
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, AsyncGenerator
|
|
8
5
|
import json
|
|
9
6
|
import re
|
|
10
7
|
import os
|
|
11
|
-
from datetime import datetime
|
|
12
8
|
import asyncio
|
|
13
9
|
from httpx import ConnectError, ReadTimeout
|
|
14
|
-
import shutil
|
|
15
|
-
import copy
|
|
16
10
|
from typing import cast
|
|
17
11
|
|
|
18
|
-
from .parser import OmniParser, ParseResult
|
|
19
|
-
from ...core.
|
|
12
|
+
from .parser import OmniParser, ParseResult
|
|
13
|
+
from ...core.base import BaseLoop
|
|
14
|
+
from ...core.visualization import VisualizationHelper
|
|
15
|
+
from ...core.messages import StandardMessageManager, ImageRetentionConfig
|
|
16
|
+
from .utils import to_openai_agent_response_format
|
|
17
|
+
from ...core.types import AgentResponse
|
|
20
18
|
from computer import Computer
|
|
21
19
|
from .types import LLMProvider
|
|
22
|
-
from .clients.base import BaseOmniClient
|
|
23
20
|
from .clients.openai import OpenAIClient
|
|
24
|
-
from .clients.groq import GroqClient
|
|
25
21
|
from .clients.anthropic import AnthropicClient
|
|
26
22
|
from .prompts import SYSTEM_PROMPT
|
|
27
|
-
from .
|
|
28
|
-
from .
|
|
29
|
-
from .
|
|
30
|
-
from ...core.messages import ImageRetentionConfig
|
|
31
|
-
from .messages import OmniMessageManager
|
|
23
|
+
from .api_handler import OmniAPIHandler
|
|
24
|
+
from .tools.manager import ToolManager
|
|
25
|
+
from .tools import ToolResult
|
|
32
26
|
|
|
33
27
|
logging.basicConfig(level=logging.INFO)
|
|
34
28
|
logger = logging.getLogger(__name__)
|
|
@@ -42,7 +36,16 @@ def extract_data(input_string: str, data_type: str) -> str:
|
|
|
42
36
|
|
|
43
37
|
|
|
44
38
|
class OmniLoop(BaseLoop):
|
|
45
|
-
"""Omni-specific implementation of the agent loop.
|
|
39
|
+
"""Omni-specific implementation of the agent loop.
|
|
40
|
+
|
|
41
|
+
This class extends BaseLoop to provide support for multimodal models
|
|
42
|
+
from various providers (OpenAI, Anthropic, etc.) with UI parsing
|
|
43
|
+
and desktop automation capabilities.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
###########################################
|
|
47
|
+
# INITIALIZATION AND CONFIGURATION
|
|
48
|
+
###########################################
|
|
46
49
|
|
|
47
50
|
def __init__(
|
|
48
51
|
self,
|
|
@@ -77,8 +80,9 @@ class OmniLoop(BaseLoop):
|
|
|
77
80
|
self.provider = provider
|
|
78
81
|
|
|
79
82
|
# Initialize message manager with image retention config
|
|
80
|
-
|
|
81
|
-
|
|
83
|
+
self.message_manager = StandardMessageManager(
|
|
84
|
+
config=ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images)
|
|
85
|
+
)
|
|
82
86
|
|
|
83
87
|
# Initialize base class (which will set up experiment manager)
|
|
84
88
|
super().__init__(
|
|
@@ -97,87 +101,53 @@ class OmniLoop(BaseLoop):
|
|
|
97
101
|
self.client = None
|
|
98
102
|
self.retry_count = 0
|
|
99
103
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
Returns:
|
|
104
|
-
bool: Always returns False as debug image saving has been disabled.
|
|
105
|
-
"""
|
|
106
|
-
# Debug image saving functionality has been removed
|
|
107
|
-
return False
|
|
108
|
-
|
|
109
|
-
def _extract_and_save_images(self, data: Any, prefix: str) -> None:
|
|
110
|
-
"""Extract and save images from API data.
|
|
104
|
+
# Initialize handlers
|
|
105
|
+
self.api_handler = OmniAPIHandler(loop=self)
|
|
106
|
+
self.viz_helper = VisualizationHelper(agent=self)
|
|
111
107
|
|
|
112
|
-
|
|
108
|
+
# Initialize tool manager
|
|
109
|
+
self.tool_manager = ToolManager(computer=computer, provider=provider)
|
|
113
110
|
|
|
114
|
-
|
|
115
|
-
data: Data to extract images from
|
|
116
|
-
prefix: Prefix for the extracted image filenames
|
|
117
|
-
"""
|
|
118
|
-
# Image extraction functionality has been removed
|
|
119
|
-
return
|
|
111
|
+
logger.info("OmniLoop initialized with StandardMessageManager")
|
|
120
112
|
|
|
121
|
-
def
|
|
122
|
-
"""
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
Args:
|
|
127
|
-
image_data: Base64 encoded image data
|
|
128
|
-
filename: Name to use for the saved image
|
|
129
|
-
"""
|
|
130
|
-
# Debug image saving functionality has been removed
|
|
131
|
-
return
|
|
132
|
-
|
|
133
|
-
def _visualize_action(self, x: int, y: int, img_base64: str) -> None:
|
|
134
|
-
"""Visualize an action by drawing on the screenshot."""
|
|
135
|
-
if (
|
|
136
|
-
not self.save_trajectory
|
|
137
|
-
or not hasattr(self, "experiment_manager")
|
|
138
|
-
or not self.experiment_manager
|
|
139
|
-
):
|
|
140
|
-
return
|
|
113
|
+
async def initialize(self) -> None:
|
|
114
|
+
"""Initialize the loop by setting up tools and clients."""
|
|
115
|
+
# Initialize base class
|
|
116
|
+
await super().initialize()
|
|
141
117
|
|
|
118
|
+
# Initialize tool manager with error handling
|
|
142
119
|
try:
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
# Save the visualization
|
|
147
|
-
self.experiment_manager.save_action_visualization(img, "click", f"x{x}_y{y}")
|
|
120
|
+
logger.info("Initializing tool manager...")
|
|
121
|
+
await self.tool_manager.initialize()
|
|
122
|
+
logger.info("Tool manager initialized successfully.")
|
|
148
123
|
except Exception as e:
|
|
149
|
-
logger.error(f"Error
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
if
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
):
|
|
158
|
-
return
|
|
159
|
-
|
|
160
|
-
try:
|
|
161
|
-
# Use the visualization utility
|
|
162
|
-
img = visualize_scroll(direction, clicks, img_base64)
|
|
163
|
-
|
|
164
|
-
# Save the visualization
|
|
165
|
-
self.experiment_manager.save_action_visualization(
|
|
166
|
-
img, "scroll", f"{direction}_{clicks}"
|
|
124
|
+
logger.error(f"Error initializing tool manager: {str(e)}")
|
|
125
|
+
logger.warning("Will attempt to initialize tools on first use.")
|
|
126
|
+
|
|
127
|
+
# Initialize API clients based on provider
|
|
128
|
+
if self.provider == LLMProvider.ANTHROPIC:
|
|
129
|
+
self.client = AnthropicClient(
|
|
130
|
+
api_key=self.api_key,
|
|
131
|
+
model=self.model,
|
|
167
132
|
)
|
|
168
|
-
|
|
169
|
-
|
|
133
|
+
elif self.provider == LLMProvider.OPENAI:
|
|
134
|
+
self.client = OpenAIClient(
|
|
135
|
+
api_key=self.api_key,
|
|
136
|
+
model=self.model,
|
|
137
|
+
)
|
|
138
|
+
else:
|
|
139
|
+
raise ValueError(f"Unsupported provider: {self.provider}")
|
|
170
140
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
"""Save a visualization of an action."""
|
|
175
|
-
if hasattr(self, "experiment_manager") and self.experiment_manager:
|
|
176
|
-
return self.experiment_manager.save_action_visualization(img, action_name, details)
|
|
177
|
-
return ""
|
|
141
|
+
###########################################
|
|
142
|
+
# CLIENT INITIALIZATION - IMPLEMENTING ABSTRACT METHOD
|
|
143
|
+
###########################################
|
|
178
144
|
|
|
179
145
|
async def initialize_client(self) -> None:
|
|
180
|
-
"""Initialize the appropriate client based on provider.
|
|
146
|
+
"""Initialize the appropriate client based on provider.
|
|
147
|
+
|
|
148
|
+
Implements abstract method from BaseLoop to set up the specific
|
|
149
|
+
provider client (OpenAI, Anthropic, etc.).
|
|
150
|
+
"""
|
|
181
151
|
try:
|
|
182
152
|
logger.info(f"Initializing {self.provider} client with model {self.model}...")
|
|
183
153
|
|
|
@@ -199,6 +169,10 @@ class OmniLoop(BaseLoop):
|
|
|
199
169
|
self.client = None
|
|
200
170
|
raise RuntimeError(f"Failed to initialize client: {str(e)}")
|
|
201
171
|
|
|
172
|
+
###########################################
|
|
173
|
+
# API CALL HANDLING
|
|
174
|
+
###########################################
|
|
175
|
+
|
|
202
176
|
async def _make_api_call(self, messages: List[Dict[str, Any]], system_prompt: str) -> Any:
|
|
203
177
|
"""Make API call to provider with retry logic."""
|
|
204
178
|
# Create new turn directory for this API call
|
|
@@ -218,68 +192,73 @@ class OmniLoop(BaseLoop):
|
|
|
218
192
|
if self.client is None:
|
|
219
193
|
raise RuntimeError("Failed to initialize client")
|
|
220
194
|
|
|
221
|
-
#
|
|
222
|
-
|
|
223
|
-
self.message_manager.
|
|
224
|
-
|
|
225
|
-
# Apply image retention and prepare messages
|
|
226
|
-
# This will limit the number of images based on only_n_most_recent_images
|
|
227
|
-
prepared_messages = self.message_manager.get_formatted_messages(provider_name)
|
|
195
|
+
# Get messages in standard format from the message manager
|
|
196
|
+
self.message_manager.messages = messages.copy()
|
|
197
|
+
prepared_messages = self.message_manager.get_messages()
|
|
228
198
|
|
|
229
|
-
#
|
|
199
|
+
# Special handling for Anthropic
|
|
230
200
|
if self.provider == LLMProvider.ANTHROPIC:
|
|
201
|
+
# Convert to Anthropic format
|
|
202
|
+
anthropic_messages, anthropic_system = self.message_manager.to_anthropic_format(
|
|
203
|
+
prepared_messages
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Filter out any empty/invalid messages
|
|
231
207
|
filtered_messages = [
|
|
232
|
-
msg
|
|
208
|
+
msg
|
|
209
|
+
for msg in anthropic_messages
|
|
210
|
+
if msg.get("role") in ["user", "assistant"]
|
|
233
211
|
]
|
|
234
|
-
else:
|
|
235
|
-
filtered_messages = prepared_messages
|
|
236
212
|
|
|
237
|
-
|
|
238
|
-
|
|
213
|
+
# Ensure there's at least one message for Anthropic
|
|
214
|
+
if not filtered_messages:
|
|
215
|
+
logger.warning(
|
|
216
|
+
"No valid messages found for Anthropic API call. Adding a default user message."
|
|
217
|
+
)
|
|
218
|
+
filtered_messages = [
|
|
219
|
+
{
|
|
220
|
+
"role": "user",
|
|
221
|
+
"content": [
|
|
222
|
+
{"type": "text", "text": "Please help with this task."}
|
|
223
|
+
],
|
|
224
|
+
}
|
|
225
|
+
]
|
|
239
226
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
else:
|
|
243
|
-
request_data["system"] = system_prompt
|
|
227
|
+
# Combine system prompts if needed
|
|
228
|
+
final_system_prompt = anthropic_system or system_prompt
|
|
244
229
|
|
|
245
|
-
|
|
230
|
+
# Log request
|
|
231
|
+
request_data = {
|
|
232
|
+
"messages": filtered_messages,
|
|
233
|
+
"max_tokens": self.max_tokens,
|
|
234
|
+
"system": final_system_prompt,
|
|
235
|
+
}
|
|
246
236
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
if is_async:
|
|
256
|
-
# For async implementations (AnthropicClient)
|
|
257
|
-
if self.provider == LLMProvider.ANTHROPIC:
|
|
258
|
-
response = await run_method(
|
|
259
|
-
messages=filtered_messages,
|
|
260
|
-
system=self._get_system_prompt(),
|
|
261
|
-
max_tokens=self.max_tokens,
|
|
262
|
-
)
|
|
263
|
-
else:
|
|
264
|
-
response = await run_method(
|
|
265
|
-
messages=messages,
|
|
266
|
-
system=system_prompt,
|
|
267
|
-
max_tokens=self.max_tokens,
|
|
268
|
-
)
|
|
237
|
+
self._log_api_call("request", request_data)
|
|
238
|
+
|
|
239
|
+
# Make API call
|
|
240
|
+
response = await self.client.run_interleaved(
|
|
241
|
+
messages=filtered_messages,
|
|
242
|
+
system=final_system_prompt,
|
|
243
|
+
max_tokens=self.max_tokens,
|
|
244
|
+
)
|
|
269
245
|
else:
|
|
270
|
-
# For
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
246
|
+
# For OpenAI and others, use standard format directly
|
|
247
|
+
# Log request
|
|
248
|
+
request_data = {
|
|
249
|
+
"messages": prepared_messages,
|
|
250
|
+
"max_tokens": self.max_tokens,
|
|
251
|
+
"system": system_prompt,
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
self._log_api_call("request", request_data)
|
|
255
|
+
|
|
256
|
+
# Make API call
|
|
257
|
+
response = await self.client.run_interleaved(
|
|
258
|
+
messages=prepared_messages,
|
|
259
|
+
system=system_prompt,
|
|
260
|
+
max_tokens=self.max_tokens,
|
|
261
|
+
)
|
|
283
262
|
|
|
284
263
|
# Log success response
|
|
285
264
|
self._log_api_call("response", request_data, response)
|
|
@@ -327,6 +306,10 @@ class OmniLoop(BaseLoop):
|
|
|
327
306
|
logger.error(error_message)
|
|
328
307
|
raise RuntimeError(error_message)
|
|
329
308
|
|
|
309
|
+
###########################################
|
|
310
|
+
# RESPONSE AND ACTION HANDLING
|
|
311
|
+
###########################################
|
|
312
|
+
|
|
330
313
|
async def _handle_response(
|
|
331
314
|
self, response: Any, messages: List[Dict[str, Any]], parsed_screen: ParseResult
|
|
332
315
|
) -> Tuple[bool, bool]:
|
|
@@ -341,194 +324,151 @@ class OmniLoop(BaseLoop):
|
|
|
341
324
|
Tuple of (should_continue, action_screenshot_saved)
|
|
342
325
|
"""
|
|
343
326
|
action_screenshot_saved = False
|
|
327
|
+
|
|
328
|
+
# Helper function to safely add assistant messages using the message manager
|
|
329
|
+
def add_assistant_message(content):
|
|
330
|
+
if isinstance(content, str):
|
|
331
|
+
# Convert string to proper format
|
|
332
|
+
formatted_content = [{"type": "text", "text": content}]
|
|
333
|
+
self.message_manager.add_assistant_message(formatted_content)
|
|
334
|
+
logger.info("Added formatted text assistant message")
|
|
335
|
+
elif isinstance(content, list):
|
|
336
|
+
# Already in proper format
|
|
337
|
+
self.message_manager.add_assistant_message(content)
|
|
338
|
+
logger.info("Added structured assistant message")
|
|
339
|
+
else:
|
|
340
|
+
# Default case - convert to string
|
|
341
|
+
formatted_content = [{"type": "text", "text": str(content)}]
|
|
342
|
+
self.message_manager.add_assistant_message(formatted_content)
|
|
343
|
+
logger.info("Added converted assistant message")
|
|
344
|
+
|
|
344
345
|
try:
|
|
345
|
-
#
|
|
346
|
+
# Step 1: Normalize response to standard format based on provider
|
|
347
|
+
standard_content = []
|
|
348
|
+
raw_text = None
|
|
349
|
+
|
|
350
|
+
# Convert response to standardized content based on provider
|
|
346
351
|
if self.provider == LLMProvider.ANTHROPIC:
|
|
347
352
|
if hasattr(response, "content") and isinstance(response.content, list):
|
|
348
|
-
#
|
|
353
|
+
# Convert Anthropic response to standard format
|
|
349
354
|
for block in response.content:
|
|
350
|
-
if hasattr(block, "type")
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
parsed_content = json.loads(json_str)
|
|
368
|
-
logger.info("Successfully parsed JSON from text")
|
|
369
|
-
else:
|
|
370
|
-
logger.error(f"No JSON found in content: {content}")
|
|
371
|
-
continue
|
|
372
|
-
except json.JSONDecodeError as e:
|
|
373
|
-
logger.error(f"Failed to parse JSON from text: {str(e)}")
|
|
374
|
-
continue
|
|
375
|
-
|
|
376
|
-
# Clean up Box ID format
|
|
377
|
-
if "Box ID" in parsed_content and isinstance(
|
|
378
|
-
parsed_content["Box ID"], str
|
|
379
|
-
):
|
|
380
|
-
parsed_content["Box ID"] = parsed_content["Box ID"].replace(
|
|
381
|
-
"Box #", ""
|
|
382
|
-
)
|
|
383
|
-
|
|
384
|
-
# Add any explanatory text as reasoning if not present
|
|
385
|
-
if "Explanation" not in parsed_content:
|
|
386
|
-
# Extract any text before the JSON as reasoning
|
|
387
|
-
text_before_json = content.split("{")[0].strip()
|
|
388
|
-
if text_before_json:
|
|
389
|
-
parsed_content["Explanation"] = text_before_json
|
|
390
|
-
|
|
391
|
-
# Log the parsed content for debugging
|
|
392
|
-
logger.info(f"Parsed content: {json.dumps(parsed_content, indent=2)}")
|
|
393
|
-
|
|
394
|
-
# Add response to messages
|
|
395
|
-
messages.append(
|
|
396
|
-
{"role": "assistant", "content": json.dumps(parsed_content)}
|
|
397
|
-
)
|
|
398
|
-
|
|
399
|
-
try:
|
|
400
|
-
# Execute action with current parsed screen info
|
|
401
|
-
await self._execute_action(
|
|
402
|
-
parsed_content, cast(ParseResult, parsed_screen)
|
|
403
|
-
)
|
|
404
|
-
action_screenshot_saved = True
|
|
405
|
-
except Exception as e:
|
|
406
|
-
logger.error(f"Error executing action: {str(e)}")
|
|
407
|
-
# Add error message to conversation
|
|
408
|
-
messages.append(
|
|
409
|
-
{
|
|
410
|
-
"role": "assistant",
|
|
411
|
-
"content": f"Error executing action: {str(e)}",
|
|
412
|
-
"metadata": {"title": "❌ Error"},
|
|
413
|
-
}
|
|
414
|
-
)
|
|
415
|
-
return False, action_screenshot_saved
|
|
416
|
-
|
|
417
|
-
# Check if task is complete
|
|
418
|
-
if parsed_content.get("Action") == "None":
|
|
419
|
-
return False, action_screenshot_saved
|
|
420
|
-
return True, action_screenshot_saved
|
|
421
|
-
|
|
422
|
-
logger.warning("No text block found in Anthropic response")
|
|
355
|
+
if hasattr(block, "type"):
|
|
356
|
+
if block.type == "text":
|
|
357
|
+
standard_content.append({"type": "text", "text": block.text})
|
|
358
|
+
# Store raw text for JSON parsing
|
|
359
|
+
if raw_text is None:
|
|
360
|
+
raw_text = block.text
|
|
361
|
+
else:
|
|
362
|
+
raw_text += "\n" + block.text
|
|
363
|
+
else:
|
|
364
|
+
# Add other block types
|
|
365
|
+
block_dict = {}
|
|
366
|
+
for key, value in vars(block).items():
|
|
367
|
+
if not key.startswith("_"):
|
|
368
|
+
block_dict[key] = value
|
|
369
|
+
standard_content.append(block_dict)
|
|
370
|
+
else:
|
|
371
|
+
logger.warning("Invalid Anthropic response format")
|
|
423
372
|
return True, action_screenshot_saved
|
|
424
|
-
|
|
425
|
-
# Handle other providers' response formats
|
|
426
|
-
if isinstance(response, dict) and "choices" in response:
|
|
427
|
-
content = response["choices"][0]["message"]["content"]
|
|
428
373
|
else:
|
|
429
|
-
|
|
374
|
+
# Assume OpenAI or compatible format
|
|
375
|
+
try:
|
|
376
|
+
raw_text = response["choices"][0]["message"]["content"]
|
|
377
|
+
standard_content = [{"type": "text", "text": raw_text}]
|
|
378
|
+
except (KeyError, TypeError, IndexError) as e:
|
|
379
|
+
logger.error(f"Invalid response format: {str(e)}")
|
|
380
|
+
return True, action_screenshot_saved
|
|
430
381
|
|
|
431
|
-
#
|
|
432
|
-
|
|
382
|
+
# Step 2: Add the normalized response to message history
|
|
383
|
+
add_assistant_message(standard_content)
|
|
384
|
+
|
|
385
|
+
# Step 3: Extract JSON from the content for action execution
|
|
386
|
+
parsed_content = None
|
|
387
|
+
|
|
388
|
+
# If we have raw text, try to extract JSON from it
|
|
389
|
+
if raw_text:
|
|
390
|
+
# Try different approaches to extract JSON
|
|
433
391
|
try:
|
|
434
392
|
# First try to parse the whole content as JSON
|
|
435
|
-
parsed_content = json.loads(
|
|
393
|
+
parsed_content = json.loads(raw_text)
|
|
394
|
+
logger.info("Successfully parsed whole content as JSON")
|
|
436
395
|
except json.JSONDecodeError:
|
|
437
396
|
try:
|
|
438
397
|
# Try to find JSON block
|
|
439
|
-
json_content = extract_data(
|
|
398
|
+
json_content = extract_data(raw_text, "json")
|
|
440
399
|
parsed_content = json.loads(json_content)
|
|
400
|
+
logger.info("Successfully parsed JSON from code block")
|
|
441
401
|
except (json.JSONDecodeError, IndexError):
|
|
442
402
|
try:
|
|
443
403
|
# Look for JSON object pattern
|
|
444
404
|
json_pattern = r"\{[^}]+\}"
|
|
445
|
-
json_match = re.search(json_pattern,
|
|
405
|
+
json_match = re.search(json_pattern, raw_text)
|
|
446
406
|
if json_match:
|
|
447
407
|
json_str = json_match.group(0)
|
|
448
408
|
parsed_content = json.loads(json_str)
|
|
409
|
+
logger.info("Successfully parsed JSON from text")
|
|
449
410
|
else:
|
|
450
|
-
logger.error(f"No JSON found in content
|
|
411
|
+
logger.error(f"No JSON found in content")
|
|
451
412
|
return True, action_screenshot_saved
|
|
452
413
|
except json.JSONDecodeError as e:
|
|
453
414
|
logger.error(f"Failed to parse JSON from text: {str(e)}")
|
|
454
415
|
return True, action_screenshot_saved
|
|
455
416
|
|
|
417
|
+
# Step 4: Process the parsed content if available
|
|
418
|
+
if parsed_content:
|
|
456
419
|
# Clean up Box ID format
|
|
457
420
|
if "Box ID" in parsed_content and isinstance(parsed_content["Box ID"], str):
|
|
458
421
|
parsed_content["Box ID"] = parsed_content["Box ID"].replace("Box #", "")
|
|
459
422
|
|
|
460
423
|
# Add any explanatory text as reasoning if not present
|
|
461
|
-
if "Explanation" not in parsed_content:
|
|
424
|
+
if "Explanation" not in parsed_content and raw_text:
|
|
462
425
|
# Extract any text before the JSON as reasoning
|
|
463
|
-
text_before_json =
|
|
426
|
+
text_before_json = raw_text.split("{")[0].strip()
|
|
464
427
|
if text_before_json:
|
|
465
428
|
parsed_content["Explanation"] = text_before_json
|
|
466
429
|
|
|
467
|
-
#
|
|
468
|
-
|
|
430
|
+
# Log the parsed content for debugging
|
|
431
|
+
logger.info(f"Parsed content: {json.dumps(parsed_content, indent=2)}")
|
|
469
432
|
|
|
433
|
+
# Step 5: Execute the action
|
|
470
434
|
try:
|
|
471
|
-
# Execute action
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
# Add error message to conversation
|
|
477
|
-
messages.append(
|
|
478
|
-
{
|
|
479
|
-
"role": "assistant",
|
|
480
|
-
"content": f"Error executing action: {str(e)}",
|
|
481
|
-
"metadata": {"title": "❌ Error"},
|
|
482
|
-
}
|
|
435
|
+
# Execute action using the common helper method
|
|
436
|
+
should_continue, action_screenshot_saved = (
|
|
437
|
+
await self._execute_action_with_tools(
|
|
438
|
+
parsed_content, cast(ParseResult, parsed_screen)
|
|
439
|
+
)
|
|
483
440
|
)
|
|
484
|
-
return False, action_screenshot_saved
|
|
485
|
-
|
|
486
|
-
# Check if task is complete
|
|
487
|
-
if parsed_content.get("Action") == "None":
|
|
488
|
-
return False, action_screenshot_saved
|
|
489
|
-
|
|
490
|
-
return True, action_screenshot_saved
|
|
491
|
-
elif isinstance(content, dict):
|
|
492
|
-
# Handle case where content is already a dictionary
|
|
493
|
-
messages.append({"role": "assistant", "content": json.dumps(content)})
|
|
494
441
|
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
442
|
+
# Check if task is complete
|
|
443
|
+
if parsed_content.get("Action") == "None":
|
|
444
|
+
return False, action_screenshot_saved
|
|
445
|
+
return should_continue, action_screenshot_saved
|
|
499
446
|
except Exception as e:
|
|
500
447
|
logger.error(f"Error executing action: {str(e)}")
|
|
501
|
-
#
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
"content": f"Error executing action: {str(e)}",
|
|
506
|
-
"metadata": {"title": "❌ Error"},
|
|
507
|
-
}
|
|
508
|
-
)
|
|
448
|
+
# Update the last assistant message with error
|
|
449
|
+
error_message = [{"type": "text", "text": f"Error executing action: {str(e)}"}]
|
|
450
|
+
# Replace the last assistant message with the error
|
|
451
|
+
self.message_manager.add_assistant_message(error_message)
|
|
509
452
|
return False, action_screenshot_saved
|
|
510
453
|
|
|
511
|
-
# Check if task is complete
|
|
512
|
-
if content.get("Action") == "None":
|
|
513
|
-
return False, action_screenshot_saved
|
|
514
|
-
|
|
515
|
-
return True, action_screenshot_saved
|
|
516
|
-
|
|
517
454
|
return True, action_screenshot_saved
|
|
518
455
|
|
|
519
456
|
except Exception as e:
|
|
520
457
|
logger.error(f"Error handling response: {str(e)}")
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
"content": f"Error: {str(e)}",
|
|
525
|
-
"metadata": {"title": "❌ Error"},
|
|
526
|
-
}
|
|
527
|
-
)
|
|
458
|
+
# Add error message using the message manager
|
|
459
|
+
error_message = [{"type": "text", "text": f"Error: {str(e)}"}]
|
|
460
|
+
self.message_manager.add_assistant_message(error_message)
|
|
528
461
|
raise
|
|
529
462
|
|
|
463
|
+
###########################################
|
|
464
|
+
# SCREEN PARSING - IMPLEMENTING ABSTRACT METHOD
|
|
465
|
+
###########################################
|
|
466
|
+
|
|
530
467
|
async def _get_parsed_screen_som(self, save_screenshot: bool = True) -> ParseResult:
|
|
531
|
-
"""Get parsed screen information with
|
|
468
|
+
"""Get parsed screen information with Screen Object Model.
|
|
469
|
+
|
|
470
|
+
Extends the base class method to use the OmniParser to parse the screen
|
|
471
|
+
and extract UI elements.
|
|
532
472
|
|
|
533
473
|
Args:
|
|
534
474
|
save_screenshot: Whether to save the screenshot (set to False when screenshots will be saved elsewhere)
|
|
@@ -563,337 +503,26 @@ class OmniLoop(BaseLoop):
|
|
|
563
503
|
logger.error(f"Error getting parsed screen: {str(e)}")
|
|
564
504
|
raise
|
|
565
505
|
|
|
566
|
-
async def _process_screen(
|
|
567
|
-
self, parsed_screen: ParseResult, messages: List[Dict[str, Any]]
|
|
568
|
-
) -> None:
|
|
569
|
-
"""Process and add screen info to messages."""
|
|
570
|
-
try:
|
|
571
|
-
# Only add message if we have an image and provider supports it
|
|
572
|
-
if self.provider in [LLMProvider.OPENAI, LLMProvider.ANTHROPIC]:
|
|
573
|
-
image = parsed_screen.annotated_image_base64 or None
|
|
574
|
-
if image:
|
|
575
|
-
# Save screen info to current turn directory
|
|
576
|
-
if self.current_turn_dir:
|
|
577
|
-
# Save elements as JSON
|
|
578
|
-
elements_path = os.path.join(self.current_turn_dir, "elements.json")
|
|
579
|
-
with open(elements_path, "w") as f:
|
|
580
|
-
# Convert elements to dicts for JSON serialization
|
|
581
|
-
elements_json = [elem.model_dump() for elem in parsed_screen.elements]
|
|
582
|
-
json.dump(elements_json, f, indent=2)
|
|
583
|
-
logger.info(f"Saved elements to {elements_path}")
|
|
584
|
-
|
|
585
|
-
# Format the image content based on the provider
|
|
586
|
-
if self.provider == LLMProvider.ANTHROPIC:
|
|
587
|
-
# Compress the image before sending to Anthropic (5MB limit)
|
|
588
|
-
image_size = len(image)
|
|
589
|
-
logger.info(f"Image base64 is present, length: {image_size}")
|
|
590
|
-
|
|
591
|
-
# Anthropic has a 5MB limit - check against base64 string length
|
|
592
|
-
# which is what matters for the API call payload
|
|
593
|
-
# Use slightly smaller limit (4.9MB) to account for request overhead
|
|
594
|
-
max_size = int(4.9 * 1024 * 1024) # 4.9MB
|
|
595
|
-
|
|
596
|
-
# Default media type (will be overridden if compression is needed)
|
|
597
|
-
media_type = "image/png"
|
|
598
|
-
|
|
599
|
-
# Check if the image already has a media type prefix
|
|
600
|
-
if image.startswith("data:"):
|
|
601
|
-
parts = image.split(",", 1)
|
|
602
|
-
if len(parts) == 2 and "image/jpeg" in parts[0].lower():
|
|
603
|
-
media_type = "image/jpeg"
|
|
604
|
-
elif len(parts) == 2 and "image/png" in parts[0].lower():
|
|
605
|
-
media_type = "image/png"
|
|
606
|
-
|
|
607
|
-
if image_size > max_size:
|
|
608
|
-
logger.info(
|
|
609
|
-
f"Image size ({image_size} bytes) exceeds Anthropic limit ({max_size} bytes), compressing..."
|
|
610
|
-
)
|
|
611
|
-
image, media_type = compress_image_base64(image, max_size)
|
|
612
|
-
logger.info(
|
|
613
|
-
f"Image compressed to {len(image)} bytes with media_type {media_type}"
|
|
614
|
-
)
|
|
615
|
-
|
|
616
|
-
# Anthropic uses "type": "image"
|
|
617
|
-
screen_info_msg = {
|
|
618
|
-
"role": "user",
|
|
619
|
-
"content": [
|
|
620
|
-
{
|
|
621
|
-
"type": "image",
|
|
622
|
-
"source": {
|
|
623
|
-
"type": "base64",
|
|
624
|
-
"media_type": media_type,
|
|
625
|
-
"data": image,
|
|
626
|
-
},
|
|
627
|
-
}
|
|
628
|
-
],
|
|
629
|
-
}
|
|
630
|
-
else:
|
|
631
|
-
# OpenAI and others use "type": "image_url"
|
|
632
|
-
screen_info_msg = {
|
|
633
|
-
"role": "user",
|
|
634
|
-
"content": [
|
|
635
|
-
{
|
|
636
|
-
"type": "image_url",
|
|
637
|
-
"image_url": {"url": f"data:image/png;base64,{image}"},
|
|
638
|
-
}
|
|
639
|
-
],
|
|
640
|
-
}
|
|
641
|
-
messages.append(screen_info_msg)
|
|
642
|
-
|
|
643
|
-
except Exception as e:
|
|
644
|
-
logger.error(f"Error processing screen info: {str(e)}")
|
|
645
|
-
raise
|
|
646
|
-
|
|
647
506
|
def _get_system_prompt(self) -> str:
|
|
648
507
|
"""Get the system prompt for the model."""
|
|
649
508
|
return SYSTEM_PROMPT
|
|
650
509
|
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
Args:
|
|
655
|
-
content: Dictionary containing the action details
|
|
656
|
-
parsed_screen: Current parsed screen information
|
|
657
|
-
"""
|
|
658
|
-
try:
|
|
659
|
-
action = content.get("Action", "").lower()
|
|
660
|
-
if not action:
|
|
661
|
-
return
|
|
662
|
-
|
|
663
|
-
# Track if we saved an action-specific screenshot
|
|
664
|
-
action_screenshot_saved = False
|
|
665
|
-
|
|
666
|
-
try:
|
|
667
|
-
# Prepare kwargs based on action type
|
|
668
|
-
kwargs = {}
|
|
669
|
-
|
|
670
|
-
if action in ["left_click", "right_click", "double_click", "move_cursor"]:
|
|
671
|
-
try:
|
|
672
|
-
box_id = int(content["Box ID"])
|
|
673
|
-
logger.info(f"Processing Box ID: {box_id}")
|
|
674
|
-
|
|
675
|
-
# Calculate click coordinates
|
|
676
|
-
x, y = await self._calculate_click_coordinates(box_id, parsed_screen)
|
|
677
|
-
logger.info(f"Calculated coordinates: x={x}, y={y}")
|
|
678
|
-
|
|
679
|
-
kwargs["x"] = x
|
|
680
|
-
kwargs["y"] = y
|
|
681
|
-
|
|
682
|
-
# Visualize action if screenshot is available
|
|
683
|
-
if parsed_screen.annotated_image_base64:
|
|
684
|
-
img_data = parsed_screen.annotated_image_base64
|
|
685
|
-
# Remove data URL prefix if present
|
|
686
|
-
if img_data.startswith("data:image"):
|
|
687
|
-
img_data = img_data.split(",")[1]
|
|
688
|
-
# Only save visualization for coordinate-based actions
|
|
689
|
-
self._visualize_action(x, y, img_data)
|
|
690
|
-
action_screenshot_saved = True
|
|
691
|
-
|
|
692
|
-
except ValueError as e:
|
|
693
|
-
logger.error(f"Error processing Box ID: {str(e)}")
|
|
694
|
-
return
|
|
695
|
-
|
|
696
|
-
elif action == "drag_to":
|
|
697
|
-
try:
|
|
698
|
-
box_id = int(content["Box ID"])
|
|
699
|
-
x, y = await self._calculate_click_coordinates(box_id, parsed_screen)
|
|
700
|
-
kwargs.update(
|
|
701
|
-
{
|
|
702
|
-
"x": x,
|
|
703
|
-
"y": y,
|
|
704
|
-
"button": content.get("button", "left"),
|
|
705
|
-
"duration": float(content.get("duration", 0.5)),
|
|
706
|
-
}
|
|
707
|
-
)
|
|
708
|
-
|
|
709
|
-
# Visualize drag destination if screenshot is available
|
|
710
|
-
if parsed_screen.annotated_image_base64:
|
|
711
|
-
img_data = parsed_screen.annotated_image_base64
|
|
712
|
-
# Remove data URL prefix if present
|
|
713
|
-
if img_data.startswith("data:image"):
|
|
714
|
-
img_data = img_data.split(",")[1]
|
|
715
|
-
# Only save visualization for coordinate-based actions
|
|
716
|
-
self._visualize_action(x, y, img_data)
|
|
717
|
-
action_screenshot_saved = True
|
|
718
|
-
|
|
719
|
-
except ValueError as e:
|
|
720
|
-
logger.error(f"Error processing drag coordinates: {str(e)}")
|
|
721
|
-
return
|
|
722
|
-
|
|
723
|
-
elif action == "type_text":
|
|
724
|
-
kwargs["text"] = content["Value"]
|
|
725
|
-
# For type_text, store the value in the action type
|
|
726
|
-
action_type = f"type_{content['Value'][:20]}" # Truncate if too long
|
|
727
|
-
elif action == "press_key":
|
|
728
|
-
kwargs["key"] = content["Value"]
|
|
729
|
-
action_type = f"press_{content['Value']}"
|
|
730
|
-
elif action == "hotkey":
|
|
731
|
-
if isinstance(content.get("Value"), list):
|
|
732
|
-
keys = content["Value"]
|
|
733
|
-
action_type = f"hotkey_{'_'.join(keys)}"
|
|
734
|
-
else:
|
|
735
|
-
# Simply split string format like "command+space" into a list
|
|
736
|
-
keys = [k.strip() for k in content["Value"].lower().split("+")]
|
|
737
|
-
action_type = f"hotkey_{content['Value'].replace('+', '_')}"
|
|
738
|
-
logger.info(f"Preparing hotkey with keys: {keys}")
|
|
739
|
-
# Get the method but call it with *args instead of **kwargs
|
|
740
|
-
method = getattr(self.computer.interface, action)
|
|
741
|
-
await method(*keys) # Unpack the keys list as positional arguments
|
|
742
|
-
logger.info(f"Tool execution completed successfully: {action}")
|
|
743
|
-
|
|
744
|
-
# For hotkeys, take a screenshot after the action
|
|
745
|
-
try:
|
|
746
|
-
# Get a new screenshot after the action and save it with the action type
|
|
747
|
-
new_parsed_screen = await self._get_parsed_screen_som(save_screenshot=False)
|
|
748
|
-
if new_parsed_screen and new_parsed_screen.annotated_image_base64:
|
|
749
|
-
img_data = new_parsed_screen.annotated_image_base64
|
|
750
|
-
# Remove data URL prefix if present
|
|
751
|
-
if img_data.startswith("data:image"):
|
|
752
|
-
img_data = img_data.split(",")[1]
|
|
753
|
-
# Save with action type to indicate this is a post-action screenshot
|
|
754
|
-
self._save_screenshot(img_data, action_type=action_type)
|
|
755
|
-
action_screenshot_saved = True
|
|
756
|
-
except Exception as screenshot_error:
|
|
757
|
-
logger.error(
|
|
758
|
-
f"Error taking post-hotkey screenshot: {str(screenshot_error)}"
|
|
759
|
-
)
|
|
760
|
-
|
|
761
|
-
return
|
|
762
|
-
|
|
763
|
-
elif action in ["scroll_down", "scroll_up"]:
|
|
764
|
-
clicks = int(content.get("amount", 1))
|
|
765
|
-
kwargs["clicks"] = clicks
|
|
766
|
-
action_type = f"scroll_{action.split('_')[1]}_{clicks}"
|
|
767
|
-
|
|
768
|
-
# Visualize scrolling if screenshot is available
|
|
769
|
-
if parsed_screen.annotated_image_base64:
|
|
770
|
-
img_data = parsed_screen.annotated_image_base64
|
|
771
|
-
# Remove data URL prefix if present
|
|
772
|
-
if img_data.startswith("data:image"):
|
|
773
|
-
img_data = img_data.split(",")[1]
|
|
774
|
-
direction = "down" if action == "scroll_down" else "up"
|
|
775
|
-
# For scrolling, we only save the visualization to avoid duplicate images
|
|
776
|
-
self._visualize_scroll(direction, clicks, img_data)
|
|
777
|
-
action_screenshot_saved = True
|
|
778
|
-
|
|
779
|
-
else:
|
|
780
|
-
logger.warning(f"Unknown action: {action}")
|
|
781
|
-
return
|
|
510
|
+
###########################################
|
|
511
|
+
# MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
|
|
512
|
+
###########################################
|
|
782
513
|
|
|
783
|
-
|
|
784
|
-
try:
|
|
785
|
-
method = getattr(self.computer.interface, action)
|
|
786
|
-
logger.info(f"Found method for action '{action}': {method}")
|
|
787
|
-
await method(**kwargs)
|
|
788
|
-
logger.info(f"Tool execution completed successfully: {action}")
|
|
789
|
-
|
|
790
|
-
# For non-coordinate based actions that don't already have visualizations,
|
|
791
|
-
# take a new screenshot after the action
|
|
792
|
-
if not action_screenshot_saved:
|
|
793
|
-
# Take a new screenshot
|
|
794
|
-
try:
|
|
795
|
-
# Get a new screenshot after the action and save it with the action type
|
|
796
|
-
new_parsed_screen = await self._get_parsed_screen_som(
|
|
797
|
-
save_screenshot=False
|
|
798
|
-
)
|
|
799
|
-
if new_parsed_screen and new_parsed_screen.annotated_image_base64:
|
|
800
|
-
img_data = new_parsed_screen.annotated_image_base64
|
|
801
|
-
# Remove data URL prefix if present
|
|
802
|
-
if img_data.startswith("data:image"):
|
|
803
|
-
img_data = img_data.split(",")[1]
|
|
804
|
-
# Save with action type to indicate this is a post-action screenshot
|
|
805
|
-
if "action_type" in locals():
|
|
806
|
-
self._save_screenshot(img_data, action_type=action_type)
|
|
807
|
-
else:
|
|
808
|
-
self._save_screenshot(img_data, action_type=action)
|
|
809
|
-
# Update the action screenshot flag for this turn
|
|
810
|
-
action_screenshot_saved = True
|
|
811
|
-
except Exception as screenshot_error:
|
|
812
|
-
logger.error(
|
|
813
|
-
f"Error taking post-action screenshot: {str(screenshot_error)}"
|
|
814
|
-
)
|
|
815
|
-
|
|
816
|
-
except AttributeError as e:
|
|
817
|
-
logger.error(f"Method not found for action '{action}': {str(e)}")
|
|
818
|
-
return
|
|
819
|
-
except Exception as tool_error:
|
|
820
|
-
logger.error(f"Tool execution failed: {str(tool_error)}")
|
|
821
|
-
return
|
|
822
|
-
|
|
823
|
-
except Exception as e:
|
|
824
|
-
logger.error(f"Error executing action {action}: {str(e)}")
|
|
825
|
-
return
|
|
826
|
-
|
|
827
|
-
except Exception as e:
|
|
828
|
-
logger.error(f"Error in _execute_action: {str(e)}")
|
|
829
|
-
return
|
|
830
|
-
|
|
831
|
-
async def _calculate_click_coordinates(
|
|
832
|
-
self, box_id: int, parsed_screen: ParseResult
|
|
833
|
-
) -> Tuple[int, int]:
|
|
834
|
-
"""Calculate click coordinates based on box ID.
|
|
835
|
-
|
|
836
|
-
Args:
|
|
837
|
-
box_id: The ID of the box to click
|
|
838
|
-
parsed_screen: The parsed screen information
|
|
839
|
-
|
|
840
|
-
Returns:
|
|
841
|
-
Tuple of (x, y) coordinates
|
|
842
|
-
|
|
843
|
-
Raises:
|
|
844
|
-
ValueError: If box_id is invalid or missing from parsed screen
|
|
845
|
-
"""
|
|
846
|
-
# First try to use structured elements data
|
|
847
|
-
logger.info(f"Elements count: {len(parsed_screen.elements)}")
|
|
848
|
-
|
|
849
|
-
# Try to find element with matching ID
|
|
850
|
-
for element in parsed_screen.elements:
|
|
851
|
-
if element.id == box_id:
|
|
852
|
-
logger.info(f"Found element with ID {box_id}: {element}")
|
|
853
|
-
bbox = element.bbox
|
|
854
|
-
|
|
855
|
-
# Get screen dimensions from the metadata if available, or fallback
|
|
856
|
-
width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
|
|
857
|
-
height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
|
|
858
|
-
logger.info(f"Screen dimensions: width={width}, height={height}")
|
|
859
|
-
|
|
860
|
-
# Calculate center of the box in pixels
|
|
861
|
-
center_x = int((bbox.x1 + bbox.x2) / 2 * width)
|
|
862
|
-
center_y = int((bbox.y1 + bbox.y2) / 2 * height)
|
|
863
|
-
logger.info(f"Calculated center: ({center_x}, {center_y})")
|
|
864
|
-
|
|
865
|
-
# Validate coordinates - if they're (0,0) or unreasonably small,
|
|
866
|
-
# use a default position in the center of the screen
|
|
867
|
-
if center_x == 0 and center_y == 0:
|
|
868
|
-
logger.warning("Got (0,0) coordinates, using fallback position")
|
|
869
|
-
center_x = width // 2
|
|
870
|
-
center_y = height // 2
|
|
871
|
-
logger.info(f"Using fallback center: ({center_x}, {center_y})")
|
|
872
|
-
|
|
873
|
-
return center_x, center_y
|
|
874
|
-
|
|
875
|
-
# If we couldn't find the box, use center of screen
|
|
876
|
-
logger.error(
|
|
877
|
-
f"Box ID {box_id} not found in structured elements (count={len(parsed_screen.elements)})"
|
|
878
|
-
)
|
|
879
|
-
|
|
880
|
-
# Use center of screen as fallback
|
|
881
|
-
width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
|
|
882
|
-
height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
|
|
883
|
-
logger.warning(f"Using fallback position in center of screen ({width//2}, {height//2})")
|
|
884
|
-
return width // 2, height // 2
|
|
885
|
-
|
|
886
|
-
async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
|
|
514
|
+
async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
|
|
887
515
|
"""Run the agent loop with provided messages.
|
|
888
516
|
|
|
889
517
|
Args:
|
|
890
|
-
messages: List of
|
|
518
|
+
messages: List of messages in standard OpenAI format
|
|
891
519
|
|
|
892
520
|
Yields:
|
|
893
|
-
|
|
521
|
+
Agent response format
|
|
894
522
|
"""
|
|
895
|
-
#
|
|
896
|
-
|
|
523
|
+
# Initialize the message manager with the provided messages
|
|
524
|
+
self.message_manager.messages = messages.copy()
|
|
525
|
+
logger.info(f"Starting OmniLoop run with {len(self.message_manager.messages)} messages")
|
|
897
526
|
|
|
898
527
|
# Continue running until explicitly told to stop
|
|
899
528
|
running = True
|
|
@@ -922,26 +551,66 @@ class OmniLoop(BaseLoop):
|
|
|
922
551
|
# Get up-to-date screen information
|
|
923
552
|
parsed_screen = await self._get_parsed_screen_som()
|
|
924
553
|
|
|
925
|
-
# Process screen info and update messages
|
|
926
|
-
|
|
554
|
+
# Process screen info and update messages in standard format
|
|
555
|
+
try:
|
|
556
|
+
# Get image from parsed screen
|
|
557
|
+
image = parsed_screen.annotated_image_base64 or None
|
|
558
|
+
if image:
|
|
559
|
+
# Save elements as JSON if we have a turn directory
|
|
560
|
+
if self.current_turn_dir and hasattr(parsed_screen, "elements"):
|
|
561
|
+
elements_path = os.path.join(self.current_turn_dir, "elements.json")
|
|
562
|
+
with open(elements_path, "w") as f:
|
|
563
|
+
# Convert elements to dicts for JSON serialization
|
|
564
|
+
elements_json = [
|
|
565
|
+
elem.model_dump() for elem in parsed_screen.elements
|
|
566
|
+
]
|
|
567
|
+
json.dump(elements_json, f, indent=2)
|
|
568
|
+
logger.info(f"Saved elements to {elements_path}")
|
|
569
|
+
|
|
570
|
+
# Remove data URL prefix if present
|
|
571
|
+
if "," in image:
|
|
572
|
+
image = image.split(",")[1]
|
|
573
|
+
|
|
574
|
+
# Add screenshot to message history using message manager
|
|
575
|
+
self.message_manager.add_user_message(
|
|
576
|
+
[
|
|
577
|
+
{
|
|
578
|
+
"type": "image_url",
|
|
579
|
+
"image_url": {"url": f"data:image/png;base64,{image}"},
|
|
580
|
+
}
|
|
581
|
+
]
|
|
582
|
+
)
|
|
583
|
+
logger.info("Added screenshot to message history")
|
|
584
|
+
except Exception as e:
|
|
585
|
+
logger.error(f"Error processing screen info: {str(e)}")
|
|
586
|
+
raise
|
|
927
587
|
|
|
928
588
|
# Get system prompt
|
|
929
589
|
system_prompt = self._get_system_prompt()
|
|
930
590
|
|
|
931
|
-
# Make API call with retries
|
|
932
|
-
response = await self.
|
|
591
|
+
# Make API call with retries using the APIHandler
|
|
592
|
+
response = await self.api_handler.make_api_call(
|
|
593
|
+
self.message_manager.messages, system_prompt
|
|
594
|
+
)
|
|
933
595
|
|
|
934
596
|
# Handle the response (may execute actions)
|
|
935
597
|
# Returns: (should_continue, action_screenshot_saved)
|
|
936
598
|
should_continue, new_screenshot_saved = await self._handle_response(
|
|
937
|
-
response,
|
|
599
|
+
response, self.message_manager.messages, parsed_screen
|
|
938
600
|
)
|
|
939
601
|
|
|
940
602
|
# Update whether an action screenshot was saved this turn
|
|
941
603
|
action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
|
|
942
604
|
|
|
605
|
+
# Create OpenAI-compatible response format using utility function
|
|
606
|
+
openai_compatible_response = await to_openai_agent_response_format(
|
|
607
|
+
response=response,
|
|
608
|
+
messages=self.message_manager.messages,
|
|
609
|
+
model=self.model,
|
|
610
|
+
)
|
|
611
|
+
|
|
943
612
|
# Yield the response to the caller
|
|
944
|
-
yield
|
|
613
|
+
yield openai_compatible_response
|
|
945
614
|
|
|
946
615
|
# Check if we should continue this conversation
|
|
947
616
|
running = should_continue
|
|
@@ -969,3 +638,218 @@ class OmniLoop(BaseLoop):
|
|
|
969
638
|
|
|
970
639
|
# Create a brief delay before retrying
|
|
971
640
|
await asyncio.sleep(1)
|
|
641
|
+
|
|
642
|
+
async def process_model_response(self, response_text: str) -> Optional[Dict[str, Any]]:
|
|
643
|
+
"""Process model response to extract tool calls.
|
|
644
|
+
|
|
645
|
+
Args:
|
|
646
|
+
response_text: Model response text
|
|
647
|
+
|
|
648
|
+
Returns:
|
|
649
|
+
Extracted tool information, or None if no tool call was found
|
|
650
|
+
"""
|
|
651
|
+
try:
|
|
652
|
+
# Ensure tools are initialized before use
|
|
653
|
+
await self._ensure_tools_initialized()
|
|
654
|
+
|
|
655
|
+
# Look for tool use in the response
|
|
656
|
+
if "function_call" in response_text or "tool_use" in response_text:
|
|
657
|
+
# The extract_tool_call method should be implemented in the OmniAPIHandler
|
|
658
|
+
# For now, we'll just use a simple approach
|
|
659
|
+
# This will be replaced with the proper implementation
|
|
660
|
+
tool_info = None
|
|
661
|
+
if "function_call" in response_text:
|
|
662
|
+
# Extract function call params
|
|
663
|
+
try:
|
|
664
|
+
# Simple extraction - in real code this would be more robust
|
|
665
|
+
import json
|
|
666
|
+
import re
|
|
667
|
+
|
|
668
|
+
match = re.search(r'"function_call"\s*:\s*{([^}]+)}', response_text)
|
|
669
|
+
if match:
|
|
670
|
+
function_text = "{" + match.group(1) + "}"
|
|
671
|
+
tool_info = json.loads(function_text)
|
|
672
|
+
except Exception as e:
|
|
673
|
+
logger.error(f"Error extracting function call: {str(e)}")
|
|
674
|
+
|
|
675
|
+
if tool_info:
|
|
676
|
+
try:
|
|
677
|
+
# Execute the tool
|
|
678
|
+
result = await self.tool_manager.execute_tool(
|
|
679
|
+
name=tool_info.get("name"), tool_input=tool_info.get("arguments", {})
|
|
680
|
+
)
|
|
681
|
+
# Handle the result
|
|
682
|
+
return {"tool_result": result}
|
|
683
|
+
except Exception as e:
|
|
684
|
+
error_msg = (
|
|
685
|
+
f"Error executing tool '{tool_info.get('name', 'unknown')}': {str(e)}"
|
|
686
|
+
)
|
|
687
|
+
logger.error(error_msg)
|
|
688
|
+
return {"tool_result": ToolResult(error=error_msg)}
|
|
689
|
+
except Exception as e:
|
|
690
|
+
logger.error(f"Error processing tool call: {str(e)}")
|
|
691
|
+
|
|
692
|
+
return None
|
|
693
|
+
|
|
694
|
+
async def process_response_with_tools(
|
|
695
|
+
self, response_text: str, parsed_screen: Optional[ParseResult] = None
|
|
696
|
+
) -> Tuple[bool, str]:
|
|
697
|
+
"""Process model response and execute tools.
|
|
698
|
+
|
|
699
|
+
Args:
|
|
700
|
+
response_text: Model response text
|
|
701
|
+
parsed_screen: Current parsed screen information (optional)
|
|
702
|
+
|
|
703
|
+
Returns:
|
|
704
|
+
Tuple of (action_taken, observation)
|
|
705
|
+
"""
|
|
706
|
+
logger.info("Processing response with tools")
|
|
707
|
+
|
|
708
|
+
# Process the response to extract tool calls
|
|
709
|
+
tool_result = await self.process_model_response(response_text)
|
|
710
|
+
|
|
711
|
+
if tool_result and "tool_result" in tool_result:
|
|
712
|
+
# A tool was executed
|
|
713
|
+
result = tool_result["tool_result"]
|
|
714
|
+
if result.error:
|
|
715
|
+
return False, f"ERROR: {result.error}"
|
|
716
|
+
else:
|
|
717
|
+
return True, result.output or "Tool executed successfully"
|
|
718
|
+
|
|
719
|
+
# No action or tool call found
|
|
720
|
+
return False, "No action taken - no tool call detected in response"
|
|
721
|
+
|
|
722
|
+
###########################################
|
|
723
|
+
# UTILITY METHODS
|
|
724
|
+
###########################################
|
|
725
|
+
|
|
726
|
+
async def _ensure_tools_initialized(self) -> None:
|
|
727
|
+
"""Ensure the tool manager and tools are initialized before use."""
|
|
728
|
+
if not hasattr(self.tool_manager, "tools") or self.tool_manager.tools is None:
|
|
729
|
+
logger.info("Tools not initialized. Initializing now...")
|
|
730
|
+
await self.tool_manager.initialize()
|
|
731
|
+
logger.info("Tools initialized successfully.")
|
|
732
|
+
|
|
733
|
+
async def _execute_action_with_tools(
|
|
734
|
+
self, action_data: Dict[str, Any], parsed_screen: ParseResult
|
|
735
|
+
) -> Tuple[bool, bool]:
|
|
736
|
+
"""Execute an action using the tools-based approach.
|
|
737
|
+
|
|
738
|
+
Args:
|
|
739
|
+
action_data: Dictionary containing action details
|
|
740
|
+
parsed_screen: Current parsed screen information
|
|
741
|
+
|
|
742
|
+
Returns:
|
|
743
|
+
Tuple of (should_continue, action_screenshot_saved)
|
|
744
|
+
"""
|
|
745
|
+
action_screenshot_saved = False
|
|
746
|
+
action_type = None # Initialize for possible use in post-action screenshot
|
|
747
|
+
|
|
748
|
+
try:
|
|
749
|
+
# Extract the action
|
|
750
|
+
parsed_action = action_data.get("Action", "").lower()
|
|
751
|
+
|
|
752
|
+
# Only process if we have a valid action
|
|
753
|
+
if not parsed_action or parsed_action == "none":
|
|
754
|
+
return False, action_screenshot_saved
|
|
755
|
+
|
|
756
|
+
# Convert the parsed content to a format suitable for the tools system
|
|
757
|
+
tool_name = "computer" # Default to computer tool
|
|
758
|
+
tool_args = {"action": parsed_action}
|
|
759
|
+
|
|
760
|
+
# Add specific arguments based on action type
|
|
761
|
+
if parsed_action in ["left_click", "right_click", "double_click", "move_cursor"]:
|
|
762
|
+
# Calculate coordinates from Box ID using parser
|
|
763
|
+
try:
|
|
764
|
+
box_id = int(action_data["Box ID"])
|
|
765
|
+
x, y = await self.parser.calculate_click_coordinates(
|
|
766
|
+
box_id, cast(ParseResult, parsed_screen)
|
|
767
|
+
)
|
|
768
|
+
tool_args["x"] = x
|
|
769
|
+
tool_args["y"] = y
|
|
770
|
+
|
|
771
|
+
# Visualize action if screenshot is available
|
|
772
|
+
if parsed_screen and parsed_screen.annotated_image_base64:
|
|
773
|
+
img_data = parsed_screen.annotated_image_base64
|
|
774
|
+
# Remove data URL prefix if present
|
|
775
|
+
if img_data.startswith("data:image"):
|
|
776
|
+
img_data = img_data.split(",")[1]
|
|
777
|
+
# Save visualization for coordinate-based actions
|
|
778
|
+
self.viz_helper.visualize_action(x, y, img_data)
|
|
779
|
+
action_screenshot_saved = True
|
|
780
|
+
|
|
781
|
+
except (ValueError, KeyError) as e:
|
|
782
|
+
logger.error(f"Error processing Box ID: {str(e)}")
|
|
783
|
+
return False, action_screenshot_saved
|
|
784
|
+
|
|
785
|
+
elif parsed_action == "type_text":
|
|
786
|
+
tool_args["text"] = action_data.get("Value", "")
|
|
787
|
+
# For type_text, store the value in the action type for screenshot naming
|
|
788
|
+
action_type = f"type_{tool_args['text'][:20]}" # Truncate if too long
|
|
789
|
+
|
|
790
|
+
elif parsed_action == "press_key":
|
|
791
|
+
tool_args["key"] = action_data.get("Value", "")
|
|
792
|
+
action_type = f"press_{tool_args['key']}"
|
|
793
|
+
|
|
794
|
+
elif parsed_action == "hotkey":
|
|
795
|
+
value = action_data.get("Value", "")
|
|
796
|
+
if isinstance(value, list):
|
|
797
|
+
tool_args["keys"] = value
|
|
798
|
+
action_type = f"hotkey_{'_'.join(value)}"
|
|
799
|
+
else:
|
|
800
|
+
# Split string format like "command+space" into a list
|
|
801
|
+
keys = [k.strip() for k in value.lower().split("+")]
|
|
802
|
+
tool_args["keys"] = keys
|
|
803
|
+
action_type = f"hotkey_{value.replace('+', '_')}"
|
|
804
|
+
|
|
805
|
+
elif parsed_action in ["scroll_down", "scroll_up"]:
|
|
806
|
+
clicks = int(action_data.get("amount", 1))
|
|
807
|
+
tool_args["amount"] = clicks
|
|
808
|
+
action_type = f"scroll_{parsed_action.split('_')[1]}_{clicks}"
|
|
809
|
+
|
|
810
|
+
# Visualize scrolling if screenshot is available
|
|
811
|
+
if parsed_screen and parsed_screen.annotated_image_base64:
|
|
812
|
+
img_data = parsed_screen.annotated_image_base64
|
|
813
|
+
# Remove data URL prefix if present
|
|
814
|
+
if img_data.startswith("data:image"):
|
|
815
|
+
img_data = img_data.split(",")[1]
|
|
816
|
+
direction = "down" if parsed_action == "scroll_down" else "up"
|
|
817
|
+
# For scrolling, we save the visualization
|
|
818
|
+
self.viz_helper.visualize_scroll(direction, clicks, img_data)
|
|
819
|
+
action_screenshot_saved = True
|
|
820
|
+
|
|
821
|
+
# Ensure tools are initialized before use
|
|
822
|
+
await self._ensure_tools_initialized()
|
|
823
|
+
|
|
824
|
+
# Execute tool with prepared arguments
|
|
825
|
+
result = await self.tool_manager.execute_tool(name=tool_name, tool_input=tool_args)
|
|
826
|
+
|
|
827
|
+
# Take a new screenshot after the action if we haven't already saved one
|
|
828
|
+
if not action_screenshot_saved:
|
|
829
|
+
try:
|
|
830
|
+
# Get a new screenshot after the action
|
|
831
|
+
new_parsed_screen = await self._get_parsed_screen_som(save_screenshot=False)
|
|
832
|
+
if new_parsed_screen and new_parsed_screen.annotated_image_base64:
|
|
833
|
+
img_data = new_parsed_screen.annotated_image_base64
|
|
834
|
+
# Remove data URL prefix if present
|
|
835
|
+
if img_data.startswith("data:image"):
|
|
836
|
+
img_data = img_data.split(",")[1]
|
|
837
|
+
# Save with action type if defined, otherwise use the action name
|
|
838
|
+
if action_type:
|
|
839
|
+
self._save_screenshot(img_data, action_type=action_type)
|
|
840
|
+
else:
|
|
841
|
+
self._save_screenshot(img_data, action_type=parsed_action)
|
|
842
|
+
action_screenshot_saved = True
|
|
843
|
+
except Exception as screenshot_error:
|
|
844
|
+
logger.error(f"Error taking post-action screenshot: {str(screenshot_error)}")
|
|
845
|
+
|
|
846
|
+
# Continue the loop if the action is not "None"
|
|
847
|
+
return True, action_screenshot_saved
|
|
848
|
+
|
|
849
|
+
except Exception as e:
|
|
850
|
+
logger.error(f"Error executing action: {str(e)}")
|
|
851
|
+
# Update the last assistant message with error
|
|
852
|
+
error_message = [{"type": "text", "text": f"Error executing action: {str(e)}"}]
|
|
853
|
+
# Replace the last assistant message with the error
|
|
854
|
+
self.message_manager.add_assistant_message(error_message)
|
|
855
|
+
return False, action_screenshot_saved
|