cua-agent 0.1.5__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (52) hide show
  1. agent/__init__.py +3 -4
  2. agent/core/__init__.py +3 -10
  3. agent/core/computer_agent.py +207 -32
  4. agent/core/experiment.py +20 -3
  5. agent/core/loop.py +78 -120
  6. agent/core/messages.py +279 -125
  7. agent/core/telemetry.py +44 -32
  8. agent/core/types.py +35 -0
  9. agent/core/visualization.py +197 -0
  10. agent/providers/anthropic/api/client.py +142 -1
  11. agent/providers/anthropic/api_handler.py +140 -0
  12. agent/providers/anthropic/callbacks/__init__.py +5 -0
  13. agent/providers/anthropic/loop.py +224 -209
  14. agent/providers/anthropic/messages/manager.py +3 -1
  15. agent/providers/anthropic/response_handler.py +229 -0
  16. agent/providers/anthropic/tools/base.py +1 -1
  17. agent/providers/anthropic/tools/bash.py +0 -97
  18. agent/providers/anthropic/tools/collection.py +2 -2
  19. agent/providers/anthropic/tools/computer.py +34 -24
  20. agent/providers/anthropic/tools/manager.py +2 -2
  21. agent/providers/anthropic/utils.py +370 -0
  22. agent/providers/omni/__init__.py +1 -20
  23. agent/providers/omni/api_handler.py +42 -0
  24. agent/providers/omni/clients/anthropic.py +4 -0
  25. agent/providers/omni/image_utils.py +0 -72
  26. agent/providers/omni/loop.py +497 -607
  27. agent/providers/omni/parser.py +60 -5
  28. agent/providers/omni/tools/__init__.py +25 -8
  29. agent/providers/omni/tools/base.py +29 -0
  30. agent/providers/omni/tools/bash.py +43 -38
  31. agent/providers/omni/tools/computer.py +144 -181
  32. agent/providers/omni/tools/manager.py +26 -48
  33. agent/providers/omni/types.py +0 -4
  34. agent/providers/omni/utils.py +225 -144
  35. {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/METADATA +6 -36
  36. cua_agent-0.1.17.dist-info/RECORD +63 -0
  37. agent/core/agent.py +0 -252
  38. agent/core/base_agent.py +0 -164
  39. agent/core/factory.py +0 -102
  40. agent/providers/omni/callbacks.py +0 -78
  41. agent/providers/omni/clients/groq.py +0 -101
  42. agent/providers/omni/experiment.py +0 -273
  43. agent/providers/omni/messages.py +0 -171
  44. agent/providers/omni/tool_manager.py +0 -91
  45. agent/providers/omni/visualization.py +0 -130
  46. agent/types/__init__.py +0 -26
  47. agent/types/base.py +0 -53
  48. agent/types/messages.py +0 -36
  49. cua_agent-0.1.5.dist-info/RECORD +0 -67
  50. /agent/{types → core}/tools.py +0 -0
  51. {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/WHEEL +0 -0
  52. {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/entry_points.txt +0 -0
@@ -1,33 +1,28 @@
1
1
  """Omni-specific agent loop implementation."""
2
2
 
3
3
  import logging
4
- from typing import Any, Dict, List, Optional, Tuple, AsyncGenerator, Union
5
- import base64
6
- from PIL import Image
7
- from io import BytesIO
4
+ from typing import Any, Dict, List, Optional, Tuple, AsyncGenerator
8
5
  import json
9
6
  import re
10
7
  import os
11
- from datetime import datetime
12
8
  import asyncio
13
9
  from httpx import ConnectError, ReadTimeout
14
- import shutil
15
- import copy
10
+ from typing import cast
16
11
 
17
- from .parser import OmniParser, ParseResult, ParserMetadata, UIElement
12
+ from .parser import OmniParser, ParseResult
18
13
  from ...core.loop import BaseLoop
14
+ from ...core.visualization import VisualizationHelper
15
+ from ...core.messages import StandardMessageManager, ImageRetentionConfig
16
+ from .utils import to_openai_agent_response_format
17
+ from ...core.types import AgentResponse
19
18
  from computer import Computer
20
19
  from .types import LLMProvider
21
- from .clients.base import BaseOmniClient
22
20
  from .clients.openai import OpenAIClient
23
- from .clients.groq import GroqClient
24
21
  from .clients.anthropic import AnthropicClient
25
22
  from .prompts import SYSTEM_PROMPT
26
- from .utils import compress_image_base64
27
- from .visualization import visualize_click, visualize_scroll, calculate_element_center
28
- from .image_utils import decode_base64_image, clean_base64_data
29
- from ...core.messages import ImageRetentionConfig
30
- from .messages import OmniMessageManager
23
+ from .api_handler import OmniAPIHandler
24
+ from .tools.manager import ToolManager
25
+ from .tools import ToolResult
31
26
 
32
27
  logging.basicConfig(level=logging.INFO)
33
28
  logger = logging.getLogger(__name__)
@@ -41,7 +36,16 @@ def extract_data(input_string: str, data_type: str) -> str:
41
36
 
42
37
 
43
38
  class OmniLoop(BaseLoop):
44
- """Omni-specific implementation of the agent loop."""
39
+ """Omni-specific implementation of the agent loop.
40
+
41
+ This class extends BaseLoop to provide support for multimodal models
42
+ from various providers (OpenAI, Anthropic, etc.) with UI parsing
43
+ and desktop automation capabilities.
44
+ """
45
+
46
+ ###########################################
47
+ # INITIALIZATION AND CONFIGURATION
48
+ ###########################################
45
49
 
46
50
  def __init__(
47
51
  self,
@@ -76,8 +80,9 @@ class OmniLoop(BaseLoop):
76
80
  self.provider = provider
77
81
 
78
82
  # Initialize message manager with image retention config
79
- image_retention_config = ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images)
80
- self.message_manager = OmniMessageManager(config=image_retention_config)
83
+ self.message_manager = StandardMessageManager(
84
+ config=ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images)
85
+ )
81
86
 
82
87
  # Initialize base class (which will set up experiment manager)
83
88
  super().__init__(
@@ -96,94 +101,58 @@ class OmniLoop(BaseLoop):
96
101
  self.client = None
97
102
  self.retry_count = 0
98
103
 
99
- def _should_save_debug_image(self) -> bool:
100
- """Check if debug images should be saved.
101
-
102
- Returns:
103
- bool: Always returns False as debug image saving has been disabled.
104
- """
105
- # Debug image saving functionality has been removed
106
- return False
107
-
108
- def _extract_and_save_images(self, data: Any, prefix: str) -> None:
109
- """Extract and save images from API data.
104
+ # Initialize handlers
105
+ self.api_handler = OmniAPIHandler(loop=self)
106
+ self.viz_helper = VisualizationHelper(agent=self)
110
107
 
111
- This method is now a no-op as image extraction functionality has been removed.
108
+ # Initialize tool manager
109
+ self.tool_manager = ToolManager(computer=computer, provider=provider)
112
110
 
113
- Args:
114
- data: Data to extract images from
115
- prefix: Prefix for the extracted image filenames
116
- """
117
- # Image extraction functionality has been removed
118
- return
111
+ logger.info("OmniLoop initialized with StandardMessageManager")
119
112
 
120
- def _save_debug_image(self, image_data: str, filename: str) -> None:
121
- """Save a debug image to the current turn directory.
122
-
123
- This method is now a no-op as debug image saving functionality has been removed.
124
-
125
- Args:
126
- image_data: Base64 encoded image data
127
- filename: Name to use for the saved image
128
- """
129
- # Debug image saving functionality has been removed
130
- return
131
-
132
- def _visualize_action(self, x: int, y: int, img_base64: str) -> None:
133
- """Visualize an action by drawing on the screenshot."""
134
- if (
135
- not self.save_trajectory
136
- or not hasattr(self, "experiment_manager")
137
- or not self.experiment_manager
138
- ):
139
- return
113
+ async def initialize(self) -> None:
114
+ """Initialize the loop by setting up tools and clients."""
115
+ # Initialize base class
116
+ await super().initialize()
140
117
 
118
+ # Initialize tool manager with error handling
141
119
  try:
142
- # Use the visualization utility
143
- img = visualize_click(x, y, img_base64)
144
-
145
- # Save the visualization
146
- self.experiment_manager.save_action_visualization(img, "click", f"x{x}_y{y}")
120
+ logger.info("Initializing tool manager...")
121
+ await self.tool_manager.initialize()
122
+ logger.info("Tool manager initialized successfully.")
147
123
  except Exception as e:
148
- logger.error(f"Error visualizing action: {str(e)}")
149
-
150
- def _visualize_scroll(self, direction: str, clicks: int, img_base64: str) -> None:
151
- """Visualize a scroll action by drawing arrows on the screenshot."""
152
- if (
153
- not self.save_trajectory
154
- or not hasattr(self, "experiment_manager")
155
- or not self.experiment_manager
156
- ):
157
- return
158
-
159
- try:
160
- # Use the visualization utility
161
- img = visualize_scroll(direction, clicks, img_base64)
162
-
163
- # Save the visualization
164
- self.experiment_manager.save_action_visualization(
165
- img, "scroll", f"{direction}_{clicks}"
124
+ logger.error(f"Error initializing tool manager: {str(e)}")
125
+ logger.warning("Will attempt to initialize tools on first use.")
126
+
127
+ # Initialize API clients based on provider
128
+ if self.provider == LLMProvider.ANTHROPIC:
129
+ self.client = AnthropicClient(
130
+ api_key=self.api_key,
131
+ model=self.model,
166
132
  )
167
- except Exception as e:
168
- logger.error(f"Error visualizing scroll: {str(e)}")
133
+ elif self.provider == LLMProvider.OPENAI:
134
+ self.client = OpenAIClient(
135
+ api_key=self.api_key,
136
+ model=self.model,
137
+ )
138
+ else:
139
+ raise ValueError(f"Unsupported provider: {self.provider}")
169
140
 
170
- def _save_action_visualization(
171
- self, img: Image.Image, action_name: str, details: str = ""
172
- ) -> str:
173
- """Save a visualization of an action."""
174
- if hasattr(self, "experiment_manager") and self.experiment_manager:
175
- return self.experiment_manager.save_action_visualization(img, action_name, details)
176
- return ""
141
+ ###########################################
142
+ # CLIENT INITIALIZATION - IMPLEMENTING ABSTRACT METHOD
143
+ ###########################################
177
144
 
178
145
  async def initialize_client(self) -> None:
179
- """Initialize the appropriate client based on provider."""
146
+ """Initialize the appropriate client based on provider.
147
+
148
+ Implements abstract method from BaseLoop to set up the specific
149
+ provider client (OpenAI, Anthropic, etc.).
150
+ """
180
151
  try:
181
152
  logger.info(f"Initializing {self.provider} client with model {self.model}...")
182
153
 
183
154
  if self.provider == LLMProvider.OPENAI:
184
155
  self.client = OpenAIClient(api_key=self.api_key, model=self.model)
185
- elif self.provider == LLMProvider.GROQ:
186
- self.client = GroqClient(api_key=self.api_key, model=self.model)
187
156
  elif self.provider == LLMProvider.ANTHROPIC:
188
157
  self.client = AnthropicClient(
189
158
  api_key=self.api_key,
@@ -200,6 +169,10 @@ class OmniLoop(BaseLoop):
200
169
  self.client = None
201
170
  raise RuntimeError(f"Failed to initialize client: {str(e)}")
202
171
 
172
+ ###########################################
173
+ # API CALL HANDLING
174
+ ###########################################
175
+
203
176
  async def _make_api_call(self, messages: List[Dict[str, Any]], system_prompt: str) -> Any:
204
177
  """Make API call to provider with retry logic."""
205
178
  # Create new turn directory for this API call
@@ -219,68 +192,73 @@ class OmniLoop(BaseLoop):
219
192
  if self.client is None:
220
193
  raise RuntimeError("Failed to initialize client")
221
194
 
222
- # Set the provider in message manager based on current provider
223
- provider_name = str(self.provider).split(".")[-1].lower() # Extract name from enum
224
- self.message_manager.set_provider(provider_name)
195
+ # Get messages in standard format from the message manager
196
+ self.message_manager.messages = messages.copy()
197
+ prepared_messages = self.message_manager.get_messages()
225
198
 
226
- # Apply image retention and prepare messages
227
- # This will limit the number of images based on only_n_most_recent_images
228
- prepared_messages = self.message_manager.get_formatted_messages(provider_name)
229
-
230
- # Filter out system messages for Anthropic
199
+ # Special handling for Anthropic
231
200
  if self.provider == LLMProvider.ANTHROPIC:
201
+ # Convert to Anthropic format
202
+ anthropic_messages, anthropic_system = self.message_manager.to_anthropic_format(
203
+ prepared_messages
204
+ )
205
+
206
+ # Filter out any empty/invalid messages
232
207
  filtered_messages = [
233
- msg for msg in prepared_messages if msg["role"] != "system"
208
+ msg
209
+ for msg in anthropic_messages
210
+ if msg.get("role") in ["user", "assistant"]
234
211
  ]
235
- else:
236
- filtered_messages = prepared_messages
237
212
 
238
- # Log request
239
- request_data = {"messages": filtered_messages, "max_tokens": self.max_tokens}
213
+ # Ensure there's at least one message for Anthropic
214
+ if not filtered_messages:
215
+ logger.warning(
216
+ "No valid messages found for Anthropic API call. Adding a default user message."
217
+ )
218
+ filtered_messages = [
219
+ {
220
+ "role": "user",
221
+ "content": [
222
+ {"type": "text", "text": "Please help with this task."}
223
+ ],
224
+ }
225
+ ]
240
226
 
241
- if self.provider == LLMProvider.ANTHROPIC:
242
- request_data["system"] = self._get_system_prompt()
243
- else:
244
- request_data["system"] = system_prompt
227
+ # Combine system prompts if needed
228
+ final_system_prompt = anthropic_system or system_prompt
245
229
 
246
- self._log_api_call("request", request_data)
230
+ # Log request
231
+ request_data = {
232
+ "messages": filtered_messages,
233
+ "max_tokens": self.max_tokens,
234
+ "system": final_system_prompt,
235
+ }
247
236
 
248
- # Make API call with appropriate parameters
249
- if self.client is None:
250
- raise RuntimeError("Client not initialized. Call initialize_client() first.")
251
-
252
- # Check if the method is async by inspecting the client implementation
253
- run_method = self.client.run_interleaved
254
- is_async = asyncio.iscoroutinefunction(run_method)
255
-
256
- if is_async:
257
- # For async implementations (AnthropicClient)
258
- if self.provider == LLMProvider.ANTHROPIC:
259
- response = await run_method(
260
- messages=filtered_messages,
261
- system=self._get_system_prompt(),
262
- max_tokens=self.max_tokens,
263
- )
264
- else:
265
- response = await run_method(
266
- messages=messages,
267
- system=system_prompt,
268
- max_tokens=self.max_tokens,
269
- )
237
+ self._log_api_call("request", request_data)
238
+
239
+ # Make API call
240
+ response = await self.client.run_interleaved(
241
+ messages=filtered_messages,
242
+ system=final_system_prompt,
243
+ max_tokens=self.max_tokens,
244
+ )
270
245
  else:
271
- # For non-async implementations (GroqClient, etc.)
272
- if self.provider == LLMProvider.ANTHROPIC:
273
- response = run_method(
274
- messages=filtered_messages,
275
- system=self._get_system_prompt(),
276
- max_tokens=self.max_tokens,
277
- )
278
- else:
279
- response = run_method(
280
- messages=messages,
281
- system=system_prompt,
282
- max_tokens=self.max_tokens,
283
- )
246
+ # For OpenAI and others, use standard format directly
247
+ # Log request
248
+ request_data = {
249
+ "messages": prepared_messages,
250
+ "max_tokens": self.max_tokens,
251
+ "system": system_prompt,
252
+ }
253
+
254
+ self._log_api_call("request", request_data)
255
+
256
+ # Make API call
257
+ response = await self.client.run_interleaved(
258
+ messages=prepared_messages,
259
+ system=system_prompt,
260
+ max_tokens=self.max_tokens,
261
+ )
284
262
 
285
263
  # Log success response
286
264
  self._log_api_call("response", request_data, response)
@@ -328,201 +306,169 @@ class OmniLoop(BaseLoop):
328
306
  logger.error(error_message)
329
307
  raise RuntimeError(error_message)
330
308
 
309
+ ###########################################
310
+ # RESPONSE AND ACTION HANDLING
311
+ ###########################################
312
+
331
313
  async def _handle_response(
332
- self, response: Any, messages: List[Dict[str, Any]], parsed_screen: Dict[str, Any]
314
+ self, response: Any, messages: List[Dict[str, Any]], parsed_screen: ParseResult
333
315
  ) -> Tuple[bool, bool]:
334
316
  """Handle API response.
335
317
 
318
+ Args:
319
+ response: API response
320
+ messages: List of messages to update
321
+ parsed_screen: Current parsed screen information
322
+
336
323
  Returns:
337
324
  Tuple of (should_continue, action_screenshot_saved)
338
325
  """
339
326
  action_screenshot_saved = False
327
+
328
+ # Helper function to safely add assistant messages using the message manager
329
+ def add_assistant_message(content):
330
+ if isinstance(content, str):
331
+ # Convert string to proper format
332
+ formatted_content = [{"type": "text", "text": content}]
333
+ self.message_manager.add_assistant_message(formatted_content)
334
+ logger.info("Added formatted text assistant message")
335
+ elif isinstance(content, list):
336
+ # Already in proper format
337
+ self.message_manager.add_assistant_message(content)
338
+ logger.info("Added structured assistant message")
339
+ else:
340
+ # Default case - convert to string
341
+ formatted_content = [{"type": "text", "text": str(content)}]
342
+ self.message_manager.add_assistant_message(formatted_content)
343
+ logger.info("Added converted assistant message")
344
+
340
345
  try:
341
- # Handle Anthropic response format
346
+ # Step 1: Normalize response to standard format based on provider
347
+ standard_content = []
348
+ raw_text = None
349
+
350
+ # Convert response to standardized content based on provider
342
351
  if self.provider == LLMProvider.ANTHROPIC:
343
352
  if hasattr(response, "content") and isinstance(response.content, list):
344
- # Extract text from content blocks
353
+ # Convert Anthropic response to standard format
345
354
  for block in response.content:
346
- if hasattr(block, "type") and block.type == "text":
347
- content = block.text
348
-
349
- # Try to find JSON in the content
350
- try:
351
- # First look for JSON block
352
- json_content = extract_data(content, "json")
353
- parsed_content = json.loads(json_content)
354
- logger.info("Successfully parsed JSON from code block")
355
- except (json.JSONDecodeError, IndexError):
356
- # If no JSON block, try to find JSON object in the text
357
- try:
358
- # Look for JSON object pattern
359
- json_pattern = r"\{[^}]+\}"
360
- json_match = re.search(json_pattern, content)
361
- if json_match:
362
- json_str = json_match.group(0)
363
- parsed_content = json.loads(json_str)
364
- logger.info("Successfully parsed JSON from text")
365
- else:
366
- logger.error(f"No JSON found in content: {content}")
367
- continue
368
- except json.JSONDecodeError as e:
369
- logger.error(f"Failed to parse JSON from text: {str(e)}")
370
- continue
371
-
372
- # Clean up Box ID format
373
- if "Box ID" in parsed_content and isinstance(
374
- parsed_content["Box ID"], str
375
- ):
376
- parsed_content["Box ID"] = parsed_content["Box ID"].replace(
377
- "Box #", ""
378
- )
379
-
380
- # Add any explanatory text as reasoning if not present
381
- if "Explanation" not in parsed_content:
382
- # Extract any text before the JSON as reasoning
383
- text_before_json = content.split("{")[0].strip()
384
- if text_before_json:
385
- parsed_content["Explanation"] = text_before_json
386
-
387
- # Log the parsed content for debugging
388
- logger.info(f"Parsed content: {json.dumps(parsed_content, indent=2)}")
389
-
390
- # Add response to messages
391
- messages.append(
392
- {"role": "assistant", "content": json.dumps(parsed_content)}
393
- )
394
-
395
- try:
396
- # Execute action with current parsed screen info
397
- await self._execute_action(parsed_content, parsed_screen)
398
- action_screenshot_saved = True
399
- except Exception as e:
400
- logger.error(f"Error executing action: {str(e)}")
401
- # Add error message to conversation
402
- messages.append(
403
- {
404
- "role": "assistant",
405
- "content": f"Error executing action: {str(e)}",
406
- "metadata": {"title": "❌ Error"},
407
- }
408
- )
409
- return False, action_screenshot_saved
410
-
411
- # Check if task is complete
412
- if parsed_content.get("Action") == "None":
413
- return False, action_screenshot_saved
414
- return True, action_screenshot_saved
415
-
416
- logger.warning("No text block found in Anthropic response")
355
+ if hasattr(block, "type"):
356
+ if block.type == "text":
357
+ standard_content.append({"type": "text", "text": block.text})
358
+ # Store raw text for JSON parsing
359
+ if raw_text is None:
360
+ raw_text = block.text
361
+ else:
362
+ raw_text += "\n" + block.text
363
+ else:
364
+ # Add other block types
365
+ block_dict = {}
366
+ for key, value in vars(block).items():
367
+ if not key.startswith("_"):
368
+ block_dict[key] = value
369
+ standard_content.append(block_dict)
370
+ else:
371
+ logger.warning("Invalid Anthropic response format")
417
372
  return True, action_screenshot_saved
418
-
419
- # Handle other providers' response formats
420
- if isinstance(response, dict) and "choices" in response:
421
- content = response["choices"][0]["message"]["content"]
422
373
  else:
423
- content = response
374
+ # Assume OpenAI or compatible format
375
+ try:
376
+ raw_text = response["choices"][0]["message"]["content"]
377
+ standard_content = [{"type": "text", "text": raw_text}]
378
+ except (KeyError, TypeError, IndexError) as e:
379
+ logger.error(f"Invalid response format: {str(e)}")
380
+ return True, action_screenshot_saved
424
381
 
425
- # Parse JSON content
426
- if isinstance(content, str):
382
+ # Step 2: Add the normalized response to message history
383
+ add_assistant_message(standard_content)
384
+
385
+ # Step 3: Extract JSON from the content for action execution
386
+ parsed_content = None
387
+
388
+ # If we have raw text, try to extract JSON from it
389
+ if raw_text:
390
+ # Try different approaches to extract JSON
427
391
  try:
428
392
  # First try to parse the whole content as JSON
429
- parsed_content = json.loads(content)
393
+ parsed_content = json.loads(raw_text)
394
+ logger.info("Successfully parsed whole content as JSON")
430
395
  except json.JSONDecodeError:
431
396
  try:
432
397
  # Try to find JSON block
433
- json_content = extract_data(content, "json")
398
+ json_content = extract_data(raw_text, "json")
434
399
  parsed_content = json.loads(json_content)
400
+ logger.info("Successfully parsed JSON from code block")
435
401
  except (json.JSONDecodeError, IndexError):
436
402
  try:
437
403
  # Look for JSON object pattern
438
404
  json_pattern = r"\{[^}]+\}"
439
- json_match = re.search(json_pattern, content)
405
+ json_match = re.search(json_pattern, raw_text)
440
406
  if json_match:
441
407
  json_str = json_match.group(0)
442
408
  parsed_content = json.loads(json_str)
409
+ logger.info("Successfully parsed JSON from text")
443
410
  else:
444
- logger.error(f"No JSON found in content: {content}")
411
+ logger.error(f"No JSON found in content")
445
412
  return True, action_screenshot_saved
446
413
  except json.JSONDecodeError as e:
447
414
  logger.error(f"Failed to parse JSON from text: {str(e)}")
448
415
  return True, action_screenshot_saved
449
416
 
417
+ # Step 4: Process the parsed content if available
418
+ if parsed_content:
450
419
  # Clean up Box ID format
451
420
  if "Box ID" in parsed_content and isinstance(parsed_content["Box ID"], str):
452
421
  parsed_content["Box ID"] = parsed_content["Box ID"].replace("Box #", "")
453
422
 
454
423
  # Add any explanatory text as reasoning if not present
455
- if "Explanation" not in parsed_content:
424
+ if "Explanation" not in parsed_content and raw_text:
456
425
  # Extract any text before the JSON as reasoning
457
- text_before_json = content.split("{")[0].strip()
426
+ text_before_json = raw_text.split("{")[0].strip()
458
427
  if text_before_json:
459
428
  parsed_content["Explanation"] = text_before_json
460
429
 
461
- # Add response to messages with stringified content
462
- messages.append({"role": "assistant", "content": json.dumps(parsed_content)})
430
+ # Log the parsed content for debugging
431
+ logger.info(f"Parsed content: {json.dumps(parsed_content, indent=2)}")
463
432
 
433
+ # Step 5: Execute the action
464
434
  try:
465
- # Execute action with current parsed screen info
466
- await self._execute_action(parsed_content, parsed_screen)
467
- action_screenshot_saved = True
468
- except Exception as e:
469
- logger.error(f"Error executing action: {str(e)}")
470
- # Add error message to conversation
471
- messages.append(
472
- {
473
- "role": "assistant",
474
- "content": f"Error executing action: {str(e)}",
475
- "metadata": {"title": "❌ Error"},
476
- }
435
+ # Execute action using the common helper method
436
+ should_continue, action_screenshot_saved = (
437
+ await self._execute_action_with_tools(
438
+ parsed_content, cast(ParseResult, parsed_screen)
439
+ )
477
440
  )
478
- return False, action_screenshot_saved
479
441
 
480
- # Check if task is complete
481
- if parsed_content.get("Action") == "None":
482
- return False, action_screenshot_saved
483
-
484
- return True, action_screenshot_saved
485
- elif isinstance(content, dict):
486
- # Handle case where content is already a dictionary
487
- messages.append({"role": "assistant", "content": json.dumps(content)})
488
-
489
- try:
490
- # Execute action with current parsed screen info
491
- await self._execute_action(content, parsed_screen)
492
- action_screenshot_saved = True
442
+ # Check if task is complete
443
+ if parsed_content.get("Action") == "None":
444
+ return False, action_screenshot_saved
445
+ return should_continue, action_screenshot_saved
493
446
  except Exception as e:
494
447
  logger.error(f"Error executing action: {str(e)}")
495
- # Add error message to conversation
496
- messages.append(
497
- {
498
- "role": "assistant",
499
- "content": f"Error executing action: {str(e)}",
500
- "metadata": {"title": "❌ Error"},
501
- }
502
- )
503
- return False, action_screenshot_saved
504
-
505
- # Check if task is complete
506
- if content.get("Action") == "None":
448
+ # Update the last assistant message with error
449
+ error_message = [{"type": "text", "text": f"Error executing action: {str(e)}"}]
450
+ # Replace the last assistant message with the error
451
+ self.message_manager.add_assistant_message(error_message)
507
452
  return False, action_screenshot_saved
508
453
 
509
- return True, action_screenshot_saved
510
-
511
454
  return True, action_screenshot_saved
512
455
 
513
456
  except Exception as e:
514
457
  logger.error(f"Error handling response: {str(e)}")
515
- messages.append(
516
- {
517
- "role": "assistant",
518
- "content": f"Error: {str(e)}",
519
- "metadata": {"title": "❌ Error"},
520
- }
521
- )
458
+ # Add error message using the message manager
459
+ error_message = [{"type": "text", "text": f"Error: {str(e)}"}]
460
+ self.message_manager.add_assistant_message(error_message)
522
461
  raise
523
462
 
463
+ ###########################################
464
+ # SCREEN PARSING - IMPLEMENTING ABSTRACT METHOD
465
+ ###########################################
466
+
524
467
  async def _get_parsed_screen_som(self, save_screenshot: bool = True) -> ParseResult:
525
- """Get parsed screen information with SOM.
468
+ """Get parsed screen information with Screen Object Model.
469
+
470
+ Extends the base class method to use the OmniParser to parse the screen
471
+ and extract UI elements.
526
472
 
527
473
  Args:
528
474
  save_screenshot: Whether to save the screenshot (set to False when screenshots will be saved elsewhere)
@@ -557,337 +503,26 @@ class OmniLoop(BaseLoop):
557
503
  logger.error(f"Error getting parsed screen: {str(e)}")
558
504
  raise
559
505
 
560
- async def _process_screen(
561
- self, parsed_screen: ParseResult, messages: List[Dict[str, Any]]
562
- ) -> None:
563
- """Process and add screen info to messages."""
564
- try:
565
- # Only add message if we have an image and provider supports it
566
- if self.provider in [LLMProvider.OPENAI, LLMProvider.ANTHROPIC]:
567
- image = parsed_screen.annotated_image_base64 or None
568
- if image:
569
- # Save screen info to current turn directory
570
- if self.current_turn_dir:
571
- # Save elements as JSON
572
- elements_path = os.path.join(self.current_turn_dir, "elements.json")
573
- with open(elements_path, "w") as f:
574
- # Convert elements to dicts for JSON serialization
575
- elements_json = [elem.model_dump() for elem in parsed_screen.elements]
576
- json.dump(elements_json, f, indent=2)
577
- logger.info(f"Saved elements to {elements_path}")
578
-
579
- # Format the image content based on the provider
580
- if self.provider == LLMProvider.ANTHROPIC:
581
- # Compress the image before sending to Anthropic (5MB limit)
582
- image_size = len(image)
583
- logger.info(f"Image base64 is present, length: {image_size}")
584
-
585
- # Anthropic has a 5MB limit - check against base64 string length
586
- # which is what matters for the API call payload
587
- # Use slightly smaller limit (4.9MB) to account for request overhead
588
- max_size = int(4.9 * 1024 * 1024) # 4.9MB
589
-
590
- # Default media type (will be overridden if compression is needed)
591
- media_type = "image/png"
592
-
593
- # Check if the image already has a media type prefix
594
- if image.startswith("data:"):
595
- parts = image.split(",", 1)
596
- if len(parts) == 2 and "image/jpeg" in parts[0].lower():
597
- media_type = "image/jpeg"
598
- elif len(parts) == 2 and "image/png" in parts[0].lower():
599
- media_type = "image/png"
600
-
601
- if image_size > max_size:
602
- logger.info(
603
- f"Image size ({image_size} bytes) exceeds Anthropic limit ({max_size} bytes), compressing..."
604
- )
605
- image, media_type = compress_image_base64(image, max_size)
606
- logger.info(
607
- f"Image compressed to {len(image)} bytes with media_type {media_type}"
608
- )
609
-
610
- # Anthropic uses "type": "image"
611
- screen_info_msg = {
612
- "role": "user",
613
- "content": [
614
- {
615
- "type": "image",
616
- "source": {
617
- "type": "base64",
618
- "media_type": media_type,
619
- "data": image,
620
- },
621
- }
622
- ],
623
- }
624
- else:
625
- # OpenAI and others use "type": "image_url"
626
- screen_info_msg = {
627
- "role": "user",
628
- "content": [
629
- {
630
- "type": "image_url",
631
- "image_url": {"url": f"data:image/png;base64,{image}"},
632
- }
633
- ],
634
- }
635
- messages.append(screen_info_msg)
636
-
637
- except Exception as e:
638
- logger.error(f"Error processing screen info: {str(e)}")
639
- raise
640
-
641
506
  def _get_system_prompt(self) -> str:
642
507
  """Get the system prompt for the model."""
643
508
  return SYSTEM_PROMPT
644
509
 
645
- async def _execute_action(self, content: Dict[str, Any], parsed_screen: ParseResult) -> None:
646
- """Execute the action specified in the content using the tool manager.
647
-
648
- Args:
649
- content: Dictionary containing the action details
650
- parsed_screen: Current parsed screen information
651
- """
652
- try:
653
- action = content.get("Action", "").lower()
654
- if not action:
655
- return
656
-
657
- # Track if we saved an action-specific screenshot
658
- action_screenshot_saved = False
659
-
660
- try:
661
- # Prepare kwargs based on action type
662
- kwargs = {}
663
-
664
- if action in ["left_click", "right_click", "double_click", "move_cursor"]:
665
- try:
666
- box_id = int(content["Box ID"])
667
- logger.info(f"Processing Box ID: {box_id}")
668
-
669
- # Calculate click coordinates
670
- x, y = await self._calculate_click_coordinates(box_id, parsed_screen)
671
- logger.info(f"Calculated coordinates: x={x}, y={y}")
672
-
673
- kwargs["x"] = x
674
- kwargs["y"] = y
675
-
676
- # Visualize action if screenshot is available
677
- if parsed_screen.annotated_image_base64:
678
- img_data = parsed_screen.annotated_image_base64
679
- # Remove data URL prefix if present
680
- if img_data.startswith("data:image"):
681
- img_data = img_data.split(",")[1]
682
- # Only save visualization for coordinate-based actions
683
- self._visualize_action(x, y, img_data)
684
- action_screenshot_saved = True
685
-
686
- except ValueError as e:
687
- logger.error(f"Error processing Box ID: {str(e)}")
688
- return
689
-
690
- elif action == "drag_to":
691
- try:
692
- box_id = int(content["Box ID"])
693
- x, y = await self._calculate_click_coordinates(box_id, parsed_screen)
694
- kwargs.update(
695
- {
696
- "x": x,
697
- "y": y,
698
- "button": content.get("button", "left"),
699
- "duration": float(content.get("duration", 0.5)),
700
- }
701
- )
702
-
703
- # Visualize drag destination if screenshot is available
704
- if parsed_screen.annotated_image_base64:
705
- img_data = parsed_screen.annotated_image_base64
706
- # Remove data URL prefix if present
707
- if img_data.startswith("data:image"):
708
- img_data = img_data.split(",")[1]
709
- # Only save visualization for coordinate-based actions
710
- self._visualize_action(x, y, img_data)
711
- action_screenshot_saved = True
712
-
713
- except ValueError as e:
714
- logger.error(f"Error processing drag coordinates: {str(e)}")
715
- return
716
-
717
- elif action == "type_text":
718
- kwargs["text"] = content["Value"]
719
- # For type_text, store the value in the action type
720
- action_type = f"type_{content['Value'][:20]}" # Truncate if too long
721
- elif action == "press_key":
722
- kwargs["key"] = content["Value"]
723
- action_type = f"press_{content['Value']}"
724
- elif action == "hotkey":
725
- if isinstance(content.get("Value"), list):
726
- keys = content["Value"]
727
- action_type = f"hotkey_{'_'.join(keys)}"
728
- else:
729
- # Simply split string format like "command+space" into a list
730
- keys = [k.strip() for k in content["Value"].lower().split("+")]
731
- action_type = f"hotkey_{content['Value'].replace('+', '_')}"
732
- logger.info(f"Preparing hotkey with keys: {keys}")
733
- # Get the method but call it with *args instead of **kwargs
734
- method = getattr(self.computer.interface, action)
735
- await method(*keys) # Unpack the keys list as positional arguments
736
- logger.info(f"Tool execution completed successfully: {action}")
737
-
738
- # For hotkeys, take a screenshot after the action
739
- try:
740
- # Get a new screenshot after the action and save it with the action type
741
- new_parsed_screen = await self._get_parsed_screen_som(save_screenshot=False)
742
- if new_parsed_screen and new_parsed_screen.annotated_image_base64:
743
- img_data = new_parsed_screen.annotated_image_base64
744
- # Remove data URL prefix if present
745
- if img_data.startswith("data:image"):
746
- img_data = img_data.split(",")[1]
747
- # Save with action type to indicate this is a post-action screenshot
748
- self._save_screenshot(img_data, action_type=action_type)
749
- action_screenshot_saved = True
750
- except Exception as screenshot_error:
751
- logger.error(
752
- f"Error taking post-hotkey screenshot: {str(screenshot_error)}"
753
- )
754
-
755
- return
756
-
757
- elif action in ["scroll_down", "scroll_up"]:
758
- clicks = int(content.get("amount", 1))
759
- kwargs["clicks"] = clicks
760
- action_type = f"scroll_{action.split('_')[1]}_{clicks}"
510
+ ###########################################
511
+ # MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
512
+ ###########################################
761
513
 
762
- # Visualize scrolling if screenshot is available
763
- if parsed_screen.annotated_image_base64:
764
- img_data = parsed_screen.annotated_image_base64
765
- # Remove data URL prefix if present
766
- if img_data.startswith("data:image"):
767
- img_data = img_data.split(",")[1]
768
- direction = "down" if action == "scroll_down" else "up"
769
- # For scrolling, we only save the visualization to avoid duplicate images
770
- self._visualize_scroll(direction, clicks, img_data)
771
- action_screenshot_saved = True
772
-
773
- else:
774
- logger.warning(f"Unknown action: {action}")
775
- return
776
-
777
- # Execute tool and handle result
778
- try:
779
- method = getattr(self.computer.interface, action)
780
- logger.info(f"Found method for action '{action}': {method}")
781
- await method(**kwargs)
782
- logger.info(f"Tool execution completed successfully: {action}")
783
-
784
- # For non-coordinate based actions that don't already have visualizations,
785
- # take a new screenshot after the action
786
- if not action_screenshot_saved:
787
- # Take a new screenshot
788
- try:
789
- # Get a new screenshot after the action and save it with the action type
790
- new_parsed_screen = await self._get_parsed_screen_som(
791
- save_screenshot=False
792
- )
793
- if new_parsed_screen and new_parsed_screen.annotated_image_base64:
794
- img_data = new_parsed_screen.annotated_image_base64
795
- # Remove data URL prefix if present
796
- if img_data.startswith("data:image"):
797
- img_data = img_data.split(",")[1]
798
- # Save with action type to indicate this is a post-action screenshot
799
- if "action_type" in locals():
800
- self._save_screenshot(img_data, action_type=action_type)
801
- else:
802
- self._save_screenshot(img_data, action_type=action)
803
- # Update the action screenshot flag for this turn
804
- action_screenshot_saved = True
805
- except Exception as screenshot_error:
806
- logger.error(
807
- f"Error taking post-action screenshot: {str(screenshot_error)}"
808
- )
809
-
810
- except AttributeError as e:
811
- logger.error(f"Method not found for action '{action}': {str(e)}")
812
- return
813
- except Exception as tool_error:
814
- logger.error(f"Tool execution failed: {str(tool_error)}")
815
- return
816
-
817
- except Exception as e:
818
- logger.error(f"Error executing action {action}: {str(e)}")
819
- return
820
-
821
- except Exception as e:
822
- logger.error(f"Error in _execute_action: {str(e)}")
823
- return
824
-
825
- async def _calculate_click_coordinates(
826
- self, box_id: int, parsed_screen: ParseResult
827
- ) -> Tuple[int, int]:
828
- """Calculate click coordinates based on box ID.
829
-
830
- Args:
831
- box_id: The ID of the box to click
832
- parsed_screen: The parsed screen information
833
-
834
- Returns:
835
- Tuple of (x, y) coordinates
836
-
837
- Raises:
838
- ValueError: If box_id is invalid or missing from parsed screen
839
- """
840
- # First try to use structured elements data
841
- logger.info(f"Elements count: {len(parsed_screen.elements)}")
842
-
843
- # Try to find element with matching ID
844
- for element in parsed_screen.elements:
845
- if element.id == box_id:
846
- logger.info(f"Found element with ID {box_id}: {element}")
847
- bbox = element.bbox
848
-
849
- # Get screen dimensions from the metadata if available, or fallback
850
- width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
851
- height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
852
- logger.info(f"Screen dimensions: width={width}, height={height}")
853
-
854
- # Calculate center of the box in pixels
855
- center_x = int((bbox.x1 + bbox.x2) / 2 * width)
856
- center_y = int((bbox.y1 + bbox.y2) / 2 * height)
857
- logger.info(f"Calculated center: ({center_x}, {center_y})")
858
-
859
- # Validate coordinates - if they're (0,0) or unreasonably small,
860
- # use a default position in the center of the screen
861
- if center_x == 0 and center_y == 0:
862
- logger.warning("Got (0,0) coordinates, using fallback position")
863
- center_x = width // 2
864
- center_y = height // 2
865
- logger.info(f"Using fallback center: ({center_x}, {center_y})")
866
-
867
- return center_x, center_y
868
-
869
- # If we couldn't find the box, use center of screen
870
- logger.error(
871
- f"Box ID {box_id} not found in structured elements (count={len(parsed_screen.elements)})"
872
- )
873
-
874
- # Use center of screen as fallback
875
- width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
876
- height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
877
- logger.warning(f"Using fallback position in center of screen ({width//2}, {height//2})")
878
- return width // 2, height // 2
879
-
880
- async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
514
+ async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
881
515
  """Run the agent loop with provided messages.
882
516
 
883
517
  Args:
884
- messages: List of message objects
518
+ messages: List of messages in standard OpenAI format
885
519
 
886
520
  Yields:
887
- Dict containing response data
521
+ Agent response format
888
522
  """
889
- # Keep track of conversation history
890
- conversation_history = messages.copy()
523
+ # Initialize the message manager with the provided messages
524
+ self.message_manager.messages = messages.copy()
525
+ logger.info(f"Starting OmniLoop run with {len(self.message_manager.messages)} messages")
891
526
 
892
527
  # Continue running until explicitly told to stop
893
528
  running = True
@@ -916,26 +551,66 @@ class OmniLoop(BaseLoop):
916
551
  # Get up-to-date screen information
917
552
  parsed_screen = await self._get_parsed_screen_som()
918
553
 
919
- # Process screen info and update messages
920
- await self._process_screen(parsed_screen, conversation_history)
554
+ # Process screen info and update messages in standard format
555
+ try:
556
+ # Get image from parsed screen
557
+ image = parsed_screen.annotated_image_base64 or None
558
+ if image:
559
+ # Save elements as JSON if we have a turn directory
560
+ if self.current_turn_dir and hasattr(parsed_screen, "elements"):
561
+ elements_path = os.path.join(self.current_turn_dir, "elements.json")
562
+ with open(elements_path, "w") as f:
563
+ # Convert elements to dicts for JSON serialization
564
+ elements_json = [
565
+ elem.model_dump() for elem in parsed_screen.elements
566
+ ]
567
+ json.dump(elements_json, f, indent=2)
568
+ logger.info(f"Saved elements to {elements_path}")
569
+
570
+ # Remove data URL prefix if present
571
+ if "," in image:
572
+ image = image.split(",")[1]
573
+
574
+ # Add screenshot to message history using message manager
575
+ self.message_manager.add_user_message(
576
+ [
577
+ {
578
+ "type": "image_url",
579
+ "image_url": {"url": f"data:image/png;base64,{image}"},
580
+ }
581
+ ]
582
+ )
583
+ logger.info("Added screenshot to message history")
584
+ except Exception as e:
585
+ logger.error(f"Error processing screen info: {str(e)}")
586
+ raise
921
587
 
922
588
  # Get system prompt
923
589
  system_prompt = self._get_system_prompt()
924
590
 
925
- # Make API call with retries
926
- response = await self._make_api_call(conversation_history, system_prompt)
591
+ # Make API call with retries using the APIHandler
592
+ response = await self.api_handler.make_api_call(
593
+ self.message_manager.messages, system_prompt
594
+ )
927
595
 
928
596
  # Handle the response (may execute actions)
929
597
  # Returns: (should_continue, action_screenshot_saved)
930
598
  should_continue, new_screenshot_saved = await self._handle_response(
931
- response, conversation_history, parsed_screen
599
+ response, self.message_manager.messages, parsed_screen
932
600
  )
933
601
 
934
602
  # Update whether an action screenshot was saved this turn
935
603
  action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
936
604
 
605
+ # Create OpenAI-compatible response format using utility function
606
+ openai_compatible_response = await to_openai_agent_response_format(
607
+ response=response,
608
+ messages=self.message_manager.messages,
609
+ model=self.model,
610
+ )
611
+
937
612
  # Yield the response to the caller
938
- yield {"response": response}
613
+ yield openai_compatible_response
939
614
 
940
615
  # Check if we should continue this conversation
941
616
  running = should_continue
@@ -963,3 +638,218 @@ class OmniLoop(BaseLoop):
963
638
 
964
639
  # Create a brief delay before retrying
965
640
  await asyncio.sleep(1)
641
+
642
+ async def process_model_response(self, response_text: str) -> Optional[Dict[str, Any]]:
643
+ """Process model response to extract tool calls.
644
+
645
+ Args:
646
+ response_text: Model response text
647
+
648
+ Returns:
649
+ Extracted tool information, or None if no tool call was found
650
+ """
651
+ try:
652
+ # Ensure tools are initialized before use
653
+ await self._ensure_tools_initialized()
654
+
655
+ # Look for tool use in the response
656
+ if "function_call" in response_text or "tool_use" in response_text:
657
+ # The extract_tool_call method should be implemented in the OmniAPIHandler
658
+ # For now, we'll just use a simple approach
659
+ # This will be replaced with the proper implementation
660
+ tool_info = None
661
+ if "function_call" in response_text:
662
+ # Extract function call params
663
+ try:
664
+ # Simple extraction - in real code this would be more robust
665
+ import json
666
+ import re
667
+
668
+ match = re.search(r'"function_call"\s*:\s*{([^}]+)}', response_text)
669
+ if match:
670
+ function_text = "{" + match.group(1) + "}"
671
+ tool_info = json.loads(function_text)
672
+ except Exception as e:
673
+ logger.error(f"Error extracting function call: {str(e)}")
674
+
675
+ if tool_info:
676
+ try:
677
+ # Execute the tool
678
+ result = await self.tool_manager.execute_tool(
679
+ name=tool_info.get("name"), tool_input=tool_info.get("arguments", {})
680
+ )
681
+ # Handle the result
682
+ return {"tool_result": result}
683
+ except Exception as e:
684
+ error_msg = (
685
+ f"Error executing tool '{tool_info.get('name', 'unknown')}': {str(e)}"
686
+ )
687
+ logger.error(error_msg)
688
+ return {"tool_result": ToolResult(error=error_msg)}
689
+ except Exception as e:
690
+ logger.error(f"Error processing tool call: {str(e)}")
691
+
692
+ return None
693
+
694
+ async def process_response_with_tools(
695
+ self, response_text: str, parsed_screen: Optional[ParseResult] = None
696
+ ) -> Tuple[bool, str]:
697
+ """Process model response and execute tools.
698
+
699
+ Args:
700
+ response_text: Model response text
701
+ parsed_screen: Current parsed screen information (optional)
702
+
703
+ Returns:
704
+ Tuple of (action_taken, observation)
705
+ """
706
+ logger.info("Processing response with tools")
707
+
708
+ # Process the response to extract tool calls
709
+ tool_result = await self.process_model_response(response_text)
710
+
711
+ if tool_result and "tool_result" in tool_result:
712
+ # A tool was executed
713
+ result = tool_result["tool_result"]
714
+ if result.error:
715
+ return False, f"ERROR: {result.error}"
716
+ else:
717
+ return True, result.output or "Tool executed successfully"
718
+
719
+ # No action or tool call found
720
+ return False, "No action taken - no tool call detected in response"
721
+
722
+ ###########################################
723
+ # UTILITY METHODS
724
+ ###########################################
725
+
726
+ async def _ensure_tools_initialized(self) -> None:
727
+ """Ensure the tool manager and tools are initialized before use."""
728
+ if not hasattr(self.tool_manager, "tools") or self.tool_manager.tools is None:
729
+ logger.info("Tools not initialized. Initializing now...")
730
+ await self.tool_manager.initialize()
731
+ logger.info("Tools initialized successfully.")
732
+
733
+ async def _execute_action_with_tools(
734
+ self, action_data: Dict[str, Any], parsed_screen: ParseResult
735
+ ) -> Tuple[bool, bool]:
736
+ """Execute an action using the tools-based approach.
737
+
738
+ Args:
739
+ action_data: Dictionary containing action details
740
+ parsed_screen: Current parsed screen information
741
+
742
+ Returns:
743
+ Tuple of (should_continue, action_screenshot_saved)
744
+ """
745
+ action_screenshot_saved = False
746
+ action_type = None # Initialize for possible use in post-action screenshot
747
+
748
+ try:
749
+ # Extract the action
750
+ parsed_action = action_data.get("Action", "").lower()
751
+
752
+ # Only process if we have a valid action
753
+ if not parsed_action or parsed_action == "none":
754
+ return False, action_screenshot_saved
755
+
756
+ # Convert the parsed content to a format suitable for the tools system
757
+ tool_name = "computer" # Default to computer tool
758
+ tool_args = {"action": parsed_action}
759
+
760
+ # Add specific arguments based on action type
761
+ if parsed_action in ["left_click", "right_click", "double_click", "move_cursor"]:
762
+ # Calculate coordinates from Box ID using parser
763
+ try:
764
+ box_id = int(action_data["Box ID"])
765
+ x, y = await self.parser.calculate_click_coordinates(
766
+ box_id, cast(ParseResult, parsed_screen)
767
+ )
768
+ tool_args["x"] = x
769
+ tool_args["y"] = y
770
+
771
+ # Visualize action if screenshot is available
772
+ if parsed_screen and parsed_screen.annotated_image_base64:
773
+ img_data = parsed_screen.annotated_image_base64
774
+ # Remove data URL prefix if present
775
+ if img_data.startswith("data:image"):
776
+ img_data = img_data.split(",")[1]
777
+ # Save visualization for coordinate-based actions
778
+ self.viz_helper.visualize_action(x, y, img_data)
779
+ action_screenshot_saved = True
780
+
781
+ except (ValueError, KeyError) as e:
782
+ logger.error(f"Error processing Box ID: {str(e)}")
783
+ return False, action_screenshot_saved
784
+
785
+ elif parsed_action == "type_text":
786
+ tool_args["text"] = action_data.get("Value", "")
787
+ # For type_text, store the value in the action type for screenshot naming
788
+ action_type = f"type_{tool_args['text'][:20]}" # Truncate if too long
789
+
790
+ elif parsed_action == "press_key":
791
+ tool_args["key"] = action_data.get("Value", "")
792
+ action_type = f"press_{tool_args['key']}"
793
+
794
+ elif parsed_action == "hotkey":
795
+ value = action_data.get("Value", "")
796
+ if isinstance(value, list):
797
+ tool_args["keys"] = value
798
+ action_type = f"hotkey_{'_'.join(value)}"
799
+ else:
800
+ # Split string format like "command+space" into a list
801
+ keys = [k.strip() for k in value.lower().split("+")]
802
+ tool_args["keys"] = keys
803
+ action_type = f"hotkey_{value.replace('+', '_')}"
804
+
805
+ elif parsed_action in ["scroll_down", "scroll_up"]:
806
+ clicks = int(action_data.get("amount", 1))
807
+ tool_args["amount"] = clicks
808
+ action_type = f"scroll_{parsed_action.split('_')[1]}_{clicks}"
809
+
810
+ # Visualize scrolling if screenshot is available
811
+ if parsed_screen and parsed_screen.annotated_image_base64:
812
+ img_data = parsed_screen.annotated_image_base64
813
+ # Remove data URL prefix if present
814
+ if img_data.startswith("data:image"):
815
+ img_data = img_data.split(",")[1]
816
+ direction = "down" if parsed_action == "scroll_down" else "up"
817
+ # For scrolling, we save the visualization
818
+ self.viz_helper.visualize_scroll(direction, clicks, img_data)
819
+ action_screenshot_saved = True
820
+
821
+ # Ensure tools are initialized before use
822
+ await self._ensure_tools_initialized()
823
+
824
+ # Execute tool with prepared arguments
825
+ result = await self.tool_manager.execute_tool(name=tool_name, tool_input=tool_args)
826
+
827
+ # Take a new screenshot after the action if we haven't already saved one
828
+ if not action_screenshot_saved:
829
+ try:
830
+ # Get a new screenshot after the action
831
+ new_parsed_screen = await self._get_parsed_screen_som(save_screenshot=False)
832
+ if new_parsed_screen and new_parsed_screen.annotated_image_base64:
833
+ img_data = new_parsed_screen.annotated_image_base64
834
+ # Remove data URL prefix if present
835
+ if img_data.startswith("data:image"):
836
+ img_data = img_data.split(",")[1]
837
+ # Save with action type if defined, otherwise use the action name
838
+ if action_type:
839
+ self._save_screenshot(img_data, action_type=action_type)
840
+ else:
841
+ self._save_screenshot(img_data, action_type=parsed_action)
842
+ action_screenshot_saved = True
843
+ except Exception as screenshot_error:
844
+ logger.error(f"Error taking post-action screenshot: {str(screenshot_error)}")
845
+
846
+ # Continue the loop if the action is not "None"
847
+ return True, action_screenshot_saved
848
+
849
+ except Exception as e:
850
+ logger.error(f"Error executing action: {str(e)}")
851
+ # Update the last assistant message with error
852
+ error_message = [{"type": "text", "text": f"Error executing action: {str(e)}"}]
853
+ # Replace the last assistant message with the error
854
+ self.message_manager.add_assistant_message(error_message)
855
+ return False, action_screenshot_saved