cua-agent 0.1.6__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (42) hide show
  1. agent/__init__.py +3 -2
  2. agent/core/__init__.py +0 -5
  3. agent/core/computer_agent.py +21 -28
  4. agent/core/loop.py +78 -124
  5. agent/core/messages.py +279 -125
  6. agent/core/types.py +35 -0
  7. agent/core/visualization.py +197 -0
  8. agent/providers/anthropic/api/client.py +142 -1
  9. agent/providers/anthropic/api_handler.py +140 -0
  10. agent/providers/anthropic/callbacks/__init__.py +5 -0
  11. agent/providers/anthropic/loop.py +206 -220
  12. agent/providers/anthropic/response_handler.py +229 -0
  13. agent/providers/anthropic/tools/bash.py +0 -97
  14. agent/providers/anthropic/utils.py +370 -0
  15. agent/providers/omni/__init__.py +1 -20
  16. agent/providers/omni/api_handler.py +42 -0
  17. agent/providers/omni/clients/anthropic.py +4 -0
  18. agent/providers/omni/image_utils.py +0 -72
  19. agent/providers/omni/loop.py +490 -606
  20. agent/providers/omni/parser.py +58 -4
  21. agent/providers/omni/tools/__init__.py +25 -7
  22. agent/providers/omni/tools/base.py +29 -0
  23. agent/providers/omni/tools/bash.py +43 -38
  24. agent/providers/omni/tools/computer.py +144 -182
  25. agent/providers/omni/tools/manager.py +25 -45
  26. agent/providers/omni/types.py +0 -4
  27. agent/providers/omni/utils.py +224 -145
  28. {cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/METADATA +6 -36
  29. cua_agent-0.1.17.dist-info/RECORD +63 -0
  30. agent/providers/omni/callbacks.py +0 -78
  31. agent/providers/omni/clients/groq.py +0 -101
  32. agent/providers/omni/experiment.py +0 -276
  33. agent/providers/omni/messages.py +0 -171
  34. agent/providers/omni/tool_manager.py +0 -91
  35. agent/providers/omni/visualization.py +0 -130
  36. agent/types/__init__.py +0 -23
  37. agent/types/base.py +0 -41
  38. agent/types/messages.py +0 -36
  39. cua_agent-0.1.6.dist-info/RECORD +0 -64
  40. /agent/{types → core}/tools.py +0 -0
  41. {cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/WHEEL +0 -0
  42. {cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/entry_points.txt +0 -0
@@ -1,34 +1,28 @@
1
1
  """Omni-specific agent loop implementation."""
2
2
 
3
3
  import logging
4
- from typing import Any, Dict, List, Optional, Tuple, AsyncGenerator, Union
5
- import base64
6
- from PIL import Image
7
- from io import BytesIO
4
+ from typing import Any, Dict, List, Optional, Tuple, AsyncGenerator
8
5
  import json
9
6
  import re
10
7
  import os
11
- from datetime import datetime
12
8
  import asyncio
13
9
  from httpx import ConnectError, ReadTimeout
14
- import shutil
15
- import copy
16
10
  from typing import cast
17
11
 
18
- from .parser import OmniParser, ParseResult, ParserMetadata, UIElement
12
+ from .parser import OmniParser, ParseResult
19
13
  from ...core.loop import BaseLoop
14
+ from ...core.visualization import VisualizationHelper
15
+ from ...core.messages import StandardMessageManager, ImageRetentionConfig
16
+ from .utils import to_openai_agent_response_format
17
+ from ...core.types import AgentResponse
20
18
  from computer import Computer
21
19
  from .types import LLMProvider
22
- from .clients.base import BaseOmniClient
23
20
  from .clients.openai import OpenAIClient
24
- from .clients.groq import GroqClient
25
21
  from .clients.anthropic import AnthropicClient
26
22
  from .prompts import SYSTEM_PROMPT
27
- from .utils import compress_image_base64
28
- from .visualization import visualize_click, visualize_scroll, calculate_element_center
29
- from .image_utils import decode_base64_image, clean_base64_data
30
- from ...core.messages import ImageRetentionConfig
31
- from .messages import OmniMessageManager
23
+ from .api_handler import OmniAPIHandler
24
+ from .tools.manager import ToolManager
25
+ from .tools import ToolResult
32
26
 
33
27
  logging.basicConfig(level=logging.INFO)
34
28
  logger = logging.getLogger(__name__)
@@ -42,7 +36,16 @@ def extract_data(input_string: str, data_type: str) -> str:
42
36
 
43
37
 
44
38
  class OmniLoop(BaseLoop):
45
- """Omni-specific implementation of the agent loop."""
39
+ """Omni-specific implementation of the agent loop.
40
+
41
+ This class extends BaseLoop to provide support for multimodal models
42
+ from various providers (OpenAI, Anthropic, etc.) with UI parsing
43
+ and desktop automation capabilities.
44
+ """
45
+
46
+ ###########################################
47
+ # INITIALIZATION AND CONFIGURATION
48
+ ###########################################
46
49
 
47
50
  def __init__(
48
51
  self,
@@ -77,8 +80,9 @@ class OmniLoop(BaseLoop):
77
80
  self.provider = provider
78
81
 
79
82
  # Initialize message manager with image retention config
80
- image_retention_config = ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images)
81
- self.message_manager = OmniMessageManager(config=image_retention_config)
83
+ self.message_manager = StandardMessageManager(
84
+ config=ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images)
85
+ )
82
86
 
83
87
  # Initialize base class (which will set up experiment manager)
84
88
  super().__init__(
@@ -97,87 +101,53 @@ class OmniLoop(BaseLoop):
97
101
  self.client = None
98
102
  self.retry_count = 0
99
103
 
100
- def _should_save_debug_image(self) -> bool:
101
- """Check if debug images should be saved.
102
-
103
- Returns:
104
- bool: Always returns False as debug image saving has been disabled.
105
- """
106
- # Debug image saving functionality has been removed
107
- return False
108
-
109
- def _extract_and_save_images(self, data: Any, prefix: str) -> None:
110
- """Extract and save images from API data.
104
+ # Initialize handlers
105
+ self.api_handler = OmniAPIHandler(loop=self)
106
+ self.viz_helper = VisualizationHelper(agent=self)
111
107
 
112
- This method is now a no-op as image extraction functionality has been removed.
108
+ # Initialize tool manager
109
+ self.tool_manager = ToolManager(computer=computer, provider=provider)
113
110
 
114
- Args:
115
- data: Data to extract images from
116
- prefix: Prefix for the extracted image filenames
117
- """
118
- # Image extraction functionality has been removed
119
- return
111
+ logger.info("OmniLoop initialized with StandardMessageManager")
120
112
 
121
- def _save_debug_image(self, image_data: str, filename: str) -> None:
122
- """Save a debug image to the current turn directory.
123
-
124
- This method is now a no-op as debug image saving functionality has been removed.
125
-
126
- Args:
127
- image_data: Base64 encoded image data
128
- filename: Name to use for the saved image
129
- """
130
- # Debug image saving functionality has been removed
131
- return
132
-
133
- def _visualize_action(self, x: int, y: int, img_base64: str) -> None:
134
- """Visualize an action by drawing on the screenshot."""
135
- if (
136
- not self.save_trajectory
137
- or not hasattr(self, "experiment_manager")
138
- or not self.experiment_manager
139
- ):
140
- return
113
+ async def initialize(self) -> None:
114
+ """Initialize the loop by setting up tools and clients."""
115
+ # Initialize base class
116
+ await super().initialize()
141
117
 
118
+ # Initialize tool manager with error handling
142
119
  try:
143
- # Use the visualization utility
144
- img = visualize_click(x, y, img_base64)
145
-
146
- # Save the visualization
147
- self.experiment_manager.save_action_visualization(img, "click", f"x{x}_y{y}")
120
+ logger.info("Initializing tool manager...")
121
+ await self.tool_manager.initialize()
122
+ logger.info("Tool manager initialized successfully.")
148
123
  except Exception as e:
149
- logger.error(f"Error visualizing action: {str(e)}")
150
-
151
- def _visualize_scroll(self, direction: str, clicks: int, img_base64: str) -> None:
152
- """Visualize a scroll action by drawing arrows on the screenshot."""
153
- if (
154
- not self.save_trajectory
155
- or not hasattr(self, "experiment_manager")
156
- or not self.experiment_manager
157
- ):
158
- return
159
-
160
- try:
161
- # Use the visualization utility
162
- img = visualize_scroll(direction, clicks, img_base64)
163
-
164
- # Save the visualization
165
- self.experiment_manager.save_action_visualization(
166
- img, "scroll", f"{direction}_{clicks}"
124
+ logger.error(f"Error initializing tool manager: {str(e)}")
125
+ logger.warning("Will attempt to initialize tools on first use.")
126
+
127
+ # Initialize API clients based on provider
128
+ if self.provider == LLMProvider.ANTHROPIC:
129
+ self.client = AnthropicClient(
130
+ api_key=self.api_key,
131
+ model=self.model,
167
132
  )
168
- except Exception as e:
169
- logger.error(f"Error visualizing scroll: {str(e)}")
133
+ elif self.provider == LLMProvider.OPENAI:
134
+ self.client = OpenAIClient(
135
+ api_key=self.api_key,
136
+ model=self.model,
137
+ )
138
+ else:
139
+ raise ValueError(f"Unsupported provider: {self.provider}")
170
140
 
171
- def _save_action_visualization(
172
- self, img: Image.Image, action_name: str, details: str = ""
173
- ) -> str:
174
- """Save a visualization of an action."""
175
- if hasattr(self, "experiment_manager") and self.experiment_manager:
176
- return self.experiment_manager.save_action_visualization(img, action_name, details)
177
- return ""
141
+ ###########################################
142
+ # CLIENT INITIALIZATION - IMPLEMENTING ABSTRACT METHOD
143
+ ###########################################
178
144
 
179
145
  async def initialize_client(self) -> None:
180
- """Initialize the appropriate client based on provider."""
146
+ """Initialize the appropriate client based on provider.
147
+
148
+ Implements abstract method from BaseLoop to set up the specific
149
+ provider client (OpenAI, Anthropic, etc.).
150
+ """
181
151
  try:
182
152
  logger.info(f"Initializing {self.provider} client with model {self.model}...")
183
153
 
@@ -199,6 +169,10 @@ class OmniLoop(BaseLoop):
199
169
  self.client = None
200
170
  raise RuntimeError(f"Failed to initialize client: {str(e)}")
201
171
 
172
+ ###########################################
173
+ # API CALL HANDLING
174
+ ###########################################
175
+
202
176
  async def _make_api_call(self, messages: List[Dict[str, Any]], system_prompt: str) -> Any:
203
177
  """Make API call to provider with retry logic."""
204
178
  # Create new turn directory for this API call
@@ -218,68 +192,73 @@ class OmniLoop(BaseLoop):
218
192
  if self.client is None:
219
193
  raise RuntimeError("Failed to initialize client")
220
194
 
221
- # Set the provider in message manager based on current provider
222
- provider_name = str(self.provider).split(".")[-1].lower() # Extract name from enum
223
- self.message_manager.set_provider(provider_name)
224
-
225
- # Apply image retention and prepare messages
226
- # This will limit the number of images based on only_n_most_recent_images
227
- prepared_messages = self.message_manager.get_formatted_messages(provider_name)
195
+ # Get messages in standard format from the message manager
196
+ self.message_manager.messages = messages.copy()
197
+ prepared_messages = self.message_manager.get_messages()
228
198
 
229
- # Filter out system messages for Anthropic
199
+ # Special handling for Anthropic
230
200
  if self.provider == LLMProvider.ANTHROPIC:
201
+ # Convert to Anthropic format
202
+ anthropic_messages, anthropic_system = self.message_manager.to_anthropic_format(
203
+ prepared_messages
204
+ )
205
+
206
+ # Filter out any empty/invalid messages
231
207
  filtered_messages = [
232
- msg for msg in prepared_messages if msg["role"] != "system"
208
+ msg
209
+ for msg in anthropic_messages
210
+ if msg.get("role") in ["user", "assistant"]
233
211
  ]
234
- else:
235
- filtered_messages = prepared_messages
236
212
 
237
- # Log request
238
- request_data = {"messages": filtered_messages, "max_tokens": self.max_tokens}
213
+ # Ensure there's at least one message for Anthropic
214
+ if not filtered_messages:
215
+ logger.warning(
216
+ "No valid messages found for Anthropic API call. Adding a default user message."
217
+ )
218
+ filtered_messages = [
219
+ {
220
+ "role": "user",
221
+ "content": [
222
+ {"type": "text", "text": "Please help with this task."}
223
+ ],
224
+ }
225
+ ]
239
226
 
240
- if self.provider == LLMProvider.ANTHROPIC:
241
- request_data["system"] = self._get_system_prompt()
242
- else:
243
- request_data["system"] = system_prompt
227
+ # Combine system prompts if needed
228
+ final_system_prompt = anthropic_system or system_prompt
244
229
 
245
- self._log_api_call("request", request_data)
230
+ # Log request
231
+ request_data = {
232
+ "messages": filtered_messages,
233
+ "max_tokens": self.max_tokens,
234
+ "system": final_system_prompt,
235
+ }
246
236
 
247
- # Make API call with appropriate parameters
248
- if self.client is None:
249
- raise RuntimeError("Client not initialized. Call initialize_client() first.")
250
-
251
- # Check if the method is async by inspecting the client implementation
252
- run_method = self.client.run_interleaved
253
- is_async = asyncio.iscoroutinefunction(run_method)
254
-
255
- if is_async:
256
- # For async implementations (AnthropicClient)
257
- if self.provider == LLMProvider.ANTHROPIC:
258
- response = await run_method(
259
- messages=filtered_messages,
260
- system=self._get_system_prompt(),
261
- max_tokens=self.max_tokens,
262
- )
263
- else:
264
- response = await run_method(
265
- messages=messages,
266
- system=system_prompt,
267
- max_tokens=self.max_tokens,
268
- )
237
+ self._log_api_call("request", request_data)
238
+
239
+ # Make API call
240
+ response = await self.client.run_interleaved(
241
+ messages=filtered_messages,
242
+ system=final_system_prompt,
243
+ max_tokens=self.max_tokens,
244
+ )
269
245
  else:
270
- # For non-async implementations (GroqClient, etc.)
271
- if self.provider == LLMProvider.ANTHROPIC:
272
- response = run_method(
273
- messages=filtered_messages,
274
- system=self._get_system_prompt(),
275
- max_tokens=self.max_tokens,
276
- )
277
- else:
278
- response = run_method(
279
- messages=messages,
280
- system=system_prompt,
281
- max_tokens=self.max_tokens,
282
- )
246
+ # For OpenAI and others, use standard format directly
247
+ # Log request
248
+ request_data = {
249
+ "messages": prepared_messages,
250
+ "max_tokens": self.max_tokens,
251
+ "system": system_prompt,
252
+ }
253
+
254
+ self._log_api_call("request", request_data)
255
+
256
+ # Make API call
257
+ response = await self.client.run_interleaved(
258
+ messages=prepared_messages,
259
+ system=system_prompt,
260
+ max_tokens=self.max_tokens,
261
+ )
283
262
 
284
263
  # Log success response
285
264
  self._log_api_call("response", request_data, response)
@@ -327,6 +306,10 @@ class OmniLoop(BaseLoop):
327
306
  logger.error(error_message)
328
307
  raise RuntimeError(error_message)
329
308
 
309
+ ###########################################
310
+ # RESPONSE AND ACTION HANDLING
311
+ ###########################################
312
+
330
313
  async def _handle_response(
331
314
  self, response: Any, messages: List[Dict[str, Any]], parsed_screen: ParseResult
332
315
  ) -> Tuple[bool, bool]:
@@ -341,194 +324,151 @@ class OmniLoop(BaseLoop):
341
324
  Tuple of (should_continue, action_screenshot_saved)
342
325
  """
343
326
  action_screenshot_saved = False
327
+
328
+ # Helper function to safely add assistant messages using the message manager
329
+ def add_assistant_message(content):
330
+ if isinstance(content, str):
331
+ # Convert string to proper format
332
+ formatted_content = [{"type": "text", "text": content}]
333
+ self.message_manager.add_assistant_message(formatted_content)
334
+ logger.info("Added formatted text assistant message")
335
+ elif isinstance(content, list):
336
+ # Already in proper format
337
+ self.message_manager.add_assistant_message(content)
338
+ logger.info("Added structured assistant message")
339
+ else:
340
+ # Default case - convert to string
341
+ formatted_content = [{"type": "text", "text": str(content)}]
342
+ self.message_manager.add_assistant_message(formatted_content)
343
+ logger.info("Added converted assistant message")
344
+
344
345
  try:
345
- # Handle Anthropic response format
346
+ # Step 1: Normalize response to standard format based on provider
347
+ standard_content = []
348
+ raw_text = None
349
+
350
+ # Convert response to standardized content based on provider
346
351
  if self.provider == LLMProvider.ANTHROPIC:
347
352
  if hasattr(response, "content") and isinstance(response.content, list):
348
- # Extract text from content blocks
353
+ # Convert Anthropic response to standard format
349
354
  for block in response.content:
350
- if hasattr(block, "type") and block.type == "text":
351
- content = block.text
352
-
353
- # Try to find JSON in the content
354
- try:
355
- # First look for JSON block
356
- json_content = extract_data(content, "json")
357
- parsed_content = json.loads(json_content)
358
- logger.info("Successfully parsed JSON from code block")
359
- except (json.JSONDecodeError, IndexError):
360
- # If no JSON block, try to find JSON object in the text
361
- try:
362
- # Look for JSON object pattern
363
- json_pattern = r"\{[^}]+\}"
364
- json_match = re.search(json_pattern, content)
365
- if json_match:
366
- json_str = json_match.group(0)
367
- parsed_content = json.loads(json_str)
368
- logger.info("Successfully parsed JSON from text")
369
- else:
370
- logger.error(f"No JSON found in content: {content}")
371
- continue
372
- except json.JSONDecodeError as e:
373
- logger.error(f"Failed to parse JSON from text: {str(e)}")
374
- continue
375
-
376
- # Clean up Box ID format
377
- if "Box ID" in parsed_content and isinstance(
378
- parsed_content["Box ID"], str
379
- ):
380
- parsed_content["Box ID"] = parsed_content["Box ID"].replace(
381
- "Box #", ""
382
- )
383
-
384
- # Add any explanatory text as reasoning if not present
385
- if "Explanation" not in parsed_content:
386
- # Extract any text before the JSON as reasoning
387
- text_before_json = content.split("{")[0].strip()
388
- if text_before_json:
389
- parsed_content["Explanation"] = text_before_json
390
-
391
- # Log the parsed content for debugging
392
- logger.info(f"Parsed content: {json.dumps(parsed_content, indent=2)}")
393
-
394
- # Add response to messages
395
- messages.append(
396
- {"role": "assistant", "content": json.dumps(parsed_content)}
397
- )
398
-
399
- try:
400
- # Execute action with current parsed screen info
401
- await self._execute_action(
402
- parsed_content, cast(ParseResult, parsed_screen)
403
- )
404
- action_screenshot_saved = True
405
- except Exception as e:
406
- logger.error(f"Error executing action: {str(e)}")
407
- # Add error message to conversation
408
- messages.append(
409
- {
410
- "role": "assistant",
411
- "content": f"Error executing action: {str(e)}",
412
- "metadata": {"title": "❌ Error"},
413
- }
414
- )
415
- return False, action_screenshot_saved
416
-
417
- # Check if task is complete
418
- if parsed_content.get("Action") == "None":
419
- return False, action_screenshot_saved
420
- return True, action_screenshot_saved
421
-
422
- logger.warning("No text block found in Anthropic response")
355
+ if hasattr(block, "type"):
356
+ if block.type == "text":
357
+ standard_content.append({"type": "text", "text": block.text})
358
+ # Store raw text for JSON parsing
359
+ if raw_text is None:
360
+ raw_text = block.text
361
+ else:
362
+ raw_text += "\n" + block.text
363
+ else:
364
+ # Add other block types
365
+ block_dict = {}
366
+ for key, value in vars(block).items():
367
+ if not key.startswith("_"):
368
+ block_dict[key] = value
369
+ standard_content.append(block_dict)
370
+ else:
371
+ logger.warning("Invalid Anthropic response format")
423
372
  return True, action_screenshot_saved
424
-
425
- # Handle other providers' response formats
426
- if isinstance(response, dict) and "choices" in response:
427
- content = response["choices"][0]["message"]["content"]
428
373
  else:
429
- content = response
374
+ # Assume OpenAI or compatible format
375
+ try:
376
+ raw_text = response["choices"][0]["message"]["content"]
377
+ standard_content = [{"type": "text", "text": raw_text}]
378
+ except (KeyError, TypeError, IndexError) as e:
379
+ logger.error(f"Invalid response format: {str(e)}")
380
+ return True, action_screenshot_saved
430
381
 
431
- # Parse JSON content
432
- if isinstance(content, str):
382
+ # Step 2: Add the normalized response to message history
383
+ add_assistant_message(standard_content)
384
+
385
+ # Step 3: Extract JSON from the content for action execution
386
+ parsed_content = None
387
+
388
+ # If we have raw text, try to extract JSON from it
389
+ if raw_text:
390
+ # Try different approaches to extract JSON
433
391
  try:
434
392
  # First try to parse the whole content as JSON
435
- parsed_content = json.loads(content)
393
+ parsed_content = json.loads(raw_text)
394
+ logger.info("Successfully parsed whole content as JSON")
436
395
  except json.JSONDecodeError:
437
396
  try:
438
397
  # Try to find JSON block
439
- json_content = extract_data(content, "json")
398
+ json_content = extract_data(raw_text, "json")
440
399
  parsed_content = json.loads(json_content)
400
+ logger.info("Successfully parsed JSON from code block")
441
401
  except (json.JSONDecodeError, IndexError):
442
402
  try:
443
403
  # Look for JSON object pattern
444
404
  json_pattern = r"\{[^}]+\}"
445
- json_match = re.search(json_pattern, content)
405
+ json_match = re.search(json_pattern, raw_text)
446
406
  if json_match:
447
407
  json_str = json_match.group(0)
448
408
  parsed_content = json.loads(json_str)
409
+ logger.info("Successfully parsed JSON from text")
449
410
  else:
450
- logger.error(f"No JSON found in content: {content}")
411
+ logger.error(f"No JSON found in content")
451
412
  return True, action_screenshot_saved
452
413
  except json.JSONDecodeError as e:
453
414
  logger.error(f"Failed to parse JSON from text: {str(e)}")
454
415
  return True, action_screenshot_saved
455
416
 
417
+ # Step 4: Process the parsed content if available
418
+ if parsed_content:
456
419
  # Clean up Box ID format
457
420
  if "Box ID" in parsed_content and isinstance(parsed_content["Box ID"], str):
458
421
  parsed_content["Box ID"] = parsed_content["Box ID"].replace("Box #", "")
459
422
 
460
423
  # Add any explanatory text as reasoning if not present
461
- if "Explanation" not in parsed_content:
424
+ if "Explanation" not in parsed_content and raw_text:
462
425
  # Extract any text before the JSON as reasoning
463
- text_before_json = content.split("{")[0].strip()
426
+ text_before_json = raw_text.split("{")[0].strip()
464
427
  if text_before_json:
465
428
  parsed_content["Explanation"] = text_before_json
466
429
 
467
- # Add response to messages with stringified content
468
- messages.append({"role": "assistant", "content": json.dumps(parsed_content)})
430
+ # Log the parsed content for debugging
431
+ logger.info(f"Parsed content: {json.dumps(parsed_content, indent=2)}")
469
432
 
433
+ # Step 5: Execute the action
470
434
  try:
471
- # Execute action with current parsed screen info
472
- await self._execute_action(parsed_content, cast(ParseResult, parsed_screen))
473
- action_screenshot_saved = True
474
- except Exception as e:
475
- logger.error(f"Error executing action: {str(e)}")
476
- # Add error message to conversation
477
- messages.append(
478
- {
479
- "role": "assistant",
480
- "content": f"Error executing action: {str(e)}",
481
- "metadata": {"title": "❌ Error"},
482
- }
435
+ # Execute action using the common helper method
436
+ should_continue, action_screenshot_saved = (
437
+ await self._execute_action_with_tools(
438
+ parsed_content, cast(ParseResult, parsed_screen)
439
+ )
483
440
  )
484
- return False, action_screenshot_saved
485
-
486
- # Check if task is complete
487
- if parsed_content.get("Action") == "None":
488
- return False, action_screenshot_saved
489
-
490
- return True, action_screenshot_saved
491
- elif isinstance(content, dict):
492
- # Handle case where content is already a dictionary
493
- messages.append({"role": "assistant", "content": json.dumps(content)})
494
441
 
495
- try:
496
- # Execute action with current parsed screen info
497
- await self._execute_action(content, cast(ParseResult, parsed_screen))
498
- action_screenshot_saved = True
442
+ # Check if task is complete
443
+ if parsed_content.get("Action") == "None":
444
+ return False, action_screenshot_saved
445
+ return should_continue, action_screenshot_saved
499
446
  except Exception as e:
500
447
  logger.error(f"Error executing action: {str(e)}")
501
- # Add error message to conversation
502
- messages.append(
503
- {
504
- "role": "assistant",
505
- "content": f"Error executing action: {str(e)}",
506
- "metadata": {"title": "❌ Error"},
507
- }
508
- )
448
+ # Update the last assistant message with error
449
+ error_message = [{"type": "text", "text": f"Error executing action: {str(e)}"}]
450
+ # Replace the last assistant message with the error
451
+ self.message_manager.add_assistant_message(error_message)
509
452
  return False, action_screenshot_saved
510
453
 
511
- # Check if task is complete
512
- if content.get("Action") == "None":
513
- return False, action_screenshot_saved
514
-
515
- return True, action_screenshot_saved
516
-
517
454
  return True, action_screenshot_saved
518
455
 
519
456
  except Exception as e:
520
457
  logger.error(f"Error handling response: {str(e)}")
521
- messages.append(
522
- {
523
- "role": "assistant",
524
- "content": f"Error: {str(e)}",
525
- "metadata": {"title": "❌ Error"},
526
- }
527
- )
458
+ # Add error message using the message manager
459
+ error_message = [{"type": "text", "text": f"Error: {str(e)}"}]
460
+ self.message_manager.add_assistant_message(error_message)
528
461
  raise
529
462
 
463
+ ###########################################
464
+ # SCREEN PARSING - IMPLEMENTING ABSTRACT METHOD
465
+ ###########################################
466
+
530
467
  async def _get_parsed_screen_som(self, save_screenshot: bool = True) -> ParseResult:
531
- """Get parsed screen information with SOM.
468
+ """Get parsed screen information with Screen Object Model.
469
+
470
+ Extends the base class method to use the OmniParser to parse the screen
471
+ and extract UI elements.
532
472
 
533
473
  Args:
534
474
  save_screenshot: Whether to save the screenshot (set to False when screenshots will be saved elsewhere)
@@ -563,337 +503,26 @@ class OmniLoop(BaseLoop):
563
503
  logger.error(f"Error getting parsed screen: {str(e)}")
564
504
  raise
565
505
 
566
- async def _process_screen(
567
- self, parsed_screen: ParseResult, messages: List[Dict[str, Any]]
568
- ) -> None:
569
- """Process and add screen info to messages."""
570
- try:
571
- # Only add message if we have an image and provider supports it
572
- if self.provider in [LLMProvider.OPENAI, LLMProvider.ANTHROPIC]:
573
- image = parsed_screen.annotated_image_base64 or None
574
- if image:
575
- # Save screen info to current turn directory
576
- if self.current_turn_dir:
577
- # Save elements as JSON
578
- elements_path = os.path.join(self.current_turn_dir, "elements.json")
579
- with open(elements_path, "w") as f:
580
- # Convert elements to dicts for JSON serialization
581
- elements_json = [elem.model_dump() for elem in parsed_screen.elements]
582
- json.dump(elements_json, f, indent=2)
583
- logger.info(f"Saved elements to {elements_path}")
584
-
585
- # Format the image content based on the provider
586
- if self.provider == LLMProvider.ANTHROPIC:
587
- # Compress the image before sending to Anthropic (5MB limit)
588
- image_size = len(image)
589
- logger.info(f"Image base64 is present, length: {image_size}")
590
-
591
- # Anthropic has a 5MB limit - check against base64 string length
592
- # which is what matters for the API call payload
593
- # Use slightly smaller limit (4.9MB) to account for request overhead
594
- max_size = int(4.9 * 1024 * 1024) # 4.9MB
595
-
596
- # Default media type (will be overridden if compression is needed)
597
- media_type = "image/png"
598
-
599
- # Check if the image already has a media type prefix
600
- if image.startswith("data:"):
601
- parts = image.split(",", 1)
602
- if len(parts) == 2 and "image/jpeg" in parts[0].lower():
603
- media_type = "image/jpeg"
604
- elif len(parts) == 2 and "image/png" in parts[0].lower():
605
- media_type = "image/png"
606
-
607
- if image_size > max_size:
608
- logger.info(
609
- f"Image size ({image_size} bytes) exceeds Anthropic limit ({max_size} bytes), compressing..."
610
- )
611
- image, media_type = compress_image_base64(image, max_size)
612
- logger.info(
613
- f"Image compressed to {len(image)} bytes with media_type {media_type}"
614
- )
615
-
616
- # Anthropic uses "type": "image"
617
- screen_info_msg = {
618
- "role": "user",
619
- "content": [
620
- {
621
- "type": "image",
622
- "source": {
623
- "type": "base64",
624
- "media_type": media_type,
625
- "data": image,
626
- },
627
- }
628
- ],
629
- }
630
- else:
631
- # OpenAI and others use "type": "image_url"
632
- screen_info_msg = {
633
- "role": "user",
634
- "content": [
635
- {
636
- "type": "image_url",
637
- "image_url": {"url": f"data:image/png;base64,{image}"},
638
- }
639
- ],
640
- }
641
- messages.append(screen_info_msg)
642
-
643
- except Exception as e:
644
- logger.error(f"Error processing screen info: {str(e)}")
645
- raise
646
-
647
506
  def _get_system_prompt(self) -> str:
648
507
  """Get the system prompt for the model."""
649
508
  return SYSTEM_PROMPT
650
509
 
651
- async def _execute_action(self, content: Dict[str, Any], parsed_screen: ParseResult) -> None:
652
- """Execute the action specified in the content using the tool manager.
653
-
654
- Args:
655
- content: Dictionary containing the action details
656
- parsed_screen: Current parsed screen information
657
- """
658
- try:
659
- action = content.get("Action", "").lower()
660
- if not action:
661
- return
662
-
663
- # Track if we saved an action-specific screenshot
664
- action_screenshot_saved = False
665
-
666
- try:
667
- # Prepare kwargs based on action type
668
- kwargs = {}
669
-
670
- if action in ["left_click", "right_click", "double_click", "move_cursor"]:
671
- try:
672
- box_id = int(content["Box ID"])
673
- logger.info(f"Processing Box ID: {box_id}")
674
-
675
- # Calculate click coordinates
676
- x, y = await self._calculate_click_coordinates(box_id, parsed_screen)
677
- logger.info(f"Calculated coordinates: x={x}, y={y}")
678
-
679
- kwargs["x"] = x
680
- kwargs["y"] = y
681
-
682
- # Visualize action if screenshot is available
683
- if parsed_screen.annotated_image_base64:
684
- img_data = parsed_screen.annotated_image_base64
685
- # Remove data URL prefix if present
686
- if img_data.startswith("data:image"):
687
- img_data = img_data.split(",")[1]
688
- # Only save visualization for coordinate-based actions
689
- self._visualize_action(x, y, img_data)
690
- action_screenshot_saved = True
691
-
692
- except ValueError as e:
693
- logger.error(f"Error processing Box ID: {str(e)}")
694
- return
695
-
696
- elif action == "drag_to":
697
- try:
698
- box_id = int(content["Box ID"])
699
- x, y = await self._calculate_click_coordinates(box_id, parsed_screen)
700
- kwargs.update(
701
- {
702
- "x": x,
703
- "y": y,
704
- "button": content.get("button", "left"),
705
- "duration": float(content.get("duration", 0.5)),
706
- }
707
- )
708
-
709
- # Visualize drag destination if screenshot is available
710
- if parsed_screen.annotated_image_base64:
711
- img_data = parsed_screen.annotated_image_base64
712
- # Remove data URL prefix if present
713
- if img_data.startswith("data:image"):
714
- img_data = img_data.split(",")[1]
715
- # Only save visualization for coordinate-based actions
716
- self._visualize_action(x, y, img_data)
717
- action_screenshot_saved = True
718
-
719
- except ValueError as e:
720
- logger.error(f"Error processing drag coordinates: {str(e)}")
721
- return
722
-
723
- elif action == "type_text":
724
- kwargs["text"] = content["Value"]
725
- # For type_text, store the value in the action type
726
- action_type = f"type_{content['Value'][:20]}" # Truncate if too long
727
- elif action == "press_key":
728
- kwargs["key"] = content["Value"]
729
- action_type = f"press_{content['Value']}"
730
- elif action == "hotkey":
731
- if isinstance(content.get("Value"), list):
732
- keys = content["Value"]
733
- action_type = f"hotkey_{'_'.join(keys)}"
734
- else:
735
- # Simply split string format like "command+space" into a list
736
- keys = [k.strip() for k in content["Value"].lower().split("+")]
737
- action_type = f"hotkey_{content['Value'].replace('+', '_')}"
738
- logger.info(f"Preparing hotkey with keys: {keys}")
739
- # Get the method but call it with *args instead of **kwargs
740
- method = getattr(self.computer.interface, action)
741
- await method(*keys) # Unpack the keys list as positional arguments
742
- logger.info(f"Tool execution completed successfully: {action}")
743
-
744
- # For hotkeys, take a screenshot after the action
745
- try:
746
- # Get a new screenshot after the action and save it with the action type
747
- new_parsed_screen = await self._get_parsed_screen_som(save_screenshot=False)
748
- if new_parsed_screen and new_parsed_screen.annotated_image_base64:
749
- img_data = new_parsed_screen.annotated_image_base64
750
- # Remove data URL prefix if present
751
- if img_data.startswith("data:image"):
752
- img_data = img_data.split(",")[1]
753
- # Save with action type to indicate this is a post-action screenshot
754
- self._save_screenshot(img_data, action_type=action_type)
755
- action_screenshot_saved = True
756
- except Exception as screenshot_error:
757
- logger.error(
758
- f"Error taking post-hotkey screenshot: {str(screenshot_error)}"
759
- )
760
-
761
- return
762
-
763
- elif action in ["scroll_down", "scroll_up"]:
764
- clicks = int(content.get("amount", 1))
765
- kwargs["clicks"] = clicks
766
- action_type = f"scroll_{action.split('_')[1]}_{clicks}"
767
-
768
- # Visualize scrolling if screenshot is available
769
- if parsed_screen.annotated_image_base64:
770
- img_data = parsed_screen.annotated_image_base64
771
- # Remove data URL prefix if present
772
- if img_data.startswith("data:image"):
773
- img_data = img_data.split(",")[1]
774
- direction = "down" if action == "scroll_down" else "up"
775
- # For scrolling, we only save the visualization to avoid duplicate images
776
- self._visualize_scroll(direction, clicks, img_data)
777
- action_screenshot_saved = True
778
-
779
- else:
780
- logger.warning(f"Unknown action: {action}")
781
- return
510
+ ###########################################
511
+ # MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
512
+ ###########################################
782
513
 
783
- # Execute tool and handle result
784
- try:
785
- method = getattr(self.computer.interface, action)
786
- logger.info(f"Found method for action '{action}': {method}")
787
- await method(**kwargs)
788
- logger.info(f"Tool execution completed successfully: {action}")
789
-
790
- # For non-coordinate based actions that don't already have visualizations,
791
- # take a new screenshot after the action
792
- if not action_screenshot_saved:
793
- # Take a new screenshot
794
- try:
795
- # Get a new screenshot after the action and save it with the action type
796
- new_parsed_screen = await self._get_parsed_screen_som(
797
- save_screenshot=False
798
- )
799
- if new_parsed_screen and new_parsed_screen.annotated_image_base64:
800
- img_data = new_parsed_screen.annotated_image_base64
801
- # Remove data URL prefix if present
802
- if img_data.startswith("data:image"):
803
- img_data = img_data.split(",")[1]
804
- # Save with action type to indicate this is a post-action screenshot
805
- if "action_type" in locals():
806
- self._save_screenshot(img_data, action_type=action_type)
807
- else:
808
- self._save_screenshot(img_data, action_type=action)
809
- # Update the action screenshot flag for this turn
810
- action_screenshot_saved = True
811
- except Exception as screenshot_error:
812
- logger.error(
813
- f"Error taking post-action screenshot: {str(screenshot_error)}"
814
- )
815
-
816
- except AttributeError as e:
817
- logger.error(f"Method not found for action '{action}': {str(e)}")
818
- return
819
- except Exception as tool_error:
820
- logger.error(f"Tool execution failed: {str(tool_error)}")
821
- return
822
-
823
- except Exception as e:
824
- logger.error(f"Error executing action {action}: {str(e)}")
825
- return
826
-
827
- except Exception as e:
828
- logger.error(f"Error in _execute_action: {str(e)}")
829
- return
830
-
831
- async def _calculate_click_coordinates(
832
- self, box_id: int, parsed_screen: ParseResult
833
- ) -> Tuple[int, int]:
834
- """Calculate click coordinates based on box ID.
835
-
836
- Args:
837
- box_id: The ID of the box to click
838
- parsed_screen: The parsed screen information
839
-
840
- Returns:
841
- Tuple of (x, y) coordinates
842
-
843
- Raises:
844
- ValueError: If box_id is invalid or missing from parsed screen
845
- """
846
- # First try to use structured elements data
847
- logger.info(f"Elements count: {len(parsed_screen.elements)}")
848
-
849
- # Try to find element with matching ID
850
- for element in parsed_screen.elements:
851
- if element.id == box_id:
852
- logger.info(f"Found element with ID {box_id}: {element}")
853
- bbox = element.bbox
854
-
855
- # Get screen dimensions from the metadata if available, or fallback
856
- width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
857
- height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
858
- logger.info(f"Screen dimensions: width={width}, height={height}")
859
-
860
- # Calculate center of the box in pixels
861
- center_x = int((bbox.x1 + bbox.x2) / 2 * width)
862
- center_y = int((bbox.y1 + bbox.y2) / 2 * height)
863
- logger.info(f"Calculated center: ({center_x}, {center_y})")
864
-
865
- # Validate coordinates - if they're (0,0) or unreasonably small,
866
- # use a default position in the center of the screen
867
- if center_x == 0 and center_y == 0:
868
- logger.warning("Got (0,0) coordinates, using fallback position")
869
- center_x = width // 2
870
- center_y = height // 2
871
- logger.info(f"Using fallback center: ({center_x}, {center_y})")
872
-
873
- return center_x, center_y
874
-
875
- # If we couldn't find the box, use center of screen
876
- logger.error(
877
- f"Box ID {box_id} not found in structured elements (count={len(parsed_screen.elements)})"
878
- )
879
-
880
- # Use center of screen as fallback
881
- width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
882
- height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
883
- logger.warning(f"Using fallback position in center of screen ({width//2}, {height//2})")
884
- return width // 2, height // 2
885
-
886
- async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
514
+ async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
887
515
  """Run the agent loop with provided messages.
888
516
 
889
517
  Args:
890
- messages: List of message objects
518
+ messages: List of messages in standard OpenAI format
891
519
 
892
520
  Yields:
893
- Dict containing response data
521
+ Agent response format
894
522
  """
895
- # Keep track of conversation history
896
- conversation_history = messages.copy()
523
+ # Initialize the message manager with the provided messages
524
+ self.message_manager.messages = messages.copy()
525
+ logger.info(f"Starting OmniLoop run with {len(self.message_manager.messages)} messages")
897
526
 
898
527
  # Continue running until explicitly told to stop
899
528
  running = True
@@ -922,26 +551,66 @@ class OmniLoop(BaseLoop):
922
551
  # Get up-to-date screen information
923
552
  parsed_screen = await self._get_parsed_screen_som()
924
553
 
925
- # Process screen info and update messages
926
- await self._process_screen(parsed_screen, conversation_history)
554
+ # Process screen info and update messages in standard format
555
+ try:
556
+ # Get image from parsed screen
557
+ image = parsed_screen.annotated_image_base64 or None
558
+ if image:
559
+ # Save elements as JSON if we have a turn directory
560
+ if self.current_turn_dir and hasattr(parsed_screen, "elements"):
561
+ elements_path = os.path.join(self.current_turn_dir, "elements.json")
562
+ with open(elements_path, "w") as f:
563
+ # Convert elements to dicts for JSON serialization
564
+ elements_json = [
565
+ elem.model_dump() for elem in parsed_screen.elements
566
+ ]
567
+ json.dump(elements_json, f, indent=2)
568
+ logger.info(f"Saved elements to {elements_path}")
569
+
570
+ # Remove data URL prefix if present
571
+ if "," in image:
572
+ image = image.split(",")[1]
573
+
574
+ # Add screenshot to message history using message manager
575
+ self.message_manager.add_user_message(
576
+ [
577
+ {
578
+ "type": "image_url",
579
+ "image_url": {"url": f"data:image/png;base64,{image}"},
580
+ }
581
+ ]
582
+ )
583
+ logger.info("Added screenshot to message history")
584
+ except Exception as e:
585
+ logger.error(f"Error processing screen info: {str(e)}")
586
+ raise
927
587
 
928
588
  # Get system prompt
929
589
  system_prompt = self._get_system_prompt()
930
590
 
931
- # Make API call with retries
932
- response = await self._make_api_call(conversation_history, system_prompt)
591
+ # Make API call with retries using the APIHandler
592
+ response = await self.api_handler.make_api_call(
593
+ self.message_manager.messages, system_prompt
594
+ )
933
595
 
934
596
  # Handle the response (may execute actions)
935
597
  # Returns: (should_continue, action_screenshot_saved)
936
598
  should_continue, new_screenshot_saved = await self._handle_response(
937
- response, conversation_history, parsed_screen
599
+ response, self.message_manager.messages, parsed_screen
938
600
  )
939
601
 
940
602
  # Update whether an action screenshot was saved this turn
941
603
  action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
942
604
 
605
+ # Create OpenAI-compatible response format using utility function
606
+ openai_compatible_response = await to_openai_agent_response_format(
607
+ response=response,
608
+ messages=self.message_manager.messages,
609
+ model=self.model,
610
+ )
611
+
943
612
  # Yield the response to the caller
944
- yield {"response": response}
613
+ yield openai_compatible_response
945
614
 
946
615
  # Check if we should continue this conversation
947
616
  running = should_continue
@@ -969,3 +638,218 @@ class OmniLoop(BaseLoop):
969
638
 
970
639
  # Create a brief delay before retrying
971
640
  await asyncio.sleep(1)
641
+
642
+ async def process_model_response(self, response_text: str) -> Optional[Dict[str, Any]]:
643
+ """Process model response to extract tool calls.
644
+
645
+ Args:
646
+ response_text: Model response text
647
+
648
+ Returns:
649
+ Extracted tool information, or None if no tool call was found
650
+ """
651
+ try:
652
+ # Ensure tools are initialized before use
653
+ await self._ensure_tools_initialized()
654
+
655
+ # Look for tool use in the response
656
+ if "function_call" in response_text or "tool_use" in response_text:
657
+ # The extract_tool_call method should be implemented in the OmniAPIHandler
658
+ # For now, we'll just use a simple approach
659
+ # This will be replaced with the proper implementation
660
+ tool_info = None
661
+ if "function_call" in response_text:
662
+ # Extract function call params
663
+ try:
664
+ # Simple extraction - in real code this would be more robust
665
+ import json
666
+ import re
667
+
668
+ match = re.search(r'"function_call"\s*:\s*{([^}]+)}', response_text)
669
+ if match:
670
+ function_text = "{" + match.group(1) + "}"
671
+ tool_info = json.loads(function_text)
672
+ except Exception as e:
673
+ logger.error(f"Error extracting function call: {str(e)}")
674
+
675
+ if tool_info:
676
+ try:
677
+ # Execute the tool
678
+ result = await self.tool_manager.execute_tool(
679
+ name=tool_info.get("name"), tool_input=tool_info.get("arguments", {})
680
+ )
681
+ # Handle the result
682
+ return {"tool_result": result}
683
+ except Exception as e:
684
+ error_msg = (
685
+ f"Error executing tool '{tool_info.get('name', 'unknown')}': {str(e)}"
686
+ )
687
+ logger.error(error_msg)
688
+ return {"tool_result": ToolResult(error=error_msg)}
689
+ except Exception as e:
690
+ logger.error(f"Error processing tool call: {str(e)}")
691
+
692
+ return None
693
+
694
+ async def process_response_with_tools(
695
+ self, response_text: str, parsed_screen: Optional[ParseResult] = None
696
+ ) -> Tuple[bool, str]:
697
+ """Process model response and execute tools.
698
+
699
+ Args:
700
+ response_text: Model response text
701
+ parsed_screen: Current parsed screen information (optional)
702
+
703
+ Returns:
704
+ Tuple of (action_taken, observation)
705
+ """
706
+ logger.info("Processing response with tools")
707
+
708
+ # Process the response to extract tool calls
709
+ tool_result = await self.process_model_response(response_text)
710
+
711
+ if tool_result and "tool_result" in tool_result:
712
+ # A tool was executed
713
+ result = tool_result["tool_result"]
714
+ if result.error:
715
+ return False, f"ERROR: {result.error}"
716
+ else:
717
+ return True, result.output or "Tool executed successfully"
718
+
719
+ # No action or tool call found
720
+ return False, "No action taken - no tool call detected in response"
721
+
722
+ ###########################################
723
+ # UTILITY METHODS
724
+ ###########################################
725
+
726
+ async def _ensure_tools_initialized(self) -> None:
727
+ """Ensure the tool manager and tools are initialized before use."""
728
+ if not hasattr(self.tool_manager, "tools") or self.tool_manager.tools is None:
729
+ logger.info("Tools not initialized. Initializing now...")
730
+ await self.tool_manager.initialize()
731
+ logger.info("Tools initialized successfully.")
732
+
733
+ async def _execute_action_with_tools(
734
+ self, action_data: Dict[str, Any], parsed_screen: ParseResult
735
+ ) -> Tuple[bool, bool]:
736
+ """Execute an action using the tools-based approach.
737
+
738
+ Args:
739
+ action_data: Dictionary containing action details
740
+ parsed_screen: Current parsed screen information
741
+
742
+ Returns:
743
+ Tuple of (should_continue, action_screenshot_saved)
744
+ """
745
+ action_screenshot_saved = False
746
+ action_type = None # Initialize for possible use in post-action screenshot
747
+
748
+ try:
749
+ # Extract the action
750
+ parsed_action = action_data.get("Action", "").lower()
751
+
752
+ # Only process if we have a valid action
753
+ if not parsed_action or parsed_action == "none":
754
+ return False, action_screenshot_saved
755
+
756
+ # Convert the parsed content to a format suitable for the tools system
757
+ tool_name = "computer" # Default to computer tool
758
+ tool_args = {"action": parsed_action}
759
+
760
+ # Add specific arguments based on action type
761
+ if parsed_action in ["left_click", "right_click", "double_click", "move_cursor"]:
762
+ # Calculate coordinates from Box ID using parser
763
+ try:
764
+ box_id = int(action_data["Box ID"])
765
+ x, y = await self.parser.calculate_click_coordinates(
766
+ box_id, cast(ParseResult, parsed_screen)
767
+ )
768
+ tool_args["x"] = x
769
+ tool_args["y"] = y
770
+
771
+ # Visualize action if screenshot is available
772
+ if parsed_screen and parsed_screen.annotated_image_base64:
773
+ img_data = parsed_screen.annotated_image_base64
774
+ # Remove data URL prefix if present
775
+ if img_data.startswith("data:image"):
776
+ img_data = img_data.split(",")[1]
777
+ # Save visualization for coordinate-based actions
778
+ self.viz_helper.visualize_action(x, y, img_data)
779
+ action_screenshot_saved = True
780
+
781
+ except (ValueError, KeyError) as e:
782
+ logger.error(f"Error processing Box ID: {str(e)}")
783
+ return False, action_screenshot_saved
784
+
785
+ elif parsed_action == "type_text":
786
+ tool_args["text"] = action_data.get("Value", "")
787
+ # For type_text, store the value in the action type for screenshot naming
788
+ action_type = f"type_{tool_args['text'][:20]}" # Truncate if too long
789
+
790
+ elif parsed_action == "press_key":
791
+ tool_args["key"] = action_data.get("Value", "")
792
+ action_type = f"press_{tool_args['key']}"
793
+
794
+ elif parsed_action == "hotkey":
795
+ value = action_data.get("Value", "")
796
+ if isinstance(value, list):
797
+ tool_args["keys"] = value
798
+ action_type = f"hotkey_{'_'.join(value)}"
799
+ else:
800
+ # Split string format like "command+space" into a list
801
+ keys = [k.strip() for k in value.lower().split("+")]
802
+ tool_args["keys"] = keys
803
+ action_type = f"hotkey_{value.replace('+', '_')}"
804
+
805
+ elif parsed_action in ["scroll_down", "scroll_up"]:
806
+ clicks = int(action_data.get("amount", 1))
807
+ tool_args["amount"] = clicks
808
+ action_type = f"scroll_{parsed_action.split('_')[1]}_{clicks}"
809
+
810
+ # Visualize scrolling if screenshot is available
811
+ if parsed_screen and parsed_screen.annotated_image_base64:
812
+ img_data = parsed_screen.annotated_image_base64
813
+ # Remove data URL prefix if present
814
+ if img_data.startswith("data:image"):
815
+ img_data = img_data.split(",")[1]
816
+ direction = "down" if parsed_action == "scroll_down" else "up"
817
+ # For scrolling, we save the visualization
818
+ self.viz_helper.visualize_scroll(direction, clicks, img_data)
819
+ action_screenshot_saved = True
820
+
821
+ # Ensure tools are initialized before use
822
+ await self._ensure_tools_initialized()
823
+
824
+ # Execute tool with prepared arguments
825
+ result = await self.tool_manager.execute_tool(name=tool_name, tool_input=tool_args)
826
+
827
+ # Take a new screenshot after the action if we haven't already saved one
828
+ if not action_screenshot_saved:
829
+ try:
830
+ # Get a new screenshot after the action
831
+ new_parsed_screen = await self._get_parsed_screen_som(save_screenshot=False)
832
+ if new_parsed_screen and new_parsed_screen.annotated_image_base64:
833
+ img_data = new_parsed_screen.annotated_image_base64
834
+ # Remove data URL prefix if present
835
+ if img_data.startswith("data:image"):
836
+ img_data = img_data.split(",")[1]
837
+ # Save with action type if defined, otherwise use the action name
838
+ if action_type:
839
+ self._save_screenshot(img_data, action_type=action_type)
840
+ else:
841
+ self._save_screenshot(img_data, action_type=parsed_action)
842
+ action_screenshot_saved = True
843
+ except Exception as screenshot_error:
844
+ logger.error(f"Error taking post-action screenshot: {str(screenshot_error)}")
845
+
846
+ # Continue the loop if the action is not "None"
847
+ return True, action_screenshot_saved
848
+
849
+ except Exception as e:
850
+ logger.error(f"Error executing action: {str(e)}")
851
+ # Update the last assistant message with error
852
+ error_message = [{"type": "text", "text": f"Error executing action: {str(e)}"}]
853
+ # Replace the last assistant message with the error
854
+ self.message_manager.add_assistant_message(error_message)
855
+ return False, action_screenshot_saved