cua-agent 0.3.2__py3-none-any.whl → 0.4.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show
  1. agent/__init__.py +15 -51
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +229 -0
  5. agent/agent.py +577 -0
  6. agent/callbacks/__init__.py +17 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/trajectory_saver.py +305 -0
  13. agent/cli.py +290 -0
  14. agent/computer_handler.py +107 -0
  15. agent/decorators.py +90 -0
  16. agent/loops/__init__.py +11 -0
  17. agent/loops/anthropic.py +728 -0
  18. agent/loops/omniparser.py +339 -0
  19. agent/loops/openai.py +95 -0
  20. agent/loops/uitars.py +688 -0
  21. agent/responses.py +207 -0
  22. agent/types.py +79 -0
  23. agent/ui/__init__.py +7 -1
  24. agent/ui/gradio/__init__.py +6 -19
  25. agent/ui/gradio/app.py +80 -1299
  26. agent/ui/gradio/ui_components.py +703 -0
  27. cua_agent-0.4.0b2.dist-info/METADATA +424 -0
  28. cua_agent-0.4.0b2.dist-info/RECORD +30 -0
  29. agent/core/__init__.py +0 -27
  30. agent/core/agent.py +0 -210
  31. agent/core/base.py +0 -217
  32. agent/core/callbacks.py +0 -200
  33. agent/core/experiment.py +0 -249
  34. agent/core/factory.py +0 -122
  35. agent/core/messages.py +0 -332
  36. agent/core/provider_config.py +0 -21
  37. agent/core/telemetry.py +0 -142
  38. agent/core/tools/__init__.py +0 -21
  39. agent/core/tools/base.py +0 -74
  40. agent/core/tools/bash.py +0 -52
  41. agent/core/tools/collection.py +0 -46
  42. agent/core/tools/computer.py +0 -113
  43. agent/core/tools/edit.py +0 -67
  44. agent/core/tools/manager.py +0 -56
  45. agent/core/tools.py +0 -32
  46. agent/core/types.py +0 -88
  47. agent/core/visualization.py +0 -197
  48. agent/providers/__init__.py +0 -4
  49. agent/providers/anthropic/__init__.py +0 -6
  50. agent/providers/anthropic/api/client.py +0 -360
  51. agent/providers/anthropic/api/logging.py +0 -150
  52. agent/providers/anthropic/api_handler.py +0 -140
  53. agent/providers/anthropic/callbacks/__init__.py +0 -5
  54. agent/providers/anthropic/callbacks/manager.py +0 -65
  55. agent/providers/anthropic/loop.py +0 -568
  56. agent/providers/anthropic/prompts.py +0 -23
  57. agent/providers/anthropic/response_handler.py +0 -226
  58. agent/providers/anthropic/tools/__init__.py +0 -33
  59. agent/providers/anthropic/tools/base.py +0 -88
  60. agent/providers/anthropic/tools/bash.py +0 -66
  61. agent/providers/anthropic/tools/collection.py +0 -34
  62. agent/providers/anthropic/tools/computer.py +0 -396
  63. agent/providers/anthropic/tools/edit.py +0 -326
  64. agent/providers/anthropic/tools/manager.py +0 -54
  65. agent/providers/anthropic/tools/run.py +0 -42
  66. agent/providers/anthropic/types.py +0 -16
  67. agent/providers/anthropic/utils.py +0 -381
  68. agent/providers/omni/__init__.py +0 -8
  69. agent/providers/omni/api_handler.py +0 -42
  70. agent/providers/omni/clients/anthropic.py +0 -103
  71. agent/providers/omni/clients/base.py +0 -35
  72. agent/providers/omni/clients/oaicompat.py +0 -195
  73. agent/providers/omni/clients/ollama.py +0 -122
  74. agent/providers/omni/clients/openai.py +0 -155
  75. agent/providers/omni/clients/utils.py +0 -25
  76. agent/providers/omni/image_utils.py +0 -34
  77. agent/providers/omni/loop.py +0 -990
  78. agent/providers/omni/parser.py +0 -307
  79. agent/providers/omni/prompts.py +0 -64
  80. agent/providers/omni/tools/__init__.py +0 -30
  81. agent/providers/omni/tools/base.py +0 -29
  82. agent/providers/omni/tools/bash.py +0 -74
  83. agent/providers/omni/tools/computer.py +0 -179
  84. agent/providers/omni/tools/manager.py +0 -61
  85. agent/providers/omni/utils.py +0 -236
  86. agent/providers/openai/__init__.py +0 -6
  87. agent/providers/openai/api_handler.py +0 -456
  88. agent/providers/openai/loop.py +0 -472
  89. agent/providers/openai/response_handler.py +0 -205
  90. agent/providers/openai/tools/__init__.py +0 -15
  91. agent/providers/openai/tools/base.py +0 -79
  92. agent/providers/openai/tools/computer.py +0 -326
  93. agent/providers/openai/tools/manager.py +0 -106
  94. agent/providers/openai/types.py +0 -36
  95. agent/providers/openai/utils.py +0 -98
  96. agent/providers/uitars/__init__.py +0 -1
  97. agent/providers/uitars/clients/base.py +0 -35
  98. agent/providers/uitars/clients/mlxvlm.py +0 -263
  99. agent/providers/uitars/clients/oaicompat.py +0 -214
  100. agent/providers/uitars/loop.py +0 -660
  101. agent/providers/uitars/prompts.py +0 -63
  102. agent/providers/uitars/tools/__init__.py +0 -1
  103. agent/providers/uitars/tools/computer.py +0 -283
  104. agent/providers/uitars/tools/manager.py +0 -60
  105. agent/providers/uitars/utils.py +0 -264
  106. agent/telemetry.py +0 -21
  107. agent/ui/__main__.py +0 -15
  108. cua_agent-0.3.2.dist-info/METADATA +0 -295
  109. cua_agent-0.3.2.dist-info/RECORD +0 -87
  110. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b2.dist-info}/WHEEL +0 -0
  111. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b2.dist-info}/entry_points.txt +0 -0
@@ -1,660 +0,0 @@
1
- """UI-TARS-specific agent loop implementation."""
2
-
3
- import logging
4
- import asyncio
5
- import re
6
- import os
7
- import json
8
- import base64
9
- import copy
10
- from typing import Any, Dict, List, Optional, Tuple, AsyncGenerator, cast
11
-
12
- from httpx import ConnectError, ReadTimeout
13
-
14
- from ...core.base import BaseLoop
15
- from ...core.messages import StandardMessageManager, ImageRetentionConfig
16
- from ...core.types import AgentResponse, LLMProvider
17
- from ...core.visualization import VisualizationHelper
18
- from computer import Computer
19
-
20
- from .utils import add_box_token, parse_actions, parse_action_parameters, to_agent_response_format
21
- from .tools.manager import ToolManager
22
- from .tools.computer import ToolResult
23
- from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
24
-
25
- from .clients.oaicompat import OAICompatClient
26
- from .clients.mlxvlm import MLXVLMUITarsClient
27
-
28
- logger = logging.getLogger(__name__)
29
-
30
- class UITARSLoop(BaseLoop):
31
- """UI-TARS-specific implementation of the agent loop.
32
-
33
- This class extends BaseLoop to provide support for the UI-TARS model
34
- with computer control capabilities.
35
- """
36
-
37
- ###########################################
38
- # INITIALIZATION AND CONFIGURATION
39
- ###########################################
40
-
41
- def __init__(
42
- self,
43
- computer: Computer,
44
- api_key: str,
45
- model: str,
46
- provider: Optional[LLMProvider] = None,
47
- provider_base_url: Optional[str] = "http://localhost:8000/v1",
48
- only_n_most_recent_images: Optional[int] = 2,
49
- base_dir: Optional[str] = "trajectories",
50
- max_retries: int = 3,
51
- retry_delay: float = 1.0,
52
- save_trajectory: bool = True,
53
- **kwargs,
54
- ):
55
- """Initialize the loop.
56
-
57
- Args:
58
- computer: Computer instance
59
- api_key: API key (may not be needed for local endpoints)
60
- model: Model name (e.g., "ui-tars")
61
- provider_base_url: Base URL for the API provider
62
- only_n_most_recent_images: Maximum number of recent screenshots to include in API requests
63
- base_dir: Base directory for saving experiment data
64
- max_retries: Maximum number of retries for API calls
65
- retry_delay: Delay between retries in seconds
66
- save_trajectory: Whether to save trajectory data
67
- provider: The LLM provider to use (defaults to OAICOMPAT if not specified)
68
- """
69
- # Set provider before initializing base class
70
- self.provider = provider or LLMProvider.OAICOMPAT
71
- self.provider_base_url = provider_base_url
72
-
73
- # Initialize message manager with image retention config
74
- self.message_manager = StandardMessageManager(
75
- config=ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images)
76
- )
77
-
78
- # Initialize base class (which will set up experiment manager)
79
- super().__init__(
80
- computer=computer,
81
- model=model,
82
- api_key=api_key,
83
- max_retries=max_retries,
84
- retry_delay=retry_delay,
85
- base_dir=base_dir,
86
- save_trajectory=save_trajectory,
87
- only_n_most_recent_images=only_n_most_recent_images,
88
- **kwargs,
89
- )
90
-
91
- # Set API client attributes
92
- self.client = None
93
- self.retry_count = 0
94
- self.loop_task = None # Store the loop task for cancellation
95
-
96
- # Initialize visualization helper
97
- self.viz_helper = VisualizationHelper(agent=self)
98
-
99
- # Initialize tool manager
100
- self.tool_manager = ToolManager(computer=computer)
101
-
102
- logger.info("UITARSLoop initialized with StandardMessageManager")
103
-
104
- async def initialize(self) -> None:
105
- """Initialize the loop by setting up tools and clients."""
106
- # Initialize base class
107
- await super().initialize()
108
-
109
- # Initialize tool manager with error handling
110
- try:
111
- logger.info("Initializing tool manager...")
112
- await self.tool_manager.initialize()
113
- logger.info("Tool manager initialized successfully.")
114
- except Exception as e:
115
- logger.error(f"Error initializing tool manager: {str(e)}")
116
- logger.warning("Will attempt to initialize tools on first use.")
117
-
118
- # Initialize client for the selected provider
119
- try:
120
- await self.initialize_client()
121
- except Exception as e:
122
- logger.error(f"Error initializing client: {str(e)}")
123
- raise RuntimeError(f"Failed to initialize client: {str(e)}")
124
-
125
- ###########################################
126
- # CLIENT INITIALIZATION - IMPLEMENTING ABSTRACT METHOD
127
- ###########################################
128
-
129
- async def initialize_client(self) -> None:
130
- """Initialize the appropriate client.
131
-
132
- Implements abstract method from BaseLoop to set up the specific
133
- provider client based on the configured provider.
134
- """
135
- try:
136
- if self.provider == LLMProvider.MLXVLM:
137
- logger.info(f"Initializing MLX VLM client for UI-TARS with model {self.model}...")
138
-
139
- self.client = MLXVLMUITarsClient(
140
- model=self.model,
141
- )
142
-
143
- logger.info(f"Initialized MLX VLM client with model {self.model}")
144
- else:
145
- # Default to OAICompat client for other providers
146
- logger.info(f"Initializing OAICompat client for UI-TARS with model {self.model}...")
147
-
148
- self.client = OAICompatClient(
149
- api_key=self.api_key or "EMPTY", # Local endpoints typically don't require an API key
150
- model=self.model,
151
- provider_base_url=self.provider_base_url,
152
- )
153
-
154
- logger.info(f"Initialized OAICompat client with model {self.model}")
155
- except Exception as e:
156
- logger.error(f"Error initializing client: {str(e)}")
157
- self.client = None
158
- raise RuntimeError(f"Failed to initialize client: {str(e)}")
159
-
160
- ###########################################
161
- # MESSAGE FORMATTING
162
- ###########################################
163
-
164
- def to_uitars_format(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
165
- """Convert messages to UI-TARS compatible format.
166
-
167
- Args:
168
- messages: List of messages in standard format
169
-
170
- Returns:
171
- List of messages formatted for UI-TARS
172
- """
173
- # Create a copy of the messages to avoid modifying the original
174
- uitars_messages = copy.deepcopy(messages)
175
-
176
- # Find the first user message to modify
177
- first_user_idx = None
178
- instruction = ""
179
-
180
- for idx, msg in enumerate(uitars_messages):
181
- if msg.get("role") == "user":
182
- first_user_idx = idx
183
- content = msg.get("content", "")
184
- if isinstance(content, str):
185
- instruction = content
186
- break
187
- elif isinstance(content, list):
188
- for item in content:
189
- if item.get("type") == "text":
190
- instruction = item.get("text", "")
191
- break
192
- if instruction:
193
- break
194
-
195
- # Only modify the first user message if found
196
- if first_user_idx is not None and instruction:
197
- # Create the computer use prompt
198
- user_prompt = COMPUTER_USE.format(
199
- instruction='\n'.join([instruction, MAC_SPECIFIC_NOTES]),
200
- language="English"
201
- )
202
-
203
- # Replace the content of the first user message
204
- if isinstance(uitars_messages[first_user_idx].get("content", ""), str):
205
- uitars_messages[first_user_idx]["content"] = [{"type": "text", "text": user_prompt}]
206
- elif isinstance(uitars_messages[first_user_idx].get("content", ""), list):
207
- # Find and replace only the text part, keeping images
208
- for i, item in enumerate(uitars_messages[first_user_idx]["content"]):
209
- if item.get("type") == "text":
210
- uitars_messages[first_user_idx]["content"][i]["text"] = user_prompt
211
- break
212
-
213
- # Add box tokens to assistant responses
214
- for idx, msg in enumerate(uitars_messages):
215
- if msg.get("role") == "assistant":
216
- content = msg.get("content", "")
217
- if content and isinstance(content, list):
218
- for i, part in enumerate(content):
219
- if part.get('type') == 'text':
220
- uitars_messages[idx]["content"][i]["text"] = add_box_token(part['text'])
221
-
222
- return uitars_messages
223
-
224
- ###########################################
225
- # API CALL HANDLING
226
- ###########################################
227
-
228
- async def _make_api_call(self, messages: List[Dict[str, Any]], system_prompt: str) -> Any:
229
- """Make API call to provider with retry logic."""
230
- # Create new turn directory for this API call
231
- self._create_turn_dir()
232
-
233
- request_data = None
234
- last_error = None
235
-
236
- for attempt in range(self.max_retries):
237
- try:
238
- # Ensure client is initialized
239
- if self.client is None:
240
- logger.info(
241
- f"Client not initialized in _make_api_call (attempt {attempt+1}), initializing now..."
242
- )
243
- await self.initialize_client()
244
- if self.client is None:
245
- raise RuntimeError("Failed to initialize client")
246
-
247
- # Get messages in standard format from the message manager
248
- self.message_manager.messages = messages.copy()
249
- prepared_messages = self.message_manager.get_messages()
250
-
251
- # Convert messages to UI-TARS format
252
- uitars_messages = self.to_uitars_format(prepared_messages)
253
-
254
- # Log request
255
- request_data = {
256
- "messages": uitars_messages,
257
- "max_tokens": self.max_tokens,
258
- "system": system_prompt,
259
- }
260
-
261
- self._log_api_call("request", request_data)
262
-
263
- # Make API call
264
- response = await self.client.run_interleaved(
265
- messages=uitars_messages,
266
- system=system_prompt,
267
- max_tokens=self.max_tokens,
268
- )
269
-
270
- # Log success response
271
- self._log_api_call("response", request_data, response)
272
-
273
- return response
274
-
275
- except (ConnectError, ReadTimeout) as e:
276
- last_error = e
277
- logger.warning(
278
- f"Connection error on attempt {attempt + 1}/{self.max_retries}: {str(e)}"
279
- )
280
- if attempt < self.max_retries - 1:
281
- await asyncio.sleep(self.retry_delay * (attempt + 1)) # Exponential backoff
282
- # Reset client on connection errors to force re-initialization
283
- self.client = None
284
- continue
285
-
286
- except RuntimeError as e:
287
- # Handle client initialization errors specifically
288
- last_error = e
289
- self._log_api_call("error", request_data, error=e)
290
- logger.error(
291
- f"Client initialization error (attempt {attempt + 1}/{self.max_retries}): {str(e)}"
292
- )
293
- if attempt < self.max_retries - 1:
294
- # Reset client to force re-initialization
295
- self.client = None
296
- await asyncio.sleep(self.retry_delay)
297
- continue
298
-
299
- except Exception as e:
300
- # Log unexpected error
301
- last_error = e
302
- self._log_api_call("error", request_data, error=e)
303
- logger.error(f"Unexpected error in API call: {str(e)}")
304
- if attempt < self.max_retries - 1:
305
- await asyncio.sleep(self.retry_delay)
306
- continue
307
-
308
- # If we get here, all retries failed
309
- error_message = f"API call failed after {self.max_retries} attempts"
310
- if last_error:
311
- error_message += f": {str(last_error)}"
312
-
313
- logger.error(error_message)
314
- raise RuntimeError(error_message)
315
-
316
- ###########################################
317
- # RESPONSE AND ACTION HANDLING
318
- ###########################################
319
-
320
- async def _handle_response(
321
- self, response: Any, messages: List[Dict[str, Any]]
322
- ) -> Tuple[bool, bool]:
323
- """Handle API response.
324
-
325
- Args:
326
- response: API response
327
- messages: List of messages to update
328
-
329
- Returns:
330
- Tuple of (should_continue, action_screenshot_saved)
331
- """
332
- action_screenshot_saved = False
333
-
334
- try:
335
- # Step 1: Extract the raw response text
336
- raw_text = None
337
-
338
- try:
339
- # OpenAI-compatible response format
340
- raw_text = response["choices"][0]["message"]["content"]
341
- except (KeyError, TypeError, IndexError) as e:
342
- logger.error(f"Invalid response format: {str(e)}")
343
- return True, action_screenshot_saved
344
-
345
- # Step 2: Add the response to message history
346
- self.message_manager.add_assistant_message([{"type": "text", "text": raw_text}])
347
-
348
- # Step 3: Parse actions from the response
349
- parsed_actions = parse_actions(raw_text)
350
-
351
- if not parsed_actions:
352
- logger.warning("No action found in the response")
353
- return True, action_screenshot_saved
354
-
355
- # Step 4: Execute each action
356
- for action in parsed_actions:
357
- action_type = None
358
-
359
- # Handle "finished" action
360
- if action.startswith("finished"):
361
- logger.info("Agent completed the task")
362
- return False, action_screenshot_saved
363
-
364
- # Process other action types (click, type, etc.)
365
- try:
366
- # Parse action parameters using the utility function
367
- action_name, tool_args = parse_action_parameters(action)
368
-
369
- if not action_name:
370
- logger.warning(f"Could not parse action: {action}")
371
- continue
372
-
373
- # Mark actions that would create screenshots
374
- if action_name in ["click", "left_double", "right_single", "drag", "scroll"]:
375
- action_screenshot_saved = True
376
-
377
- # Execute the tool with prepared arguments
378
- await self._ensure_tools_initialized()
379
-
380
- # Let's log what we're about to execute for debugging
381
- logger.info(f"Executing computer tool with arguments: {tool_args}")
382
-
383
- result = await self.tool_manager.execute_tool(name="computer", tool_input=tool_args)
384
-
385
- # Handle the result
386
- if hasattr(result, "error") and result.error:
387
- logger.error(f"Error executing tool: {result.error}")
388
- else:
389
- # Action was successful
390
- logger.info(f"Successfully executed {action_name}")
391
-
392
- # Save screenshot if one was returned and we haven't already saved one
393
- if hasattr(result, "base64_image") and result.base64_image:
394
- self._save_screenshot(result.base64_image, action_type=action_name)
395
- action_screenshot_saved = True
396
-
397
- except Exception as e:
398
- logger.error(f"Error executing action {action}: {str(e)}")
399
-
400
- # Continue the loop if there are actions to process
401
- return True, action_screenshot_saved
402
-
403
- except Exception as e:
404
- logger.error(f"Error handling response: {str(e)}")
405
- # Add error message using the message manager
406
- error_message = [{"type": "text", "text": f"Error: {str(e)}"}]
407
- self.message_manager.add_assistant_message(error_message)
408
- raise
409
-
410
- ###########################################
411
- # SCREEN HANDLING
412
- ###########################################
413
-
414
- async def _get_current_screen(self, save_screenshot: bool = True) -> str:
415
- """Get the current screen as a base64 encoded image.
416
-
417
- Args:
418
- save_screenshot: Whether to save the screenshot
419
-
420
- Returns:
421
- Base64 encoded screenshot
422
- """
423
- try:
424
- # Take a screenshot
425
- screenshot = await self.computer.interface.screenshot()
426
-
427
- # Convert to base64
428
- img_base64 = base64.b64encode(screenshot).decode("utf-8")
429
-
430
- # Process screenshot through hooks and save if needed
431
- await self.handle_screenshot(img_base64, action_type="state")
432
-
433
- # Save screenshot if requested
434
- if save_screenshot and self.save_trajectory:
435
- self._save_screenshot(img_base64, action_type="state")
436
-
437
- return img_base64
438
-
439
- except Exception as e:
440
- logger.error(f"Error getting current screen: {str(e)}")
441
- raise
442
-
443
- ###########################################
444
- # SYSTEM PROMPT
445
- ###########################################
446
-
447
- def _get_system_prompt(self) -> str:
448
- """Get the system prompt for the model."""
449
- return SYSTEM_PROMPT
450
-
451
- ###########################################
452
- # MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
453
- ###########################################
454
-
455
- async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
456
- """Run the agent loop with provided messages.
457
-
458
- Args:
459
- messages: List of messages in standard OpenAI format
460
-
461
- Yields:
462
- Agent response format
463
- """
464
- try:
465
- logger.info(f"Starting UITARSLoop run with {len(messages)} messages")
466
-
467
- # Initialize the message manager with the provided messages
468
- self.message_manager.messages = messages.copy()
469
-
470
- # Create queue for response streaming
471
- queue = asyncio.Queue()
472
-
473
- # Start loop in background task
474
- self.loop_task = asyncio.create_task(self._run_loop(queue, messages))
475
-
476
- # Process and yield messages as they arrive
477
- while True:
478
- try:
479
- item = await queue.get()
480
- if item is None: # Stop signal
481
- break
482
- yield item
483
- queue.task_done()
484
- except Exception as e:
485
- logger.error(f"Error processing queue item: {str(e)}")
486
- continue
487
-
488
- # Wait for loop to complete
489
- await self.loop_task
490
-
491
- # Send completion message
492
- yield {
493
- "role": "assistant",
494
- "content": "Task completed successfully.",
495
- "metadata": {"title": "✅ Complete"},
496
- }
497
-
498
- except Exception as e:
499
- logger.error(f"Error in run method: {str(e)}")
500
- yield {
501
- "role": "assistant",
502
- "content": f"Error: {str(e)}",
503
- "metadata": {"title": "❌ Error"},
504
- }
505
-
506
- async def _run_loop(self, queue: asyncio.Queue, messages: List[Dict[str, Any]]) -> None:
507
- """Internal method to run the agent loop with provided messages.
508
-
509
- Args:
510
- queue: Queue to put responses into
511
- messages: List of messages in standard OpenAI format
512
- """
513
- # Continue running until explicitly told to stop
514
- running = True
515
- turn_created = False
516
- # Track if an action-specific screenshot has been saved this turn
517
- action_screenshot_saved = False
518
-
519
- attempt = 0
520
- max_attempts = 3
521
-
522
- try:
523
- while running and attempt < max_attempts:
524
- try:
525
- # Create a new turn directory if it's not already created
526
- if not turn_created:
527
- self._create_turn_dir()
528
- turn_created = True
529
-
530
- # Ensure client is initialized
531
- if self.client is None:
532
- logger.info("Initializing client...")
533
- await self.initialize_client()
534
- if self.client is None:
535
- raise RuntimeError("Failed to initialize client")
536
- logger.info("Client initialized successfully")
537
-
538
- # Get current screen
539
- base64_screenshot = await self._get_current_screen()
540
-
541
- # Add screenshot to message history
542
- self.message_manager.add_user_message(
543
- [
544
- {
545
- "type": "image_url",
546
- "image_url": {"url": f"data:image/png;base64,{base64_screenshot}"},
547
- }
548
- ]
549
- )
550
- logger.info("Added screenshot to message history")
551
-
552
- # Get system prompt
553
- system_prompt = self._get_system_prompt()
554
-
555
- # Make API call with retries
556
- response = await self._make_api_call(
557
- self.message_manager.messages, system_prompt
558
- )
559
-
560
- # Handle the response (may execute actions)
561
- # Returns: (should_continue, action_screenshot_saved)
562
- should_continue, new_screenshot_saved = await self._handle_response(
563
- response, self.message_manager.messages
564
- )
565
-
566
- # Update whether an action screenshot was saved this turn
567
- action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
568
-
569
- agent_response = await to_agent_response_format(
570
- response,
571
- messages,
572
- model=self.model,
573
- )
574
- # Log standardized response for ease of parsing
575
- self._log_api_call("agent_response", request=None, response=agent_response)
576
-
577
- # Put the response in the queue
578
- await queue.put(agent_response)
579
-
580
- # Check if we should continue this conversation
581
- running = should_continue
582
-
583
- # Create a new turn directory if we're continuing
584
- if running:
585
- turn_created = False
586
-
587
- # Reset attempt counter on success
588
- attempt = 0
589
-
590
- except Exception as e:
591
- attempt += 1
592
- error_msg = f"Error in run method (attempt {attempt}/{max_attempts}): {str(e)}"
593
- logger.error(error_msg)
594
-
595
- # If this is our last attempt, provide more info about the error
596
- if attempt >= max_attempts:
597
- logger.error(f"Maximum retry attempts reached. Last error was: {str(e)}")
598
-
599
- await queue.put({
600
- "role": "assistant",
601
- "content": f"Error: {str(e)}",
602
- "metadata": {"title": "❌ Error"},
603
- })
604
-
605
- # Create a brief delay before retrying
606
- await asyncio.sleep(1)
607
- finally:
608
- # Signal that we're done
609
- await queue.put(None)
610
-
611
- async def cancel(self) -> None:
612
- """Cancel the currently running agent loop task.
613
-
614
- This method stops the ongoing processing in the agent loop
615
- by cancelling the loop_task if it exists and is running.
616
- """
617
- if self.loop_task and not self.loop_task.done():
618
- logger.info("Cancelling UITARS loop task")
619
- self.loop_task.cancel()
620
- try:
621
- # Wait for the task to be cancelled with a timeout
622
- await asyncio.wait_for(self.loop_task, timeout=2.0)
623
- except asyncio.TimeoutError:
624
- logger.warning("Timeout while waiting for loop task to cancel")
625
- except asyncio.CancelledError:
626
- logger.info("Loop task cancelled successfully")
627
- except Exception as e:
628
- logger.error(f"Error while cancelling loop task: {str(e)}")
629
- finally:
630
- logger.info("UITARS loop task cancelled")
631
- else:
632
- logger.info("No active UITARS loop task to cancel")
633
-
634
- ###########################################
635
- # UTILITY METHODS
636
- ###########################################
637
-
638
- async def _ensure_tools_initialized(self) -> None:
639
- """Ensure the tool manager and tools are initialized before use."""
640
- if not hasattr(self.tool_manager, "tools") or self.tool_manager.tools is None:
641
- logger.info("Tools not initialized. Initializing now...")
642
- await self.tool_manager.initialize()
643
- logger.info("Tools initialized successfully.")
644
-
645
- async def process_model_response(self, response_text: str) -> Optional[Dict[str, Any]]:
646
- """Process model response to extract tool calls.
647
-
648
- Args:
649
- response_text: Model response text
650
-
651
- Returns:
652
- Extracted tool information, or None if no tool call was found
653
- """
654
- # UI-TARS doesn't use the standard tool call format, so we parse its actions differently
655
- parsed_actions = parse_actions(response_text)
656
-
657
- if parsed_actions:
658
- return {"actions": parsed_actions}
659
-
660
- return None