cua-agent 0.3.2__py3-none-any.whl → 0.4.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +15 -51
- agent/__main__.py +21 -0
- agent/adapters/__init__.py +9 -0
- agent/adapters/huggingfacelocal_adapter.py +229 -0
- agent/agent.py +577 -0
- agent/callbacks/__init__.py +17 -0
- agent/callbacks/base.py +153 -0
- agent/callbacks/budget_manager.py +44 -0
- agent/callbacks/image_retention.py +139 -0
- agent/callbacks/logging.py +247 -0
- agent/callbacks/pii_anonymization.py +259 -0
- agent/callbacks/trajectory_saver.py +305 -0
- agent/cli.py +290 -0
- agent/computer_handler.py +107 -0
- agent/decorators.py +90 -0
- agent/loops/__init__.py +11 -0
- agent/loops/anthropic.py +728 -0
- agent/loops/omniparser.py +339 -0
- agent/loops/openai.py +95 -0
- agent/loops/uitars.py +688 -0
- agent/responses.py +207 -0
- agent/types.py +79 -0
- agent/ui/__init__.py +7 -1
- agent/ui/gradio/__init__.py +6 -19
- agent/ui/gradio/app.py +80 -1299
- agent/ui/gradio/ui_components.py +703 -0
- cua_agent-0.4.0b2.dist-info/METADATA +424 -0
- cua_agent-0.4.0b2.dist-info/RECORD +30 -0
- agent/core/__init__.py +0 -27
- agent/core/agent.py +0 -210
- agent/core/base.py +0 -217
- agent/core/callbacks.py +0 -200
- agent/core/experiment.py +0 -249
- agent/core/factory.py +0 -122
- agent/core/messages.py +0 -332
- agent/core/provider_config.py +0 -21
- agent/core/telemetry.py +0 -142
- agent/core/tools/__init__.py +0 -21
- agent/core/tools/base.py +0 -74
- agent/core/tools/bash.py +0 -52
- agent/core/tools/collection.py +0 -46
- agent/core/tools/computer.py +0 -113
- agent/core/tools/edit.py +0 -67
- agent/core/tools/manager.py +0 -56
- agent/core/tools.py +0 -32
- agent/core/types.py +0 -88
- agent/core/visualization.py +0 -197
- agent/providers/__init__.py +0 -4
- agent/providers/anthropic/__init__.py +0 -6
- agent/providers/anthropic/api/client.py +0 -360
- agent/providers/anthropic/api/logging.py +0 -150
- agent/providers/anthropic/api_handler.py +0 -140
- agent/providers/anthropic/callbacks/__init__.py +0 -5
- agent/providers/anthropic/callbacks/manager.py +0 -65
- agent/providers/anthropic/loop.py +0 -568
- agent/providers/anthropic/prompts.py +0 -23
- agent/providers/anthropic/response_handler.py +0 -226
- agent/providers/anthropic/tools/__init__.py +0 -33
- agent/providers/anthropic/tools/base.py +0 -88
- agent/providers/anthropic/tools/bash.py +0 -66
- agent/providers/anthropic/tools/collection.py +0 -34
- agent/providers/anthropic/tools/computer.py +0 -396
- agent/providers/anthropic/tools/edit.py +0 -326
- agent/providers/anthropic/tools/manager.py +0 -54
- agent/providers/anthropic/tools/run.py +0 -42
- agent/providers/anthropic/types.py +0 -16
- agent/providers/anthropic/utils.py +0 -381
- agent/providers/omni/__init__.py +0 -8
- agent/providers/omni/api_handler.py +0 -42
- agent/providers/omni/clients/anthropic.py +0 -103
- agent/providers/omni/clients/base.py +0 -35
- agent/providers/omni/clients/oaicompat.py +0 -195
- agent/providers/omni/clients/ollama.py +0 -122
- agent/providers/omni/clients/openai.py +0 -155
- agent/providers/omni/clients/utils.py +0 -25
- agent/providers/omni/image_utils.py +0 -34
- agent/providers/omni/loop.py +0 -990
- agent/providers/omni/parser.py +0 -307
- agent/providers/omni/prompts.py +0 -64
- agent/providers/omni/tools/__init__.py +0 -30
- agent/providers/omni/tools/base.py +0 -29
- agent/providers/omni/tools/bash.py +0 -74
- agent/providers/omni/tools/computer.py +0 -179
- agent/providers/omni/tools/manager.py +0 -61
- agent/providers/omni/utils.py +0 -236
- agent/providers/openai/__init__.py +0 -6
- agent/providers/openai/api_handler.py +0 -456
- agent/providers/openai/loop.py +0 -472
- agent/providers/openai/response_handler.py +0 -205
- agent/providers/openai/tools/__init__.py +0 -15
- agent/providers/openai/tools/base.py +0 -79
- agent/providers/openai/tools/computer.py +0 -326
- agent/providers/openai/tools/manager.py +0 -106
- agent/providers/openai/types.py +0 -36
- agent/providers/openai/utils.py +0 -98
- agent/providers/uitars/__init__.py +0 -1
- agent/providers/uitars/clients/base.py +0 -35
- agent/providers/uitars/clients/mlxvlm.py +0 -263
- agent/providers/uitars/clients/oaicompat.py +0 -214
- agent/providers/uitars/loop.py +0 -660
- agent/providers/uitars/prompts.py +0 -63
- agent/providers/uitars/tools/__init__.py +0 -1
- agent/providers/uitars/tools/computer.py +0 -283
- agent/providers/uitars/tools/manager.py +0 -60
- agent/providers/uitars/utils.py +0 -264
- agent/telemetry.py +0 -21
- agent/ui/__main__.py +0 -15
- cua_agent-0.3.2.dist-info/METADATA +0 -295
- cua_agent-0.3.2.dist-info/RECORD +0 -87
- {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b2.dist-info}/WHEEL +0 -0
- {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b2.dist-info}/entry_points.txt +0 -0
agent/providers/uitars/loop.py
DELETED
|
@@ -1,660 +0,0 @@
|
|
|
1
|
-
"""UI-TARS-specific agent loop implementation."""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
import asyncio
|
|
5
|
-
import re
|
|
6
|
-
import os
|
|
7
|
-
import json
|
|
8
|
-
import base64
|
|
9
|
-
import copy
|
|
10
|
-
from typing import Any, Dict, List, Optional, Tuple, AsyncGenerator, cast
|
|
11
|
-
|
|
12
|
-
from httpx import ConnectError, ReadTimeout
|
|
13
|
-
|
|
14
|
-
from ...core.base import BaseLoop
|
|
15
|
-
from ...core.messages import StandardMessageManager, ImageRetentionConfig
|
|
16
|
-
from ...core.types import AgentResponse, LLMProvider
|
|
17
|
-
from ...core.visualization import VisualizationHelper
|
|
18
|
-
from computer import Computer
|
|
19
|
-
|
|
20
|
-
from .utils import add_box_token, parse_actions, parse_action_parameters, to_agent_response_format
|
|
21
|
-
from .tools.manager import ToolManager
|
|
22
|
-
from .tools.computer import ToolResult
|
|
23
|
-
from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
|
|
24
|
-
|
|
25
|
-
from .clients.oaicompat import OAICompatClient
|
|
26
|
-
from .clients.mlxvlm import MLXVLMUITarsClient
|
|
27
|
-
|
|
28
|
-
logger = logging.getLogger(__name__)
|
|
29
|
-
|
|
30
|
-
class UITARSLoop(BaseLoop):
|
|
31
|
-
"""UI-TARS-specific implementation of the agent loop.
|
|
32
|
-
|
|
33
|
-
This class extends BaseLoop to provide support for the UI-TARS model
|
|
34
|
-
with computer control capabilities.
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
###########################################
|
|
38
|
-
# INITIALIZATION AND CONFIGURATION
|
|
39
|
-
###########################################
|
|
40
|
-
|
|
41
|
-
def __init__(
|
|
42
|
-
self,
|
|
43
|
-
computer: Computer,
|
|
44
|
-
api_key: str,
|
|
45
|
-
model: str,
|
|
46
|
-
provider: Optional[LLMProvider] = None,
|
|
47
|
-
provider_base_url: Optional[str] = "http://localhost:8000/v1",
|
|
48
|
-
only_n_most_recent_images: Optional[int] = 2,
|
|
49
|
-
base_dir: Optional[str] = "trajectories",
|
|
50
|
-
max_retries: int = 3,
|
|
51
|
-
retry_delay: float = 1.0,
|
|
52
|
-
save_trajectory: bool = True,
|
|
53
|
-
**kwargs,
|
|
54
|
-
):
|
|
55
|
-
"""Initialize the loop.
|
|
56
|
-
|
|
57
|
-
Args:
|
|
58
|
-
computer: Computer instance
|
|
59
|
-
api_key: API key (may not be needed for local endpoints)
|
|
60
|
-
model: Model name (e.g., "ui-tars")
|
|
61
|
-
provider_base_url: Base URL for the API provider
|
|
62
|
-
only_n_most_recent_images: Maximum number of recent screenshots to include in API requests
|
|
63
|
-
base_dir: Base directory for saving experiment data
|
|
64
|
-
max_retries: Maximum number of retries for API calls
|
|
65
|
-
retry_delay: Delay between retries in seconds
|
|
66
|
-
save_trajectory: Whether to save trajectory data
|
|
67
|
-
provider: The LLM provider to use (defaults to OAICOMPAT if not specified)
|
|
68
|
-
"""
|
|
69
|
-
# Set provider before initializing base class
|
|
70
|
-
self.provider = provider or LLMProvider.OAICOMPAT
|
|
71
|
-
self.provider_base_url = provider_base_url
|
|
72
|
-
|
|
73
|
-
# Initialize message manager with image retention config
|
|
74
|
-
self.message_manager = StandardMessageManager(
|
|
75
|
-
config=ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images)
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
# Initialize base class (which will set up experiment manager)
|
|
79
|
-
super().__init__(
|
|
80
|
-
computer=computer,
|
|
81
|
-
model=model,
|
|
82
|
-
api_key=api_key,
|
|
83
|
-
max_retries=max_retries,
|
|
84
|
-
retry_delay=retry_delay,
|
|
85
|
-
base_dir=base_dir,
|
|
86
|
-
save_trajectory=save_trajectory,
|
|
87
|
-
only_n_most_recent_images=only_n_most_recent_images,
|
|
88
|
-
**kwargs,
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
# Set API client attributes
|
|
92
|
-
self.client = None
|
|
93
|
-
self.retry_count = 0
|
|
94
|
-
self.loop_task = None # Store the loop task for cancellation
|
|
95
|
-
|
|
96
|
-
# Initialize visualization helper
|
|
97
|
-
self.viz_helper = VisualizationHelper(agent=self)
|
|
98
|
-
|
|
99
|
-
# Initialize tool manager
|
|
100
|
-
self.tool_manager = ToolManager(computer=computer)
|
|
101
|
-
|
|
102
|
-
logger.info("UITARSLoop initialized with StandardMessageManager")
|
|
103
|
-
|
|
104
|
-
async def initialize(self) -> None:
|
|
105
|
-
"""Initialize the loop by setting up tools and clients."""
|
|
106
|
-
# Initialize base class
|
|
107
|
-
await super().initialize()
|
|
108
|
-
|
|
109
|
-
# Initialize tool manager with error handling
|
|
110
|
-
try:
|
|
111
|
-
logger.info("Initializing tool manager...")
|
|
112
|
-
await self.tool_manager.initialize()
|
|
113
|
-
logger.info("Tool manager initialized successfully.")
|
|
114
|
-
except Exception as e:
|
|
115
|
-
logger.error(f"Error initializing tool manager: {str(e)}")
|
|
116
|
-
logger.warning("Will attempt to initialize tools on first use.")
|
|
117
|
-
|
|
118
|
-
# Initialize client for the selected provider
|
|
119
|
-
try:
|
|
120
|
-
await self.initialize_client()
|
|
121
|
-
except Exception as e:
|
|
122
|
-
logger.error(f"Error initializing client: {str(e)}")
|
|
123
|
-
raise RuntimeError(f"Failed to initialize client: {str(e)}")
|
|
124
|
-
|
|
125
|
-
###########################################
|
|
126
|
-
# CLIENT INITIALIZATION - IMPLEMENTING ABSTRACT METHOD
|
|
127
|
-
###########################################
|
|
128
|
-
|
|
129
|
-
async def initialize_client(self) -> None:
|
|
130
|
-
"""Initialize the appropriate client.
|
|
131
|
-
|
|
132
|
-
Implements abstract method from BaseLoop to set up the specific
|
|
133
|
-
provider client based on the configured provider.
|
|
134
|
-
"""
|
|
135
|
-
try:
|
|
136
|
-
if self.provider == LLMProvider.MLXVLM:
|
|
137
|
-
logger.info(f"Initializing MLX VLM client for UI-TARS with model {self.model}...")
|
|
138
|
-
|
|
139
|
-
self.client = MLXVLMUITarsClient(
|
|
140
|
-
model=self.model,
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
logger.info(f"Initialized MLX VLM client with model {self.model}")
|
|
144
|
-
else:
|
|
145
|
-
# Default to OAICompat client for other providers
|
|
146
|
-
logger.info(f"Initializing OAICompat client for UI-TARS with model {self.model}...")
|
|
147
|
-
|
|
148
|
-
self.client = OAICompatClient(
|
|
149
|
-
api_key=self.api_key or "EMPTY", # Local endpoints typically don't require an API key
|
|
150
|
-
model=self.model,
|
|
151
|
-
provider_base_url=self.provider_base_url,
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
logger.info(f"Initialized OAICompat client with model {self.model}")
|
|
155
|
-
except Exception as e:
|
|
156
|
-
logger.error(f"Error initializing client: {str(e)}")
|
|
157
|
-
self.client = None
|
|
158
|
-
raise RuntimeError(f"Failed to initialize client: {str(e)}")
|
|
159
|
-
|
|
160
|
-
###########################################
|
|
161
|
-
# MESSAGE FORMATTING
|
|
162
|
-
###########################################
|
|
163
|
-
|
|
164
|
-
def to_uitars_format(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
165
|
-
"""Convert messages to UI-TARS compatible format.
|
|
166
|
-
|
|
167
|
-
Args:
|
|
168
|
-
messages: List of messages in standard format
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
List of messages formatted for UI-TARS
|
|
172
|
-
"""
|
|
173
|
-
# Create a copy of the messages to avoid modifying the original
|
|
174
|
-
uitars_messages = copy.deepcopy(messages)
|
|
175
|
-
|
|
176
|
-
# Find the first user message to modify
|
|
177
|
-
first_user_idx = None
|
|
178
|
-
instruction = ""
|
|
179
|
-
|
|
180
|
-
for idx, msg in enumerate(uitars_messages):
|
|
181
|
-
if msg.get("role") == "user":
|
|
182
|
-
first_user_idx = idx
|
|
183
|
-
content = msg.get("content", "")
|
|
184
|
-
if isinstance(content, str):
|
|
185
|
-
instruction = content
|
|
186
|
-
break
|
|
187
|
-
elif isinstance(content, list):
|
|
188
|
-
for item in content:
|
|
189
|
-
if item.get("type") == "text":
|
|
190
|
-
instruction = item.get("text", "")
|
|
191
|
-
break
|
|
192
|
-
if instruction:
|
|
193
|
-
break
|
|
194
|
-
|
|
195
|
-
# Only modify the first user message if found
|
|
196
|
-
if first_user_idx is not None and instruction:
|
|
197
|
-
# Create the computer use prompt
|
|
198
|
-
user_prompt = COMPUTER_USE.format(
|
|
199
|
-
instruction='\n'.join([instruction, MAC_SPECIFIC_NOTES]),
|
|
200
|
-
language="English"
|
|
201
|
-
)
|
|
202
|
-
|
|
203
|
-
# Replace the content of the first user message
|
|
204
|
-
if isinstance(uitars_messages[first_user_idx].get("content", ""), str):
|
|
205
|
-
uitars_messages[first_user_idx]["content"] = [{"type": "text", "text": user_prompt}]
|
|
206
|
-
elif isinstance(uitars_messages[first_user_idx].get("content", ""), list):
|
|
207
|
-
# Find and replace only the text part, keeping images
|
|
208
|
-
for i, item in enumerate(uitars_messages[first_user_idx]["content"]):
|
|
209
|
-
if item.get("type") == "text":
|
|
210
|
-
uitars_messages[first_user_idx]["content"][i]["text"] = user_prompt
|
|
211
|
-
break
|
|
212
|
-
|
|
213
|
-
# Add box tokens to assistant responses
|
|
214
|
-
for idx, msg in enumerate(uitars_messages):
|
|
215
|
-
if msg.get("role") == "assistant":
|
|
216
|
-
content = msg.get("content", "")
|
|
217
|
-
if content and isinstance(content, list):
|
|
218
|
-
for i, part in enumerate(content):
|
|
219
|
-
if part.get('type') == 'text':
|
|
220
|
-
uitars_messages[idx]["content"][i]["text"] = add_box_token(part['text'])
|
|
221
|
-
|
|
222
|
-
return uitars_messages
|
|
223
|
-
|
|
224
|
-
###########################################
|
|
225
|
-
# API CALL HANDLING
|
|
226
|
-
###########################################
|
|
227
|
-
|
|
228
|
-
async def _make_api_call(self, messages: List[Dict[str, Any]], system_prompt: str) -> Any:
|
|
229
|
-
"""Make API call to provider with retry logic."""
|
|
230
|
-
# Create new turn directory for this API call
|
|
231
|
-
self._create_turn_dir()
|
|
232
|
-
|
|
233
|
-
request_data = None
|
|
234
|
-
last_error = None
|
|
235
|
-
|
|
236
|
-
for attempt in range(self.max_retries):
|
|
237
|
-
try:
|
|
238
|
-
# Ensure client is initialized
|
|
239
|
-
if self.client is None:
|
|
240
|
-
logger.info(
|
|
241
|
-
f"Client not initialized in _make_api_call (attempt {attempt+1}), initializing now..."
|
|
242
|
-
)
|
|
243
|
-
await self.initialize_client()
|
|
244
|
-
if self.client is None:
|
|
245
|
-
raise RuntimeError("Failed to initialize client")
|
|
246
|
-
|
|
247
|
-
# Get messages in standard format from the message manager
|
|
248
|
-
self.message_manager.messages = messages.copy()
|
|
249
|
-
prepared_messages = self.message_manager.get_messages()
|
|
250
|
-
|
|
251
|
-
# Convert messages to UI-TARS format
|
|
252
|
-
uitars_messages = self.to_uitars_format(prepared_messages)
|
|
253
|
-
|
|
254
|
-
# Log request
|
|
255
|
-
request_data = {
|
|
256
|
-
"messages": uitars_messages,
|
|
257
|
-
"max_tokens": self.max_tokens,
|
|
258
|
-
"system": system_prompt,
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
self._log_api_call("request", request_data)
|
|
262
|
-
|
|
263
|
-
# Make API call
|
|
264
|
-
response = await self.client.run_interleaved(
|
|
265
|
-
messages=uitars_messages,
|
|
266
|
-
system=system_prompt,
|
|
267
|
-
max_tokens=self.max_tokens,
|
|
268
|
-
)
|
|
269
|
-
|
|
270
|
-
# Log success response
|
|
271
|
-
self._log_api_call("response", request_data, response)
|
|
272
|
-
|
|
273
|
-
return response
|
|
274
|
-
|
|
275
|
-
except (ConnectError, ReadTimeout) as e:
|
|
276
|
-
last_error = e
|
|
277
|
-
logger.warning(
|
|
278
|
-
f"Connection error on attempt {attempt + 1}/{self.max_retries}: {str(e)}"
|
|
279
|
-
)
|
|
280
|
-
if attempt < self.max_retries - 1:
|
|
281
|
-
await asyncio.sleep(self.retry_delay * (attempt + 1)) # Exponential backoff
|
|
282
|
-
# Reset client on connection errors to force re-initialization
|
|
283
|
-
self.client = None
|
|
284
|
-
continue
|
|
285
|
-
|
|
286
|
-
except RuntimeError as e:
|
|
287
|
-
# Handle client initialization errors specifically
|
|
288
|
-
last_error = e
|
|
289
|
-
self._log_api_call("error", request_data, error=e)
|
|
290
|
-
logger.error(
|
|
291
|
-
f"Client initialization error (attempt {attempt + 1}/{self.max_retries}): {str(e)}"
|
|
292
|
-
)
|
|
293
|
-
if attempt < self.max_retries - 1:
|
|
294
|
-
# Reset client to force re-initialization
|
|
295
|
-
self.client = None
|
|
296
|
-
await asyncio.sleep(self.retry_delay)
|
|
297
|
-
continue
|
|
298
|
-
|
|
299
|
-
except Exception as e:
|
|
300
|
-
# Log unexpected error
|
|
301
|
-
last_error = e
|
|
302
|
-
self._log_api_call("error", request_data, error=e)
|
|
303
|
-
logger.error(f"Unexpected error in API call: {str(e)}")
|
|
304
|
-
if attempt < self.max_retries - 1:
|
|
305
|
-
await asyncio.sleep(self.retry_delay)
|
|
306
|
-
continue
|
|
307
|
-
|
|
308
|
-
# If we get here, all retries failed
|
|
309
|
-
error_message = f"API call failed after {self.max_retries} attempts"
|
|
310
|
-
if last_error:
|
|
311
|
-
error_message += f": {str(last_error)}"
|
|
312
|
-
|
|
313
|
-
logger.error(error_message)
|
|
314
|
-
raise RuntimeError(error_message)
|
|
315
|
-
|
|
316
|
-
###########################################
|
|
317
|
-
# RESPONSE AND ACTION HANDLING
|
|
318
|
-
###########################################
|
|
319
|
-
|
|
320
|
-
async def _handle_response(
|
|
321
|
-
self, response: Any, messages: List[Dict[str, Any]]
|
|
322
|
-
) -> Tuple[bool, bool]:
|
|
323
|
-
"""Handle API response.
|
|
324
|
-
|
|
325
|
-
Args:
|
|
326
|
-
response: API response
|
|
327
|
-
messages: List of messages to update
|
|
328
|
-
|
|
329
|
-
Returns:
|
|
330
|
-
Tuple of (should_continue, action_screenshot_saved)
|
|
331
|
-
"""
|
|
332
|
-
action_screenshot_saved = False
|
|
333
|
-
|
|
334
|
-
try:
|
|
335
|
-
# Step 1: Extract the raw response text
|
|
336
|
-
raw_text = None
|
|
337
|
-
|
|
338
|
-
try:
|
|
339
|
-
# OpenAI-compatible response format
|
|
340
|
-
raw_text = response["choices"][0]["message"]["content"]
|
|
341
|
-
except (KeyError, TypeError, IndexError) as e:
|
|
342
|
-
logger.error(f"Invalid response format: {str(e)}")
|
|
343
|
-
return True, action_screenshot_saved
|
|
344
|
-
|
|
345
|
-
# Step 2: Add the response to message history
|
|
346
|
-
self.message_manager.add_assistant_message([{"type": "text", "text": raw_text}])
|
|
347
|
-
|
|
348
|
-
# Step 3: Parse actions from the response
|
|
349
|
-
parsed_actions = parse_actions(raw_text)
|
|
350
|
-
|
|
351
|
-
if not parsed_actions:
|
|
352
|
-
logger.warning("No action found in the response")
|
|
353
|
-
return True, action_screenshot_saved
|
|
354
|
-
|
|
355
|
-
# Step 4: Execute each action
|
|
356
|
-
for action in parsed_actions:
|
|
357
|
-
action_type = None
|
|
358
|
-
|
|
359
|
-
# Handle "finished" action
|
|
360
|
-
if action.startswith("finished"):
|
|
361
|
-
logger.info("Agent completed the task")
|
|
362
|
-
return False, action_screenshot_saved
|
|
363
|
-
|
|
364
|
-
# Process other action types (click, type, etc.)
|
|
365
|
-
try:
|
|
366
|
-
# Parse action parameters using the utility function
|
|
367
|
-
action_name, tool_args = parse_action_parameters(action)
|
|
368
|
-
|
|
369
|
-
if not action_name:
|
|
370
|
-
logger.warning(f"Could not parse action: {action}")
|
|
371
|
-
continue
|
|
372
|
-
|
|
373
|
-
# Mark actions that would create screenshots
|
|
374
|
-
if action_name in ["click", "left_double", "right_single", "drag", "scroll"]:
|
|
375
|
-
action_screenshot_saved = True
|
|
376
|
-
|
|
377
|
-
# Execute the tool with prepared arguments
|
|
378
|
-
await self._ensure_tools_initialized()
|
|
379
|
-
|
|
380
|
-
# Let's log what we're about to execute for debugging
|
|
381
|
-
logger.info(f"Executing computer tool with arguments: {tool_args}")
|
|
382
|
-
|
|
383
|
-
result = await self.tool_manager.execute_tool(name="computer", tool_input=tool_args)
|
|
384
|
-
|
|
385
|
-
# Handle the result
|
|
386
|
-
if hasattr(result, "error") and result.error:
|
|
387
|
-
logger.error(f"Error executing tool: {result.error}")
|
|
388
|
-
else:
|
|
389
|
-
# Action was successful
|
|
390
|
-
logger.info(f"Successfully executed {action_name}")
|
|
391
|
-
|
|
392
|
-
# Save screenshot if one was returned and we haven't already saved one
|
|
393
|
-
if hasattr(result, "base64_image") and result.base64_image:
|
|
394
|
-
self._save_screenshot(result.base64_image, action_type=action_name)
|
|
395
|
-
action_screenshot_saved = True
|
|
396
|
-
|
|
397
|
-
except Exception as e:
|
|
398
|
-
logger.error(f"Error executing action {action}: {str(e)}")
|
|
399
|
-
|
|
400
|
-
# Continue the loop if there are actions to process
|
|
401
|
-
return True, action_screenshot_saved
|
|
402
|
-
|
|
403
|
-
except Exception as e:
|
|
404
|
-
logger.error(f"Error handling response: {str(e)}")
|
|
405
|
-
# Add error message using the message manager
|
|
406
|
-
error_message = [{"type": "text", "text": f"Error: {str(e)}"}]
|
|
407
|
-
self.message_manager.add_assistant_message(error_message)
|
|
408
|
-
raise
|
|
409
|
-
|
|
410
|
-
###########################################
|
|
411
|
-
# SCREEN HANDLING
|
|
412
|
-
###########################################
|
|
413
|
-
|
|
414
|
-
async def _get_current_screen(self, save_screenshot: bool = True) -> str:
|
|
415
|
-
"""Get the current screen as a base64 encoded image.
|
|
416
|
-
|
|
417
|
-
Args:
|
|
418
|
-
save_screenshot: Whether to save the screenshot
|
|
419
|
-
|
|
420
|
-
Returns:
|
|
421
|
-
Base64 encoded screenshot
|
|
422
|
-
"""
|
|
423
|
-
try:
|
|
424
|
-
# Take a screenshot
|
|
425
|
-
screenshot = await self.computer.interface.screenshot()
|
|
426
|
-
|
|
427
|
-
# Convert to base64
|
|
428
|
-
img_base64 = base64.b64encode(screenshot).decode("utf-8")
|
|
429
|
-
|
|
430
|
-
# Process screenshot through hooks and save if needed
|
|
431
|
-
await self.handle_screenshot(img_base64, action_type="state")
|
|
432
|
-
|
|
433
|
-
# Save screenshot if requested
|
|
434
|
-
if save_screenshot and self.save_trajectory:
|
|
435
|
-
self._save_screenshot(img_base64, action_type="state")
|
|
436
|
-
|
|
437
|
-
return img_base64
|
|
438
|
-
|
|
439
|
-
except Exception as e:
|
|
440
|
-
logger.error(f"Error getting current screen: {str(e)}")
|
|
441
|
-
raise
|
|
442
|
-
|
|
443
|
-
###########################################
|
|
444
|
-
# SYSTEM PROMPT
|
|
445
|
-
###########################################
|
|
446
|
-
|
|
447
|
-
def _get_system_prompt(self) -> str:
|
|
448
|
-
"""Get the system prompt for the model."""
|
|
449
|
-
return SYSTEM_PROMPT
|
|
450
|
-
|
|
451
|
-
###########################################
|
|
452
|
-
# MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
|
|
453
|
-
###########################################
|
|
454
|
-
|
|
455
|
-
async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
|
|
456
|
-
"""Run the agent loop with provided messages.
|
|
457
|
-
|
|
458
|
-
Args:
|
|
459
|
-
messages: List of messages in standard OpenAI format
|
|
460
|
-
|
|
461
|
-
Yields:
|
|
462
|
-
Agent response format
|
|
463
|
-
"""
|
|
464
|
-
try:
|
|
465
|
-
logger.info(f"Starting UITARSLoop run with {len(messages)} messages")
|
|
466
|
-
|
|
467
|
-
# Initialize the message manager with the provided messages
|
|
468
|
-
self.message_manager.messages = messages.copy()
|
|
469
|
-
|
|
470
|
-
# Create queue for response streaming
|
|
471
|
-
queue = asyncio.Queue()
|
|
472
|
-
|
|
473
|
-
# Start loop in background task
|
|
474
|
-
self.loop_task = asyncio.create_task(self._run_loop(queue, messages))
|
|
475
|
-
|
|
476
|
-
# Process and yield messages as they arrive
|
|
477
|
-
while True:
|
|
478
|
-
try:
|
|
479
|
-
item = await queue.get()
|
|
480
|
-
if item is None: # Stop signal
|
|
481
|
-
break
|
|
482
|
-
yield item
|
|
483
|
-
queue.task_done()
|
|
484
|
-
except Exception as e:
|
|
485
|
-
logger.error(f"Error processing queue item: {str(e)}")
|
|
486
|
-
continue
|
|
487
|
-
|
|
488
|
-
# Wait for loop to complete
|
|
489
|
-
await self.loop_task
|
|
490
|
-
|
|
491
|
-
# Send completion message
|
|
492
|
-
yield {
|
|
493
|
-
"role": "assistant",
|
|
494
|
-
"content": "Task completed successfully.",
|
|
495
|
-
"metadata": {"title": "✅ Complete"},
|
|
496
|
-
}
|
|
497
|
-
|
|
498
|
-
except Exception as e:
|
|
499
|
-
logger.error(f"Error in run method: {str(e)}")
|
|
500
|
-
yield {
|
|
501
|
-
"role": "assistant",
|
|
502
|
-
"content": f"Error: {str(e)}",
|
|
503
|
-
"metadata": {"title": "❌ Error"},
|
|
504
|
-
}
|
|
505
|
-
|
|
506
|
-
async def _run_loop(self, queue: asyncio.Queue, messages: List[Dict[str, Any]]) -> None:
|
|
507
|
-
"""Internal method to run the agent loop with provided messages.
|
|
508
|
-
|
|
509
|
-
Args:
|
|
510
|
-
queue: Queue to put responses into
|
|
511
|
-
messages: List of messages in standard OpenAI format
|
|
512
|
-
"""
|
|
513
|
-
# Continue running until explicitly told to stop
|
|
514
|
-
running = True
|
|
515
|
-
turn_created = False
|
|
516
|
-
# Track if an action-specific screenshot has been saved this turn
|
|
517
|
-
action_screenshot_saved = False
|
|
518
|
-
|
|
519
|
-
attempt = 0
|
|
520
|
-
max_attempts = 3
|
|
521
|
-
|
|
522
|
-
try:
|
|
523
|
-
while running and attempt < max_attempts:
|
|
524
|
-
try:
|
|
525
|
-
# Create a new turn directory if it's not already created
|
|
526
|
-
if not turn_created:
|
|
527
|
-
self._create_turn_dir()
|
|
528
|
-
turn_created = True
|
|
529
|
-
|
|
530
|
-
# Ensure client is initialized
|
|
531
|
-
if self.client is None:
|
|
532
|
-
logger.info("Initializing client...")
|
|
533
|
-
await self.initialize_client()
|
|
534
|
-
if self.client is None:
|
|
535
|
-
raise RuntimeError("Failed to initialize client")
|
|
536
|
-
logger.info("Client initialized successfully")
|
|
537
|
-
|
|
538
|
-
# Get current screen
|
|
539
|
-
base64_screenshot = await self._get_current_screen()
|
|
540
|
-
|
|
541
|
-
# Add screenshot to message history
|
|
542
|
-
self.message_manager.add_user_message(
|
|
543
|
-
[
|
|
544
|
-
{
|
|
545
|
-
"type": "image_url",
|
|
546
|
-
"image_url": {"url": f"data:image/png;base64,{base64_screenshot}"},
|
|
547
|
-
}
|
|
548
|
-
]
|
|
549
|
-
)
|
|
550
|
-
logger.info("Added screenshot to message history")
|
|
551
|
-
|
|
552
|
-
# Get system prompt
|
|
553
|
-
system_prompt = self._get_system_prompt()
|
|
554
|
-
|
|
555
|
-
# Make API call with retries
|
|
556
|
-
response = await self._make_api_call(
|
|
557
|
-
self.message_manager.messages, system_prompt
|
|
558
|
-
)
|
|
559
|
-
|
|
560
|
-
# Handle the response (may execute actions)
|
|
561
|
-
# Returns: (should_continue, action_screenshot_saved)
|
|
562
|
-
should_continue, new_screenshot_saved = await self._handle_response(
|
|
563
|
-
response, self.message_manager.messages
|
|
564
|
-
)
|
|
565
|
-
|
|
566
|
-
# Update whether an action screenshot was saved this turn
|
|
567
|
-
action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
|
|
568
|
-
|
|
569
|
-
agent_response = await to_agent_response_format(
|
|
570
|
-
response,
|
|
571
|
-
messages,
|
|
572
|
-
model=self.model,
|
|
573
|
-
)
|
|
574
|
-
# Log standardized response for ease of parsing
|
|
575
|
-
self._log_api_call("agent_response", request=None, response=agent_response)
|
|
576
|
-
|
|
577
|
-
# Put the response in the queue
|
|
578
|
-
await queue.put(agent_response)
|
|
579
|
-
|
|
580
|
-
# Check if we should continue this conversation
|
|
581
|
-
running = should_continue
|
|
582
|
-
|
|
583
|
-
# Create a new turn directory if we're continuing
|
|
584
|
-
if running:
|
|
585
|
-
turn_created = False
|
|
586
|
-
|
|
587
|
-
# Reset attempt counter on success
|
|
588
|
-
attempt = 0
|
|
589
|
-
|
|
590
|
-
except Exception as e:
|
|
591
|
-
attempt += 1
|
|
592
|
-
error_msg = f"Error in run method (attempt {attempt}/{max_attempts}): {str(e)}"
|
|
593
|
-
logger.error(error_msg)
|
|
594
|
-
|
|
595
|
-
# If this is our last attempt, provide more info about the error
|
|
596
|
-
if attempt >= max_attempts:
|
|
597
|
-
logger.error(f"Maximum retry attempts reached. Last error was: {str(e)}")
|
|
598
|
-
|
|
599
|
-
await queue.put({
|
|
600
|
-
"role": "assistant",
|
|
601
|
-
"content": f"Error: {str(e)}",
|
|
602
|
-
"metadata": {"title": "❌ Error"},
|
|
603
|
-
})
|
|
604
|
-
|
|
605
|
-
# Create a brief delay before retrying
|
|
606
|
-
await asyncio.sleep(1)
|
|
607
|
-
finally:
|
|
608
|
-
# Signal that we're done
|
|
609
|
-
await queue.put(None)
|
|
610
|
-
|
|
611
|
-
async def cancel(self) -> None:
|
|
612
|
-
"""Cancel the currently running agent loop task.
|
|
613
|
-
|
|
614
|
-
This method stops the ongoing processing in the agent loop
|
|
615
|
-
by cancelling the loop_task if it exists and is running.
|
|
616
|
-
"""
|
|
617
|
-
if self.loop_task and not self.loop_task.done():
|
|
618
|
-
logger.info("Cancelling UITARS loop task")
|
|
619
|
-
self.loop_task.cancel()
|
|
620
|
-
try:
|
|
621
|
-
# Wait for the task to be cancelled with a timeout
|
|
622
|
-
await asyncio.wait_for(self.loop_task, timeout=2.0)
|
|
623
|
-
except asyncio.TimeoutError:
|
|
624
|
-
logger.warning("Timeout while waiting for loop task to cancel")
|
|
625
|
-
except asyncio.CancelledError:
|
|
626
|
-
logger.info("Loop task cancelled successfully")
|
|
627
|
-
except Exception as e:
|
|
628
|
-
logger.error(f"Error while cancelling loop task: {str(e)}")
|
|
629
|
-
finally:
|
|
630
|
-
logger.info("UITARS loop task cancelled")
|
|
631
|
-
else:
|
|
632
|
-
logger.info("No active UITARS loop task to cancel")
|
|
633
|
-
|
|
634
|
-
###########################################
|
|
635
|
-
# UTILITY METHODS
|
|
636
|
-
###########################################
|
|
637
|
-
|
|
638
|
-
async def _ensure_tools_initialized(self) -> None:
|
|
639
|
-
"""Ensure the tool manager and tools are initialized before use."""
|
|
640
|
-
if not hasattr(self.tool_manager, "tools") or self.tool_manager.tools is None:
|
|
641
|
-
logger.info("Tools not initialized. Initializing now...")
|
|
642
|
-
await self.tool_manager.initialize()
|
|
643
|
-
logger.info("Tools initialized successfully.")
|
|
644
|
-
|
|
645
|
-
async def process_model_response(self, response_text: str) -> Optional[Dict[str, Any]]:
|
|
646
|
-
"""Process model response to extract tool calls.
|
|
647
|
-
|
|
648
|
-
Args:
|
|
649
|
-
response_text: Model response text
|
|
650
|
-
|
|
651
|
-
Returns:
|
|
652
|
-
Extracted tool information, or None if no tool call was found
|
|
653
|
-
"""
|
|
654
|
-
# UI-TARS doesn't use the standard tool call format, so we parse its actions differently
|
|
655
|
-
parsed_actions = parse_actions(response_text)
|
|
656
|
-
|
|
657
|
-
if parsed_actions:
|
|
658
|
-
return {"actions": parsed_actions}
|
|
659
|
-
|
|
660
|
-
return None
|