cua-agent 0.1.29__py3-none-any.whl → 0.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

@@ -0,0 +1,598 @@
1
+ """UI-TARS-specific agent loop implementation."""
2
+
3
+ import logging
4
+ import asyncio
5
+ import re
6
+ import os
7
+ import json
8
+ import base64
9
+ import copy
10
+ from typing import Any, Dict, List, Optional, Tuple, AsyncGenerator, cast
11
+
12
+ from httpx import ConnectError, ReadTimeout
13
+
14
+ from ...core.base import BaseLoop
15
+ from ...core.messages import StandardMessageManager, ImageRetentionConfig
16
+ from ...core.types import AgentResponse, LLMProvider
17
+ from ...core.visualization import VisualizationHelper
18
+ from computer import Computer
19
+
20
+ from .utils import add_box_token, parse_actions, parse_action_parameters
21
+ from .tools.manager import ToolManager
22
+ from .tools.computer import ToolResult
23
+ from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
24
+
25
+ from .clients.oaicompat import OAICompatClient
26
+
27
+ logging.basicConfig(level=logging.INFO)
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class UITARSLoop(BaseLoop):
32
+ """UI-TARS-specific implementation of the agent loop.
33
+
34
+ This class extends BaseLoop to provide support for the UI-TARS model
35
+ with computer control capabilities.
36
+ """
37
+
38
+ ###########################################
39
+ # INITIALIZATION AND CONFIGURATION
40
+ ###########################################
41
+
42
+ def __init__(
43
+ self,
44
+ computer: Computer,
45
+ api_key: str,
46
+ model: str,
47
+ provider_base_url: Optional[str] = "http://localhost:8000/v1",
48
+ only_n_most_recent_images: Optional[int] = 2,
49
+ base_dir: Optional[str] = "trajectories",
50
+ max_retries: int = 3,
51
+ retry_delay: float = 1.0,
52
+ save_trajectory: bool = True,
53
+ **kwargs,
54
+ ):
55
+ """Initialize the loop.
56
+
57
+ Args:
58
+ computer: Computer instance
59
+ api_key: API key (may not be needed for local endpoints)
60
+ model: Model name (e.g., "ui-tars")
61
+ provider_base_url: Base URL for the API provider
62
+ only_n_most_recent_images: Maximum number of recent screenshots to include in API requests
63
+ base_dir: Base directory for saving experiment data
64
+ max_retries: Maximum number of retries for API calls
65
+ retry_delay: Delay between retries in seconds
66
+ save_trajectory: Whether to save trajectory data
67
+ """
68
+ # Set provider before initializing base class
69
+ self.provider = LLMProvider.OAICOMPAT
70
+ self.provider_base_url = provider_base_url
71
+
72
+ # Initialize message manager with image retention config
73
+ self.message_manager = StandardMessageManager(
74
+ config=ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images)
75
+ )
76
+
77
+ # Initialize base class (which will set up experiment manager)
78
+ super().__init__(
79
+ computer=computer,
80
+ model=model,
81
+ api_key=api_key,
82
+ max_retries=max_retries,
83
+ retry_delay=retry_delay,
84
+ base_dir=base_dir,
85
+ save_trajectory=save_trajectory,
86
+ only_n_most_recent_images=only_n_most_recent_images,
87
+ **kwargs,
88
+ )
89
+
90
+ # Set API client attributes
91
+ self.client = None
92
+ self.retry_count = 0
93
+
94
+ # Initialize visualization helper
95
+ self.viz_helper = VisualizationHelper(agent=self)
96
+
97
+ # Initialize tool manager
98
+ self.tool_manager = ToolManager(computer=computer)
99
+
100
+ logger.info("UITARSLoop initialized with StandardMessageManager")
101
+
102
+ async def initialize(self) -> None:
103
+ """Initialize the loop by setting up tools and clients."""
104
+ # Initialize base class
105
+ await super().initialize()
106
+
107
+ # Initialize tool manager with error handling
108
+ try:
109
+ logger.info("Initializing tool manager...")
110
+ await self.tool_manager.initialize()
111
+ logger.info("Tool manager initialized successfully.")
112
+ except Exception as e:
113
+ logger.error(f"Error initializing tool manager: {str(e)}")
114
+ logger.warning("Will attempt to initialize tools on first use.")
115
+
116
+ # Initialize client for the OAICompat provider
117
+ try:
118
+ await self.initialize_client()
119
+ except Exception as e:
120
+ logger.error(f"Error initializing client: {str(e)}")
121
+ raise RuntimeError(f"Failed to initialize client: {str(e)}")
122
+
123
+ ###########################################
124
+ # CLIENT INITIALIZATION - IMPLEMENTING ABSTRACT METHOD
125
+ ###########################################
126
+
127
+ async def initialize_client(self) -> None:
128
+ """Initialize the appropriate client.
129
+
130
+ Implements abstract method from BaseLoop to set up the specific
131
+ provider client (OAICompat for UI-TARS).
132
+ """
133
+ try:
134
+ logger.info(f"Initializing OAICompat client for UI-TARS with model {self.model}...")
135
+
136
+ self.client = OAICompatClient(
137
+ api_key=self.api_key or "EMPTY", # Local endpoints typically don't require an API key
138
+ model=self.model,
139
+ provider_base_url=self.provider_base_url,
140
+ )
141
+
142
+ logger.info(f"Initialized OAICompat client with model {self.model}")
143
+ except Exception as e:
144
+ logger.error(f"Error initializing client: {str(e)}")
145
+ self.client = None
146
+ raise RuntimeError(f"Failed to initialize client: {str(e)}")
147
+
148
+ ###########################################
149
+ # MESSAGE FORMATTING
150
+ ###########################################
151
+
152
+ def to_uitars_format(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
153
+ """Convert messages to UI-TARS compatible format.
154
+
155
+ Args:
156
+ messages: List of messages in standard format
157
+
158
+ Returns:
159
+ List of messages formatted for UI-TARS
160
+ """
161
+ # Create a copy of the messages to avoid modifying the original
162
+ uitars_messages = copy.deepcopy(messages)
163
+
164
+ # Find the first user message to modify
165
+ first_user_idx = None
166
+ instruction = ""
167
+
168
+ for idx, msg in enumerate(uitars_messages):
169
+ if msg.get("role") == "user":
170
+ first_user_idx = idx
171
+ content = msg.get("content", "")
172
+ if isinstance(content, str):
173
+ instruction = content
174
+ break
175
+ elif isinstance(content, list):
176
+ for item in content:
177
+ if item.get("type") == "text":
178
+ instruction = item.get("text", "")
179
+ break
180
+ if instruction:
181
+ break
182
+
183
+ # Only modify the first user message if found
184
+ if first_user_idx is not None and instruction:
185
+ # Create the computer use prompt
186
+ user_prompt = COMPUTER_USE.format(
187
+ instruction='\n'.join([instruction, MAC_SPECIFIC_NOTES]),
188
+ language="English"
189
+ )
190
+
191
+ # Replace the content of the first user message
192
+ if isinstance(uitars_messages[first_user_idx].get("content", ""), str):
193
+ uitars_messages[first_user_idx]["content"] = [{"type": "text", "text": user_prompt}]
194
+ elif isinstance(uitars_messages[first_user_idx].get("content", ""), list):
195
+ # Find and replace only the text part, keeping images
196
+ for i, item in enumerate(uitars_messages[first_user_idx]["content"]):
197
+ if item.get("type") == "text":
198
+ uitars_messages[first_user_idx]["content"][i]["text"] = user_prompt
199
+ break
200
+
201
+ # Add box tokens to assistant responses
202
+ for idx, msg in enumerate(uitars_messages):
203
+ if msg.get("role") == "assistant":
204
+ content = msg.get("content", "")
205
+ if content and isinstance(content, list):
206
+ for i, part in enumerate(content):
207
+ if part.get('type') == 'text':
208
+ uitars_messages[idx]["content"][i]["text"] = add_box_token(part['text'])
209
+
210
+ return uitars_messages
211
+
212
+ ###########################################
213
+ # API CALL HANDLING
214
+ ###########################################
215
+
216
+ async def _make_api_call(self, messages: List[Dict[str, Any]], system_prompt: str) -> Any:
217
+ """Make API call to provider with retry logic."""
218
+ # Create new turn directory for this API call
219
+ self._create_turn_dir()
220
+
221
+ request_data = None
222
+ last_error = None
223
+
224
+ for attempt in range(self.max_retries):
225
+ try:
226
+ # Ensure client is initialized
227
+ if self.client is None:
228
+ logger.info(
229
+ f"Client not initialized in _make_api_call (attempt {attempt+1}), initializing now..."
230
+ )
231
+ await self.initialize_client()
232
+ if self.client is None:
233
+ raise RuntimeError("Failed to initialize client")
234
+
235
+ # Get messages in standard format from the message manager
236
+ self.message_manager.messages = messages.copy()
237
+ prepared_messages = self.message_manager.get_messages()
238
+
239
+ # Convert messages to UI-TARS format
240
+ uitars_messages = self.to_uitars_format(prepared_messages)
241
+
242
+ # Log request
243
+ request_data = {
244
+ "messages": uitars_messages,
245
+ "max_tokens": self.max_tokens,
246
+ "system": system_prompt,
247
+ }
248
+
249
+ self._log_api_call("request", request_data)
250
+
251
+ # Make API call
252
+ response = await self.client.run_interleaved(
253
+ messages=uitars_messages,
254
+ system=system_prompt,
255
+ max_tokens=self.max_tokens,
256
+ )
257
+
258
+ # Log success response
259
+ self._log_api_call("response", request_data, response)
260
+
261
+ return response
262
+
263
+ except (ConnectError, ReadTimeout) as e:
264
+ last_error = e
265
+ logger.warning(
266
+ f"Connection error on attempt {attempt + 1}/{self.max_retries}: {str(e)}"
267
+ )
268
+ if attempt < self.max_retries - 1:
269
+ await asyncio.sleep(self.retry_delay * (attempt + 1)) # Exponential backoff
270
+ # Reset client on connection errors to force re-initialization
271
+ self.client = None
272
+ continue
273
+
274
+ except RuntimeError as e:
275
+ # Handle client initialization errors specifically
276
+ last_error = e
277
+ self._log_api_call("error", request_data, error=e)
278
+ logger.error(
279
+ f"Client initialization error (attempt {attempt + 1}/{self.max_retries}): {str(e)}"
280
+ )
281
+ if attempt < self.max_retries - 1:
282
+ # Reset client to force re-initialization
283
+ self.client = None
284
+ await asyncio.sleep(self.retry_delay)
285
+ continue
286
+
287
+ except Exception as e:
288
+ # Log unexpected error
289
+ last_error = e
290
+ self._log_api_call("error", request_data, error=e)
291
+ logger.error(f"Unexpected error in API call: {str(e)}")
292
+ if attempt < self.max_retries - 1:
293
+ await asyncio.sleep(self.retry_delay)
294
+ continue
295
+
296
+ # If we get here, all retries failed
297
+ error_message = f"API call failed after {self.max_retries} attempts"
298
+ if last_error:
299
+ error_message += f": {str(last_error)}"
300
+
301
+ logger.error(error_message)
302
+ raise RuntimeError(error_message)
303
+
304
+ ###########################################
305
+ # RESPONSE AND ACTION HANDLING
306
+ ###########################################
307
+
308
+ async def _handle_response(
309
+ self, response: Any, messages: List[Dict[str, Any]]
310
+ ) -> Tuple[bool, bool]:
311
+ """Handle API response.
312
+
313
+ Args:
314
+ response: API response
315
+ messages: List of messages to update
316
+
317
+ Returns:
318
+ Tuple of (should_continue, action_screenshot_saved)
319
+ """
320
+ action_screenshot_saved = False
321
+
322
+ try:
323
+ # Step 1: Extract the raw response text
324
+ raw_text = None
325
+
326
+ try:
327
+ # OpenAI-compatible response format
328
+ raw_text = response["choices"][0]["message"]["content"]
329
+ except (KeyError, TypeError, IndexError) as e:
330
+ logger.error(f"Invalid response format: {str(e)}")
331
+ return True, action_screenshot_saved
332
+
333
+ # Step 2: Add the response to message history
334
+ self.message_manager.add_assistant_message([{"type": "text", "text": raw_text}])
335
+
336
+ # Step 3: Parse actions from the response
337
+ parsed_actions = parse_actions(raw_text)
338
+
339
+ if not parsed_actions:
340
+ logger.warning("No action found in the response")
341
+ return True, action_screenshot_saved
342
+
343
+ # Step 4: Execute each action
344
+ for action in parsed_actions:
345
+ action_type = None
346
+
347
+ # Handle "finished" action
348
+ if action.startswith("finished"):
349
+ logger.info("Agent completed the task")
350
+ return False, action_screenshot_saved
351
+
352
+ # Process other action types (click, type, etc.)
353
+ try:
354
+ # Parse action parameters using the utility function
355
+ action_name, tool_args = parse_action_parameters(action)
356
+
357
+ if not action_name:
358
+ logger.warning(f"Could not parse action: {action}")
359
+ continue
360
+
361
+ # Mark actions that would create screenshots
362
+ if action_name in ["click", "left_double", "right_single", "drag", "scroll"]:
363
+ action_screenshot_saved = True
364
+
365
+ # Execute the tool with prepared arguments
366
+ await self._ensure_tools_initialized()
367
+
368
+ # Let's log what we're about to execute for debugging
369
+ logger.info(f"Executing computer tool with arguments: {tool_args}")
370
+
371
+ result = await self.tool_manager.execute_tool(name="computer", tool_input=tool_args)
372
+
373
+ # Handle the result
374
+ if hasattr(result, "error") and result.error:
375
+ logger.error(f"Error executing tool: {result.error}")
376
+ else:
377
+ # Action was successful
378
+ logger.info(f"Successfully executed {action_name}")
379
+
380
+ # Save screenshot if one was returned and we haven't already saved one
381
+ if hasattr(result, "base64_image") and result.base64_image:
382
+ self._save_screenshot(result.base64_image, action_type=action_name)
383
+ action_screenshot_saved = True
384
+
385
+ except Exception as e:
386
+ logger.error(f"Error executing action {action}: {str(e)}")
387
+
388
+ # Continue the loop if there are actions to process
389
+ return True, action_screenshot_saved
390
+
391
+ except Exception as e:
392
+ logger.error(f"Error handling response: {str(e)}")
393
+ # Add error message using the message manager
394
+ error_message = [{"type": "text", "text": f"Error: {str(e)}"}]
395
+ self.message_manager.add_assistant_message(error_message)
396
+ raise
397
+
398
+ ###########################################
399
+ # SCREEN HANDLING
400
+ ###########################################
401
+
402
+ async def _get_current_screen(self, save_screenshot: bool = True) -> str:
403
+ """Get the current screen as a base64 encoded image.
404
+
405
+ Args:
406
+ save_screenshot: Whether to save the screenshot
407
+
408
+ Returns:
409
+ Base64 encoded screenshot
410
+ """
411
+ try:
412
+ # Take a screenshot
413
+ screenshot = await self.computer.interface.screenshot()
414
+
415
+ # Convert to base64
416
+ img_base64 = base64.b64encode(screenshot).decode("utf-8")
417
+
418
+ # Process screenshot through hooks and save if needed
419
+ await self.handle_screenshot(img_base64, action_type="state")
420
+
421
+ # Save screenshot if requested
422
+ if save_screenshot and self.save_trajectory:
423
+ self._save_screenshot(img_base64, action_type="state")
424
+
425
+ return img_base64
426
+
427
+ except Exception as e:
428
+ logger.error(f"Error getting current screen: {str(e)}")
429
+ raise
430
+
431
+ ###########################################
432
+ # SYSTEM PROMPT
433
+ ###########################################
434
+
435
+ def _get_system_prompt(self) -> str:
436
+ """Get the system prompt for the model."""
437
+ return SYSTEM_PROMPT
438
+
439
+ ###########################################
440
+ # MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
441
+ ###########################################
442
+
443
+ async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
444
+ """Run the agent loop with provided messages.
445
+
446
+ Args:
447
+ messages: List of messages in standard OpenAI format
448
+
449
+ Yields:
450
+ Agent response format
451
+ """
452
+ # Initialize the message manager with the provided messages
453
+ self.message_manager.messages = messages.copy()
454
+ logger.info(f"Starting UITARSLoop run with {len(self.message_manager.messages)} messages")
455
+
456
+ # Continue running until explicitly told to stop
457
+ running = True
458
+ turn_created = False
459
+ # Track if an action-specific screenshot has been saved this turn
460
+ action_screenshot_saved = False
461
+
462
+ attempt = 0
463
+ max_attempts = 3
464
+
465
+ while running and attempt < max_attempts:
466
+ try:
467
+ # Create a new turn directory if it's not already created
468
+ if not turn_created:
469
+ self._create_turn_dir()
470
+ turn_created = True
471
+
472
+ # Ensure client is initialized
473
+ if self.client is None:
474
+ logger.info("Initializing client...")
475
+ await self.initialize_client()
476
+ if self.client is None:
477
+ raise RuntimeError("Failed to initialize client")
478
+ logger.info("Client initialized successfully")
479
+
480
+ # Get current screen
481
+ base64_screenshot = await self._get_current_screen()
482
+
483
+ # Add screenshot to message history
484
+ self.message_manager.add_user_message(
485
+ [
486
+ {
487
+ "type": "image_url",
488
+ "image_url": {"url": f"data:image/png;base64,{base64_screenshot}"},
489
+ }
490
+ ]
491
+ )
492
+ logger.info("Added screenshot to message history")
493
+
494
+ # Get system prompt
495
+ system_prompt = self._get_system_prompt()
496
+
497
+ # Make API call with retries
498
+ response = await self._make_api_call(
499
+ self.message_manager.messages, system_prompt
500
+ )
501
+
502
+ # Handle the response (may execute actions)
503
+ # Returns: (should_continue, action_screenshot_saved)
504
+ should_continue, new_screenshot_saved = await self._handle_response(
505
+ response, self.message_manager.messages
506
+ )
507
+
508
+ # Update whether an action screenshot was saved this turn
509
+ action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
510
+
511
+ # Parse actions from the raw response
512
+ raw_response = response["choices"][0]["message"]["content"]
513
+ parsed_actions = parse_actions(raw_response)
514
+
515
+ # Extract thought content if available
516
+ thought = ""
517
+ if "Thought:" in raw_response:
518
+ thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", raw_response, re.DOTALL)
519
+ if thought_match:
520
+ thought = thought_match.group(1).strip()
521
+
522
+ # Create standardized thought response format
523
+ thought_response = {
524
+ "role": "assistant",
525
+ "content": thought or raw_response,
526
+ "metadata": {
527
+ "title": "🧠 UI-TARS Thoughts"
528
+ }
529
+ }
530
+
531
+ # Create action response format
532
+ action_response = {
533
+ "role": "assistant",
534
+ "content": str(parsed_actions),
535
+ "metadata": {
536
+ "title": "🖱️ UI-TARS Actions",
537
+ }
538
+ }
539
+
540
+ # Yield both responses to the caller (thoughts first, then actions)
541
+ yield thought_response
542
+ if parsed_actions:
543
+ yield action_response
544
+
545
+ # Check if we should continue this conversation
546
+ running = should_continue
547
+
548
+ # Create a new turn directory if we're continuing
549
+ if running:
550
+ turn_created = False
551
+
552
+ # Reset attempt counter on success
553
+ attempt = 0
554
+
555
+ except Exception as e:
556
+ attempt += 1
557
+ error_msg = f"Error in run method (attempt {attempt}/{max_attempts}): {str(e)}"
558
+ logger.error(error_msg)
559
+
560
+ # If this is our last attempt, provide more info about the error
561
+ if attempt >= max_attempts:
562
+ logger.error(f"Maximum retry attempts reached. Last error was: {str(e)}")
563
+
564
+ yield {
565
+ "error": str(e),
566
+ "metadata": {"title": "❌ Error"},
567
+ }
568
+
569
+ # Create a brief delay before retrying
570
+ await asyncio.sleep(1)
571
+
572
+ ###########################################
573
+ # UTILITY METHODS
574
+ ###########################################
575
+
576
+ async def _ensure_tools_initialized(self) -> None:
577
+ """Ensure the tool manager and tools are initialized before use."""
578
+ if not hasattr(self.tool_manager, "tools") or self.tool_manager.tools is None:
579
+ logger.info("Tools not initialized. Initializing now...")
580
+ await self.tool_manager.initialize()
581
+ logger.info("Tools initialized successfully.")
582
+
583
+ async def process_model_response(self, response_text: str) -> Optional[Dict[str, Any]]:
584
+ """Process model response to extract tool calls.
585
+
586
+ Args:
587
+ response_text: Model response text
588
+
589
+ Returns:
590
+ Extracted tool information, or None if no tool call was found
591
+ """
592
+ # UI-TARS doesn't use the standard tool call format, so we parse its actions differently
593
+ parsed_actions = parse_actions(response_text)
594
+
595
+ if parsed_actions:
596
+ return {"actions": parsed_actions}
597
+
598
+ return None
@@ -0,0 +1,63 @@
1
+ """Prompts for UI-TARS agent."""
2
+
3
+ MAC_SPECIFIC_NOTES = """
4
+ (You are operating on macOS, use 'cmd' instead of 'ctrl' for most shortcuts e.g., hotkey(key='cmd c') for copy, hotkey(key='cmd v') for paste, hotkey(key='cmd t') for new tab).)
5
+ """
6
+
7
+ SYSTEM_PROMPT = "You are a helpful assistant."
8
+
9
+ COMPUTER_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
10
+
11
+ ## Output Format
12
+ ```
13
+ Thought: ...
14
+ Action: ...
15
+ ```
16
+
17
+ ## Action Space
18
+
19
+ click(start_box='<|box_start|>(x1,y1)<|box_end|>')
20
+ left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
21
+ right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
22
+ drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
23
+ hotkey(key='')
24
+ type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
25
+ scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
26
+ wait() #Sleep for 5s and take a screenshot to check for any changes.
27
+ finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
28
+
29
+
30
+ ## Note
31
+ - Use {language} in `Thought` part.
32
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
33
+
34
+ ## User Instruction
35
+ {instruction}
36
+ """
37
+
38
+ MOBILE_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
39
+ ## Output Format
40
+ ```
41
+ Thought: ...
42
+ Action: ...
43
+ ```
44
+ ## Action Space
45
+
46
+ click(start_box='<|box_start|>(x1,y1)<|box_end|>')
47
+ long_press(start_box='<|box_start|>(x1,y1)<|box_end|>')
48
+ type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
49
+ scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
50
+ open_app(app_name=\'\')
51
+ drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
52
+ press_home()
53
+ press_back()
54
+ finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
55
+
56
+
57
+ ## Note
58
+ - Use {language} in `Thought` part.
59
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
60
+
61
+ ## User Instruction
62
+ {instruction}
63
+ """
@@ -0,0 +1 @@
1
+ """UI-TARS tools package."""