cua-agent 0.1.29__py3-none-any.whl → 0.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

@@ -0,0 +1,283 @@
1
+ """Computer tool for UI-TARS."""
2
+
3
+ import asyncio
4
+ import base64
5
+ import logging
6
+ import re
7
+ from typing import Any, Dict, List, Optional, Literal, Union
8
+
9
+ from computer import Computer
10
+ from ....core.tools.base import ToolResult, ToolFailure
11
+ from ....core.tools.computer import BaseComputerTool
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class ComputerTool(BaseComputerTool):
17
+ """
18
+ A tool that allows the UI-TARS agent to interact with the screen, keyboard, and mouse.
19
+ """
20
+
21
+ name: str = "computer"
22
+ width: Optional[int] = None
23
+ height: Optional[int] = None
24
+ computer: Computer
25
+
26
+ def __init__(self, computer: Computer):
27
+ """Initialize the computer tool.
28
+
29
+ Args:
30
+ computer: Computer instance
31
+ """
32
+ super().__init__(computer)
33
+ self.computer = computer
34
+ self.width = None
35
+ self.height = None
36
+ self.logger = logging.getLogger(__name__)
37
+
38
+ def to_params(self) -> Dict[str, Any]:
39
+ """Convert tool to API parameters.
40
+
41
+ Returns:
42
+ Dictionary with tool parameters
43
+ """
44
+ if self.width is None or self.height is None:
45
+ raise RuntimeError(
46
+ "Screen dimensions not initialized. Call initialize_dimensions() first."
47
+ )
48
+ return {
49
+ "type": "computer",
50
+ "display_width": self.width,
51
+ "display_height": self.height,
52
+ }
53
+
54
+ async def initialize_dimensions(self) -> None:
55
+ """Initialize screen dimensions from the computer interface."""
56
+ try:
57
+ display_size = await self.computer.interface.get_screen_size()
58
+ self.width = display_size["width"]
59
+ self.height = display_size["height"]
60
+ self.logger.info(f"Initialized screen dimensions to {self.width}x{self.height}")
61
+ except Exception as e:
62
+ # Fall back to defaults if we can't get accurate dimensions
63
+ self.width = 1024
64
+ self.height = 768
65
+ self.logger.warning(
66
+ f"Failed to get screen dimensions, using defaults: {self.width}x{self.height}. Error: {e}"
67
+ )
68
+
69
+ async def __call__(
70
+ self,
71
+ *,
72
+ action: str,
73
+ **kwargs,
74
+ ) -> ToolResult:
75
+ """Execute a computer action.
76
+
77
+ Args:
78
+ action: The action to perform (based on UI-TARS action space)
79
+ **kwargs: Additional parameters for the action
80
+
81
+ Returns:
82
+ ToolResult containing action output and possibly a base64 image
83
+ """
84
+ try:
85
+ # Ensure dimensions are initialized
86
+ if self.width is None or self.height is None:
87
+ await self.initialize_dimensions()
88
+ if self.width is None or self.height is None:
89
+ return ToolFailure(error="Failed to initialize screen dimensions")
90
+
91
+ # Handle actions defined in UI-TARS action space (from prompts.py)
92
+ # Handle standard click (left click)
93
+ if action == "click":
94
+ if "x" in kwargs and "y" in kwargs:
95
+ x, y = kwargs["x"], kwargs["y"]
96
+ await self.computer.interface.left_click(x, y)
97
+
98
+ # Wait briefly for UI to update
99
+ await asyncio.sleep(0.5)
100
+
101
+ # Take screenshot after action
102
+ screenshot = await self.computer.interface.screenshot()
103
+ base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
104
+
105
+ return ToolResult(
106
+ output=f"Clicked at ({x}, {y})",
107
+ base64_image=base64_screenshot,
108
+ )
109
+ else:
110
+ return ToolFailure(error="Missing coordinates for click action")
111
+
112
+ # Handle double click
113
+ elif action == "left_double":
114
+ if "x" in kwargs and "y" in kwargs:
115
+ x, y = kwargs["x"], kwargs["y"]
116
+ await self.computer.interface.double_click(x, y)
117
+
118
+ # Wait briefly for UI to update
119
+ await asyncio.sleep(0.5)
120
+
121
+ # Take screenshot after action
122
+ screenshot = await self.computer.interface.screenshot()
123
+ base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
124
+
125
+ return ToolResult(
126
+ output=f"Double-clicked at ({x}, {y})",
127
+ base64_image=base64_screenshot,
128
+ )
129
+ else:
130
+ return ToolFailure(error="Missing coordinates for left_double action")
131
+
132
+ # Handle right click
133
+ elif action == "right_single":
134
+ if "x" in kwargs and "y" in kwargs:
135
+ x, y = kwargs["x"], kwargs["y"]
136
+ await self.computer.interface.right_click(x, y)
137
+
138
+ # Wait briefly for UI to update
139
+ await asyncio.sleep(0.5)
140
+
141
+ # Take screenshot after action
142
+ screenshot = await self.computer.interface.screenshot()
143
+ base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
144
+
145
+ return ToolResult(
146
+ output=f"Right-clicked at ({x}, {y})",
147
+ base64_image=base64_screenshot,
148
+ )
149
+ else:
150
+ return ToolFailure(error="Missing coordinates for right_single action")
151
+
152
+ # Handle typing text
153
+ elif action == "type_text":
154
+ if "text" in kwargs:
155
+ text = kwargs["text"]
156
+ await self.computer.interface.type_text(text)
157
+
158
+ # Wait for UI to update
159
+ await asyncio.sleep(0.3)
160
+
161
+ # Take screenshot after action
162
+ screenshot = await self.computer.interface.screenshot()
163
+ base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
164
+
165
+ return ToolResult(
166
+ output=f"Typed: {text}",
167
+ base64_image=base64_screenshot,
168
+ )
169
+ else:
170
+ return ToolFailure(error="Missing text for type action")
171
+
172
+ # Handle hotkey
173
+ elif action == "hotkey":
174
+ if "keys" in kwargs:
175
+ keys = kwargs["keys"]
176
+
177
+ if len(keys) > 1:
178
+ await self.computer.interface.hotkey(*keys)
179
+ else:
180
+ # Single key press
181
+ await self.computer.interface.press_key(keys[0])
182
+
183
+ # Wait for UI to update
184
+ await asyncio.sleep(0.3)
185
+
186
+ # Take screenshot after action
187
+ screenshot = await self.computer.interface.screenshot()
188
+ base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
189
+
190
+ return ToolResult(
191
+ output=f"Pressed hotkey: {', '.join(keys)}",
192
+ base64_image=base64_screenshot,
193
+ )
194
+ else:
195
+ return ToolFailure(error="Missing keys for hotkey action")
196
+
197
+ # Handle drag action
198
+ elif action == "drag":
199
+ if all(k in kwargs for k in ["start_x", "start_y", "end_x", "end_y"]):
200
+ start_x, start_y = kwargs["start_x"], kwargs["start_y"]
201
+ end_x, end_y = kwargs["end_x"], kwargs["end_y"]
202
+
203
+ # Perform drag
204
+ await self.computer.interface.move_cursor(start_x, start_y)
205
+ await self.computer.interface.drag_to(end_x, end_y)
206
+
207
+ # Wait for UI to update
208
+ await asyncio.sleep(0.5)
209
+
210
+ # Take screenshot after action
211
+ screenshot = await self.computer.interface.screenshot()
212
+ base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
213
+
214
+ return ToolResult(
215
+ output=f"Dragged from ({start_x}, {start_y}) to ({end_x}, {end_y})",
216
+ base64_image=base64_screenshot,
217
+ )
218
+ else:
219
+ return ToolFailure(error="Missing coordinates for drag action")
220
+
221
+ # Handle scroll action
222
+ elif action == "scroll":
223
+ if all(k in kwargs for k in ["x", "y", "direction"]):
224
+ x, y = kwargs["x"], kwargs["y"]
225
+ direction = kwargs["direction"]
226
+
227
+ # Move cursor to position
228
+ await self.computer.interface.move_cursor(x, y)
229
+
230
+ # Scroll based on direction
231
+ if direction == "down":
232
+ await self.computer.interface.scroll_down(5)
233
+ elif direction == "up":
234
+ await self.computer.interface.scroll_up(5)
235
+ elif direction == "right":
236
+ pass # await self.computer.interface.scroll_right(5)
237
+ elif direction == "left":
238
+ pass # await self.computer.interface.scroll_left(5)
239
+ else:
240
+ return ToolFailure(error=f"Invalid scroll direction: {direction}")
241
+
242
+ # Wait for UI to update
243
+ await asyncio.sleep(0.5)
244
+
245
+ # Take screenshot after action
246
+ screenshot = await self.computer.interface.screenshot()
247
+ base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
248
+
249
+ return ToolResult(
250
+ output=f"Scrolled {direction} at ({x}, {y})",
251
+ base64_image=base64_screenshot,
252
+ )
253
+ else:
254
+ return ToolFailure(error="Missing parameters for scroll action")
255
+
256
+ # Handle wait action
257
+ elif action == "wait":
258
+ # Sleep for 5 seconds as specified in the action space
259
+ await asyncio.sleep(5)
260
+
261
+ # Take screenshot after waiting
262
+ screenshot = await self.computer.interface.screenshot()
263
+ base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
264
+
265
+ return ToolResult(
266
+ output="Waited for 5 seconds",
267
+ base64_image=base64_screenshot,
268
+ )
269
+
270
+ # Handle finished action (task completion)
271
+ elif action == "finished":
272
+ content = kwargs.get("content", "Task completed")
273
+ return ToolResult(
274
+ output=f"Task finished: {content}",
275
+ )
276
+
277
+ return await self._handle_scroll(action)
278
+ else:
279
+ return ToolFailure(error=f"Unsupported action: {action}")
280
+
281
+ except Exception as e:
282
+ self.logger.error(f"Error in ComputerTool.__call__: {str(e)}")
283
+ return ToolFailure(error=f"Failed to execute {action}: {str(e)}")
@@ -0,0 +1,60 @@
1
+ """Tool manager for the UI-TARS provider."""
2
+
3
+ import logging
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ from computer import Computer
7
+ from ....core.tools import BaseToolManager
8
+ from ....core.tools.collection import ToolCollection
9
+ from .computer import ComputerTool
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class ToolManager(BaseToolManager):
15
+ """Manages UI-TARS provider tool initialization and execution."""
16
+
17
+ def __init__(self, computer: Computer):
18
+ """Initialize the tool manager.
19
+
20
+ Args:
21
+ computer: Computer instance for computer-related tools
22
+ """
23
+ super().__init__(computer)
24
+ # Initialize UI-TARS-specific tools
25
+ self.computer_tool = ComputerTool(self.computer)
26
+ self._initialized = False
27
+
28
+ def _initialize_tools(self) -> ToolCollection:
29
+ """Initialize all available tools."""
30
+ return ToolCollection(self.computer_tool)
31
+
32
+ async def _initialize_tools_specific(self) -> None:
33
+ """Initialize UI-TARS provider-specific tool requirements."""
34
+ await self.computer_tool.initialize_dimensions()
35
+
36
+ def get_tool_params(self) -> List[Dict[str, Any]]:
37
+ """Get tool parameters for API calls.
38
+
39
+ Returns:
40
+ List of tool parameters for the current provider's API
41
+ """
42
+ if self.tools is None:
43
+ raise RuntimeError("Tools not initialized. Call initialize() first.")
44
+
45
+ return self.tools.to_params()
46
+
47
+ async def execute_tool(self, name: str, tool_input: dict[str, Any]) -> Any:
48
+ """Execute a tool with the given input.
49
+
50
+ Args:
51
+ name: Name of the tool to execute
52
+ tool_input: Input parameters for the tool
53
+
54
+ Returns:
55
+ Result of the tool execution
56
+ """
57
+ if self.tools is None:
58
+ raise RuntimeError("Tools not initialized. Call initialize() first.")
59
+
60
+ return await self.tools.run(name=name, tool_input=tool_input)
@@ -0,0 +1,153 @@
1
+ """Utility functions for the UI-TARS provider."""
2
+
3
+ import logging
4
+ import base64
5
+ import re
6
+ from typing import Any, Dict, List, Optional, Union, Tuple
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ def add_box_token(input_string: str) -> str:
12
+ """Add box tokens to the coordinates in the model response.
13
+
14
+ Args:
15
+ input_string: Raw model response
16
+
17
+ Returns:
18
+ String with box tokens added
19
+ """
20
+ if "Action: " not in input_string or "start_box=" not in input_string:
21
+ return input_string
22
+
23
+ suffix = input_string.split("Action: ")[0] + "Action: "
24
+ actions = input_string.split("Action: ")[1:]
25
+ processed_actions = []
26
+
27
+ for action in actions:
28
+ action = action.strip()
29
+ coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action)
30
+
31
+ updated_action = action
32
+ for coord_type, x, y in coordinates:
33
+ updated_action = updated_action.replace(
34
+ f"{coord_type}='({x},{y})'",
35
+ f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'"
36
+ )
37
+ processed_actions.append(updated_action)
38
+
39
+ return suffix + "\n\n".join(processed_actions)
40
+
41
+
42
+ def parse_actions(response: str) -> List[str]:
43
+ """Parse actions from UI-TARS model response.
44
+
45
+ Args:
46
+ response: The raw model response text
47
+
48
+ Returns:
49
+ List of parsed actions
50
+ """
51
+ actions = []
52
+ # Extract the Action part from the response
53
+ if "Action:" in response:
54
+ action_text = response.split("Action:")[-1].strip()
55
+ # Clean up and format action
56
+ if action_text:
57
+ # Handle multiple actions separated by newlines
58
+ action_parts = action_text.split("\n\n")
59
+ for part in action_parts:
60
+ if part.strip():
61
+ actions.append(part.strip())
62
+
63
+ return actions
64
+
65
+
66
+ def parse_action_parameters(action: str) -> Tuple[str, Dict[str, Any]]:
67
+ """Parse parameters from an action string.
68
+
69
+ Args:
70
+ action: The action string to parse
71
+
72
+ Returns:
73
+ Tuple of (action_name, action_parameters)
74
+ """
75
+ # Handle "finished" action
76
+ if action.startswith("finished"):
77
+ return "finished", {}
78
+
79
+ # Parse action parameters
80
+ action_match = re.match(r'(\w+)\((.*)\)', action)
81
+ if not action_match:
82
+ logger.warning(f"Could not parse action: {action}")
83
+ return "", {}
84
+
85
+ action_name = action_match.group(1)
86
+ action_params_str = action_match.group(2)
87
+
88
+ tool_args = {"action": action_name}
89
+
90
+ # Extract coordinate values from the action
91
+ if "start_box" in action_params_str:
92
+ # Extract all box coordinates
93
+ box_pattern = r"(start_box|end_box)='(?:<\|box_start\|>)?\((\d+),\s*(\d+)\)(?:<\|box_end\|>)?'"
94
+ box_matches = re.findall(box_pattern, action_params_str)
95
+
96
+ # Handle click-type actions
97
+ if action_name in ["click", "left_double", "right_single"]:
98
+ # Get coordinates from start_box
99
+ for box_type, x, y in box_matches:
100
+ if box_type == "start_box":
101
+ tool_args["x"] = int(x)
102
+ tool_args["y"] = int(y)
103
+ break
104
+
105
+ # Handle drag action
106
+ elif action_name == "drag":
107
+ start_x, start_y = None, None
108
+ end_x, end_y = None, None
109
+
110
+ for box_type, x, y in box_matches:
111
+ if box_type == "start_box":
112
+ start_x, start_y = int(x), int(y)
113
+ elif box_type == "end_box":
114
+ end_x, end_y = int(x), int(y)
115
+
116
+ if not None in [start_x, start_y, end_x, end_y]:
117
+ tool_args["start_x"] = start_x
118
+ tool_args["start_y"] = start_y
119
+ tool_args["end_x"] = end_x
120
+ tool_args["end_y"] = end_y
121
+
122
+ # Handle scroll action
123
+ elif action_name == "scroll":
124
+ # Get coordinates from start_box
125
+ for box_type, x, y in box_matches:
126
+ if box_type == "start_box":
127
+ tool_args["x"] = int(x)
128
+ tool_args["y"] = int(y)
129
+ break
130
+
131
+ # Extract direction
132
+ direction_match = re.search(r"direction='([^']+)'", action_params_str)
133
+ if direction_match:
134
+ tool_args["direction"] = direction_match.group(1)
135
+
136
+ # Handle typing text
137
+ elif action_name == "type":
138
+ # Extract text content
139
+ content_match = re.search(r"content='([^']*)'", action_params_str)
140
+ if content_match:
141
+ # Unescape escaped characters
142
+ text = content_match.group(1).replace("\\'", "'").replace('\\"', '"').replace("\\n", "\n")
143
+ tool_args = {"action": "type_text", "text": text}
144
+
145
+ # Handle hotkey
146
+ elif action_name == "hotkey":
147
+ # Extract key combination
148
+ key_match = re.search(r"key='([^']*)'", action_params_str)
149
+ if key_match:
150
+ keys = key_match.group(1).split()
151
+ tool_args = {"action": "hotkey", "keys": keys}
152
+
153
+ return action_name, tool_args
agent/ui/gradio/app.py CHANGED
@@ -162,6 +162,10 @@ MODEL_MAPPINGS = {
162
162
  "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
163
163
  "claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
164
164
  },
165
+ "uitars": {
166
+ # UI-TARS models default to custom endpoint
167
+ "default": "ByteDance-Seed/UI-TARS-1.5-7B",
168
+ },
165
169
  "ollama": {
166
170
  # For Ollama models, we keep the original name
167
171
  "default": "llama3", # A common default model
@@ -191,6 +195,7 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
191
195
  "ANTHROPIC": AgentLoop.ANTHROPIC,
192
196
  "OMNI": AgentLoop.OMNI,
193
197
  "OMNI-OLLAMA": AgentLoop.OMNI, # Special case for Ollama models with OMNI parser
198
+ "UITARS": AgentLoop.UITARS, # UI-TARS implementation
194
199
  }
195
200
  agent_loop = loop_provider_map.get(loop_provider, AgentLoop.OPENAI)
196
201
 
@@ -281,7 +286,9 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
281
286
  # Assign the determined model name
282
287
  model_name_to_use = cleaned_model_name
283
288
  # agent_loop remains AgentLoop.OMNI
284
-
289
+ elif agent_loop == AgentLoop.UITARS:
290
+ provider = LLMProvider.OAICOMPAT
291
+ model_name_to_use = MODEL_MAPPINGS["uitars"]["default"] # Default
285
292
  else:
286
293
  # Default to OpenAI if unrecognized loop
287
294
  provider = LLMProvider.OPENAI
@@ -551,6 +558,7 @@ def create_gradio_ui(
551
558
  "OPENAI": openai_models,
552
559
  "ANTHROPIC": anthropic_models,
553
560
  "OMNI": omni_models + ["Custom model..."], # Add custom model option
561
+ "UITARS": ["Custom model..."], # UI-TARS options
554
562
  }
555
563
 
556
564
  # --- Apply Saved Settings (override defaults if available) ---
@@ -692,7 +700,7 @@ def create_gradio_ui(
692
700
  with gr.Accordion("Configuration", open=True):
693
701
  # Configuration options
694
702
  agent_loop = gr.Dropdown(
695
- choices=["OPENAI", "ANTHROPIC", "OMNI"],
703
+ choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
696
704
  label="Agent Loop",
697
705
  value=initial_loop,
698
706
  info="Select the agent loop provider",
@@ -807,6 +815,8 @@ def create_gradio_ui(
807
815
  provider, cleaned_model_name_from_func, agent_loop_type = (
808
816
  get_provider_and_model(model_string_to_analyze, agent_loop_choice)
809
817
  )
818
+
819
+ print(f"provider={provider} cleaned_model_name_from_func={cleaned_model_name_from_func} agent_loop_type={agent_loop_type} agent_loop_choice={agent_loop_choice}")
810
820
 
811
821
  # Determine the final model name to send to the agent
812
822
  # If custom selected, use the custom text box value, otherwise use the cleaned name
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.1.29
3
+ Version: 0.1.31
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.10
@@ -21,6 +21,8 @@ Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "anthropic"
21
21
  Provides-Extra: openai
22
22
  Requires-Dist: openai<2.0.0,>=1.14.0; extra == "openai"
23
23
  Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "openai"
24
+ Provides-Extra: uitars
25
+ Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "uitars"
24
26
  Provides-Extra: ui
25
27
  Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "ui"
26
28
  Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "ui"
@@ -99,6 +101,7 @@ pip install "cua-agent[all]"
99
101
  # or install specific loop providers
100
102
  pip install "cua-agent[openai]" # OpenAI Cua Loop
101
103
  pip install "cua-agent[anthropic]" # Anthropic Cua Loop
104
+ pip install "cua-agent[uitars]" # UI-Tars support
102
105
  pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
103
106
  pip install "cua-agent[ui]" # Gradio UI for the agent
104
107
  ```
@@ -118,6 +121,9 @@ async with Computer() as macos_computer:
118
121
  # or
119
122
  # loop=AgentLoop.OMNI,
120
123
  # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
124
+ # or
125
+ # loop=AgentLoop.UITARS,
126
+ # model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
121
127
  )
122
128
 
123
129
  tasks = [
@@ -143,7 +149,13 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
143
149
 
144
150
  ## Using the Gradio UI
145
151
 
146
- The agent includes a Gradio-based user interface for easy interaction. To use it:
152
+ The agent includes a Gradio-based user interface for easier interaction.
153
+
154
+ <div align="center">
155
+ <img src="../../img/agent_gradio_ui.png"/>
156
+ </div>
157
+
158
+ To use it:
147
159
 
148
160
  ```bash
149
161
  # Install with Gradio support
@@ -192,6 +204,10 @@ The Gradio UI provides:
192
204
  - Configuration of agent parameters
193
205
  - Chat interface for interacting with the agent
194
206
 
207
+ ### Using UI-TARS
208
+
209
+ You can use UI-TARS by first following the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md). This will give you a provider URL like this: `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the gradio UI.
210
+
195
211
  ## Agent Loops
196
212
 
197
213
  The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
@@ -200,6 +216,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
200
216
  |:-----------|:-----------------|:------------|:-------------|
201
217
  | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
202
218
  | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
219
+ | `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required |
203
220
  | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
204
221
 
205
222
  ## AgentResponse
@@ -241,25 +258,9 @@ async for result in agent.run(task):
241
258
  print(output)
242
259
  ```
243
260
 
244
- ### Gradio UI
245
-
246
- You can also interact with the agent using a Gradio interface.
247
-
248
- ```python
249
- # Ensure environment variables (e.g., API keys) are loaded
250
- # You might need a helper function like load_dotenv_files() if using .env
251
- # from utils import load_dotenv_files
252
- # load_dotenv_files()
253
-
254
- from agent.ui.gradio.app import create_gradio_ui
255
-
256
- app = create_gradio_ui()
257
- app.launch(share=False)
258
- ```
259
-
260
261
  **Note on Settings Persistence:**
261
262
 
262
263
  * The Gradio UI automatically saves your configuration (Agent Loop, Model Choice, Custom Base URL, Save Trajectory state, Recent Images count) to a file named `.gradio_settings.json` in the project's root directory when you successfully run a task.
263
264
  * This allows your preferences to persist between sessions.
264
265
  * API keys entered into the custom provider field are **not** saved in this file for security reasons. Manage API keys using environment variables (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`) or a `.env` file.
265
- * It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
266
+ * It's recommended to add `.gradio_settings.json` to your `.gitignore` file.