cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +4 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +110 -99
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +337 -185
  15. agent/callbacks/__init__.py +9 -4
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +35 -33
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +99 -61
  25. agent/callbacks/trajectory_saver.py +95 -69
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +38 -99
  37. agent/integrations/hud/agent.py +369 -0
  38. agent/integrations/hud/proxy.py +166 -52
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +579 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +136 -150
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +50 -51
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +247 -206
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +61 -57
  64. agent/proxy/handlers.py +46 -39
  65. agent/responses.py +447 -347
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +11 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. cua_agent-0.4.22.dist-info/METADATA +0 -436
  78. cua_agent-0.4.22.dist-info/RECORD +0 -51
  79. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,423 @@
1
+ """
2
+ Browser Tool for agent interactions.
3
+ Allows agents to control a browser programmatically via Playwright.
4
+ Implements the computer_use action interface for comprehensive browser control.
5
+ """
6
+
7
+ import asyncio
8
+ import logging
9
+ from typing import TYPE_CHECKING, Optional, Union
10
+
11
+ from .base import BaseComputerTool, register_tool
12
+
13
+ if TYPE_CHECKING:
14
+ from computer.interface import GenericComputerInterface
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @register_tool("computer_use")
20
+ class BrowserTool(BaseComputerTool):
21
+ """
22
+ Browser tool that uses the computer SDK's interface to control a browser.
23
+ Implements a comprehensive computer_use action interface for browser control.
24
+ """
25
+
26
+ def __init__(self, interface: "GenericComputerInterface", cfg: Optional[dict] = None):
27
+ """
28
+ Initialize the BrowserTool.
29
+
30
+ Args:
31
+ interface: A GenericComputerInterface instance that provides playwright_exec
32
+ cfg: Optional configuration dictionary
33
+ """
34
+ self.interface = interface
35
+ self._facts = [] # Store memorized facts
36
+
37
+ # Get initial screenshot to determine dimensions
38
+ self.viewport_width = None
39
+ self.viewport_height = None
40
+ self.resized_width = None
41
+ self.resized_height = None
42
+
43
+ # Try to initialize dimensions synchronously
44
+ try:
45
+ import asyncio
46
+
47
+ loop = asyncio.get_event_loop()
48
+ if loop.is_running():
49
+ # If we're in an async context, dimensions will be lazy-loaded
50
+ pass
51
+ else:
52
+ loop.run_until_complete(self._initialize_dimensions())
53
+ except Exception:
54
+ # Dimensions will be lazy-loaded on first use
55
+ pass
56
+
57
+ super().__init__(cfg)
58
+
59
+ async def _initialize_dimensions(self):
60
+ """Initialize viewport and resized dimensions from screenshot."""
61
+ try:
62
+ import base64
63
+ import io
64
+
65
+ from PIL import Image
66
+ from qwen_vl_utils import smart_resize
67
+
68
+ # Take a screenshot to get actual dimensions
69
+ screenshot_b64 = await self.screenshot()
70
+ img_bytes = base64.b64decode(screenshot_b64)
71
+ im = Image.open(io.BytesIO(img_bytes))
72
+
73
+ # Store actual viewport size
74
+ self.viewport_width = im.width
75
+ self.viewport_height = im.height
76
+
77
+ # Calculate resized dimensions using smart_resize with factor=28
78
+ MIN_PIXELS = 3136
79
+ MAX_PIXELS = 12845056
80
+ rh, rw = smart_resize(
81
+ im.height, im.width, factor=28, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
82
+ )
83
+ self.resized_width = rw
84
+ self.resized_height = rh
85
+
86
+ except Exception as e:
87
+ # Fall back to defaults if initialization fails
88
+ logger.warning(f"Failed to initialize dimensions: {e}")
89
+ self.viewport_width = 1024
90
+ self.viewport_height = 768
91
+ self.resized_width = 1024
92
+ self.resized_height = 768
93
+
94
+ async def _proc_coords(self, x: float, y: float) -> tuple:
95
+ """
96
+ Process coordinates by converting from resized space to viewport space.
97
+
98
+ Args:
99
+ x: X coordinate in resized space (0 to resized_width)
100
+ y: Y coordinate in resized space (0 to resized_height)
101
+
102
+ Returns:
103
+ Tuple of (viewport_x, viewport_y) in actual viewport pixels
104
+ """
105
+ # Ensure dimensions are initialized
106
+ if self.resized_width is None or self.resized_height is None:
107
+ await self._initialize_dimensions()
108
+
109
+ # Convert from resized space to viewport space
110
+ # Normalize by resized dimensions, then scale to viewport dimensions
111
+ viewport_x = (x / self.resized_width) * self.viewport_width
112
+ viewport_y = (y / self.resized_height) * self.viewport_height
113
+
114
+ return int(round(viewport_x)), int(round(viewport_y))
115
+
116
+ @property
117
+ def description(self) -> str:
118
+ # Use resized dimensions if available, otherwise use defaults
119
+ width = self.resized_width if self.resized_width is not None else 1024
120
+ height = self.resized_height if self.resized_height is not None else 768
121
+
122
+ return f"Use a mouse and keyboard to interact with a computer, and take screenshots.\
123
+ * This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\
124
+ * Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\
125
+ * The screen's resolution is {width}x{height}.\
126
+ * Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\
127
+ * If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\
128
+ * Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\
129
+ * When a separate scrollable container prominently overlays the webpage, if you want to scroll within it, you typically need to mouse_move() over it first and then scroll().\
130
+ * If a popup window appears that you want to close, if left_click() on the 'X' or close button doesn't work, try key(keys=['Escape']) to close it.\
131
+ * On some search bars, when you type(), you may need to press_enter=False and instead separately call left_click() on the search button to submit the search query. This is especially true of search bars that have auto-suggest popups for e.g. locations\
132
+ * For calendar widgets, you usually need to left_click() on arrows to move between months and left_click() on dates to select them; type() is not typically used to input dates there.".strip()
133
+
134
+ @property
135
+ def parameters(self) -> dict:
136
+ return {
137
+ "type": "object",
138
+ "properties": {
139
+ "action": {
140
+ "description": """The action to perform. The available actions are:
141
+ * key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order. Includes 'Enter', 'Alt', 'Shift', 'Tab', 'Control', 'Backspace', 'Delete', 'Escape', 'ArrowUp', 'ArrowDown', 'ArrowLeft', 'ArrowRight', 'PageDown', 'PageUp', 'Shift', etc.
142
+ * type: Type a string of text on the keyboard.
143
+ * mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.
144
+ * left_click: Click the left mouse button.
145
+ * scroll: Performs a scroll of the mouse scroll wheel.
146
+ * visit_url: Visit a specified URL.
147
+ * web_search: Perform a web search with a specified query.
148
+ * history_back: Go back to the previous page in the browser history.
149
+ * pause_and_memorize_fact: Pause and memorize a fact for future reference.
150
+ * wait: Wait specified seconds for the change to happen.
151
+ * terminate: Terminate the current task and report its completion status.""",
152
+ "enum": [
153
+ "key",
154
+ "type",
155
+ "mouse_move",
156
+ "left_click",
157
+ "scroll",
158
+ "visit_url",
159
+ "web_search",
160
+ "history_back",
161
+ "pause_and_memorize_fact",
162
+ "wait",
163
+ "terminate",
164
+ ],
165
+ "type": "string",
166
+ },
167
+ "keys": {"description": "Required only by action=key.", "type": "array"},
168
+ "text": {"description": "Required only by action=type.", "type": "string"},
169
+ "coordinate": {
170
+ "description": "(x, y) coordinates for mouse actions. Required only by action=left_click, action=mouse_move, and action=type.",
171
+ "type": "array",
172
+ },
173
+ "pixels": {
174
+ "description": "Amount of scrolling. Positive = up, Negative = down. Required only by action=scroll.",
175
+ "type": "number",
176
+ },
177
+ "url": {
178
+ "description": "The URL to visit. Required only by action=visit_url.",
179
+ "type": "string",
180
+ },
181
+ "query": {
182
+ "description": "The query to search for. Required only by action=web_search.",
183
+ "type": "string",
184
+ },
185
+ "fact": {
186
+ "description": "The fact to remember for the future. Required only by action=pause_and_memorize_fact.",
187
+ "type": "string",
188
+ },
189
+ "time": {
190
+ "description": "Seconds to wait. Required only by action=wait.",
191
+ "type": "number",
192
+ },
193
+ "status": {
194
+ "description": "Status of the task. Required only by action=terminate.",
195
+ "type": "string",
196
+ "enum": ["success", "failure"],
197
+ },
198
+ },
199
+ "required": ["action"],
200
+ }
201
+
202
+ def call(self, params: Union[str, dict], **kwargs) -> Union[str, dict]:
203
+ """
204
+ Execute a browser action.
205
+
206
+ Args:
207
+ params: Action parameters (JSON string or dict)
208
+ **kwargs: Additional keyword arguments
209
+
210
+ Returns:
211
+ Result of the action execution
212
+ """
213
+ # Verify and parse parameters
214
+ params_dict = self._verify_json_format_args(params)
215
+ action = params_dict.get("action")
216
+
217
+ if not action:
218
+ return {"success": False, "error": "action parameter is required"}
219
+
220
+ # Execute action synchronously by running async method in event loop
221
+ try:
222
+ loop = asyncio.get_event_loop()
223
+ if loop.is_running():
224
+ # If we're already in an async context, we can't use run_until_complete
225
+ # Create a task and wait for it
226
+ import concurrent.futures
227
+
228
+ with concurrent.futures.ThreadPoolExecutor() as executor:
229
+ future = executor.submit(asyncio.run, self._execute_action(action, params_dict))
230
+ result = future.result()
231
+ else:
232
+ result = loop.run_until_complete(self._execute_action(action, params_dict))
233
+ return result
234
+ except Exception as e:
235
+ logger.error(f"Error executing action {action}: {e}")
236
+ return {"success": False, "error": str(e)}
237
+
238
+ async def _execute_action(self, action: str, params: dict) -> dict:
239
+ """Execute the specific action asynchronously."""
240
+ try:
241
+ if action == "key":
242
+ return await self._action_key(params)
243
+ elif action == "type":
244
+ return await self._action_type(params)
245
+ elif action == "mouse_move":
246
+ return await self._action_mouse_move(params)
247
+ elif action == "left_click":
248
+ return await self._action_left_click(params)
249
+ elif action == "scroll":
250
+ return await self._action_scroll(params)
251
+ elif action == "visit_url":
252
+ return await self._action_visit_url(params)
253
+ elif action == "web_search":
254
+ return await self._action_web_search(params)
255
+ elif action == "history_back":
256
+ return await self._action_history_back(params)
257
+ elif action == "pause_and_memorize_fact":
258
+ return await self._action_pause_and_memorize_fact(params)
259
+ elif action == "wait":
260
+ return await self._action_wait(params)
261
+ elif action == "terminate":
262
+ return await self._action_terminate(params)
263
+ else:
264
+ return {"success": False, "error": f"Unknown action: {action}"}
265
+ except Exception as e:
266
+ logger.error(f"Error in action {action}: {e}")
267
+ return {"success": False, "error": str(e)}
268
+
269
+ async def _action_key(self, params: dict) -> dict:
270
+ """Press keys in sequence."""
271
+ keys = params.get("keys", [])
272
+ if not keys:
273
+ return {"success": False, "error": "keys parameter is required"}
274
+
275
+ # Convert keys to proper format and press via hotkey
276
+ try:
277
+ await self.interface.interface.hotkey(*keys)
278
+ return {"success": True, "message": f"Pressed keys: {keys}"}
279
+ except Exception as e:
280
+ return {"success": False, "error": str(e)}
281
+
282
+ async def _action_type(self, params: dict) -> dict:
283
+ """Type text."""
284
+ text = params.get("text")
285
+ if not text:
286
+ return {"success": False, "error": "text parameter is required"}
287
+
288
+ # If coordinate is provided, click there first
289
+ coordinate = params.get("coordinate")
290
+ if coordinate and len(coordinate) == 2:
291
+ await self.interface.playwright_exec("click", {"x": coordinate[0], "y": coordinate[1]})
292
+
293
+ result = await self.interface.playwright_exec("type", {"text": text})
294
+ return result
295
+
296
+ async def _action_mouse_move(self, params: dict) -> dict:
297
+ """Move mouse to coordinates."""
298
+ coordinate = params.get("coordinate")
299
+ if not coordinate or len(coordinate) != 2:
300
+ return {"success": False, "error": "coordinate parameter [x, y] is required"}
301
+
302
+ await self.interface.interface.move_cursor(coordinate[0], coordinate[1])
303
+ return {"success": True, "message": f"Moved cursor to {coordinate}"}
304
+
305
+ async def _action_left_click(self, params: dict) -> dict:
306
+ """Click at coordinates."""
307
+ coordinate = params.get("coordinate")
308
+ if not coordinate or len(coordinate) != 2:
309
+ return {"success": False, "error": "coordinate parameter [x, y] is required"}
310
+
311
+ result = await self.interface.playwright_exec(
312
+ "click", {"x": coordinate[0], "y": coordinate[1]}
313
+ )
314
+ return result
315
+
316
+ async def _action_scroll(self, params: dict) -> dict:
317
+ """Scroll the page."""
318
+ pixels = params.get("pixels", 0)
319
+ if pixels == 0:
320
+ return {"success": False, "error": "pixels parameter is required"}
321
+
322
+ # Positive = up (negative delta_y), Negative = down (positive delta_y)
323
+ result = await self.interface.playwright_exec("scroll", {"delta_x": 0, "delta_y": -pixels})
324
+ return result
325
+
326
+ async def _action_visit_url(self, params: dict) -> dict:
327
+ """Visit a URL."""
328
+ url = params.get("url")
329
+ if not url:
330
+ return {"success": False, "error": "url parameter is required"}
331
+
332
+ result = await self.interface.playwright_exec("visit_url", {"url": url})
333
+ return result
334
+
335
+ async def _action_web_search(self, params: dict) -> dict:
336
+ """Perform web search."""
337
+ query = params.get("query")
338
+ if not query:
339
+ return {"success": False, "error": "query parameter is required"}
340
+
341
+ result = await self.interface.playwright_exec("web_search", {"query": query})
342
+ return result
343
+
344
+ async def _action_history_back(self, params: dict) -> dict:
345
+ """Go back in browser history."""
346
+ # Press Alt+Left arrow key combination
347
+ try:
348
+ await self.interface.interface.hotkey("Alt", "ArrowLeft")
349
+ return {"success": True, "message": "Navigated back in history"}
350
+ except Exception as e:
351
+ return {"success": False, "error": str(e)}
352
+
353
+ async def _action_pause_and_memorize_fact(self, params: dict) -> dict:
354
+ """Memorize a fact."""
355
+ fact = params.get("fact")
356
+ if not fact:
357
+ return {"success": False, "error": "fact parameter is required"}
358
+
359
+ self._facts.append(fact)
360
+ return {
361
+ "success": True,
362
+ "message": f"Memorized fact: {fact}",
363
+ "total_facts": len(self._facts),
364
+ }
365
+
366
+ async def _action_wait(self, params: dict) -> dict:
367
+ """Wait for specified seconds."""
368
+ time = params.get("time", 0)
369
+ if time <= 0:
370
+ return {"success": False, "error": "time parameter must be positive"}
371
+
372
+ await asyncio.sleep(time)
373
+ return {"success": True, "message": f"Waited {time} seconds"}
374
+
375
+ async def _action_terminate(self, params: dict) -> dict:
376
+ """Terminate and report status."""
377
+ status = params.get("status", "success")
378
+ message = f"Task terminated with status: {status}"
379
+
380
+ if self._facts:
381
+ message += f"\nMemorized facts: {self._facts}"
382
+
383
+ return {"success": True, "status": status, "message": message, "terminated": True}
384
+
385
+ # Legacy methods for backward compatibility
386
+ async def visit_url(self, url: str) -> dict:
387
+ """Navigate to a URL."""
388
+ return await self._action_visit_url({"url": url})
389
+
390
+ async def click(self, x: int, y: int) -> dict:
391
+ """Click at coordinates."""
392
+ return await self._action_left_click({"coordinate": [x, y]})
393
+
394
+ async def type(self, text: str) -> dict:
395
+ """Type text into the focused element."""
396
+ return await self._action_type({"text": text})
397
+
398
+ async def scroll(self, delta_x: int, delta_y: int) -> dict:
399
+ """Scroll the page."""
400
+ return await self._action_scroll({"pixels": -delta_y})
401
+
402
+ async def web_search(self, query: str) -> dict:
403
+ """Navigate to a Google search for the query."""
404
+ return await self._action_web_search({"query": query})
405
+
406
+ async def screenshot(self) -> str:
407
+ """Take a screenshot of the current browser page."""
408
+ result = await self.interface.playwright_exec("screenshot", {})
409
+ if result.get("success") and result.get("screenshot"):
410
+ screenshot_b64 = result["screenshot"]
411
+ return screenshot_b64
412
+ else:
413
+ error = result.get("error", "Unknown error")
414
+ raise RuntimeError(f"Failed to take screenshot: {error}")
415
+
416
+ async def get_current_url(self) -> str:
417
+ """Get the current URL of the browser page."""
418
+ result = await self.interface.playwright_exec("get_current_url", {})
419
+ if result.get("success") and result.get("url"):
420
+ return result["url"]
421
+ else:
422
+ error = result.get("error", "Unknown error")
423
+ raise RuntimeError(f"Failed to get current URL: {error}")
agent/types.py CHANGED
@@ -2,37 +2,43 @@
2
2
  Type definitions for agent
3
3
  """
4
4
 
5
- from typing import Dict, List, Any, Optional, Callable, Protocol, Literal
6
- from pydantic import BaseModel
7
5
  import re
8
- from litellm import ResponseInputParam, ResponsesAPIResponse, ToolParam
9
6
  from collections.abc import Iterable
7
+ from typing import Any, Callable, Dict, List, Literal, Optional, Protocol
8
+
9
+ from litellm import ResponseInputParam, ResponsesAPIResponse, ToolParam
10
+ from pydantic import BaseModel
10
11
 
11
12
  # Agent input types
12
13
  Messages = str | ResponseInputParam | List[Dict[str, Any]]
13
14
  Tools = Optional[Iterable[ToolParam]]
14
15
 
15
16
  # Agent output types
16
- AgentResponse = ResponsesAPIResponse
17
+ AgentResponse = ResponsesAPIResponse
17
18
  AgentCapability = Literal["step", "click"]
18
19
 
20
+
19
21
  # Exception types
20
22
  class ToolError(RuntimeError):
21
23
  """Base exception for tool-related errors"""
24
+
22
25
  pass
23
26
 
27
+
24
28
  class IllegalArgumentError(ToolError):
25
29
  """Exception raised when function arguments are invalid"""
30
+
26
31
  pass
27
32
 
28
33
 
29
34
  # Agent config registration
30
35
  class AgentConfigInfo(BaseModel):
31
36
  """Information about a registered agent config"""
37
+
32
38
  agent_class: type
33
39
  models_regex: str
34
40
  priority: int = 0
35
-
41
+
36
42
  def matches_model(self, model: str) -> bool:
37
43
  """Check if this agent config matches the given model"""
38
44
  return bool(re.match(self.models_regex, model))
agent/ui/__init__.py CHANGED
@@ -2,6 +2,6 @@
2
2
  UI components for agent
3
3
  """
4
4
 
5
- from .gradio import launch_ui, create_gradio_ui
5
+ from .gradio import create_gradio_ui, launch_ui
6
6
 
7
7
  __all__ = ["launch_ui", "create_gradio_ui"]
agent/ui/__main__.py CHANGED
@@ -1,4 +1,4 @@
1
1
  from .gradio import launch_ui
2
2
 
3
3
  if __name__ == "__main__":
4
- launch_ui()
4
+ launch_ui()
agent/ui/gradio/app.py CHANGED
@@ -6,9 +6,9 @@ with an advanced UI for model selection and configuration.
6
6
 
7
7
  Supported Agent Models:
8
8
  - OpenAI: openai/computer-use-preview
9
- - Anthropic: anthropic/claude-3-5-sonnet-20241022, anthropic/claude-3-7-sonnet-20250219
9
+ - Anthropic: anthropic/claude-sonnet-4-5-20250929, anthropic/claude-3-7-sonnet-20250219
10
10
  - UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
11
- - Omniparser: omniparser+anthropic/claude-3-5-sonnet-20241022, omniparser+ollama_chat/gemma3
11
+ - Omniparser: omniparser+anthropic/claude-sonnet-4-5-20250929, omniparser+ollama_chat/gemma3
12
12
 
13
13
  Requirements:
14
14
  - Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
@@ -18,21 +18,21 @@ Requirements:
18
18
  - OpenAI or Anthropic API key
19
19
  """
20
20
 
21
- import os
22
21
  import asyncio
23
- import logging
24
22
  import json
23
+ import logging
24
+ import os
25
25
  import platform
26
26
  from pathlib import Path
27
- from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
27
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union, cast
28
+
28
29
  import gradio as gr
29
- from gradio.components.chatbot import MetadataDict
30
- from typing import cast
31
30
 
32
31
  # Import from agent package
33
32
  from agent import ComputerAgent
34
- from agent.types import Messages, AgentResponse
33
+ from agent.types import AgentResponse, Messages
35
34
  from computer import Computer
35
+ from gradio.components.chatbot import MetadataDict
36
36
 
37
37
  # Global variables
38
38
  global_agent = None
@@ -42,11 +42,13 @@ SETTINGS_FILE = Path(".gradio_settings.json")
42
42
  logging.basicConfig(level=logging.INFO)
43
43
 
44
44
  import dotenv
45
+
45
46
  if dotenv.load_dotenv():
46
47
  print(f"DEBUG - Loaded environment variables from {dotenv.find_dotenv()}")
47
48
  else:
48
49
  print("DEBUG - No .env file found")
49
50
 
51
+
50
52
  # --- Settings Load/Save Functions ---
51
53
  def load_settings() -> Dict[str, Any]:
52
54
  """Loads settings from the JSON file."""
@@ -84,7 +86,7 @@ def save_settings(settings: Dict[str, Any]):
84
86
  # async def on_screenshot(self, screenshot_base64: str, action_type: str = "") -> None:
85
87
  # """Add screenshot to chatbot when a screenshot is taken."""
86
88
  # image_markdown = f"![Screenshot after {action_type}](data:image/png;base64,{screenshot_base64})"
87
-
89
+
88
90
  # if self.chatbot_history is not None:
89
91
  # self.chatbot_history.append(
90
92
  # gr.ChatMessage(
@@ -114,14 +116,12 @@ MODEL_MAPPINGS = {
114
116
  "Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
115
117
  "Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
116
118
  "Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
117
- "Anthropic: Claude 3.5 Sonnet (20241022)": "anthropic/claude-3-5-sonnet-20241022",
118
119
  },
119
120
  "omni": {
120
121
  "default": "omniparser+openai/gpt-4o",
121
122
  "OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
122
123
  "OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
123
124
  "OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
124
- "OMNI: Claude 3.5 Sonnet (20241022)": "omniparser+anthropic/claude-3-5-sonnet-20241022",
125
125
  },
126
126
  "uitars": {
127
127
  "default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",
@@ -141,7 +141,7 @@ def get_model_string(model_name: str, loop_provider: str) -> str:
141
141
  ollama_model = model_name.split("OMNI: Ollama ", 1)[1]
142
142
  return f"omniparser+ollama_chat/{ollama_model}"
143
143
  return "omniparser+ollama_chat/llama3"
144
-
144
+
145
145
  # Map based on loop provider
146
146
  mapping = MODEL_MAPPINGS.get(loop_provider.lower(), MODEL_MAPPINGS["openai"])
147
147
  return mapping.get(model_name, mapping["default"])
@@ -151,6 +151,7 @@ def get_ollama_models() -> List[str]:
151
151
  """Get available models from Ollama if installed."""
152
152
  try:
153
153
  import subprocess
154
+
154
155
  result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
155
156
  if result.returncode == 0:
156
157
  lines = result.stdout.strip().split("\n")
@@ -174,16 +175,14 @@ def create_computer_instance(
174
175
  os_type: str = "macos",
175
176
  provider_type: str = "lume",
176
177
  name: Optional[str] = None,
177
- api_key: Optional[str] = None
178
+ api_key: Optional[str] = None,
178
179
  ) -> Computer:
179
180
  """Create or get the global Computer instance."""
180
181
  global global_computer
181
182
  if global_computer is None:
182
183
  if provider_type == "localhost":
183
184
  global_computer = Computer(
184
- verbosity=verbosity,
185
- os_type=os_type,
186
- use_host_computer_server=True
185
+ verbosity=verbosity, os_type=os_type, use_host_computer_server=True
187
186
  )
188
187
  else:
189
188
  global_computer = Computer(
@@ -191,7 +190,7 @@ def create_computer_instance(
191
190
  os_type=os_type,
192
191
  provider_type=provider_type,
193
192
  name=name if name else "",
194
- api_key=api_key
193
+ api_key=api_key,
195
194
  )
196
195
  return global_computer
197
196
 
@@ -217,7 +216,7 @@ def create_agent(
217
216
  os_type=computer_os,
218
217
  provider_type=computer_provider,
219
218
  name=computer_name,
220
- api_key=computer_api_key
219
+ api_key=computer_api_key,
221
220
  )
222
221
 
223
222
  # Handle custom models
@@ -233,12 +232,15 @@ def create_agent(
233
232
  "only_n_most_recent_images": only_n_most_recent_images,
234
233
  "verbosity": verbosity,
235
234
  }
236
-
235
+
237
236
  if save_trajectory:
238
237
  agent_kwargs["trajectory_dir"] = "trajectories"
239
-
238
+
240
239
  if max_trajectory_budget:
241
- agent_kwargs["max_trajectory_budget"] = {"max_budget": max_trajectory_budget, "raise_error": True}
240
+ agent_kwargs["max_trajectory_budget"] = {
241
+ "max_budget": max_trajectory_budget,
242
+ "raise_error": True,
243
+ }
242
244
 
243
245
  global_agent = ComputerAgent(**agent_kwargs)
244
246
  return global_agent
@@ -247,7 +249,8 @@ def create_agent(
247
249
  def launch_ui():
248
250
  """Standalone function to launch the Gradio app."""
249
251
  from agent.ui.gradio.ui_components import create_gradio_ui
250
- print(f"Starting Gradio app for CUA Agent...")
252
+
253
+ print("Starting Gradio app for Cua Agent...")
251
254
  demo = create_gradio_ui()
252
255
  demo.launch(share=False, inbrowser=True)
253
256