cua-agent 0.1.30__tar.gz → 0.1.32__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (84) hide show
  1. {cua_agent-0.1.30 → cua_agent-0.1.32}/PKG-INFO +11 -4
  2. {cua_agent-0.1.30 → cua_agent-0.1.32}/README.md +10 -3
  3. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/loop.py +2 -0
  4. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/computer.py +11 -9
  5. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/clients/oaicompat.py +12 -2
  6. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/loop.py +2 -0
  7. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/loop.py +4 -0
  8. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/tools/computer.py +44 -7
  9. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/clients/oaicompat.py +24 -16
  10. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/loop.py +18 -39
  11. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/prompts.py +5 -1
  12. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/tools/computer.py +6 -2
  13. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/utils.py +112 -1
  14. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/ui/gradio/app.py +58 -127
  15. {cua_agent-0.1.30 → cua_agent-0.1.32}/pyproject.toml +3 -3
  16. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/__init__.py +0 -0
  17. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/__init__.py +0 -0
  18. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/agent.py +0 -0
  19. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/base.py +0 -0
  20. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/callbacks.py +0 -0
  21. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/experiment.py +0 -0
  22. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/factory.py +0 -0
  23. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/messages.py +0 -0
  24. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/provider_config.py +0 -0
  25. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/telemetry.py +0 -0
  26. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools/__init__.py +0 -0
  27. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools/base.py +0 -0
  28. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools/bash.py +0 -0
  29. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools/collection.py +0 -0
  30. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools/computer.py +0 -0
  31. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools/edit.py +0 -0
  32. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools/manager.py +0 -0
  33. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools.py +0 -0
  34. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/types.py +0 -0
  35. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/visualization.py +0 -0
  36. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/__init__.py +0 -0
  37. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/__init__.py +0 -0
  38. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/api/client.py +0 -0
  39. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/api/logging.py +0 -0
  40. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/api_handler.py +0 -0
  41. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/callbacks/__init__.py +0 -0
  42. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/callbacks/manager.py +0 -0
  43. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/prompts.py +0 -0
  44. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/response_handler.py +0 -0
  45. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/__init__.py +0 -0
  46. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/base.py +0 -0
  47. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/bash.py +0 -0
  48. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/collection.py +0 -0
  49. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/edit.py +0 -0
  50. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/manager.py +0 -0
  51. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/run.py +0 -0
  52. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/types.py +0 -0
  53. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/utils.py +0 -0
  54. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/__init__.py +0 -0
  55. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/api_handler.py +0 -0
  56. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/clients/anthropic.py +0 -0
  57. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/clients/base.py +0 -0
  58. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/clients/ollama.py +0 -0
  59. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/clients/openai.py +0 -0
  60. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/clients/utils.py +0 -0
  61. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/image_utils.py +0 -0
  62. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/parser.py +0 -0
  63. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/prompts.py +0 -0
  64. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/tools/__init__.py +0 -0
  65. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/tools/base.py +0 -0
  66. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/tools/bash.py +0 -0
  67. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/tools/computer.py +0 -0
  68. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/tools/manager.py +0 -0
  69. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/utils.py +0 -0
  70. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/__init__.py +0 -0
  71. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/api_handler.py +0 -0
  72. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/response_handler.py +0 -0
  73. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/tools/__init__.py +0 -0
  74. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/tools/base.py +0 -0
  75. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/tools/manager.py +0 -0
  76. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/types.py +0 -0
  77. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/utils.py +0 -0
  78. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/__init__.py +0 -0
  79. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/clients/base.py +0 -0
  80. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/tools/__init__.py +0 -0
  81. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/tools/manager.py +0 -0
  82. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/telemetry.py +0 -0
  83. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/ui/__init__.py +0 -0
  84. {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/ui/gradio/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.1.30
3
+ Version: 0.1.32
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.10
@@ -101,6 +101,7 @@ pip install "cua-agent[all]"
101
101
  # or install specific loop providers
102
102
  pip install "cua-agent[openai]" # OpenAI Cua Loop
103
103
  pip install "cua-agent[anthropic]" # Anthropic Cua Loop
104
+ pip install "cua-agent[uitars]" # UI-Tars support
104
105
  pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
105
106
  pip install "cua-agent[ui]" # Gradio UI for the agent
106
107
  ```
@@ -119,10 +120,10 @@ async with Computer() as macos_computer:
119
120
  # model=LLM(provider=LLMProvider.ANTHROPIC)
120
121
  # or
121
122
  # loop=AgentLoop.OMNI,
122
- # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
123
+ # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
123
124
  # or
124
125
  # loop=AgentLoop.UITARS,
125
- # model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
126
+ # model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
126
127
  )
127
128
 
128
129
  tasks = [
@@ -148,7 +149,13 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
148
149
 
149
150
  ## Using the Gradio UI
150
151
 
151
- The agent includes a Gradio-based user interface for easy interaction. To use it:
152
+ The agent includes a Gradio-based user interface for easier interaction.
153
+
154
+ <div align="center">
155
+ <img src="../../img/agent_gradio_ui.png"/>
156
+ </div>
157
+
158
+ To use it:
152
159
 
153
160
  ```bash
154
161
  # Install with Gradio support
@@ -31,6 +31,7 @@ pip install "cua-agent[all]"
31
31
  # or install specific loop providers
32
32
  pip install "cua-agent[openai]" # OpenAI Cua Loop
33
33
  pip install "cua-agent[anthropic]" # Anthropic Cua Loop
34
+ pip install "cua-agent[uitars]" # UI-Tars support
34
35
  pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
35
36
  pip install "cua-agent[ui]" # Gradio UI for the agent
36
37
  ```
@@ -49,10 +50,10 @@ async with Computer() as macos_computer:
49
50
  # model=LLM(provider=LLMProvider.ANTHROPIC)
50
51
  # or
51
52
  # loop=AgentLoop.OMNI,
52
- # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
53
+ # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
53
54
  # or
54
55
  # loop=AgentLoop.UITARS,
55
- # model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
56
+ # model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
56
57
  )
57
58
 
58
59
  tasks = [
@@ -78,7 +79,13 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
78
79
 
79
80
  ## Using the Gradio UI
80
81
 
81
- The agent includes a Gradio-based user interface for easy interaction. To use it:
82
+ The agent includes a Gradio-based user interface for easier interaction.
83
+
84
+ <div align="center">
85
+ <img src="../../img/agent_gradio_ui.png"/>
86
+ </div>
87
+
88
+ To use it:
82
89
 
83
90
  ```bash
84
91
  # Install with Gradio support
@@ -279,6 +279,8 @@ class AnthropicLoop(BaseLoop):
279
279
  messages,
280
280
  model=self.model,
281
281
  )
282
+ # Log standardized response for ease of parsing
283
+ self._log_api_call("agent_response", request=None, response=openai_compatible_response)
282
284
  await queue.put(openai_compatible_response)
283
285
 
284
286
  if not should_continue:
@@ -161,15 +161,17 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
161
161
  self.logger.info(f"Moving cursor to ({x}, {y})")
162
162
  await self.computer.interface.move_cursor(x, y)
163
163
  elif action == "left_click_drag":
164
- self.logger.info(f"Dragging from ({x}, {y})")
165
- # First move to the position
166
- await self.computer.interface.move_cursor(x, y)
167
- # Then perform drag operation - check if drag_to exists or we need to use other methods
168
- try:
169
- await self.computer.interface.drag_to(x, y)
170
- except Exception as e:
171
- self.logger.error(f"Error during drag operation: {str(e)}")
172
- raise ToolError(f"Failed to perform drag: {str(e)}")
164
+ # Get the start coordinate from kwargs
165
+ start_coordinate = kwargs.get("start_coordinate")
166
+ if not start_coordinate:
167
+ raise ToolError("start_coordinate is required for left_click_drag action")
168
+
169
+ start_x, start_y = start_coordinate
170
+ end_x, end_y = x, y
171
+
172
+ self.logger.info(f"Dragging from ({start_x}, {start_y}) to ({end_x}, {end_y})")
173
+ await self.computer.interface.move_cursor(start_x, start_y)
174
+ await self.computer.interface.drag_to(end_x, end_y)
173
175
 
174
176
  # Wait briefly for any UI changes
175
177
  await asyncio.sleep(0.5)
@@ -93,7 +93,14 @@ class OAICompatClient(BaseOmniClient):
93
93
  """
94
94
  headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
95
95
 
96
- final_messages = [{"role": "system", "content": system}]
96
+ final_messages = [
97
+ {
98
+ "role": "system",
99
+ "content": [
100
+ { "type": "text", "text": system }
101
+ ]
102
+ }
103
+ ]
97
104
 
98
105
  # Process messages
99
106
  for item in messages:
@@ -117,7 +124,10 @@ class OAICompatClient(BaseOmniClient):
117
124
  else:
118
125
  message = {
119
126
  "role": item["role"],
120
- "content": [{"type": "text", "text": item["content"]}],
127
+ "content": [{
128
+ "type": "text",
129
+ "text": item["content"]
130
+ }],
121
131
  }
122
132
  final_messages.append(message)
123
133
  else:
@@ -670,6 +670,8 @@ class OmniLoop(BaseLoop):
670
670
  parsed_screen=parsed_screen,
671
671
  parser=self.parser
672
672
  )
673
+ # Log standardized response for ease of parsing
674
+ self._log_api_call("agent_response", request=None, response=openai_compatible_response)
673
675
 
674
676
  # Yield the response to the caller
675
677
  yield openai_compatible_response
@@ -276,6 +276,10 @@ class OpenAILoop(BaseLoop):
276
276
  )
277
277
  # Don't reset last_response_id to None - keep the previous value if available
278
278
 
279
+
280
+ # Log standardized response for ease of parsing
281
+ # Since this is the openAI responses format, we don't need to convert it to agent response format
282
+ self._log_api_call("agent_response", request=None, response=response)
279
283
  # Process API response
280
284
  await queue.put(response)
281
285
 
@@ -44,6 +44,7 @@ Action = Literal[
44
44
  "double_click",
45
45
  "screenshot",
46
46
  "scroll",
47
+ "drag",
47
48
  ]
48
49
 
49
50
 
@@ -162,9 +163,14 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
162
163
  y = kwargs.get("y")
163
164
  if x is None or y is None:
164
165
  raise ToolError("x and y coordinates are required for scroll action")
165
- scroll_x = kwargs.get("scroll_x", 0) // 20
166
- scroll_y = kwargs.get("scroll_y", 0) // 20
166
+ scroll_x = kwargs.get("scroll_x", 0) // 50
167
+ scroll_y = kwargs.get("scroll_y", 0) // 50
167
168
  return await self.handle_scroll(x, y, scroll_x, scroll_y)
169
+ elif type == "drag":
170
+ path = kwargs.get("path")
171
+ if not path or not isinstance(path, list) or len(path) < 2:
172
+ raise ToolError("path is required for drag action and must contain at least 2 points")
173
+ return await self.handle_drag(path)
168
174
  elif type == "screenshot":
169
175
  return await self.screenshot()
170
176
  elif type == "wait":
@@ -240,11 +246,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
240
246
 
241
247
  if len(mapped_keys) > 1:
242
248
  # For key combinations (like Ctrl+C)
243
- for k in mapped_keys:
244
- await self.computer.interface.press_key(k)
245
- await asyncio.sleep(0.1)
246
- for k in reversed(mapped_keys):
247
- await self.computer.interface.press_key(k)
249
+ await self.computer.interface.hotkey(*mapped_keys)
248
250
  else:
249
251
  # Single key press
250
252
  await self.computer.interface.press_key(mapped_keys[0])
@@ -306,6 +308,41 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
306
308
  self.logger.error(f"Error in handle_scroll: {str(e)}")
307
309
  raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
308
310
 
311
+ async def handle_drag(self, path: List[Dict[str, int]]) -> ToolResult:
312
+ """Handle mouse drag operation using a path of coordinates.
313
+
314
+ Args:
315
+ path: List of coordinate points {"x": int, "y": int} defining the drag path
316
+
317
+ Returns:
318
+ ToolResult with the operation result and screenshot
319
+ """
320
+ try:
321
+ # Convert from [{"x": x, "y": y}, ...] format to [(x, y), ...] format
322
+ points = [(p["x"], p["y"]) for p in path]
323
+
324
+ # Perform drag action
325
+ if len(points) == 2:
326
+ await self.computer.interface.move_cursor(points[0][0], points[0][1])
327
+ await self.computer.interface.drag_to(points[1][0], points[1][1])
328
+ else:
329
+ await self.computer.interface.drag(points, button="left")
330
+
331
+ # Wait for UI to update
332
+ await asyncio.sleep(0.5)
333
+
334
+ # Take screenshot after action
335
+ screenshot = await self.computer.interface.screenshot()
336
+ base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
337
+
338
+ return ToolResult(
339
+ output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
340
+ base64_image=base64_screenshot,
341
+ )
342
+ except Exception as e:
343
+ self.logger.error(f"Error in handle_drag: {str(e)}")
344
+ raise ToolError(f"Failed to perform drag operation: {str(e)}")
345
+
309
346
  async def screenshot(self) -> ToolResult:
310
347
  """Take a screenshot."""
311
348
  try:
@@ -94,8 +94,15 @@ class OAICompatClient(BaseUITarsClient):
94
94
  """
95
95
  headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
96
96
 
97
- final_messages = [{"role": "system", "content": system}]
98
-
97
+ final_messages = [
98
+ {
99
+ "role": "system",
100
+ "content": [
101
+ { "type": "text", "text": system }
102
+ ]
103
+ }
104
+ ]
105
+
99
106
  # Process messages
100
107
  for item in messages:
101
108
  if isinstance(item, dict):
@@ -138,8 +145,13 @@ class OAICompatClient(BaseUITarsClient):
138
145
  message = {"role": "user", "content": [{"type": "text", "text": item}]}
139
146
  final_messages.append(message)
140
147
 
141
- payload = {"model": self.model, "messages": final_messages, "temperature": self.temperature}
142
- payload["max_tokens"] = max_tokens or self.max_tokens
148
+ payload = {
149
+ "model": self.model,
150
+ "messages": final_messages,
151
+ "max_tokens": max_tokens or self.max_tokens,
152
+ "temperature": self.temperature,
153
+ "top_p": 0.7,
154
+ }
143
155
 
144
156
  try:
145
157
  async with aiohttp.ClientSession() as session:
@@ -178,25 +190,21 @@ class OAICompatClient(BaseUITarsClient):
178
190
  response_text = await response.text()
179
191
  logger.debug(f"Response content: {response_text}")
180
192
 
193
+ # if 503, then the endpoint is still warming up
194
+ if response.status == 503:
195
+ logger.error(f"Endpoint is still warming up, please try again later")
196
+ raise Exception(f"Endpoint is still warming up: {response_text}")
197
+
181
198
  # Try to parse as JSON if the content type is appropriate
182
199
  if "application/json" in response.headers.get('Content-Type', ''):
183
200
  response_json = await response.json()
184
201
  else:
185
202
  raise Exception(f"Response is not JSON format")
186
- # # Optionally try to parse it anyway
187
- # try:
188
- # import json
189
- # response_json = json.loads(response_text)
190
- # except json.JSONDecodeError as e:
191
- # print(f"Failed to parse response as JSON: {e}")
192
203
 
193
204
  if response.status != 200:
194
- error_msg = response_json.get("error", {}).get(
195
- "message", str(response_json)
196
- )
197
- logger.error(f"Error in API call: {error_msg}")
198
- raise Exception(f"API error: {error_msg}")
199
-
205
+ logger.error(f"Error in API call: {response_text}")
206
+ raise Exception(f"API error: {response_text}")
207
+
200
208
  return response_json
201
209
 
202
210
  except Exception as e:
@@ -17,10 +17,10 @@ from ...core.types import AgentResponse, LLMProvider
17
17
  from ...core.visualization import VisualizationHelper
18
18
  from computer import Computer
19
19
 
20
- from .utils import add_box_token, parse_actions, parse_action_parameters
20
+ from .utils import add_box_token, parse_actions, parse_action_parameters, to_agent_response_format
21
21
  from .tools.manager import ToolManager
22
22
  from .tools.computer import ToolResult
23
- from .prompts import COMPUTER_USE, SYSTEM_PROMPT
23
+ from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
24
24
 
25
25
  from .clients.oaicompat import OAICompatClient
26
26
 
@@ -184,7 +184,7 @@ class UITARSLoop(BaseLoop):
184
184
  if first_user_idx is not None and instruction:
185
185
  # Create the computer use prompt
186
186
  user_prompt = COMPUTER_USE.format(
187
- instruction=instruction,
187
+ instruction='\n'.join([instruction, MAC_SPECIFIC_NOTES]),
188
188
  language="English"
189
189
  )
190
190
 
@@ -232,8 +232,11 @@ class UITARSLoop(BaseLoop):
232
232
  if self.client is None:
233
233
  raise RuntimeError("Failed to initialize client")
234
234
 
235
- # Convert messages to UI-TARS format
235
+ # Get messages in standard format from the message manager
236
+ self.message_manager.messages = messages.copy()
236
237
  prepared_messages = self.message_manager.get_messages()
238
+
239
+ # Convert messages to UI-TARS format
237
240
  uitars_messages = self.to_uitars_format(prepared_messages)
238
241
 
239
242
  # Log request
@@ -437,7 +440,7 @@ class UITARSLoop(BaseLoop):
437
440
  # MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
438
441
  ###########################################
439
442
 
440
- async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
443
+ async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
441
444
  """Run the agent loop with provided messages.
442
445
 
443
446
  Args:
@@ -504,41 +507,16 @@ class UITARSLoop(BaseLoop):
504
507
 
505
508
  # Update whether an action screenshot was saved this turn
506
509
  action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
507
-
508
- # Parse actions from the raw response
509
- raw_response = response["choices"][0]["message"]["content"]
510
- parsed_actions = parse_actions(raw_response)
511
510
 
512
- # Extract thought content if available
513
- thought = ""
514
- if "Thought:" in raw_response:
515
- thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", raw_response, re.DOTALL)
516
- if thought_match:
517
- thought = thought_match.group(1).strip()
518
-
519
- # Create standardized thought response format
520
- thought_response = {
521
- "role": "assistant",
522
- "content": thought or raw_response,
523
- "metadata": {
524
- "title": "🧠 UI-TARS Thoughts"
525
- }
526
- }
511
+ agent_response = await to_agent_response_format(
512
+ response,
513
+ messages,
514
+ model=self.model,
515
+ )
516
+ # Log standardized response for ease of parsing
517
+ self._log_api_call("agent_response", request=None, response=agent_response)
518
+ yield agent_response
527
519
 
528
- # Create action response format
529
- action_response = {
530
- "role": "assistant",
531
- "content": str(parsed_actions),
532
- "metadata": {
533
- "title": "🖱️ UI-TARS Actions",
534
- }
535
- }
536
-
537
- # Yield both responses to the caller (thoughts first, then actions)
538
- yield thought_response
539
- if parsed_actions:
540
- yield action_response
541
-
542
520
  # Check if we should continue this conversation
543
521
  running = should_continue
544
522
 
@@ -559,7 +537,8 @@ class UITARSLoop(BaseLoop):
559
537
  logger.error(f"Maximum retry attempts reached. Last error was: {str(e)}")
560
538
 
561
539
  yield {
562
- "error": str(e),
540
+ "role": "assistant",
541
+ "content": f"Error: {str(e)}",
563
542
  "metadata": {"title": "❌ Error"},
564
543
  }
565
544
 
@@ -1,5 +1,9 @@
1
1
  """Prompts for UI-TARS agent."""
2
2
 
3
+ MAC_SPECIFIC_NOTES = """
4
+ (You are operating on macOS, use 'cmd' instead of 'ctrl' for most shortcuts e.g., hotkey(key='cmd c') for copy, hotkey(key='cmd v') for paste, hotkey(key='cmd t') for new tab).)
5
+ """
6
+
3
7
  SYSTEM_PROMPT = "You are a helpful assistant."
4
8
 
5
9
  COMPUTER_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
@@ -56,4 +60,4 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
56
60
 
57
61
  ## User Instruction
58
62
  {instruction}
59
- """
63
+ """
@@ -173,9 +173,13 @@ class ComputerTool(BaseComputerTool):
173
173
  elif action == "hotkey":
174
174
  if "keys" in kwargs:
175
175
  keys = kwargs["keys"]
176
- for key in keys:
177
- await self.computer.interface.press_key(key)
178
176
 
177
+ if len(keys) > 1:
178
+ await self.computer.interface.hotkey(*keys)
179
+ else:
180
+ # Single key press
181
+ await self.computer.interface.press_key(keys[0])
182
+
179
183
  # Wait for UI to update
180
184
  await asyncio.sleep(0.3)
181
185
 
@@ -4,9 +4,114 @@ import logging
4
4
  import base64
5
5
  import re
6
6
  from typing import Any, Dict, List, Optional, Union, Tuple
7
+ from datetime import datetime
7
8
 
8
9
  logger = logging.getLogger(__name__)
9
10
 
11
+ from ...core.types import AgentResponse
12
+
13
+ async def to_agent_response_format(
14
+ response: Dict[str, Any],
15
+ messages: List[Dict[str, Any]],
16
+ model: Optional[str] = None,
17
+ ) -> AgentResponse:
18
+ """Convert raw UI-TARS response to agent response format.
19
+
20
+ Args:
21
+ response: Raw UI-TARS response
22
+ messages: List of messages in standard format
23
+ model: Optional model name
24
+
25
+ Returns:
26
+ AgentResponse: Standardized agent response format
27
+ """
28
+ # Create unique IDs for this response
29
+ response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
30
+ reasoning_id = f"rs_{response_id}"
31
+ action_id = f"cu_{response_id}"
32
+ call_id = f"call_{response_id}"
33
+
34
+ # Parse actions from the raw response
35
+ content = response["choices"][0]["message"]["content"]
36
+ actions = parse_actions(content)
37
+
38
+ # Extract thought content if available
39
+ reasoning_text = ""
40
+ if "Thought:" in content:
41
+ thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", content, re.DOTALL)
42
+ if thought_match:
43
+ reasoning_text = thought_match.group(1).strip()
44
+
45
+ # Create output items
46
+ output_items = []
47
+ if reasoning_text:
48
+ output_items.append({
49
+ "type": "reasoning",
50
+ "id": reasoning_id,
51
+ "text": reasoning_text
52
+ })
53
+ if actions:
54
+ for i, action in enumerate(actions):
55
+ action_name, tool_args = parse_action_parameters(action)
56
+ if action_name == "finished":
57
+ output_items.append({
58
+ "type": "message",
59
+ "role": "assistant",
60
+ "content": [{
61
+ "type": "output_text",
62
+ "text": tool_args["content"]
63
+ }],
64
+ "id": f"action_{i}_{action_id}",
65
+ "status": "completed"
66
+ })
67
+ else:
68
+ if tool_args.get("action") == action_name:
69
+ del tool_args["action"]
70
+ output_items.append({
71
+ "type": "computer_call",
72
+ "id": f"{action}_{i}_{action_id}",
73
+ "call_id": f"call_{i}_{action_id}",
74
+ "action": { "type": action_name, **tool_args },
75
+ "pending_safety_checks": [],
76
+ "status": "completed"
77
+ })
78
+
79
+ # Create agent response
80
+ agent_response = AgentResponse(
81
+ id=response_id,
82
+ object="response",
83
+ created_at=int(datetime.now().timestamp()),
84
+ status="completed",
85
+ error=None,
86
+ incomplete_details=None,
87
+ instructions=None,
88
+ max_output_tokens=None,
89
+ model=model or response["model"],
90
+ output=output_items,
91
+ parallel_tool_calls=True,
92
+ previous_response_id=None,
93
+ reasoning={"effort": "medium"},
94
+ store=True,
95
+ temperature=0.0,
96
+ top_p=0.7,
97
+ text={"format": {"type": "text"}},
98
+ tool_choice="auto",
99
+ tools=[
100
+ {
101
+ "type": "computer_use_preview",
102
+ "display_height": 768,
103
+ "display_width": 1024,
104
+ "environment": "mac",
105
+ }
106
+ ],
107
+ truncation="auto",
108
+ usage=response["usage"],
109
+ user=None,
110
+ metadata={},
111
+ response=response
112
+ )
113
+ return agent_response
114
+
10
115
 
11
116
  def add_box_token(input_string: str) -> str:
12
117
  """Add box tokens to the coordinates in the model response.
@@ -74,7 +179,13 @@ def parse_action_parameters(action: str) -> Tuple[str, Dict[str, Any]]:
74
179
  """
75
180
  # Handle "finished" action
76
181
  if action.startswith("finished"):
77
- return "finished", {}
182
+ # Parse content if it exists
183
+ content_match = re.search(r"content='([^']*)'", action)
184
+ if content_match:
185
+ content = content_match.group(1)
186
+ return "finished", {"content": content}
187
+ else:
188
+ return "finished", {}
78
189
 
79
190
  # Parse action parameters
80
191
  action_match = re.match(r'(\w+)\((.*)\)', action)
@@ -35,6 +35,7 @@ from pathlib import Path
35
35
  from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
36
36
  import gradio as gr
37
37
  from gradio.components.chatbot import MetadataDict
38
+ from typing import cast
38
39
 
39
40
  # Import from agent package
40
41
  from agent.core.types import AgentResponse
@@ -322,63 +323,6 @@ def get_ollama_models() -> List[str]:
322
323
  logging.error(f"Error getting Ollama models: {e}")
323
324
  return []
324
325
 
325
-
326
- def extract_synthesized_text(
327
- result: Union[AgentResponse, Dict[str, Any]],
328
- ) -> Tuple[str, MetadataDict]:
329
- """Extract synthesized text from the agent result."""
330
- synthesized_text = ""
331
- metadata = MetadataDict()
332
-
333
- if "output" in result and result["output"]:
334
- for output in result["output"]:
335
- if output.get("type") == "reasoning":
336
- metadata["title"] = "🧠 Reasoning"
337
- content = output.get("content", "")
338
- if content:
339
- synthesized_text += f"{content}\n"
340
- elif output.get("type") == "message":
341
- # Handle message type outputs - can contain rich content
342
- content = output.get("content", [])
343
-
344
- # Content is usually an array of content blocks
345
- if isinstance(content, list):
346
- for block in content:
347
- if isinstance(block, dict) and block.get("type") == "output_text":
348
- text_value = block.get("text", "")
349
- if text_value:
350
- synthesized_text += f"{text_value}\n"
351
-
352
- elif output.get("type") == "computer_call":
353
- action = output.get("action", {})
354
- action_type = action.get("type", "")
355
-
356
- # Create a descriptive text about the action
357
- if action_type == "click":
358
- button = action.get("button", "")
359
- x = action.get("x", "")
360
- y = action.get("y", "")
361
- synthesized_text += f"Clicked {button} at position ({x}, {y}).\n"
362
- elif action_type == "type":
363
- text = action.get("text", "")
364
- synthesized_text += f"Typed: {text}.\n"
365
- elif action_type == "keypress":
366
- # Extract key correctly from either keys array or key field
367
- if isinstance(action.get("keys"), list):
368
- key = ", ".join(action.get("keys"))
369
- else:
370
- key = action.get("key", "")
371
-
372
- synthesized_text += f"Pressed key: {key}\n"
373
- else:
374
- synthesized_text += f"Performed {action_type} action.\n"
375
-
376
- metadata["status"] = "done"
377
- metadata["title"] = f"🛠️ {synthesized_text.strip().splitlines()[-1]}"
378
-
379
- return synthesized_text.strip(), metadata
380
-
381
-
382
326
  def create_computer_instance(verbosity: int = logging.INFO) -> Computer:
383
327
  """Create or get the global Computer instance."""
384
328
  global global_computer
@@ -447,66 +391,6 @@ def create_agent(
447
391
 
448
392
  return global_agent
449
393
 
450
-
451
- def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> Tuple[str, MetadataDict]:
452
- """Process agent results for the Gradio UI."""
453
- # Extract text content
454
- text_obj = result.get("text", {})
455
- metadata = result.get("metadata", {})
456
-
457
- # Create a properly typed MetadataDict
458
- metadata_dict = MetadataDict()
459
- metadata_dict["title"] = metadata.get("title", "")
460
- metadata_dict["status"] = "done"
461
- metadata = metadata_dict
462
-
463
- # For OpenAI's Computer-Use Agent, text field is an object with format property
464
- if (
465
- text_obj
466
- and isinstance(text_obj, dict)
467
- and "format" in text_obj
468
- and not text_obj.get("value", "")
469
- ):
470
- content, metadata = extract_synthesized_text(result)
471
- else:
472
- if not text_obj:
473
- text_obj = result
474
-
475
- # For other types of results, try to get text directly
476
- if isinstance(text_obj, dict):
477
- if "value" in text_obj:
478
- content = text_obj["value"]
479
- elif "text" in text_obj:
480
- content = text_obj["text"]
481
- elif "content" in text_obj:
482
- content = text_obj["content"]
483
- else:
484
- content = ""
485
- else:
486
- content = str(text_obj) if text_obj else ""
487
-
488
- # If still no content but we have outputs, create a summary
489
- if not content and "output" in result and result["output"]:
490
- output = result["output"]
491
- for out in output:
492
- if out.get("type") == "reasoning":
493
- content = out.get("content", "")
494
- if content:
495
- break
496
- elif out.get("type") == "computer_call":
497
- action = out.get("action", {})
498
- action_type = action.get("type", "")
499
- if action_type:
500
- content = f"Performing action: {action_type}"
501
- break
502
-
503
- # Clean up the text - ensure content is a string
504
- if not isinstance(content, str):
505
- content = str(content) if content else ""
506
-
507
- return content, metadata
508
-
509
-
510
394
  def create_gradio_ui(
511
395
  provider_name: str = "openai",
512
396
  model_name: str = "gpt-4o",
@@ -907,17 +791,64 @@ def create_gradio_ui(
907
791
 
908
792
  # Stream responses from the agent
909
793
  async for result in global_agent.run(last_user_message):
910
- # Process result
911
- content, metadata = process_agent_result(result)
912
-
913
- # Skip empty content
914
- if content or metadata.get("title"):
915
- history.append(
916
- gr.ChatMessage(
917
- role="assistant", content=content, metadata=metadata
794
+ print(f"DEBUG - Agent response ------- START")
795
+ from pprint import pprint
796
+ pprint(result)
797
+ print(f"DEBUG - Agent response ------- END")
798
+
799
+ def generate_gradio_messages():
800
+ if result.get("content"):
801
+ yield gr.ChatMessage(
802
+ role="assistant",
803
+ content=result.get("content", ""),
804
+ metadata=cast(MetadataDict, result.get("metadata", {}))
918
805
  )
919
- )
920
- yield history
806
+ else:
807
+ outputs = result.get("output", [])
808
+ for output in outputs:
809
+ if output.get("type") == "message":
810
+ content = output.get("content", [])
811
+ for content_part in content:
812
+ if content_part.get("text"):
813
+ yield gr.ChatMessage(
814
+ role=output.get("role", "assistant"),
815
+ content=content_part.get("text", ""),
816
+ metadata=content_part.get("metadata", {})
817
+ )
818
+ elif output.get("type") == "reasoning":
819
+ # if it's openAI, we only have access to a summary of the reasoning
820
+ summary_content = output.get("summary", [])
821
+ if summary_content:
822
+ for summary_part in summary_content:
823
+ if summary_part.get("type") == "summary_text":
824
+ yield gr.ChatMessage(
825
+ role="assistant",
826
+ content=summary_part.get("text", "")
827
+ )
828
+ else:
829
+ summary_content = output.get("text", "")
830
+ if summary_content:
831
+ yield gr.ChatMessage(
832
+ role="assistant",
833
+ content=summary_content,
834
+ )
835
+ elif output.get("type") == "computer_call":
836
+ action = output.get("action", {})
837
+ action_type = action.get("type", "")
838
+ if action_type:
839
+ action_title = f"🛠️ Performing {action_type}"
840
+ if action.get("x") and action.get("y"):
841
+ action_title += f" at ({action['x']}, {action['y']})"
842
+ yield gr.ChatMessage(
843
+ role="assistant",
844
+ content=f"```json\n{json.dumps(action)}\n```",
845
+ metadata={"title": action_title}
846
+ )
847
+
848
+ for message in generate_gradio_messages():
849
+ history.append(message)
850
+ yield history
851
+
921
852
  except Exception as e:
922
853
  import traceback
923
854
 
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "cua-agent"
9
- version = "0.1.30"
9
+ version = "0.1.32"
10
10
  description = "CUA (Computer Use) Agent for AI-driven computer interaction"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -108,7 +108,7 @@ target-version = [
108
108
 
109
109
  [tool.ruff]
110
110
  line-length = 100
111
- target-version = "0.1.30"
111
+ target-version = "0.1.32"
112
112
  select = [
113
113
  "E",
114
114
  "F",
@@ -122,7 +122,7 @@ docstring-code-format = true
122
122
 
123
123
  [tool.mypy]
124
124
  strict = true
125
- python_version = "0.1.30"
125
+ python_version = "0.1.32"
126
126
  ignore_missing_imports = true
127
127
  disallow_untyped_defs = true
128
128
  check_untyped_defs = true
File without changes