cua-agent 0.1.31__tar.gz → 0.1.33__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (84) hide show
  1. {cua_agent-0.1.31 → cua_agent-0.1.33}/PKG-INFO +3 -3
  2. {cua_agent-0.1.31 → cua_agent-0.1.33}/README.md +2 -2
  3. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/loop.py +2 -0
  4. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/tools/computer.py +11 -9
  5. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/loop.py +2 -0
  6. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/openai/loop.py +7 -10
  7. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/openai/tools/computer.py +41 -0
  8. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/uitars/clients/oaicompat.py +8 -12
  9. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/uitars/loop.py +12 -36
  10. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/uitars/utils.py +112 -1
  11. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/ui/gradio/app.py +58 -127
  12. {cua_agent-0.1.31 → cua_agent-0.1.33}/pyproject.toml +3 -3
  13. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/__init__.py +0 -0
  14. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/__init__.py +0 -0
  15. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/agent.py +0 -0
  16. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/base.py +0 -0
  17. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/callbacks.py +0 -0
  18. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/experiment.py +0 -0
  19. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/factory.py +0 -0
  20. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/messages.py +0 -0
  21. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/provider_config.py +0 -0
  22. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/telemetry.py +0 -0
  23. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/tools/__init__.py +0 -0
  24. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/tools/base.py +0 -0
  25. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/tools/bash.py +0 -0
  26. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/tools/collection.py +0 -0
  27. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/tools/computer.py +0 -0
  28. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/tools/edit.py +0 -0
  29. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/tools/manager.py +0 -0
  30. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/tools.py +0 -0
  31. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/types.py +0 -0
  32. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/core/visualization.py +0 -0
  33. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/__init__.py +0 -0
  34. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/__init__.py +0 -0
  35. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/api/client.py +0 -0
  36. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/api/logging.py +0 -0
  37. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/api_handler.py +0 -0
  38. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/callbacks/__init__.py +0 -0
  39. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/callbacks/manager.py +0 -0
  40. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/prompts.py +0 -0
  41. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/response_handler.py +0 -0
  42. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/tools/__init__.py +0 -0
  43. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/tools/base.py +0 -0
  44. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/tools/bash.py +0 -0
  45. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/tools/collection.py +0 -0
  46. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/tools/edit.py +0 -0
  47. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/tools/manager.py +0 -0
  48. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/tools/run.py +0 -0
  49. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/types.py +0 -0
  50. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/anthropic/utils.py +0 -0
  51. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/__init__.py +0 -0
  52. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/api_handler.py +0 -0
  53. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/clients/anthropic.py +0 -0
  54. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/clients/base.py +0 -0
  55. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/clients/oaicompat.py +0 -0
  56. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/clients/ollama.py +0 -0
  57. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/clients/openai.py +0 -0
  58. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/clients/utils.py +0 -0
  59. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/image_utils.py +0 -0
  60. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/parser.py +0 -0
  61. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/prompts.py +0 -0
  62. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/tools/__init__.py +0 -0
  63. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/tools/base.py +0 -0
  64. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/tools/bash.py +0 -0
  65. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/tools/computer.py +0 -0
  66. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/tools/manager.py +0 -0
  67. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/omni/utils.py +0 -0
  68. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/openai/__init__.py +0 -0
  69. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/openai/api_handler.py +0 -0
  70. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/openai/response_handler.py +0 -0
  71. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/openai/tools/__init__.py +0 -0
  72. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/openai/tools/base.py +0 -0
  73. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/openai/tools/manager.py +0 -0
  74. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/openai/types.py +0 -0
  75. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/openai/utils.py +0 -0
  76. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/uitars/__init__.py +0 -0
  77. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/uitars/clients/base.py +0 -0
  78. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/uitars/prompts.py +0 -0
  79. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/uitars/tools/__init__.py +0 -0
  80. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/uitars/tools/computer.py +0 -0
  81. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/providers/uitars/tools/manager.py +0 -0
  82. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/telemetry.py +0 -0
  83. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/ui/__init__.py +0 -0
  84. {cua_agent-0.1.31 → cua_agent-0.1.33}/agent/ui/gradio/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.1.31
3
+ Version: 0.1.33
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.10
@@ -120,10 +120,10 @@ async with Computer() as macos_computer:
120
120
  # model=LLM(provider=LLMProvider.ANTHROPIC)
121
121
  # or
122
122
  # loop=AgentLoop.OMNI,
123
- # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
123
+ # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
124
124
  # or
125
125
  # loop=AgentLoop.UITARS,
126
- # model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
126
+ # model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
127
127
  )
128
128
 
129
129
  tasks = [
@@ -50,10 +50,10 @@ async with Computer() as macos_computer:
50
50
  # model=LLM(provider=LLMProvider.ANTHROPIC)
51
51
  # or
52
52
  # loop=AgentLoop.OMNI,
53
- # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
53
+ # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
54
54
  # or
55
55
  # loop=AgentLoop.UITARS,
56
- # model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
56
+ # model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
57
57
  )
58
58
 
59
59
  tasks = [
@@ -279,6 +279,8 @@ class AnthropicLoop(BaseLoop):
279
279
  messages,
280
280
  model=self.model,
281
281
  )
282
+ # Log standardized response for ease of parsing
283
+ self._log_api_call("agent_response", request=None, response=openai_compatible_response)
282
284
  await queue.put(openai_compatible_response)
283
285
 
284
286
  if not should_continue:
@@ -161,15 +161,17 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
161
161
  self.logger.info(f"Moving cursor to ({x}, {y})")
162
162
  await self.computer.interface.move_cursor(x, y)
163
163
  elif action == "left_click_drag":
164
- self.logger.info(f"Dragging from ({x}, {y})")
165
- # First move to the position
166
- await self.computer.interface.move_cursor(x, y)
167
- # Then perform drag operation - check if drag_to exists or we need to use other methods
168
- try:
169
- await self.computer.interface.drag_to(x, y)
170
- except Exception as e:
171
- self.logger.error(f"Error during drag operation: {str(e)}")
172
- raise ToolError(f"Failed to perform drag: {str(e)}")
164
+ # Get the start coordinate from kwargs
165
+ start_coordinate = kwargs.get("start_coordinate")
166
+ if not start_coordinate:
167
+ raise ToolError("start_coordinate is required for left_click_drag action")
168
+
169
+ start_x, start_y = start_coordinate
170
+ end_x, end_y = x, y
171
+
172
+ self.logger.info(f"Dragging from ({start_x}, {start_y}) to ({end_x}, {end_y})")
173
+ await self.computer.interface.move_cursor(start_x, start_y)
174
+ await self.computer.interface.drag_to(end_x, end_y)
173
175
 
174
176
  # Wait briefly for any UI changes
175
177
  await asyncio.sleep(0.5)
@@ -670,6 +670,8 @@ class OmniLoop(BaseLoop):
670
670
  parsed_screen=parsed_screen,
671
671
  parser=self.parser
672
672
  )
673
+ # Log standardized response for ease of parsing
674
+ self._log_api_call("agent_response", request=None, response=openai_compatible_response)
673
675
 
674
676
  # Yield the response to the caller
675
677
  yield openai_compatible_response
@@ -201,16 +201,7 @@ class OpenAILoop(BaseLoop):
201
201
 
202
202
  # Emit screenshot callbacks
203
203
  await self.handle_screenshot(screenshot_base64, action_type="initial_state")
204
-
205
- # Save screenshot if requested
206
- if self.save_trajectory:
207
- # Ensure screenshot_base64 is a string
208
- if not isinstance(screenshot_base64, str):
209
- logger.warning(
210
- "Converting non-string screenshot_base64 to string for _save_screenshot"
211
- )
212
- self._save_screenshot(screenshot_base64, action_type="state")
213
- logger.info("Screenshot saved to trajectory")
204
+ self._save_screenshot(screenshot_base64, action_type="state")
214
205
 
215
206
  # First add any existing user messages that were passed to run()
216
207
  user_query = None
@@ -276,6 +267,10 @@ class OpenAILoop(BaseLoop):
276
267
  )
277
268
  # Don't reset last_response_id to None - keep the previous value if available
278
269
 
270
+
271
+ # Log standardized response for ease of parsing
272
+ # Since this is the openAI responses format, we don't need to convert it to agent response format
273
+ self._log_api_call("agent_response", request=None, response=response)
279
274
  # Process API response
280
275
  await queue.put(response)
281
276
 
@@ -347,6 +342,7 @@ class OpenAILoop(BaseLoop):
347
342
  # Process screenshot through hooks
348
343
  action_type = f"after_{action.get('type', 'action')}"
349
344
  await self.handle_screenshot(screenshot_base64, action_type=action_type)
345
+ self._save_screenshot(screenshot_base64, action_type=action_type)
350
346
 
351
347
  # Create computer_call_output
352
348
  computer_call_output = {
@@ -393,6 +389,7 @@ class OpenAILoop(BaseLoop):
393
389
 
394
390
  # Process the response
395
391
  # await self.response_handler.process_response(response, queue)
392
+ self._log_api_call("agent_response", request=None, response=response)
396
393
  await queue.put(response)
397
394
  except Exception as e:
398
395
  logger.error(f"Error executing computer action: {str(e)}")
@@ -44,6 +44,7 @@ Action = Literal[
44
44
  "double_click",
45
45
  "screenshot",
46
46
  "scroll",
47
+ "drag",
47
48
  ]
48
49
 
49
50
 
@@ -165,6 +166,11 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
165
166
  scroll_x = kwargs.get("scroll_x", 0) // 50
166
167
  scroll_y = kwargs.get("scroll_y", 0) // 50
167
168
  return await self.handle_scroll(x, y, scroll_x, scroll_y)
169
+ elif type == "drag":
170
+ path = kwargs.get("path")
171
+ if not path or not isinstance(path, list) or len(path) < 2:
172
+ raise ToolError("path is required for drag action and must contain at least 2 points")
173
+ return await self.handle_drag(path)
168
174
  elif type == "screenshot":
169
175
  return await self.screenshot()
170
176
  elif type == "wait":
@@ -302,6 +308,41 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
302
308
  self.logger.error(f"Error in handle_scroll: {str(e)}")
303
309
  raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
304
310
 
311
+ async def handle_drag(self, path: List[Dict[str, int]]) -> ToolResult:
312
+ """Handle mouse drag operation using a path of coordinates.
313
+
314
+ Args:
315
+ path: List of coordinate points {"x": int, "y": int} defining the drag path
316
+
317
+ Returns:
318
+ ToolResult with the operation result and screenshot
319
+ """
320
+ try:
321
+ # Convert from [{"x": x, "y": y}, ...] format to [(x, y), ...] format
322
+ points = [(p["x"], p["y"]) for p in path]
323
+
324
+ # Perform drag action
325
+ if len(points) == 2:
326
+ await self.computer.interface.move_cursor(points[0][0], points[0][1])
327
+ await self.computer.interface.drag_to(points[1][0], points[1][1])
328
+ else:
329
+ await self.computer.interface.drag(points, button="left")
330
+
331
+ # Wait for UI to update
332
+ await asyncio.sleep(0.5)
333
+
334
+ # Take screenshot after action
335
+ screenshot = await self.computer.interface.screenshot()
336
+ base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
337
+
338
+ return ToolResult(
339
+ output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
340
+ base64_image=base64_screenshot,
341
+ )
342
+ except Exception as e:
343
+ self.logger.error(f"Error in handle_drag: {str(e)}")
344
+ raise ToolError(f"Failed to perform drag operation: {str(e)}")
345
+
305
346
  async def screenshot(self) -> ToolResult:
306
347
  """Take a screenshot."""
307
348
  try:
@@ -190,25 +190,21 @@ class OAICompatClient(BaseUITarsClient):
190
190
  response_text = await response.text()
191
191
  logger.debug(f"Response content: {response_text}")
192
192
 
193
+ # if 503, then the endpoint is still warming up
194
+ if response.status == 503:
195
+ logger.error(f"Endpoint is still warming up, please try again later")
196
+ raise Exception(f"Endpoint is still warming up: {response_text}")
197
+
193
198
  # Try to parse as JSON if the content type is appropriate
194
199
  if "application/json" in response.headers.get('Content-Type', ''):
195
200
  response_json = await response.json()
196
201
  else:
197
202
  raise Exception(f"Response is not JSON format")
198
- # # Optionally try to parse it anyway
199
- # try:
200
- # import json
201
- # response_json = json.loads(response_text)
202
- # except json.JSONDecodeError as e:
203
- # print(f"Failed to parse response as JSON: {e}")
204
203
 
205
204
  if response.status != 200:
206
- error_msg = response_json.get("error", {}).get(
207
- "message", str(response_json)
208
- )
209
- logger.error(f"Error in API call: {error_msg}")
210
- raise Exception(f"API error: {error_msg}")
211
-
205
+ logger.error(f"Error in API call: {response_text}")
206
+ raise Exception(f"API error: {response_text}")
207
+
212
208
  return response_json
213
209
 
214
210
  except Exception as e:
@@ -17,7 +17,7 @@ from ...core.types import AgentResponse, LLMProvider
17
17
  from ...core.visualization import VisualizationHelper
18
18
  from computer import Computer
19
19
 
20
- from .utils import add_box_token, parse_actions, parse_action_parameters
20
+ from .utils import add_box_token, parse_actions, parse_action_parameters, to_agent_response_format
21
21
  from .tools.manager import ToolManager
22
22
  from .tools.computer import ToolResult
23
23
  from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
@@ -440,7 +440,7 @@ class UITARSLoop(BaseLoop):
440
440
  # MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
441
441
  ###########################################
442
442
 
443
- async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
443
+ async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
444
444
  """Run the agent loop with provided messages.
445
445
 
446
446
  Args:
@@ -507,41 +507,16 @@ class UITARSLoop(BaseLoop):
507
507
 
508
508
  # Update whether an action screenshot was saved this turn
509
509
  action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
510
-
511
- # Parse actions from the raw response
512
- raw_response = response["choices"][0]["message"]["content"]
513
- parsed_actions = parse_actions(raw_response)
514
-
515
- # Extract thought content if available
516
- thought = ""
517
- if "Thought:" in raw_response:
518
- thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", raw_response, re.DOTALL)
519
- if thought_match:
520
- thought = thought_match.group(1).strip()
521
510
 
522
- # Create standardized thought response format
523
- thought_response = {
524
- "role": "assistant",
525
- "content": thought or raw_response,
526
- "metadata": {
527
- "title": "🧠 UI-TARS Thoughts"
528
- }
529
- }
511
+ agent_response = await to_agent_response_format(
512
+ response,
513
+ messages,
514
+ model=self.model,
515
+ )
516
+ # Log standardized response for ease of parsing
517
+ self._log_api_call("agent_response", request=None, response=agent_response)
518
+ yield agent_response
530
519
 
531
- # Create action response format
532
- action_response = {
533
- "role": "assistant",
534
- "content": str(parsed_actions),
535
- "metadata": {
536
- "title": "🖱️ UI-TARS Actions",
537
- }
538
- }
539
-
540
- # Yield both responses to the caller (thoughts first, then actions)
541
- yield thought_response
542
- if parsed_actions:
543
- yield action_response
544
-
545
520
  # Check if we should continue this conversation
546
521
  running = should_continue
547
522
 
@@ -562,7 +537,8 @@ class UITARSLoop(BaseLoop):
562
537
  logger.error(f"Maximum retry attempts reached. Last error was: {str(e)}")
563
538
 
564
539
  yield {
565
- "error": str(e),
540
+ "role": "assistant",
541
+ "content": f"Error: {str(e)}",
566
542
  "metadata": {"title": "❌ Error"},
567
543
  }
568
544
 
@@ -4,9 +4,114 @@ import logging
4
4
  import base64
5
5
  import re
6
6
  from typing import Any, Dict, List, Optional, Union, Tuple
7
+ from datetime import datetime
7
8
 
8
9
  logger = logging.getLogger(__name__)
9
10
 
11
+ from ...core.types import AgentResponse
12
+
13
+ async def to_agent_response_format(
14
+ response: Dict[str, Any],
15
+ messages: List[Dict[str, Any]],
16
+ model: Optional[str] = None,
17
+ ) -> AgentResponse:
18
+ """Convert raw UI-TARS response to agent response format.
19
+
20
+ Args:
21
+ response: Raw UI-TARS response
22
+ messages: List of messages in standard format
23
+ model: Optional model name
24
+
25
+ Returns:
26
+ AgentResponse: Standardized agent response format
27
+ """
28
+ # Create unique IDs for this response
29
+ response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
30
+ reasoning_id = f"rs_{response_id}"
31
+ action_id = f"cu_{response_id}"
32
+ call_id = f"call_{response_id}"
33
+
34
+ # Parse actions from the raw response
35
+ content = response["choices"][0]["message"]["content"]
36
+ actions = parse_actions(content)
37
+
38
+ # Extract thought content if available
39
+ reasoning_text = ""
40
+ if "Thought:" in content:
41
+ thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", content, re.DOTALL)
42
+ if thought_match:
43
+ reasoning_text = thought_match.group(1).strip()
44
+
45
+ # Create output items
46
+ output_items = []
47
+ if reasoning_text:
48
+ output_items.append({
49
+ "type": "reasoning",
50
+ "id": reasoning_id,
51
+ "text": reasoning_text
52
+ })
53
+ if actions:
54
+ for i, action in enumerate(actions):
55
+ action_name, tool_args = parse_action_parameters(action)
56
+ if action_name == "finished":
57
+ output_items.append({
58
+ "type": "message",
59
+ "role": "assistant",
60
+ "content": [{
61
+ "type": "output_text",
62
+ "text": tool_args["content"]
63
+ }],
64
+ "id": f"action_{i}_{action_id}",
65
+ "status": "completed"
66
+ })
67
+ else:
68
+ if tool_args.get("action") == action_name:
69
+ del tool_args["action"]
70
+ output_items.append({
71
+ "type": "computer_call",
72
+ "id": f"{action}_{i}_{action_id}",
73
+ "call_id": f"call_{i}_{action_id}",
74
+ "action": { "type": action_name, **tool_args },
75
+ "pending_safety_checks": [],
76
+ "status": "completed"
77
+ })
78
+
79
+ # Create agent response
80
+ agent_response = AgentResponse(
81
+ id=response_id,
82
+ object="response",
83
+ created_at=int(datetime.now().timestamp()),
84
+ status="completed",
85
+ error=None,
86
+ incomplete_details=None,
87
+ instructions=None,
88
+ max_output_tokens=None,
89
+ model=model or response["model"],
90
+ output=output_items,
91
+ parallel_tool_calls=True,
92
+ previous_response_id=None,
93
+ reasoning={"effort": "medium"},
94
+ store=True,
95
+ temperature=0.0,
96
+ top_p=0.7,
97
+ text={"format": {"type": "text"}},
98
+ tool_choice="auto",
99
+ tools=[
100
+ {
101
+ "type": "computer_use_preview",
102
+ "display_height": 768,
103
+ "display_width": 1024,
104
+ "environment": "mac",
105
+ }
106
+ ],
107
+ truncation="auto",
108
+ usage=response["usage"],
109
+ user=None,
110
+ metadata={},
111
+ response=response
112
+ )
113
+ return agent_response
114
+
10
115
 
11
116
  def add_box_token(input_string: str) -> str:
12
117
  """Add box tokens to the coordinates in the model response.
@@ -74,7 +179,13 @@ def parse_action_parameters(action: str) -> Tuple[str, Dict[str, Any]]:
74
179
  """
75
180
  # Handle "finished" action
76
181
  if action.startswith("finished"):
77
- return "finished", {}
182
+ # Parse content if it exists
183
+ content_match = re.search(r"content='([^']*)'", action)
184
+ if content_match:
185
+ content = content_match.group(1)
186
+ return "finished", {"content": content}
187
+ else:
188
+ return "finished", {}
78
189
 
79
190
  # Parse action parameters
80
191
  action_match = re.match(r'(\w+)\((.*)\)', action)
@@ -35,6 +35,7 @@ from pathlib import Path
35
35
  from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
36
36
  import gradio as gr
37
37
  from gradio.components.chatbot import MetadataDict
38
+ from typing import cast
38
39
 
39
40
  # Import from agent package
40
41
  from agent.core.types import AgentResponse
@@ -322,63 +323,6 @@ def get_ollama_models() -> List[str]:
322
323
  logging.error(f"Error getting Ollama models: {e}")
323
324
  return []
324
325
 
325
-
326
- def extract_synthesized_text(
327
- result: Union[AgentResponse, Dict[str, Any]],
328
- ) -> Tuple[str, MetadataDict]:
329
- """Extract synthesized text from the agent result."""
330
- synthesized_text = ""
331
- metadata = MetadataDict()
332
-
333
- if "output" in result and result["output"]:
334
- for output in result["output"]:
335
- if output.get("type") == "reasoning":
336
- metadata["title"] = "🧠 Reasoning"
337
- content = output.get("content", "")
338
- if content:
339
- synthesized_text += f"{content}\n"
340
- elif output.get("type") == "message":
341
- # Handle message type outputs - can contain rich content
342
- content = output.get("content", [])
343
-
344
- # Content is usually an array of content blocks
345
- if isinstance(content, list):
346
- for block in content:
347
- if isinstance(block, dict) and block.get("type") == "output_text":
348
- text_value = block.get("text", "")
349
- if text_value:
350
- synthesized_text += f"{text_value}\n"
351
-
352
- elif output.get("type") == "computer_call":
353
- action = output.get("action", {})
354
- action_type = action.get("type", "")
355
-
356
- # Create a descriptive text about the action
357
- if action_type == "click":
358
- button = action.get("button", "")
359
- x = action.get("x", "")
360
- y = action.get("y", "")
361
- synthesized_text += f"Clicked {button} at position ({x}, {y}).\n"
362
- elif action_type == "type":
363
- text = action.get("text", "")
364
- synthesized_text += f"Typed: {text}.\n"
365
- elif action_type == "keypress":
366
- # Extract key correctly from either keys array or key field
367
- if isinstance(action.get("keys"), list):
368
- key = ", ".join(action.get("keys"))
369
- else:
370
- key = action.get("key", "")
371
-
372
- synthesized_text += f"Pressed key: {key}\n"
373
- else:
374
- synthesized_text += f"Performed {action_type} action.\n"
375
-
376
- metadata["status"] = "done"
377
- metadata["title"] = f"🛠️ {synthesized_text.strip().splitlines()[-1]}"
378
-
379
- return synthesized_text.strip(), metadata
380
-
381
-
382
326
  def create_computer_instance(verbosity: int = logging.INFO) -> Computer:
383
327
  """Create or get the global Computer instance."""
384
328
  global global_computer
@@ -447,66 +391,6 @@ def create_agent(
447
391
 
448
392
  return global_agent
449
393
 
450
-
451
- def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> Tuple[str, MetadataDict]:
452
- """Process agent results for the Gradio UI."""
453
- # Extract text content
454
- text_obj = result.get("text", {})
455
- metadata = result.get("metadata", {})
456
-
457
- # Create a properly typed MetadataDict
458
- metadata_dict = MetadataDict()
459
- metadata_dict["title"] = metadata.get("title", "")
460
- metadata_dict["status"] = "done"
461
- metadata = metadata_dict
462
-
463
- # For OpenAI's Computer-Use Agent, text field is an object with format property
464
- if (
465
- text_obj
466
- and isinstance(text_obj, dict)
467
- and "format" in text_obj
468
- and not text_obj.get("value", "")
469
- ):
470
- content, metadata = extract_synthesized_text(result)
471
- else:
472
- if not text_obj:
473
- text_obj = result
474
-
475
- # For other types of results, try to get text directly
476
- if isinstance(text_obj, dict):
477
- if "value" in text_obj:
478
- content = text_obj["value"]
479
- elif "text" in text_obj:
480
- content = text_obj["text"]
481
- elif "content" in text_obj:
482
- content = text_obj["content"]
483
- else:
484
- content = ""
485
- else:
486
- content = str(text_obj) if text_obj else ""
487
-
488
- # If still no content but we have outputs, create a summary
489
- if not content and "output" in result and result["output"]:
490
- output = result["output"]
491
- for out in output:
492
- if out.get("type") == "reasoning":
493
- content = out.get("content", "")
494
- if content:
495
- break
496
- elif out.get("type") == "computer_call":
497
- action = out.get("action", {})
498
- action_type = action.get("type", "")
499
- if action_type:
500
- content = f"Performing action: {action_type}"
501
- break
502
-
503
- # Clean up the text - ensure content is a string
504
- if not isinstance(content, str):
505
- content = str(content) if content else ""
506
-
507
- return content, metadata
508
-
509
-
510
394
  def create_gradio_ui(
511
395
  provider_name: str = "openai",
512
396
  model_name: str = "gpt-4o",
@@ -907,17 +791,64 @@ def create_gradio_ui(
907
791
 
908
792
  # Stream responses from the agent
909
793
  async for result in global_agent.run(last_user_message):
910
- # Process result
911
- content, metadata = process_agent_result(result)
912
-
913
- # Skip empty content
914
- if content or metadata.get("title"):
915
- history.append(
916
- gr.ChatMessage(
917
- role="assistant", content=content, metadata=metadata
794
+ print(f"DEBUG - Agent response ------- START")
795
+ from pprint import pprint
796
+ pprint(result)
797
+ print(f"DEBUG - Agent response ------- END")
798
+
799
+ def generate_gradio_messages():
800
+ if result.get("content"):
801
+ yield gr.ChatMessage(
802
+ role="assistant",
803
+ content=result.get("content", ""),
804
+ metadata=cast(MetadataDict, result.get("metadata", {}))
918
805
  )
919
- )
920
- yield history
806
+ else:
807
+ outputs = result.get("output", [])
808
+ for output in outputs:
809
+ if output.get("type") == "message":
810
+ content = output.get("content", [])
811
+ for content_part in content:
812
+ if content_part.get("text"):
813
+ yield gr.ChatMessage(
814
+ role=output.get("role", "assistant"),
815
+ content=content_part.get("text", ""),
816
+ metadata=content_part.get("metadata", {})
817
+ )
818
+ elif output.get("type") == "reasoning":
819
+ # if it's openAI, we only have access to a summary of the reasoning
820
+ summary_content = output.get("summary", [])
821
+ if summary_content:
822
+ for summary_part in summary_content:
823
+ if summary_part.get("type") == "summary_text":
824
+ yield gr.ChatMessage(
825
+ role="assistant",
826
+ content=summary_part.get("text", "")
827
+ )
828
+ else:
829
+ summary_content = output.get("text", "")
830
+ if summary_content:
831
+ yield gr.ChatMessage(
832
+ role="assistant",
833
+ content=summary_content,
834
+ )
835
+ elif output.get("type") == "computer_call":
836
+ action = output.get("action", {})
837
+ action_type = action.get("type", "")
838
+ if action_type:
839
+ action_title = f"🛠️ Performing {action_type}"
840
+ if action.get("x") and action.get("y"):
841
+ action_title += f" at ({action['x']}, {action['y']})"
842
+ yield gr.ChatMessage(
843
+ role="assistant",
844
+ content=f"```json\n{json.dumps(action)}\n```",
845
+ metadata={"title": action_title}
846
+ )
847
+
848
+ for message in generate_gradio_messages():
849
+ history.append(message)
850
+ yield history
851
+
921
852
  except Exception as e:
922
853
  import traceback
923
854
 
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "cua-agent"
9
- version = "0.1.31"
9
+ version = "0.1.33"
10
10
  description = "CUA (Computer Use) Agent for AI-driven computer interaction"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -108,7 +108,7 @@ target-version = [
108
108
 
109
109
  [tool.ruff]
110
110
  line-length = 100
111
- target-version = "0.1.31"
111
+ target-version = "0.1.33"
112
112
  select = [
113
113
  "E",
114
114
  "F",
@@ -122,7 +122,7 @@ docstring-code-format = true
122
122
 
123
123
  [tool.mypy]
124
124
  strict = true
125
- python_version = "0.1.31"
125
+ python_version = "0.1.33"
126
126
  ignore_missing_imports = true
127
127
  disallow_untyped_defs = true
128
128
  check_untyped_defs = true
File without changes