PyPI - cua-agent - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

cua-agent 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (7) hide show

agent/callbacks/telemetry.py +1 -1
agent/cli.py +85 -23
agent/loops/anthropic.py +659 -18
{cua_agent-0.4.1.dist-info → cua_agent-0.4.3.dist-info}/METADATA +2 -2
{cua_agent-0.4.1.dist-info → cua_agent-0.4.3.dist-info}/RECORD +7 -7
{cua_agent-0.4.1.dist-info → cua_agent-0.4.3.dist-info}/WHEEL +0 -0
{cua_agent-0.4.1.dist-info → cua_agent-0.4.3.dist-info}/entry_points.txt +0 -0

agent/callbacks/telemetry.py CHANGED Viewed

@@ -60,7 +60,7 @@ class TelemetryCallback(AsyncCallbackHandler):
         """Record agent type/model and session initialization."""
         agent_info = {
             "session_id": self.session_id,
-            "agent_type": self.agent.agent_loop.__name__,
+            "agent_type": self.agent.agent_loop.__name__ if hasattr(self.agent, 'agent_loop') else 'unknown',
             "model": getattr(self.agent, 'model', 'unknown'),
             **SYSTEM_INFO
         }

agent/cli.py CHANGED Viewed

@@ -51,9 +51,8 @@ class Colors:
     BG_YELLOW = '\033[43m'
     BG_BLUE = '\033[44m'
-def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n"):
-    """Print colored text to terminal."""
+def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n", right: str = ""):
+    """Print colored text to terminal with optional right-aligned text."""
     prefix = ""
     if bold:
         prefix += Colors.BOLD
@@ -62,10 +61,35 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
     if color:
         prefix += color
-    print(f"{prefix}{text}{Colors.RESET}", end=end)
+    if right:
+        # Get terminal width (default to 80 if unable to determine)
+        try:
+            import shutil
+            terminal_width = shutil.get_terminal_size().columns
+        except:
+            terminal_width = 80
+        # Add right margin
+        terminal_width -= 1
+        # Calculate padding needed
+        # Account for ANSI escape codes not taking visual space
+        visible_left_len = len(text)
+        visible_right_len = len(right)
+        padding = terminal_width - visible_left_len - visible_right_len
+        if padding > 0:
+            output = f"{prefix}{text}{' ' * padding}{right}{Colors.RESET}"
+        else:
+            # If not enough space, just put a single space between
+            output = f"{prefix}{text} {right}{Colors.RESET}"
+    else:
+        output = f"{prefix}{text}{Colors.RESET}"
+    print(output, end=end)
-def print_action(action_type: str, details: Dict[str, Any]):
+def print_action(action_type: str, details: Dict[str, Any], total_cost: float):
     """Print computer action with nice formatting."""
     # Format action details
     args_str = ""
@@ -81,8 +105,10 @@ def print_action(action_type: str, details: Dict[str, Any]):
     elif action_type == "scroll" and "x" in details and "y" in details:
         args_str = f"({details['x']}, {details['y']})"
-    print_colored(f"🛠️  {action_type}{args_str}", dim=True)
+    if total_cost > 0:
+        print_colored(f"🛠️  {action_type}{args_str}", dim=True, right=f"💸 ${total_cost:.2f}")
+    else:
+        print_colored(f"🛠️  {action_type}{args_str}", dim=True)
 def print_welcome(model: str, agent_loop: str, container_name: str):
     """Print welcome message."""
@@ -92,26 +118,32 @@ def print_welcome(model: str, agent_loop: str, container_name: str):
 async def ainput(prompt: str = ""):
     return await asyncio.to_thread(input, prompt)
-async def chat_loop(agent, model: str, container_name: str):
+async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True):
     """Main chat loop with the agent."""
     print_welcome(model, agent.agent_loop.__name__, container_name)
     history = []
+    if initial_prompt:
+        history.append({"role": "user", "content": initial_prompt})
+    total_cost = 0
     while True:
-        # Get user input with prompt
-        print_colored("> ", end="")
-        user_input = await ainput()
-        if user_input.lower() in ['exit', 'quit', 'q']:
-            print_colored("\n👋 Goodbye!")
-            break
+        if history[-1].get("role") != "user":
+            # Get user input with prompt
+            print_colored("> ", end="")
+            user_input = await ainput()
-        if not user_input:
-            continue
-        # Add user message to history
-        history.append({"role": "user", "content": user_input})
+            if user_input.lower() in ['exit', 'quit', 'q']:
+                print_colored("\n👋 Goodbye!")
+                break
+            if not user_input:
+                continue
+            # Add user message to history
+            history.append({"role": "user", "content": user_input})
         # Stream responses from the agent with spinner
         with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
@@ -120,6 +152,9 @@ async def chat_loop(agent, model: str, container_name: str):
             async for result in agent.run(history):
                 # Add agent responses to history
                 history.extend(result.get("output", []))
+                if show_usage:
+                    total_cost += result.get("usage", {}).get("response_cost", 0)
                 # Process and display the output
                 for item in result.get("output", []):
@@ -139,7 +174,7 @@ async def chat_loop(agent, model: str, container_name: str):
                         action_type = action.get("type", "")
                         if action_type:
                             spinner.hide()
-                            print_action(action_type, action)
+                            print_action(action_type, action, total_cost)
                             spinner.text = f"Performing {action_type}..."
                             spinner.show()
@@ -159,6 +194,8 @@ async def chat_loop(agent, model: str, container_name: str):
                             print_colored(f"📤 {output}", dim=True)
             spinner.hide()
+            if show_usage and total_cost > 0:
+                print_colored(f"Total cost: ${total_cost:.2f}", dim=True)
 async def main():
@@ -204,6 +241,26 @@ Examples:
         action="store_true",
         help="Enable verbose logging"
     )
+    parser.add_argument(
+        "-p", "--prompt",
+        type=str,
+        help="Initial prompt to send to the agent. Leave blank for interactive mode."
+    )
+    parser.add_argument(
+        "-c", "--cache",
+        action="store_true",
+        help="Tell the API to enable caching"
+    )
+    parser.add_argument(
+        "-u", "--usage",
+        action="store_true",
+        help="Show total cost of the agent runs"
+    )
     args = parser.parse_args()
@@ -269,9 +326,11 @@ Examples:
         agent_kwargs = {
             "model": args.model,
             "tools": [computer],
-            "only_n_most_recent_images": args.images,
             "verbosity": 20 if args.verbose else 30,  # DEBUG vs WARNING
         }
+        if args.images > 0:
+            agent_kwargs["only_n_most_recent_images"] = args.images
         if args.trajectory:
             agent_kwargs["trajectory_dir"] = "trajectories"
@@ -282,11 +341,14 @@ Examples:
                 "raise_error": True,
                 "reset_after_each_run": False
             }
+        if args.cache:
+            agent_kwargs["use_prompt_caching"] = True
         agent = ComputerAgent(**agent_kwargs)
         # Start chat loop
-        await chat_loop(agent, args.model, container_name)
+        await chat_loop(agent, args.model, container_name, args.prompt, args.usage)

agent/loops/anthropic.py CHANGED Viewed

@@ -193,17 +193,98 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
             tool_use_content = []
+            # Basic actions (all versions)
             if action_type == "click":
+                # Input:
+                # {
+                #     "type": "computer_call",
+                #     "call_id": "call_1",
+                #     "action": {
+                #         "type": "click",
+                #         "x": 100,
+                #         "y": 200
+                #     }
+                # }
+                # Output:
+                # {
+                #     "function": {
+                #         "name": "computer",
+                #         "arguments": json.dumps({
+                #             "action": "click",
+                #             "coordinate": [100, 200]
+                #         })
+                #     },
+                #     "id": "call_1",
+                #     "type": "function"
+                # }
+                button = action.get("button", "left")
+                action_name = "right_click" if button == "right" else "middle_click" if button == "wheel" else "left_click"
                 tool_use_content.append({
                     "type": "tool_use",
                     "id": call_id,
                     "name": "computer",
                     "input": {
-                        "action": "click",
+                        "action": action_name,
+                        "coordinate": [action.get("x", 0), action.get("y", 0)]
+                    }
+                })
+            elif action_type == "double_click":
+                # Input:
+                # {
+                #     "type": "computer_call",
+                #     "call_id": "call_1",
+                #     "action": {
+                #         "type": "double_click",
+                #         "x": 160,
+                #         "y": 240
+                #     }
+                # }
+                # Output:
+                # {
+                #     "function": {
+                #         "name": "computer",
+                #         "arguments": json.dumps({
+                #             "action": "double_click",
+                #             "coordinate": [160, 240]
+                #         })
+                #     },
+                #     "id": "call_1",
+                #     "type": "function"
+                # }
+                tool_use_content.append({
+                    "type": "tool_use",
+                    "id": call_id,
+                    "name": "computer",
+                    "input": {
+                        "action": "double_click",
                         "coordinate": [action.get("x", 0), action.get("y", 0)]
                     }
                 })
             elif action_type == "type":
+                # Input:
+                # {
+                #     "type": "computer_call",
+                #     "call_id": "call_1",
+                #     "action": {
+                #         "type": "type",
+                #         "text": "Hello World"
+                #     }
+                # }
+                # Output:
+                # {
+                #     "function": {
+                #         "name": "computer",
+                #         "arguments": json.dumps({
+                #             "action": "type",
+                #             "text": "Hello World"
+                #         })
+                #     },
+                #     "id": "call_1",
+                #     "type": "function"
+                # }
                 tool_use_content.append({
                     "type": "tool_use",
                     "id": call_id,
@@ -213,26 +294,223 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                         "text": action.get("text", "")
                     }
                 })
-            elif action_type == "key":
+            elif action_type == "keypress":
+                # Input:
+                # {
+                #     "type": "computer_call",
+                #     "call_id": "call_1",
+                #     "action": {
+                #         "type": "keypress",
+                #         "keys": ["ctrl", "c"]
+                #     }
+                # }
+                # Output:
+                # {
+                #     "function": {
+                #         "name": "computer",
+                #         "arguments": json.dumps({
+                #             "action": "key",
+                #             "text": "ctrl+c"
+                #         })
+                #     },
+                #     "id": "call_1",
+                #     "type": "function"
+                # }
                 tool_use_content.append({
                     "type": "tool_use",
                     "id": call_id,
                     "name": "computer",
                     "input": {
                         "action": "key",
-                        "key": action.get("key", "")
+                        "text": "+".join(action.get("keys", []))
+                    }
+                })
+            elif action_type == "mouse_move":
+                # Input:
+                # {
+                #     "type": "computer_call",
+                #     "call_id": "call_1",
+                #     "action": {
+                #         "type": "mouse_move",
+                #         "x": 150,
+                #         "y": 250
+                #     }
+                # }
+                # Output:
+                # {
+                #     "function": {
+                #         "name": "computer",
+                #         "arguments": json.dumps({
+                #             "action": "mouse_move",
+                #             "coordinate": [150, 250]
+                #         })
+                #     },
+                #     "id": "call_1",
+                #     "type": "function"
+                # }
+                tool_use_content.append({
+                    "type": "tool_use",
+                    "id": call_id,
+                    "name": "computer",
+                    "input": {
+                        "action": "mouse_move",
+                        "coordinate": [action.get("x", 0), action.get("y", 0)]
+                    }
+                })
+            elif action_type == "scroll":
+                # Input:
+                # {
+                #     "type": "computer_call",
+                #     "call_id": "call_1",
+                #     "action": {
+                #         "type": "scroll",
+                #         "x": 300,
+                #         "y": 400,
+                #         "scroll_x": 0,
+                #         "scroll_y": -5
+                #     }
+                # }
+                # Output:
+                # {
+                #     "function": {
+                #         "name": "computer",
+                #         "arguments": json.dumps({
+                #             "action": "scroll",
+                #             "coordinate": [300, 400],
+                #             "scroll_direction": "down",
+                #             "scroll_amount": 5
+                #         })
+                #     },
+                #     "id": "call_1",
+                #     "type": "function"
+                # }
+                scroll_x = action.get("scroll_x", 0)
+                scroll_y = action.get("scroll_y", 0)
+                # Determine direction and amount from scroll values
+                if scroll_x > 0:
+                    direction = "left"
+                    amount = scroll_x
+                elif scroll_x < 0:
+                    direction = "right"
+                    amount = -scroll_x
+                elif scroll_y > 0:
+                    direction = "up"
+                    amount = scroll_y
+                elif scroll_y < 0:
+                    direction = "down"
+                    amount = -scroll_y
+                else:
+                    direction = "down"
+                    amount = 3
+                tool_use_content.append({
+                    "type": "tool_use",
+                    "id": call_id,
+                    "name": "computer",
+                    "input": {
+                        "action": "scroll",
+                        "coordinate": [action.get("x", 0), action.get("y", 0)],
+                        "scroll_direction": direction,
+                        "scroll_amount": amount
+                    }
+                })
+            elif action_type == "drag":
+                # Input:
+                # {
+                #     "type": "computer_call",
+                #     "call_id": "call_1",
+                #     "action": {
+                #         "type": "drag",
+                #         "path": [
+                #             {"x": 100, "y": 150},
+                #             {"x": 200, "y": 250}
+                #         ]
+                #     }
+                # }
+                # Output:
+                # {
+                #     "function": {
+                #         "name": "computer",
+                #         "arguments": json.dumps({
+                #             "action": "left_click_drag",
+                #             "start_coordinate": [100, 150],
+                #             "end_coordinate": [200, 250]
+                #         })
+                #     },
+                #     "id": "call_1",
+                #     "type": "function"
+                # }
+                path = action.get("path", [])
+                start_coord = [0, 0]
+                end_coord = [0, 0]
+                if isinstance(path, list) and len(path) >= 2:
+                    start_coord = [path[0].get("x", 0), path[0].get("y", 0)]
+                    end_coord = [path[-1].get("x", 0), path[-1].get("y", 0)]
+                tool_use_content.append({
+                    "type": "tool_use",
+                    "id": call_id,
+                    "name": "computer",
+                    "input": {
+                        "action": "left_click_drag",
+                        "start_coordinate": start_coord,
+                        "end_coordinate": end_coord
                     }
                 })
             elif action_type == "wait":
+                # Input:
+                # {
+                #     "type": "computer_call",
+                #     "call_id": "call_1",
+                #     "action": {
+                #         "type": "wait"
+                #     }
+                # }
+                # Output:
+                # {
+                #     "function": {
+                #         "name": "computer",
+                #         "arguments": json.dumps({
+                #             "action": "wait"
+                #         })
+                #     },
+                #     "id": "call_1",
+                #     "type": "function"
+                # }
                 tool_use_content.append({
                     "type": "tool_use",
                     "id": call_id,
                     "name": "computer",
                     "input": {
-                        "action": "screenshot"
+                        "action": "wait"
                     }
                 })
             elif action_type == "screenshot":
+                # Input:
+                # {
+                #     "type": "computer_call",
+                #     "call_id": "call_1",
+                #     "action": {
+                #         "type": "screenshot"
+                #     }
+                # }
+                # Output:
+                # {
+                #     "function": {
+                #         "name": "computer",
+                #         "arguments": json.dumps({
+                #             "action": "screenshot"
+                #         })
+                #     },
+                #     "id": "call_1",
+                #     "type": "function"
+                # }
                 tool_use_content.append({
                     "type": "tool_use",
                     "id": call_id,
@@ -342,7 +620,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             ))
                         elif action_type == "key":
                             responses_items.append(make_keypress_item(
-                                key=tool_input.get("key", ""),
+                                keys=tool_input.get("text", "").replace("+", "-").split("-"),
                                 call_id=call_id
                             ))
                         elif action_type == "mouse_move":
@@ -361,21 +639,32 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                         # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
                         elif action_type == "scroll":
                             coordinate = tool_input.get("coordinate", [0, 0])
+                            scroll_amount = tool_input.get("scroll_amount", 3)
+                            scroll_x = scroll_amount if tool_input.get("scroll_direction", "down") == "right" else \
+                                -scroll_amount if tool_input.get("scroll_direction", "down") == "left" else 0
+                            scroll_y = scroll_amount if tool_input.get("scroll_direction", "down") == "down" else \
+                                -scroll_amount if tool_input.get("scroll_direction", "down") == "up" else 0
                             responses_items.append(make_scroll_item(
                                 x=coordinate[0] if len(coordinate) > 0 else 0,
                                 y=coordinate[1] if len(coordinate) > 1 else 0,
-                                direction=tool_input.get("scroll_direction", "down"),
-                                amount=tool_input.get("scroll_amount", 3),
+                                scroll_x=scroll_x,
+                                scroll_y=scroll_y,
                                 call_id=call_id
                             ))
                         elif action_type == "left_click_drag":
                             start_coord = tool_input.get("start_coordinate", [0, 0])
                             end_coord = tool_input.get("end_coordinate", [0, 0])
                             responses_items.append(make_drag_item(
-                                start_x=start_coord[0] if len(start_coord) > 0 else 0,
-                                start_y=start_coord[1] if len(start_coord) > 1 else 0,
-                                end_x=end_coord[0] if len(end_coord) > 0 else 0,
-                                end_y=end_coord[1] if len(end_coord) > 1 else 0,
+                                path=[
+                                    {
+                                        "x": start_coord[0] if len(start_coord) > 0 else 0,
+                                        "y": start_coord[1] if len(start_coord) > 1 else 0
+                                    },
+                                    {
+                                        "x": end_coord[0] if len(end_coord) > 0 else 0,
+                                        "y": end_coord[1] if len(end_coord) > 1 else 0
+                                    }
+                                ],
                                 call_id=call_id
                             ))
                         elif action_type == "right_click":
@@ -459,7 +748,6 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
     # Handle tool calls (alternative format)
     if hasattr(message, 'tool_calls') and message.tool_calls:
         for tool_call in message.tool_calls:
-            print(tool_call)
             if tool_call.function.name == "computer":
                 try:
                     args = json.loads(tool_call.function.arguments)
@@ -468,10 +756,53 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                     # Basic actions (all versions)
                     if action_type == "screenshot":
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "screenshot"
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "screenshot"
+                        #     }
+                        # }
                         responses_items.append(make_screenshot_item(
                             call_id=call_id
                         ))
                     elif action_type in ["click", "left_click"]:
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "click",
+                        #             "coordinate": [100, 200]
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "click",
+                        #         "x": 100,
+                        #         "y": 200
+                        #     }
+                        # }
                         coordinate = args.get("coordinate", [0, 0])
                         responses_items.append(make_click_item(
                             x=coordinate[0] if len(coordinate) > 0 else 0,
@@ -479,16 +810,83 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             call_id=call_id
                         ))
                     elif action_type == "type":
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "type",
+                        #             "text": "Hello World"
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "type",
+                        #         "text": "Hello World"
+                        #     }
+                        # }
                         responses_items.append(make_type_item(
                             text=args.get("text", ""),
                             call_id=call_id
                         ))
                     elif action_type == "key":
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "key",
+                        #             "text": "ctrl+c"
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "keypress",
+                        #         "keys": ["ctrl", "c"]
+                        #     }
+                        # }
                         responses_items.append(make_keypress_item(
-                            key=args.get("key", ""),
+                            keys=args.get("text", "").replace("+", "-").split("-"),
                             call_id=call_id
                         ))
                     elif action_type == "mouse_move":
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "mouse_move",
+                        #             "coordinate": [150, 250]
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "mouse_move",
+                        #         "x": 150,
+                        #         "y": 250
+                        #     }
+                        # }
                         coordinate = args.get("coordinate", [0, 0])
                         responses_items.append(make_move_item(
                             x=coordinate[0] if len(coordinate) > 0 else 0,
@@ -498,6 +896,33 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                     # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
                     elif action_type == "scroll":
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "scroll",
+                        #             "coordinate": [300, 400],
+                        #             "scroll_direction": "down",
+                        #             "scroll_amount": 5
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "scroll",
+                        #         "x": 300,
+                        #         "y": 400,
+                        #         "scroll_x": 0,
+                        #         "scroll_y": -5
+                        #     }
+                        # }
                         coordinate = args.get("coordinate", [0, 0])
                         direction = args.get("scroll_direction", "down")
                         amount = args.get("scroll_amount", 3)
@@ -513,16 +938,72 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             call_id=call_id
                         ))
                     elif action_type == "left_click_drag":
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "left_click_drag",
+                        #             "start_coordinate": [100, 150],
+                        #             "end_coordinate": [200, 250]
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "drag",
+                        #         "path": [
+                        #             {"x": 100, "y": 150},
+                        #             {"x": 200, "y": 250}
+                        #         ]
+                        #     }
+                        # }
                         start_coord = args.get("start_coordinate", [0, 0])
                         end_coord = args.get("end_coordinate", [0, 0])
                         responses_items.append(make_drag_item(
-                            start_x=start_coord[0] if len(start_coord) > 0 else 0,
-                            start_y=start_coord[1] if len(start_coord) > 1 else 0,
-                            end_x=end_coord[0] if len(end_coord) > 0 else 0,
-                            end_y=end_coord[1] if len(end_coord) > 1 else 0,
+                            path=[
+                                {
+                                    "x": start_coord[0] if len(start_coord) > 0 else 0,
+                                    "y": start_coord[1] if len(start_coord) > 1 else 0
+                                },
+                                {
+                                    "x": end_coord[0] if len(end_coord) > 0 else 0,
+                                    "y": end_coord[1] if len(end_coord) > 1 else 0
+                                }
+                            ],
                             call_id=call_id
                         ))
                     elif action_type == "right_click":
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "right_click",
+                        #             "coordinate": [120, 180]
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "click",
+                        #         "x": 120,
+                        #         "y": 180,
+                        #         "button": "right"
+                        #     }
+                        # }
                         coordinate = args.get("coordinate", [0, 0])
                         responses_items.append(make_click_item(
                             x=coordinate[0] if len(coordinate) > 0 else 0,
@@ -531,14 +1012,61 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             call_id=call_id
                         ))
                     elif action_type == "middle_click":
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "middle_click",
+                        #             "coordinate": [140, 220]
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "click",
+                        #         "x": 140,
+                        #         "y": 220,
+                        #         "button": "wheel"
+                        #     }
+                        # }
                         coordinate = args.get("coordinate", [0, 0])
                         responses_items.append(make_click_item(
                             x=coordinate[0] if len(coordinate) > 0 else 0,
                             y=coordinate[1] if len(coordinate) > 1 else 0,
-                            button="scroll",
+                            button="wheel",
                             call_id=call_id
                         ))
                     elif action_type == "double_click":
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "double_click",
+                        #             "coordinate": [160, 240]
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "double_click",
+                        #         "x": 160,
+                        #         "y": 240
+                        #     }
+                        # }
                         coordinate = args.get("coordinate", [0, 0])
                         responses_items.append(make_double_click_item(
                             x=coordinate[0] if len(coordinate) > 0 else 0,
@@ -546,14 +1074,127 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             call_id=call_id
                         ))
                     elif action_type == "triple_click":
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "triple_click",
+                        #             "coordinate": [180, 260]
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "triple_click",
+                        #         "x": 180,
+                        #         "y": 260
+                        #     }
+                        # }
                         raise NotImplementedError("triple_click")
                     elif action_type == "left_mouse_down":
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "left_mouse_down",
+                        #             "coordinate": [200, 280]
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "mouse_down",
+                        #         "button": "left",
+                        #         "x": 200,
+                        #         "y": 280
+                        #     }
+                        # }
                         raise NotImplementedError("left_mouse_down")
                     elif action_type == "left_mouse_up":
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "left_mouse_up",
+                        #             "coordinate": [220, 300]
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "mouse_up",
+                        #         "button": "left",
+                        #         "x": 220,
+                        #         "y": 300
+                        #     }
+                        # }
                         raise NotImplementedError("left_mouse_up")
                     elif action_type == "hold_key":
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "hold_key",
+                        #             "key": "shift"
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "key_hold",
+                        #         "key": "shift"
+                        #     }
+                        # }
                         raise NotImplementedError("hold_key")
                     elif action_type == "wait":
+                        # Input:
+                        # {
+                        #     "function": {
+                        #         "name": "computer",
+                        #         "arguments": json.dumps({
+                        #             "action": "wait"
+                        #         })
+                        #     },
+                        #     "id": "call_1",
+                        #     "type": "function"
+                        # }
+                        # Output:
+                        # {
+                        #     "type": "computer_call",
+                        #     "call_id": "call_1",
+                        #     "action": {
+                        #         "type": "wait"
+                        #     }
+                        # }
                         responses_items.append(make_wait_item(
                             call_id=call_id
                         ))

{cua_agent-0.4.1.dist-info → cua_agent-0.4.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cua-agent
-Version: 0.4.1
+Version: 0.4.3
 Summary: CUA (Computer Use) Agent for AI-driven computer interaction
 Author-Email: TryCua <gh@trycua.com>
 Requires-Python: >=3.11
@@ -13,7 +13,7 @@ Requires-Dist: pydantic>=2.6.4
 Requires-Dist: rich>=13.7.1
 Requires-Dist: python-dotenv>=1.0.1
 Requires-Dist: cua-computer<0.5.0,>=0.3.0
-Requires-Dist: cua-core<0.2.0,>=0.1.0
+Requires-Dist: cua-core<0.2.0,>=0.1.8
 Requires-Dist: certifi>=2024.2.2
 Requires-Dist: litellm>=1.74.8
 Provides-Extra: openai

{cua_agent-0.4.1.dist-info → cua_agent-0.4.3.dist-info}/RECORD RENAMED Viewed

@@ -9,13 +9,13 @@ agent/callbacks/budget_manager.py,sha256=RyKM-7iXQcDotYvrw3eURzeEHEXvQjID-NobtvQ
 agent/callbacks/image_retention.py,sha256=tiuRT5ke9xXTb2eP8Gz-2ITyAMY29LURUH6AbjX3RP8,6165
 agent/callbacks/logging.py,sha256=OOxU97EzrxlnUAtiEnvy9FB7SwCUK90-rdpDFA2Ae4E,10921
 agent/callbacks/pii_anonymization.py,sha256=UKAqNacHG3z92_6uocVzOIl8gJoqyofldCoCmB4UVIE,10268
-agent/callbacks/telemetry.py,sha256=sYsE_-tnZkt1ydIRbp_GfCETlz7QG9DNbawq6hM4Bqw,7445
+agent/callbacks/telemetry.py,sha256=PU7pkK7W1v1xjDN-9gA30lGvn4-WhqK3BPHGW3HpTOc,7497
 agent/callbacks/trajectory_saver.py,sha256=POE8aPT-MBzfW873wr6C7iiVUHtp483KwvLPxC1S3EY,11626
-agent/cli.py,sha256=WZFyhmTbFnA7QgZmqKO5tGoWsKeO12-GVlBab314o9Q,10002
+agent/cli.py,sha256=odI7cdl1psOGK-mEQzezsPzbRcLFwDbi7A2ukvYq8dk,12130
 agent/computer_handler.py,sha256=2gfFBeDk9Vd54x9mOqnswMo8BdjUduLo5I0RbBPLovY,3964
 agent/decorators.py,sha256=bCmcCjP31WEjWg1D91OE2jo7AZTfGa9cNgCnYUvjiyw,2832
 agent/loops/__init__.py,sha256=_qpP_--3ePdFkTZP8qmUEFlBsy6m4h8fj0gGLDKA7zw,217
-agent/loops/anthropic.py,sha256=w5s_zvkXdcHt0DgBMYjDQGDMBXK4bPu-SyeIMhA1Rrs,32243
+agent/loops/anthropic.py,sha256=Za_Qzf4q37CO4QZ0jTnSjHj7RIgaoTLNdrxfPYEysCg,58155
 agent/loops/omniparser.py,sha256=m3bDNQ0Igc_HHVoAbjVNj599uRoC9Eap3DCALg6RZ54,11422
 agent/loops/openai.py,sha256=ArTqadeJY8F9N8ZLKfswlzgHV_54HbWJgLd4l6ele9w,3010
 agent/loops/uitars.py,sha256=L0NYxKoIiMfIHbyomnaiK3ZGLmLv3QMx9nX57GruAk0,26323
@@ -27,7 +27,7 @@ agent/ui/__main__.py,sha256=vudWXYvGM0aNT5aZ94HPtGW8YXOZ4cLXepHyhUM_k1g,73
 agent/ui/gradio/__init__.py,sha256=yv4Mrfo-Sj2U5sVn_UJHAuwYCezo-5O4ItR2C9jzNko,145
 agent/ui/gradio/app.py,sha256=X7he4jzyFqWJDP1y_M8yfZvfdy6GHNuclLn4k9iIwAw,8824
 agent/ui/gradio/ui_components.py,sha256=WxFE-4wvdEgj7FPLNXUrs118sXJ9vN3kLkZxtto-weo,34474
-cua_agent-0.4.1.dist-info/METADATA,sha256=Yf2tVl9529nOxprqpjmvqTtqPOnYWmDtDjuo6UuFddg,12060
-cua_agent-0.4.1.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
-cua_agent-0.4.1.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
-cua_agent-0.4.1.dist-info/RECORD,,
+cua_agent-0.4.3.dist-info/METADATA,sha256=x8zulOSGVabWb_SjdI08AphtSUU0XBaOT0B2ULmQtik,12060
+cua_agent-0.4.3.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
+cua_agent-0.4.3.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
+cua_agent-0.4.3.dist-info/RECORD,,

{cua_agent-0.4.1.dist-info → cua_agent-0.4.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{cua_agent-0.4.1.dist-info → cua_agent-0.4.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

cua-agent 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl