PyPI - cua-agent - Versions diffs - 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl - Mend

cua-agent 0.4.14py3-none-any.whl → 0.7.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show

agent/__init__.py +4 -19
agent/__main__.py +2 -1
agent/adapters/__init__.py +6 -0
agent/adapters/azure_ml_adapter.py +283 -0
agent/adapters/cua_adapter.py +161 -0
agent/adapters/huggingfacelocal_adapter.py +67 -125
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +370 -0
agent/adapters/models/__init__.py +41 -0
agent/adapters/models/generic.py +78 -0
agent/adapters/models/internvl.py +290 -0
agent/adapters/models/opencua.py +115 -0
agent/adapters/models/qwen2_5_vl.py +78 -0
agent/agent.py +431 -241
agent/callbacks/__init__.py +10 -3
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +54 -98
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +140 -0
agent/callbacks/otel.py +291 -0
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/prompt_instructions.py +47 -0
agent/callbacks/telemetry.py +106 -69
agent/callbacks/trajectory_saver.py +178 -70
agent/cli.py +269 -119
agent/computers/__init__.py +14 -9
agent/computers/base.py +32 -19
agent/computers/cua.py +52 -25
agent/computers/custom.py +78 -71
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +359 -235
agent/integrations/hud/__init__.py +164 -74
agent/integrations/hud/agent.py +338 -342
agent/integrations/hud/proxy.py +297 -0
agent/loops/__init__.py +44 -14
agent/loops/anthropic.py +590 -492
agent/loops/base.py +19 -15
agent/loops/composed_grounded.py +142 -144
agent/loops/fara/__init__.py +8 -0
agent/loops/fara/config.py +506 -0
agent/loops/fara/helpers.py +357 -0
agent/loops/fara/schema.py +143 -0
agent/loops/gelato.py +183 -0
agent/loops/gemini.py +935 -0
agent/loops/generic_vlm.py +601 -0
agent/loops/glm45v.py +140 -135
agent/loops/gta1.py +48 -51
agent/loops/holo.py +218 -0
agent/loops/internvl.py +180 -0
agent/loops/moondream3.py +493 -0
agent/loops/omniparser.py +326 -226
agent/loops/openai.py +63 -56
agent/loops/opencua.py +134 -0
agent/loops/uiins.py +175 -0
agent/loops/uitars.py +262 -212
agent/loops/uitars2.py +951 -0
agent/playground/__init__.py +5 -0
agent/playground/server.py +301 -0
agent/proxy/examples.py +196 -0
agent/proxy/handlers.py +255 -0
agent/responses.py +486 -339
agent/tools/__init__.py +24 -0
agent/tools/base.py +253 -0
agent/tools/browser_tool.py +423 -0
agent/types.py +20 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +25 -22
agent/ui/gradio/ui_components.py +314 -167
cua_agent-0.7.16.dist-info/METADATA +85 -0
cua_agent-0.7.16.dist-info/RECORD +79 -0
{cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
agent/integrations/hud/adapter.py +0 -121
agent/integrations/hud/computer_handler.py +0 -187
agent/telemetry.py +0 -142
cua_agent-0.4.14.dist-info/METADATA +0 -436
cua_agent-0.4.14.dist-info/RECORD +0 -50
{cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0

agent/human_tool/ui.py CHANGED Viewed

@@ -1,21 +1,29 @@
-import gradio as gr
+import base64
+import io
 import json
 import time
-from typing import List, Dict, Any, Optional
 from datetime import datetime
+from typing import Any, Dict, List, Optional
+import gradio as gr
 import requests
-from .server import completion_queue
-import base64
-import io
 from PIL import Image
+from .server import completion_queue
 class HumanCompletionUI:
     def __init__(self, server_url: str = "http://localhost:8002"):
         self.server_url = server_url
         self.current_call_id: Optional[str] = None
         self.refresh_interval = 2.0  # seconds
         self.last_image = None  # Store the last image for display
+        # Track current interactive action controls
+        self.current_action_type: str = "click"
+        self.current_button: str = "left"
+        self.current_scroll_x: int = 0
+        self.current_scroll_y: int = -120
     def format_messages_for_chatbot(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """Format messages for display in gr.Chatbot with type='messages'."""
         formatted = []
@@ -23,7 +31,7 @@ class HumanCompletionUI:
             role = msg.get("role", "user")
             content = msg.get("content", "")
             tool_calls = msg.get("tool_calls", [])
             # Handle different content formats
             if isinstance(content, list):
                 # Multi-modal content - can include text and images
@@ -50,7 +58,7 @@ class HumanCompletionUI:
                             else:
                                 # For URL images, create gr.Image with URL
                                 formatted_content.append(gr.Image(value=image_url))
                 # Determine final content format
                 if len(formatted_content) == 1:
                     content = formatted_content[0]
@@ -58,28 +66,28 @@ class HumanCompletionUI:
                     content = formatted_content
                 else:
                     content = "[Empty content]"
             # Ensure role is valid for Gradio Chatbot
             if role not in ["user", "assistant"]:
                 role = "assistant" if role == "system" else "user"
             # Invert roles for better display in human UI context
             # (what the AI says becomes "user", what human should respond becomes "assistant")
             if role == "user":
                 role = "assistant"
             else:
                 role = "user"
             # Add the main message if it has content
             if content and str(content).strip():
                 formatted.append({"role": role, "content": content})
             # Handle tool calls - create separate messages for each tool call
             if tool_calls:
                 for tool_call in tool_calls:
                     function_name = tool_call.get("function", {}).get("name", "unknown")
                     arguments_str = tool_call.get("function", {}).get("arguments", "{}")
                     try:
                         # Parse arguments to format them nicely
                         arguments = json.loads(arguments_str)
@@ -87,18 +95,20 @@ class HumanCompletionUI:
                     except json.JSONDecodeError:
                         # If parsing fails, use the raw string
                         formatted_args = arguments_str
                     # Create a formatted message for the tool call
                     tool_call_content = f"```json\n{formatted_args}\n```"
-                    formatted.append({
-                        "role": role,
-                        "content": tool_call_content,
-                        "metadata": {"title": f"🛠️ Used {function_name}"}
-                    })
+                    formatted.append(
+                        {
+                            "role": role,
+                            "content": tool_call_content,
+                            "metadata": {"title": f"🛠️ Used {function_name}"},
+                        }
+                    )
         return formatted
     def get_pending_calls(self) -> List[Dict[str, Any]]:
         """Get pending calls from the server."""
         try:
@@ -108,38 +118,39 @@ class HumanCompletionUI:
         except Exception as e:
             print(f"Error fetching pending calls: {e}")
         return []
     def complete_call_with_response(self, call_id: str, response: str) -> bool:
         """Complete a call with a text response."""
         try:
             response_data = {"response": response}
             response_obj = requests.post(
-                f"{self.server_url}/complete/{call_id}",
-                json=response_data,
-                timeout=10
+                f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
             )
             response_obj.raise_for_status()
             return True
         except requests.RequestException as e:
             print(f"Error completing call: {e}")
             return False
     def complete_call_with_tool_calls(self, call_id: str, tool_calls: List[Dict[str, Any]]) -> bool:
         """Complete a call with tool calls."""
         try:
             response_data = {"tool_calls": tool_calls}
             response_obj = requests.post(
-                f"{self.server_url}/complete/{call_id}",
-                json=response_data,
-                timeout=10
+                f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
             )
             response_obj.raise_for_status()
             return True
         except requests.RequestException as e:
             print(f"Error completing call: {e}")
             return False
-    def complete_call(self, call_id: str, response: Optional[str] = None, tool_calls: Optional[List[Dict[str, Any]]] = None) -> bool:
+    def complete_call(
+        self,
+        call_id: str,
+        response: Optional[str] = None,
+        tool_calls: Optional[List[Dict[str, Any]]] = None,
+    ) -> bool:
         """Complete a call with either a response or tool calls."""
         try:
             response_data = {}
@@ -147,25 +158,23 @@ class HumanCompletionUI:
                 response_data["response"] = response
             if tool_calls:
                 response_data["tool_calls"] = tool_calls
             response_obj = requests.post(
-                f"{self.server_url}/complete/{call_id}",
-                json=response_data,
-                timeout=10
+                f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
             )
             response_obj.raise_for_status()
             return True
         except requests.RequestException as e:
             print(f"Error completing call: {e}")
             return False
     def get_last_image_from_messages(self, messages: List[Dict[str, Any]]) -> Optional[Any]:
         """Extract the last image from the messages for display above conversation."""
         last_image = None
         for msg in reversed(messages):  # Start from the last message
             content = msg.get("content", "")
             if isinstance(content, list):
                 for item in reversed(content):  # Get the last image in the message
                     if item.get("type") == "image_url":
@@ -184,41 +193,43 @@ class HumanCompletionUI:
                             else:
                                 # For URL images, return the URL
                                 return image_url
         return last_image
     def refresh_pending_calls(self):
         """Refresh the list of pending calls."""
         pending_calls = self.get_pending_calls()
         if not pending_calls:
             return (
                 gr.update(choices=["latest"], value="latest"),  # dropdown
                 gr.update(value=None),  # image (no image)
                 gr.update(value=[]),  # chatbot (empty messages)
-                gr.update(interactive=False)  # submit button
+                gr.update(interactive=False),  # submit button
+                gr.update(visible=False),  # click_actions_group hidden
+                gr.update(visible=False),  # actions_group hidden
             )
         # Sort pending calls by created_at to get oldest first
         sorted_calls = sorted(pending_calls, key=lambda x: x.get("created_at", ""))
         # Create choices for dropdown
         choices = [("latest", "latest")]  # Add "latest" option first
         for call in sorted_calls:
             call_id = call["id"]
             model = call.get("model", "unknown")
             created_at = call.get("created_at", "")
             # Format timestamp
             try:
-                dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
+                dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
                 time_str = dt.strftime("%H:%M:%S")
             except:
                 time_str = created_at
             choice_label = f"{call_id[:8]}... ({model}) - {time_str}"
             choices.append((choice_label, call_id))
         # Default to "latest" which shows the oldest pending conversation
         selected_call_id = "latest"
         if selected_call_id == "latest" and sorted_calls:
@@ -232,31 +243,37 @@ class HumanCompletionUI:
             conversation = []
             self.current_call_id = None
             self.last_image = None
         return (
             gr.update(choices=choices, value="latest"),
             gr.update(value=self.last_image),
             gr.update(value=conversation),
-            gr.update(interactive=bool(choices))
+            gr.update(interactive=bool(choices)),
+            gr.update(visible=True),  # click_actions_group visible when there is a call
+            gr.update(visible=True),  # actions_group visible when there is a call
         )
     def on_call_selected(self, selected_choice):
         """Handle when a call is selected from the dropdown."""
         if not selected_choice:
             return (
                 gr.update(value=None),  # no image
                 gr.update(value=[]),  # empty chatbot
-                gr.update(interactive=False)
+                gr.update(interactive=False),
+                gr.update(visible=False),  # click_actions_group hidden
+                gr.update(visible=False),  # actions_group hidden
             )
         pending_calls = self.get_pending_calls()
         if not pending_calls:
             return (
                 gr.update(value=None),  # no image
                 gr.update(value=[]),  # empty chatbot
-                gr.update(interactive=False)
+                gr.update(interactive=False),
+                gr.update(visible=False),  # click_actions_group hidden
+                gr.update(visible=False),  # actions_group hidden
             )
         # Handle "latest" option
         if selected_choice == "latest":
             # Sort calls by created_at to get oldest first
@@ -271,134 +288,143 @@ class HumanCompletionUI:
                 if call_id_short in selected_choice:
                     call_id = call["id"]
                     break
             if not call_id:
                 return (
                     gr.update(value=None),  # no image
                     gr.update(value=[]),  # empty chatbot
-                    gr.update(interactive=False)
+                    gr.update(interactive=False),
                 )
             # Find the selected call
             selected_call = next((c for c in pending_calls if c["id"] == call_id), None)
         if not selected_call:
             return (
                 gr.update(value=None),  # no image
                 gr.update(value=[]),  # empty chatbot
-                gr.update(interactive=False)
+                gr.update(interactive=False),
+                gr.update(visible=False),  # click_actions_group hidden
+                gr.update(visible=False),  # actions_group hidden
             )
         conversation = self.format_messages_for_chatbot(selected_call.get("messages", []))
         self.current_call_id = call_id
         # Get the last image from messages
         self.last_image = self.get_last_image_from_messages(selected_call.get("messages", []))
         return (
             gr.update(value=self.last_image),
             gr.update(value=conversation),
-            gr.update(interactive=True)
+            gr.update(interactive=True),
+            gr.update(visible=True),  # click_actions_group visible
+            gr.update(visible=True),  # actions_group visible
         )
     def submit_response(self, response_text: str):
         """Submit a text response to the current call."""
         if not self.current_call_id:
             return (
                 gr.update(value=response_text),  # keep response text
-                gr.update(value="❌ No call selected")  # status
+                gr.update(value="❌ No call selected"),  # status
             )
         if not response_text.strip():
             return (
                 gr.update(value=response_text),  # keep response text
-                gr.update(value="❌ Response cannot be empty")  # status
+                gr.update(value="❌ Response cannot be empty"),  # status
             )
         success = self.complete_call_with_response(self.current_call_id, response_text)
         if success:
             status_msg = "✅ Response submitted successfully!"
             return (
                 gr.update(value=""),  # clear response text
-                gr.update(value=status_msg)  # status
+                gr.update(value=status_msg),  # status
             )
         else:
             return (
                 gr.update(value=response_text),  # keep response text
-                gr.update(value="❌ Failed to submit response")  # status
+                gr.update(value="❌ Failed to submit response"),  # status
             )
     def submit_action(self, action_type: str, **kwargs) -> str:
         """Submit a computer action as a tool call."""
         if not self.current_call_id:
             return "❌ No call selected"
         import uuid
         # Create tool call structure
         action_data = {"type": action_type, **kwargs}
         tool_call = {
             "id": f"call_{uuid.uuid4().hex[:24]}",
             "type": "function",
-            "function": {
-                "name": "computer",
-                "arguments": json.dumps(action_data)
-            }
+            "function": {"name": "computer", "arguments": json.dumps(action_data)},
         }
         success = self.complete_call_with_tool_calls(self.current_call_id, [tool_call])
         if success:
             return f"✅ {action_type.capitalize()} action submitted as tool call"
         else:
             return f"❌ Failed to submit {action_type} action"
-    def submit_click_action(self, x: int, y: int, action_type: str = "click", button: str = "left") -> str:
+    def submit_click_action(
+        self, x: int, y: int, action_type: str = "click", button: str = "left"
+    ) -> str:
         """Submit a coordinate-based action."""
         if action_type == "click":
             return self.submit_action(action_type, x=x, y=y, button=button)
         else:
             return self.submit_action(action_type, x=x, y=y)
     def submit_type_action(self, text: str) -> str:
         """Submit a type action."""
         return self.submit_action("type", text=text)
     def submit_hotkey_action(self, keys: str) -> str:
         """Submit a hotkey action."""
         return self.submit_action("keypress", keys=keys)
-    def submit_description_click(self, description: str, action_type: str = "click", button: str = "left") -> str:
+    def submit_wait_action(self) -> str:
+        """Submit a wait action with no kwargs."""
+        return self.submit_action("wait")
+    def submit_description_click(
+        self, description: str, action_type: str = "click", button: str = "left"
+    ) -> str:
         """Submit a description-based action."""
         if action_type == "click":
             return self.submit_action(action_type, element_description=description, button=button)
         else:
             return self.submit_action(action_type, element_description=description)
     def wait_for_pending_calls(self, max_seconds: float = 10.0, check_interval: float = 0.2):
         """Wait for pending calls to appear or until max_seconds elapsed.
         This method loops and checks for pending calls at regular intervals,
         returning as soon as a pending call is found or the maximum wait time is reached.
         Args:
             max_seconds: Maximum number of seconds to wait
             check_interval: How often to check for pending calls (in seconds)
         """
         import time
         start_time = time.time()
         while time.time() - start_time < max_seconds:
             # Check if there are any pending calls
             pending_calls = self.get_pending_calls()
             if pending_calls:
                 # Found pending calls, return immediately
                 return self.refresh_pending_calls()
             # Wait before checking again
             time.sleep(check_interval)
         # Max wait time reached, return current state
         return self.refresh_pending_calls()
@@ -406,199 +432,261 @@ class HumanCompletionUI:
 def create_ui():
     """Create the Gradio interface."""
     ui_handler = HumanCompletionUI()
-    with gr.Blocks(title="Human-in-the-Loop Agent Tool") as demo:
+    with gr.Blocks(title="Human-in-the-Loop Agent Tool", fill_width=True) as demo:
         gr.Markdown("# 🤖 Human-in-the-Loop Agent Tool")
         gr.Markdown("Review AI conversation requests and provide human responses.")
         with gr.Row():
             with gr.Column(scale=2):
                 with gr.Group():
                     screenshot_image = gr.Image(
-                        label="Screenshot",
-                        interactive=False,
-                        height=600
+                        label="Interactive Screenshot", interactive=False, height=600
                     )
-                    # Action type selection for image clicks
-                    with gr.Row():
-                        action_type_radio = gr.Radio(
-                            label="Action Type",
-                            choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
-                            value="click",
-                            scale=2
-                        )
-                        action_button_radio = gr.Radio(
-                            label="Button (for click only)",
-                            choices=["left", "right", "wheel", "back", "forward"],
-                            value="left",
-                            visible=True,
-                            scale=1
-                        )
+                    # Action type selection for image clicks (wrapped for visibility control)
+                    with gr.Group(visible=False) as click_actions_group:
+                        with gr.Row():
+                            action_type_radio = gr.Dropdown(
+                                label="Interactive Action",
+                                choices=[
+                                    "click",
+                                    "double_click",
+                                    "move",
+                                    "left_mouse_up",
+                                    "left_mouse_down",
+                                    "scroll",
+                                ],
+                                value="click",
+                                scale=2,
+                            )
+                            action_button_radio = gr.Dropdown(
+                                label="Button",
+                                choices=["left", "right", "wheel", "back", "forward"],
+                                value="left",
+                                visible=True,
+                                scale=1,
+                            )
+                            scroll_x_input = gr.Number(
+                                label="scroll_x", value=0, visible=False, scale=1
+                            )
+                            scroll_y_input = gr.Number(
+                                label="scroll_y", value=-120, visible=False, scale=1
+                            )
                     conversation_chatbot = gr.Chatbot(
-                        label="Messages",
-                        type="messages",
-                        height=500,
-                        show_copy_button=True
+                        label="Conversation", height=500, buttons=["copy"]
                     )
             with gr.Column(scale=1):
                 with gr.Group():
                     call_dropdown = gr.Dropdown(
-                        label="Select a pending call",
+                        label="Select a pending conversation request",
                         choices=["latest"],
                         interactive=True,
-                        value="latest"
+                        value="latest",
                     )
                     refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
+                    status_display = gr.Textbox(
+                        label="Status", interactive=False, value="Ready to receive requests..."
+                    )
                 with gr.Group():
                     response_text = gr.Textbox(
-                        label="Response",
-                        lines=3,
-                        placeholder="Enter your response here..."
+                        label="Message", lines=3, placeholder="Enter your message here..."
                     )
-                    submit_btn = gr.Button("📤 Submit Response", variant="primary", interactive=False)
-                # Action Accordions
-                with gr.Accordion("🖱️ Click Actions", open=False):
-                    with gr.Group():
-                        with gr.Row():
-                            click_x = gr.Number(label="X", value=0, minimum=0)
-                            click_y = gr.Number(label="Y", value=0, minimum=0)
-                        with gr.Row():
-                            click_action_type = gr.Dropdown(
-                                label="Action Type",
-                                choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
-                                value="click"
-                            )
-                            click_button = gr.Dropdown(
-                                label="Button (for click only)",
-                                choices=["left", "right", "wheel", "back", "forward"],
-                                value="left"
-                            )
-                        click_submit_btn = gr.Button("Submit Action")
-                with gr.Accordion("📝 Type Action", open=False):
-                    with gr.Group():
-                        type_text = gr.Textbox(
-                            label="Text to Type",
-                            placeholder="Enter text to type..."
-                        )
-                        type_submit_btn = gr.Button("Submit Type")
-                with gr.Accordion("⌨️ Keypress Action", open=False):
-                    with gr.Group():
-                        keypress_text = gr.Textbox(
-                            label="Keys",
-                            placeholder="e.g., ctrl+c, alt+tab"
-                        )
-                        keypress_submit_btn = gr.Button("Submit Keypress")
-                with gr.Accordion("🎯 Description Action", open=False):
-                    with gr.Group():
-                        description_text = gr.Textbox(
-                            label="Element Description",
-                            placeholder="e.g., 'Privacy and security option in left sidebar'"
-                        )
-                        with gr.Row():
-                            description_action_type = gr.Dropdown(
-                                label="Action Type",
-                                choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
-                                value="click"
-                            )
-                            description_button = gr.Radio(
-                                label="Button (for click only)",
-                                choices=["left", "right", "wheel", "back", "forward"],
-                                value="left"
-                            )
-                        description_submit_btn = gr.Button("Submit Description Action")
-                status_display = gr.Textbox(
-                    label="Status",
-                    interactive=False,
-                    value="Ready to receive calls..."
-                )
+                    submit_btn = gr.Button(
+                        "📤 Submit Message", variant="primary", interactive=False
+                    )
+                # Action Accordions (wrapped for visibility control)
+                with gr.Group(visible=False) as actions_group:
+                    with gr.Tabs():
+                        with gr.Tab("🖱️ Click Actions"):
+                            with gr.Group():
+                                description_text = gr.Textbox(
+                                    label="Element Description",
+                                    placeholder="e.g., 'Privacy and security option in left sidebar'",
+                                )
+                                with gr.Row():
+                                    description_action_type = gr.Dropdown(
+                                        label="Action",
+                                        choices=[
+                                            "click",
+                                            "double_click",
+                                            "move",
+                                            "left_mouse_up",
+                                            "left_mouse_down",
+                                        ],
+                                        value="click",
+                                    )
+                                    description_button = gr.Dropdown(
+                                        label="Button",
+                                        choices=["left", "right", "wheel", "back", "forward"],
+                                        value="left",
+                                    )
+                                description_submit_btn = gr.Button("Submit Click Action")
+                        with gr.Tab("📝 Type Action"):
+                            with gr.Group():
+                                type_text = gr.Textbox(
+                                    label="Text to Type", placeholder="Enter text to type..."
+                                )
+                                type_submit_btn = gr.Button("Submit Type")
+                        with gr.Tab("⌨️ Keypress Action"):
+                            with gr.Group():
+                                keypress_text = gr.Textbox(
+                                    label="Keys", placeholder="e.g., ctrl+c, alt+tab"
+                                )
+                                keypress_submit_btn = gr.Button("Submit Keypress")
+                        with gr.Tab("🧰 Misc Actions"):
+                            with gr.Group():
+                                misc_action_dropdown = gr.Dropdown(
+                                    label="Action", choices=["wait"], value="wait"
+                                )
+                                misc_submit_btn = gr.Button("Submit Action")
         # Event handlers
         refresh_btn.click(
             fn=ui_handler.refresh_pending_calls,
-            outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
+            outputs=[
+                call_dropdown,
+                screenshot_image,
+                conversation_chatbot,
+                submit_btn,
+                click_actions_group,
+                actions_group,
+            ],
         )
         call_dropdown.change(
             fn=ui_handler.on_call_selected,
             inputs=[call_dropdown],
-            outputs=[screenshot_image, conversation_chatbot, submit_btn]
+            outputs=[
+                screenshot_image,
+                conversation_chatbot,
+                submit_btn,
+                click_actions_group,
+                actions_group,
+            ],
         )
         def handle_image_click(evt: gr.SelectData):
             if evt.index is not None:
                 x, y = evt.index
-                action_type = action_type_radio.value or "click"
-                button = action_button_radio.value or "left"
-                result = ui_handler.submit_click_action(x, y, action_type, button)
+                action_type = ui_handler.current_action_type or "click"
+                button = ui_handler.current_button or "left"
+                if action_type == "scroll":
+                    sx_i = int(ui_handler.current_scroll_x or 0)
+                    sy_i = int(ui_handler.current_scroll_y or 0)
+                    # Submit a scroll action with x,y position and scroll deltas
+                    result = ui_handler.submit_action(
+                        "scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i
+                    )
+                else:
+                    result = ui_handler.submit_click_action(x, y, action_type, button)
                 ui_handler.wait_for_pending_calls()
                 return result
             return "No coordinates selected"
-        screenshot_image.select(
-            fn=handle_image_click,
-            outputs=[status_display]
-        ).then(
+        screenshot_image.select(fn=handle_image_click, outputs=[status_display]).then(
             fn=ui_handler.wait_for_pending_calls,
-            outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
+            outputs=[
+                call_dropdown,
+                screenshot_image,
+                conversation_chatbot,
+                submit_btn,
+                click_actions_group,
+                actions_group,
+            ],
         )
         # Response submission
         submit_btn.click(
             fn=ui_handler.submit_response,
             inputs=[response_text],
-            outputs=[response_text, status_display]
+            outputs=[response_text, status_display],
         ).then(
             fn=ui_handler.refresh_pending_calls,
-            outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
+            outputs=[
+                call_dropdown,
+                screenshot_image,
+                conversation_chatbot,
+                submit_btn,
+                click_actions_group,
+                actions_group,
+            ],
         )
-        # Toggle button radio visibility based on action type
-        def toggle_button_visibility(action_type):
-            return gr.update(visible=(action_type == "click"))
+        # Toggle visibility of controls based on action type
+        def toggle_action_controls(action_type):
+            # Button visible only for click
+            button_vis = gr.update(visible=(action_type == "click"))
+            # Scroll inputs visible only for scroll
+            scroll_x_vis = gr.update(visible=(action_type == "scroll"))
+            scroll_y_vis = gr.update(visible=(action_type == "scroll"))
+            # Update state
+            ui_handler.current_action_type = action_type or "click"
+            return button_vis, scroll_x_vis, scroll_y_vis
         action_type_radio.change(
-            fn=toggle_button_visibility,
+            fn=toggle_action_controls,
             inputs=[action_type_radio],
-            outputs=[action_button_radio]
+            outputs=[action_button_radio, scroll_x_input, scroll_y_input],
         )
-        # Action accordion handlers
-        click_submit_btn.click(
-            fn=ui_handler.submit_click_action,
-            inputs=[click_x, click_y, click_action_type, click_button],
-            outputs=[status_display]
-        ).then(
-            fn=ui_handler.wait_for_pending_calls,
-            outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
-        )
+        # Keep other control values in ui_handler state
+        def on_button_change(val):
+            ui_handler.current_button = val or "left"
+        action_button_radio.change(fn=on_button_change, inputs=[action_button_radio])
+        def on_scroll_x_change(val):
+            try:
+                ui_handler.current_scroll_x = int(val) if val is not None else 0
+            except Exception:
+                ui_handler.current_scroll_x = 0
+        scroll_x_input.change(fn=on_scroll_x_change, inputs=[scroll_x_input])
+        def on_scroll_y_change(val):
+            try:
+                ui_handler.current_scroll_y = int(val) if val is not None else 0
+            except Exception:
+                ui_handler.current_scroll_y = 0
+        scroll_y_input.change(fn=on_scroll_y_change, inputs=[scroll_y_input])
         type_submit_btn.click(
-            fn=ui_handler.submit_type_action,
-            inputs=[type_text],
-            outputs=[status_display]
+            fn=ui_handler.submit_type_action, inputs=[type_text], outputs=[status_display]
         ).then(
             fn=ui_handler.wait_for_pending_calls,
-            outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
+            outputs=[
+                call_dropdown,
+                screenshot_image,
+                conversation_chatbot,
+                submit_btn,
+                click_actions_group,
+                actions_group,
+            ],
         )
         keypress_submit_btn.click(
-            fn=ui_handler.submit_hotkey_action,
-            inputs=[keypress_text],
-            outputs=[status_display]
+            fn=ui_handler.submit_hotkey_action, inputs=[keypress_text], outputs=[status_display]
         ).then(
             fn=ui_handler.wait_for_pending_calls,
-            outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
+            outputs=[
+                call_dropdown,
+                screenshot_image,
+                conversation_chatbot,
+                submit_btn,
+                click_actions_group,
+                actions_group,
+            ],
         )
         def handle_description_submit(description, action_type, button):
             if description:
                 result = ui_handler.submit_description_click(description, action_type, button)
@@ -609,18 +697,54 @@ def create_ui():
         description_submit_btn.click(
             fn=handle_description_submit,
             inputs=[description_text, description_action_type, description_button],
-            outputs=[status_display]
+            outputs=[status_display],
         ).then(
             fn=ui_handler.wait_for_pending_calls,
-            outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
+            outputs=[
+                call_dropdown,
+                screenshot_image,
+                conversation_chatbot,
+                submit_btn,
+                click_actions_group,
+                actions_group,
+            ],
         )
+        # Misc action handler
+        def handle_misc_submit(selected_action):
+            if selected_action == "wait":
+                result = ui_handler.submit_wait_action()
+                ui_handler.wait_for_pending_calls()
+                return result
+            return f"Unsupported misc action: {selected_action}"
+        misc_submit_btn.click(
+            fn=handle_misc_submit, inputs=[misc_action_dropdown], outputs=[status_display]
+        ).then(
+            fn=ui_handler.wait_for_pending_calls,
+            outputs=[
+                call_dropdown,
+                screenshot_image,
+                conversation_chatbot,
+                submit_btn,
+                click_actions_group,
+                actions_group,
+            ],
+        )
         # Load initial data
         demo.load(
             fn=ui_handler.refresh_pending_calls,
-            outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
+            outputs=[
+                call_dropdown,
+                screenshot_image,
+                conversation_chatbot,
+                submit_btn,
+                click_actions_group,
+                actions_group,
+            ],
         )
     return demo

cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.14py3-none-any.whl → 0.7.16py3-none-any.whl