PyPI - cua-agent - Versions diffs - 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl - Mend

cua-agent 0.4.14py3-none-any.whl → 0.7.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show

agent/__init__.py +4 -19
agent/__main__.py +2 -1
agent/adapters/__init__.py +6 -0
agent/adapters/azure_ml_adapter.py +283 -0
agent/adapters/cua_adapter.py +161 -0
agent/adapters/huggingfacelocal_adapter.py +67 -125
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +370 -0
agent/adapters/models/__init__.py +41 -0
agent/adapters/models/generic.py +78 -0
agent/adapters/models/internvl.py +290 -0
agent/adapters/models/opencua.py +115 -0
agent/adapters/models/qwen2_5_vl.py +78 -0
agent/agent.py +431 -241
agent/callbacks/__init__.py +10 -3
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +54 -98
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +140 -0
agent/callbacks/otel.py +291 -0
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/prompt_instructions.py +47 -0
agent/callbacks/telemetry.py +106 -69
agent/callbacks/trajectory_saver.py +178 -70
agent/cli.py +269 -119
agent/computers/__init__.py +14 -9
agent/computers/base.py +32 -19
agent/computers/cua.py +52 -25
agent/computers/custom.py +78 -71
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +359 -235
agent/integrations/hud/__init__.py +164 -74
agent/integrations/hud/agent.py +338 -342
agent/integrations/hud/proxy.py +297 -0
agent/loops/__init__.py +44 -14
agent/loops/anthropic.py +590 -492
agent/loops/base.py +19 -15
agent/loops/composed_grounded.py +142 -144
agent/loops/fara/__init__.py +8 -0
agent/loops/fara/config.py +506 -0
agent/loops/fara/helpers.py +357 -0
agent/loops/fara/schema.py +143 -0
agent/loops/gelato.py +183 -0
agent/loops/gemini.py +935 -0
agent/loops/generic_vlm.py +601 -0
agent/loops/glm45v.py +140 -135
agent/loops/gta1.py +48 -51
agent/loops/holo.py +218 -0
agent/loops/internvl.py +180 -0
agent/loops/moondream3.py +493 -0
agent/loops/omniparser.py +326 -226
agent/loops/openai.py +63 -56
agent/loops/opencua.py +134 -0
agent/loops/uiins.py +175 -0
agent/loops/uitars.py +262 -212
agent/loops/uitars2.py +951 -0
agent/playground/__init__.py +5 -0
agent/playground/server.py +301 -0
agent/proxy/examples.py +196 -0
agent/proxy/handlers.py +255 -0
agent/responses.py +486 -339
agent/tools/__init__.py +24 -0
agent/tools/base.py +253 -0
agent/tools/browser_tool.py +423 -0
agent/types.py +20 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +25 -22
agent/ui/gradio/ui_components.py +314 -167
cua_agent-0.7.16.dist-info/METADATA +85 -0
cua_agent-0.7.16.dist-info/RECORD +79 -0
{cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
agent/integrations/hud/adapter.py +0 -121
agent/integrations/hud/computer_handler.py +0 -187
agent/telemetry.py +0 -142
cua_agent-0.4.14.dist-info/METADATA +0 -436
cua_agent-0.4.14.dist-info/RECORD +0 -50
{cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0

agent/loops/omniparser.py CHANGED Viewed

@@ -5,100 +5,108 @@ Code: https://github.com/microsoft/OmniParser
 """
 import asyncio
+import base64
+import inspect
 import json
-from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 import litellm
-import inspect
-import base64
 from ..decorators import register_agent
-from ..types import Messages, AgentResponse, Tools, AgentCapability
 from ..loops.base import AsyncAgentConfig
+from ..responses import (
+    convert_completion_messages_to_responses_items,
+    convert_responses_items_to_completion_messages,
+)
+from ..types import AgentCapability, AgentResponse, Messages, Tools
 SOM_TOOL_SCHEMA = {
-  "type": "function",
-  "name": "computer",
-  "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
-  "parameters": {
-    "type": "object",
-    "properties": {
-      "action": {
-        "type": "string",
-        "enum": [
-          "screenshot",
-          "click",
-          "double_click",
-          "drag",
-          "type",
-          "keypress",
-          "scroll",
-          "move",
-          "wait",
-          "get_current_url",
-          "get_dimensions",
-          "get_environment"
-        ],
-        "description": "The action to perform"
-      },
-      "element_id": {
-        "type": "integer",
-        "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
-      },
-      "start_element_id": {
-        "type": "integer",
-        "description": "The ID of the element to start dragging from (required for drag action)"
-      },
-      "end_element_id": {
-        "type": "integer",
-        "description": "The ID of the element to drag to (required for drag action)"
-      },
-      "text": {
-        "type": "string",
-        "description": "The text to type (required for type action)"
-      },
-      "keys": {
-        "type": "string",
-        "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
-      },
-      "button": {
-        "type": "string",
-        "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
-      },
-      "scroll_x": {
-        "type": "integer",
-        "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
-      },
-      "scroll_y": {
-        "type": "integer",
-        "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
-      },
+    "type": "function",
+    "function": {
+        "name": "computer",
+        "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "action": {
+                    "type": "string",
+                    "enum": [
+                        "screenshot",
+                        "click",
+                        "double_click",
+                        "drag",
+                        "type",
+                        "keypress",
+                        "scroll",
+                        "move",
+                        "wait",
+                        "get_current_url",
+                        "get_dimensions",
+                        "get_environment",
+                    ],
+                    "description": "The action to perform",
+                },
+                "element_id": {
+                    "type": "integer",
+                    "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
+                },
+                "start_element_id": {
+                    "type": "integer",
+                    "description": "The ID of the element to start dragging from (required for drag action)",
+                },
+                "end_element_id": {
+                    "type": "integer",
+                    "description": "The ID of the element to drag to (required for drag action)",
+                },
+                "text": {
+                    "type": "string",
+                    "description": "The text to type (required for type action)",
+                },
+                "keys": {
+                    "type": "string",
+                    "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
+                },
+                "button": {
+                    "type": "string",
+                    "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
+                },
+                "scroll_x": {
+                    "type": "integer",
+                    "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
+                },
+                "scroll_y": {
+                    "type": "integer",
+                    "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
+                },
+            },
+            "required": ["action", "element_id"],
+        },
     },
-    "required": [
-      "action"
-    ]
-  }
 }
 OMNIPARSER_AVAILABLE = False
 try:
     from som import OmniParser
     OMNIPARSER_AVAILABLE = True
 except ImportError:
     pass
 OMNIPARSER_SINGLETON = None
 def get_parser():
     global OMNIPARSER_SINGLETON
     if OMNIPARSER_SINGLETON is None:
         OMNIPARSER_SINGLETON = OmniParser()
     return OMNIPARSER_SINGLETON
 def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
     """Get the last computer_call_output message from a messages list.
     Args:
         messages: List of messages to search through
     Returns:
         The last computer_call_output message dict, or None if not found
     """
@@ -107,11 +115,12 @@ def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Di
             return message
     return None
 def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[Tools, dict]:
     """Prepare tools for OpenAI API format"""
     omniparser_tools = []
     id2xy = dict()
     for schema in tool_schemas:
         if schema["type"] == "computer":
             omniparser_tools.append(SOM_TOOL_SCHEMA)
@@ -122,72 +131,80 @@ def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[T
         elif schema["type"] == "function":
             # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
             # Schema should be: {type, name, description, parameters}
-            omniparser_tools.append({ "type": "function", **schema["function"] })
+            omniparser_tools.append({"type": "function", **schema["function"]})
     return omniparser_tools, id2xy
-async def replace_function_with_computer_call(item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]):
-  item_type = item.get("type")
-  def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
-    if element_id is None:
-      return (None, None)
-    return id2xy.get(element_id, (None, None))
-  if item_type == "function_call":
-    fn_name = item.get("name")
-    fn_args = json.loads(item.get("arguments", "{}"))
-    item_id = item.get("id")
-    call_id = item.get("call_id")
-    if fn_name == "computer":
-      action = fn_args.get("action")
-      element_id = fn_args.get("element_id")
-      start_element_id = fn_args.get("start_element_id")
-      end_element_id = fn_args.get("end_element_id")
-      text = fn_args.get("text")
-      keys = fn_args.get("keys")
-      button = fn_args.get("button")
-      scroll_x = fn_args.get("scroll_x")
-      scroll_y = fn_args.get("scroll_y")
-      x, y = _get_xy(element_id)
-      start_x, start_y = _get_xy(start_element_id)
-      end_x, end_y = _get_xy(end_element_id)
-      action_args = {
-          "type": action,
-          "x": x,
-          "y": y,
-          "start_x": start_x,
-          "start_y": start_y,
-          "end_x": end_x,
-          "end_y": end_y,
-          "text": text,
-          "keys": keys,
-          "button": button,
-          "scroll_x": scroll_x,
-          "scroll_y": scroll_y
-        }
-      # Remove None values to keep the JSON clean
-      action_args = {k: v for k, v in action_args.items() if v is not None}
-      return [{
-        "type": "computer_call",
-        "action": action_args,
-        "id": item_id,
-        "call_id": call_id,
-        "status": "completed"
-      }]
+async def replace_function_with_computer_call(
+    item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]
+):
+    item_type = item.get("type")
+    def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
+        if element_id is None:
+            return (None, None)
+        return id2xy.get(element_id, (None, None))
+    if item_type == "function_call":
+        fn_name = item.get("name")
+        fn_args = json.loads(item.get("arguments", "{}"))
-  return [item]
+        item_id = item.get("id")
+        call_id = item.get("call_id")
-async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]):
+        if fn_name == "computer":
+            action = fn_args.get("action")
+            element_id = fn_args.get("element_id")
+            start_element_id = fn_args.get("start_element_id")
+            end_element_id = fn_args.get("end_element_id")
+            text = fn_args.get("text")
+            keys = fn_args.get("keys")
+            button = fn_args.get("button")
+            scroll_x = fn_args.get("scroll_x")
+            scroll_y = fn_args.get("scroll_y")
+            x, y = _get_xy(element_id)
+            start_x, start_y = _get_xy(start_element_id)
+            end_x, end_y = _get_xy(end_element_id)
+            action_args = {
+                "type": action,
+                "x": x,
+                "y": y,
+                "start_x": start_x,
+                "start_y": start_y,
+                "end_x": end_x,
+                "end_y": end_y,
+                "text": text,
+                "keys": keys,
+                "button": button,
+                "scroll_x": scroll_x,
+                "scroll_y": scroll_y,
+            }
+            # Remove None values to keep the JSON clean
+            action_args = {k: v for k, v in action_args.items() if v is not None}
+            return [
+                {
+                    "type": "computer_call",
+                    "action": action_args,
+                    "id": item_id,
+                    "call_id": call_id,
+                    "status": "completed",
+                }
+            ]
+    return [item]
+async def replace_computer_call_with_function(
+    item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]
+):
     """
     Convert computer_call back to function_call format.
     Also handles computer_call_output -> function_call_output conversion.
     Args:
         item: The item to convert
         xy2id: Mapping from (x, y) coordinates to element IDs
@@ -202,12 +219,12 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
     if item_type == "computer_call":
         action_data = item.get("action", {})
         # Extract coordinates and convert back to element IDs
         element_id = _get_element_id(action_data.get("x"), action_data.get("y"))
         start_element_id = _get_element_id(action_data.get("start_x"), action_data.get("start_y"))
         end_element_id = _get_element_id(action_data.get("end_x"), action_data.get("end_y"))
         # Build function arguments
         fn_args = {
             "action": action_data.get("type"),
@@ -218,33 +235,38 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
             "keys": action_data.get("keys"),
             "button": action_data.get("button"),
             "scroll_x": action_data.get("scroll_x"),
-            "scroll_y": action_data.get("scroll_y")
+            "scroll_y": action_data.get("scroll_y"),
         }
         # Remove None values to keep the JSON clean
         fn_args = {k: v for k, v in fn_args.items() if v is not None}
-        return [{
-            "type": "function_call",
-            "name": "computer",
-            "arguments": json.dumps(fn_args),
-            "id": item.get("id"),
-            "call_id": item.get("call_id"),
-            "status": "completed",
-            # Fall back to string representation
-            "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})"
-        }]
+        return [
+            {
+                "type": "function_call",
+                "name": "computer",
+                "arguments": json.dumps(fn_args),
+                "id": item.get("id"),
+                "call_id": item.get("call_id"),
+                "status": "completed",
+            }
+        ]
     elif item_type == "computer_call_output":
-        # Simple conversion: computer_call_output -> function_call_output
-        return [{
-            "type": "function_call_output",
-            "call_id": item.get("call_id"),
-            "content": [item.get("output")],
-            "id": item.get("id"),
-            "status": "completed"
-        }]
+        output = item.get("output")
+        if isinstance(output, dict):
+            output = [output]
+        return [
+            {
+                "type": "function_call_output",
+                "call_id": item.get("call_id"),
+                "output": item.get("output"),
+                "id": item.get("id"),
+                "status": "completed",
+            }
+        ]
     return [item]
@@ -252,7 +274,7 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
 @register_agent(models=r"omniparser\+.*|omni\+.*", priority=2)
 class OmniparserConfig(AsyncAgentConfig):
     """Omniparser agent configuration implementing AsyncAgentConfig protocol."""
     async def predict_step(
         self,
         messages: List[Dict[str, Any]],
@@ -266,63 +288,124 @@ class OmniparserConfig(AsyncAgentConfig):
         _on_api_end=None,
         _on_usage=None,
         _on_screenshot=None,
-        **kwargs
+        **kwargs,
     ) -> Dict[str, Any]:
         """
         OpenAI computer-use-preview agent loop using liteLLM responses.
         Supports OpenAI's computer use preview models.
         """
         if not OMNIPARSER_AVAILABLE:
-            raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
+            raise ValueError(
+                "omniparser loop requires som to be installed. Install it with `pip install cua-som`."
+            )
         tools = tools or []
-        llm_model = model.split('+')[-1]
+        llm_model = model.split("+")[-1]
+        # Get screen dimensions from computer handler
+        try:
+            width, height = await computer_handler.get_dimensions()
+        except Exception:
+            # Fallback to default dimensions if method fails
+            width, height = 1024, 768
         # Prepare tools for OpenAI API
         openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
-        # Find last computer_call_output
-        last_computer_call_output = get_last_computer_call_output(messages) # type: ignore
-        if last_computer_call_output:
-            image_url = last_computer_call_output.get("output", {}).get("image_url", "")
-            image_data = image_url.split(",")[-1]
-            if image_data:
-                parser = get_parser()
+        # Build per-screenshot element mappings for historical consistency
+        screenshot_mappings = []  # (message_index, xy2id)
+        parser = get_parser()
+        for idx, message in enumerate(messages):
+            if not isinstance(message, dict):
+                message = message.__dict__
+            if message.get("type") == "computer_call_output":
+                image_url = message.get("output", {}).get("image_url", "")
+                if not image_url:
+                    continue
+                image_data = image_url.split(",")[-1]
+                if not image_data:
+                    continue
                 result = parser.parse(image_data)
                 if _on_screenshot:
                     await _on_screenshot(result.annotated_image_base64, "annotated_image")
+                local_id2xy = {}
                 for element in result.elements:
-                    id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2)
-        # handle computer calls -> function calls
-        new_messages = []
-        for message in messages:
+                    norm_x = (element.bbox.x1 + element.bbox.x2) / 2
+                    norm_y = (element.bbox.y1 + element.bbox.y2) / 2
+                    pixel_x = int(norm_x * width)
+                    pixel_y = int(norm_y * height)
+                    local_id2xy[element.id] = (pixel_x, pixel_y)
+                xy2id = {v: k for k, v in local_id2xy.items()}
+                screenshot_mappings.append((idx, xy2id))
+        # Replace screenshot with annotated image
+                message["output"]["image_url"] = (
+                    f"data:image/png;base64,{result.annotated_image_base64}"
+                )
+        def get_mapping_for_index(index):
+            applicable = [m for i, m in screenshot_mappings if i <= index]
+            return applicable[-1] if applicable else {}
+        messages_with_element_ids = []
+        for i, message in enumerate(messages):
             if not isinstance(message, dict):
                 message = message.__dict__
-            new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
-        messages = new_messages
+            xy2id = get_mapping_for_index(i)
+            converted = await replace_computer_call_with_function(message, xy2id)
+            messages_with_element_ids.extend(converted)
+        completion_messages = convert_responses_items_to_completion_messages(
+            messages_with_element_ids, allow_images_in_tool_results=False
+        )
         # Prepare API call kwargs
         api_kwargs = {
             "model": llm_model,
-            "input": messages,
+            "messages": completion_messages,
             "tools": openai_tools if openai_tools else None,
             "stream": stream,
-            "truncation": "auto",
             "num_retries": max_retries,
-            **kwargs
+            **kwargs,
         }
+        # Add Vertex AI specific parameters if using vertex_ai models
+        if llm_model.startswith("vertex_ai/"):
+            import os
+            # Pass vertex_project and vertex_location to liteLLM
+            if "vertex_project" not in api_kwargs:
+                api_kwargs["vertex_project"] = os.getenv("GOOGLE_CLOUD_PROJECT")
+            if "vertex_location" not in api_kwargs:
+                api_kwargs["vertex_location"] = "global"
+            # Pass through Gemini 3-specific parameters if provided
+            if "thinking_level" in kwargs:
+                api_kwargs["thinking_level"] = kwargs["thinking_level"]
+            if "media_resolution" in kwargs:
+                api_kwargs["media_resolution"] = kwargs["media_resolution"]
         # Call API start hook
         if _on_api_start:
             await _on_api_start(api_kwargs)
         print(str(api_kwargs)[:1000])
-        # Use liteLLM responses
-        response = await litellm.aresponses(**api_kwargs)
+        # Use liteLLM completion
+        response = await litellm.acompletion(**api_kwargs)
         # Call API end hook
         if _on_api_end:
@@ -330,60 +413,83 @@ class OmniparserConfig(AsyncAgentConfig):
         # Extract usage information
         usage = {
-            **response.usage.model_dump(), # type: ignore
-            "response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore
+            **response.usage.model_dump(),  # type: ignore
+            "response_cost": response._hidden_params.get("response_cost", 0.0),  # type: ignore
         }
         if _on_usage:
             await _on_usage(usage)
-        # handle som function calls -> xy computer calls
-        new_output = []
-        for i in range(len(response.output)): # type: ignore
-          new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
-        return {
-            "output": new_output,
-            "usage": usage
-        }
+        response_dict = response.model_dump()  # type: ignore
+        choice_messages = [choice["message"] for choice in response_dict["choices"]]
+        responses_items = []
+        for choice_message in choice_messages:
+            responses_items.extend(convert_completion_messages_to_responses_items([choice_message]))
+        # Convert element_id → x,y (similar to moondream's convert_computer_calls_desc2xy)
+        final_output = []
+        for item in responses_items:
+            if item.get("type") == "computer_call" and "action" in item:
+                action = item["action"].copy()
+                # Handle single element_id
+                if "element_id" in action:
+                    element_id = action["element_id"]
+                    if element_id in id2xy:
+                        x, y = id2xy[element_id]
+                        action["x"] = x
+                        action["y"] = y
+                        del action["element_id"]
+                # Handle start_element_id and end_element_id for drag operations
+                elif "start_element_id" in action and "end_element_id" in action:
+                    start_id = action["start_element_id"]
+                    end_id = action["end_element_id"]
+                    if start_id in id2xy and end_id in id2xy:
+                        start_x, start_y = id2xy[start_id]
+                        end_x, end_y = id2xy[end_id]
+                        action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
+                        del action["start_element_id"]
+                        del action["end_element_id"]
+                converted_item = item.copy()
+                converted_item["action"] = action
+                final_output.append(converted_item)
+            else:
+                final_output.append(item)
+        return {"output": final_output, "usage": usage}
     async def predict_click(
-        self,
-        model: str,
-        image_b64: str,
-        instruction: str,
-        **kwargs
+        self, model: str, image_b64: str, instruction: str, **kwargs
     ) -> Optional[Tuple[float, float]]:
         """
         Predict click coordinates using OmniParser and LLM.
         Uses OmniParser to annotate the image with element IDs, then uses LLM
         to identify the correct element ID based on the instruction.
         """
         if not OMNIPARSER_AVAILABLE:
             return None
         # Parse the image with OmniParser to get annotated image and elements
         parser = get_parser()
         result = parser.parse(image_b64)
         # Extract the LLM model from composed model string
-        llm_model = model.split('+')[-1]
+        llm_model = model.split("+")[-1]
         # Create system prompt for element ID prediction
-        SYSTEM_PROMPT = f'''
+        SYSTEM_PROMPT = """
 You are an expert UI element locator. Given a GUI image annotated with numerical IDs over each interactable element, along with a user's element description, provide the ID of the specified element.
 The image shows UI elements with numbered overlays. Each number corresponds to a clickable/interactable element.
 Output only the element ID as a single integer.
-'''.strip()
+""".strip()
         # Prepare messages for LLM
         messages = [
-            {
-                "role": "system",
-                "content": SYSTEM_PROMPT
-            },
+            {"role": "system", "content": SYSTEM_PROMPT},
             {
                 "role": "user",
                 "content": [
@@ -391,31 +497,25 @@ Output only the element ID as a single integer.
                         "type": "image_url",
                         "image_url": {
                             "url": f"data:image/png;base64,{result.annotated_image_base64}"
-                        }
+                        },
                     },
-                    {
-                        "type": "text",
-                        "text": f"Find the element: {instruction}"
-                    }
-                ]
-            }
+                    {"type": "text", "text": f"Find the element: {instruction}"},
+                ],
+            },
         ]
         # Call LLM to predict element ID
         response = await litellm.acompletion(
-            model=llm_model,
-            messages=messages,
-            max_tokens=10,
-            temperature=0.1
+            model=llm_model, messages=messages, max_tokens=10, temperature=0.1
         )
         # Extract element ID from response
-        response_text = response.choices[0].message.content.strip() # type: ignore
+        response_text = response.choices[0].message.content.strip()  # type: ignore
         # Try to parse the element ID
         try:
             element_id = int(response_text)
             # Find the element with this ID and return its center coordinates
             for element in result.elements:
                 if element.id == element_id:
@@ -425,9 +525,9 @@ Output only the element ID as a single integer.
         except ValueError:
             # If we can't parse the ID, return None
             pass
         return None
     def get_capabilities(self) -> List[AgentCapability]:
         """Return the capabilities supported by this agent."""
         return ["step"]

cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.14py3-none-any.whl → 0.7.16py3-none-any.whl