PyPI - cua-agent - Versions diffs - 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl - Mend

cua-agent 0.4.34py3-none-any.whl → 0.4.36py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show

agent/__init__.py +4 -10
agent/__main__.py +2 -1
agent/adapters/huggingfacelocal_adapter.py +54 -61
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +110 -99
agent/adapters/models/__init__.py +14 -6
agent/adapters/models/generic.py +7 -4
agent/adapters/models/internvl.py +66 -30
agent/adapters/models/opencua.py +23 -8
agent/adapters/models/qwen2_5_vl.py +7 -4
agent/agent.py +184 -158
agent/callbacks/__init__.py +4 -4
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +18 -13
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +3 -1
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/telemetry.py +67 -61
agent/callbacks/trajectory_saver.py +90 -70
agent/cli.py +115 -110
agent/computers/__init__.py +13 -8
agent/computers/base.py +32 -19
agent/computers/cua.py +33 -25
agent/computers/custom.py +78 -71
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +235 -185
agent/integrations/hud/__init__.py +15 -21
agent/integrations/hud/agent.py +101 -83
agent/integrations/hud/proxy.py +90 -57
agent/loops/__init__.py +25 -21
agent/loops/anthropic.py +537 -483
agent/loops/base.py +13 -14
agent/loops/composed_grounded.py +135 -149
agent/loops/gemini.py +31 -12
agent/loops/glm45v.py +135 -133
agent/loops/gta1.py +47 -50
agent/loops/holo.py +4 -2
agent/loops/internvl.py +6 -11
agent/loops/moondream3.py +36 -12
agent/loops/omniparser.py +215 -210
agent/loops/openai.py +49 -50
agent/loops/opencua.py +29 -41
agent/loops/qwen.py +510 -0
agent/loops/uitars.py +237 -202
agent/proxy/examples.py +54 -50
agent/proxy/handlers.py +27 -34
agent/responses.py +330 -330
agent/types.py +11 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +23 -18
agent/ui/gradio/ui_components.py +310 -161
{cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
cua_agent-0.4.36.dist-info/RECORD +64 -0
cua_agent-0.4.34.dist-info/RECORD +0 -63
{cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
{cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0

agent/adapters/mlxvlm_adapter.py CHANGED Viewed

@@ -1,24 +1,26 @@
 import asyncio
+import base64
 import functools
-import warnings
 import io
-import base64
 import math
 import re
+import warnings
 from concurrent.futures import ThreadPoolExecutor
-from typing import Iterator, AsyncIterator, Dict, List, Any, Optional, Tuple, cast
-from PIL import Image
-from litellm.types.utils import GenericStreamingChunk, ModelResponse
+from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Tuple, cast
+from litellm import acompletion, completion
 from litellm.llms.custom_llm import CustomLLM
-from litellm import completion, acompletion
+from litellm.types.utils import GenericStreamingChunk, ModelResponse
+from PIL import Image
 # Try to import MLX dependencies
 try:
     import mlx.core as mx
-    from mlx_vlm import load, generate
+    from mlx_vlm import generate, load
     from mlx_vlm.prompt_utils import apply_chat_template
     from mlx_vlm.utils import load_config
     from transformers.tokenization_utils import PreTrainedTokenizer
     MLX_AVAILABLE = True
 except ImportError:
     MLX_AVAILABLE = False
@@ -29,20 +31,28 @@ MIN_PIXELS = 100 * 28 * 28
 MAX_PIXELS = 16384 * 28 * 28
 MAX_RATIO = 200
 def round_by_factor(number: float, factor: int) -> int:
     """Returns the closest integer to 'number' that is divisible by 'factor'."""
     return round(number / factor) * factor
 def ceil_by_factor(number: float, factor: int) -> int:
     """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
     return math.ceil(number / factor) * factor
 def floor_by_factor(number: float, factor: int) -> int:
     """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
     return math.floor(number / factor) * factor
 def smart_resize(
-    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
+    height: int,
+    width: int,
+    factor: int = IMAGE_FACTOR,
+    min_pixels: int = MIN_PIXELS,
+    max_pixels: int = MAX_PIXELS,
 ) -> tuple[int, int]:
     """
     Rescales the image so that the following conditions are met:
@@ -70,61 +80,62 @@ def smart_resize(
 class MLXVLMAdapter(CustomLLM):
     """MLX VLM Adapter for running vision-language models locally using MLX."""
     def __init__(self, **kwargs):
         """Initialize the adapter.
         Args:
             **kwargs: Additional arguments
         """
         super().__init__()
         self.models = {}  # Cache for loaded models
         self.processors = {}  # Cache for loaded processors
         self.configs = {}  # Cache for loaded configs
         self._executor = ThreadPoolExecutor(max_workers=1)  # Single thread pool
     def _load_model_and_processor(self, model_name: str):
         """Load model and processor if not already cached.
         Args:
             model_name: Name of the model to load
         Returns:
             Tuple of (model, processor, config)
         """
         if not MLX_AVAILABLE:
             raise ImportError("MLX VLM dependencies not available. Please install mlx-vlm.")
         if model_name not in self.models:
             # Load model and processor
             model_obj, processor = load(
-                model_name,
-                processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS}
+                model_name, processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS}
             )
             config = load_config(model_name)
             # Cache them
             self.models[model_name] = model_obj
             self.processors[model_name] = processor
             self.configs[model_name] = config
         return self.models[model_name], self.processors[model_name], self.configs[model_name]
-    def _process_coordinates(self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]) -> str:
+    def _process_coordinates(
+        self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]
+    ) -> str:
         """Process coordinates in box tokens based on image resizing using smart_resize approach.
         Args:
             text: Text containing box tokens
             original_size: Original image size (width, height)
             model_size: Model processed image size (width, height)
         Returns:
             Text with processed coordinates
         """
         # Find all box tokens
         box_pattern = r"<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>"
         def process_coords(match):
             model_x, model_y = int(match.group(1)), int(match.group(2))
             # Scale coordinates from model space to original image space
@@ -132,15 +143,20 @@ class MLXVLMAdapter(CustomLLM):
             new_x = int(model_x * original_size[0] / model_size[0])  # Width
             new_y = int(model_y * original_size[1] / model_size[1])  # Height
             return f"<|box_start|>({new_x},{new_y})<|box_end|>"
         return re.sub(box_pattern, process_coords, text)
-    def _convert_messages(self, messages: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Image.Image], Dict[int, Tuple[int, int]], Dict[int, Tuple[int, int]]]:
+    def _convert_messages(self, messages: List[Dict[str, Any]]) -> Tuple[
+        List[Dict[str, Any]],
+        List[Image.Image],
+        Dict[int, Tuple[int, int]],
+        Dict[int, Tuple[int, int]],
+    ]:
         """Convert OpenAI format messages to MLX VLM format and extract images.
         Args:
             messages: Messages in OpenAI format
         Returns:
             Tuple of (processed_messages, images, original_sizes, model_sizes)
         """
@@ -149,13 +165,10 @@ class MLXVLMAdapter(CustomLLM):
         original_sizes = {}  # Track original sizes of images for coordinate mapping
         model_sizes = {}  # Track model processed sizes
         image_index = 0
         for message in messages:
-            processed_message = {
-                "role": message["role"],
-                "content": []
-            }
+            processed_message = {"role": message["role"], "content": []}
             content = message.get("content", [])
             if isinstance(content, str):
                 # Simple text content
@@ -165,164 +178,163 @@ class MLXVLMAdapter(CustomLLM):
                 processed_content = []
                 for item in content:
                     if item.get("type") == "text":
-                        processed_content.append({
-                            "type": "text",
-                            "text": item.get("text", "")
-                        })
+                        processed_content.append({"type": "text", "text": item.get("text", "")})
                     elif item.get("type") == "image_url":
                         image_url = item.get("image_url", {}).get("url", "")
                         pil_image = None
                         if image_url.startswith("data:image/"):
                             # Extract base64 data
-                            base64_data = image_url.split(',')[1]
+                            base64_data = image_url.split(",")[1]
                             # Convert base64 to PIL Image
                             image_data = base64.b64decode(base64_data)
                             pil_image = Image.open(io.BytesIO(image_data))
                         else:
                             # Handle file path or URL
                             pil_image = Image.open(image_url)
                         # Store original image size for coordinate mapping
                         original_size = pil_image.size
                         original_sizes[image_index] = original_size
                         # Use smart_resize to determine model size
                         # Note: smart_resize expects (height, width) but PIL gives (width, height)
                         height, width = original_size[1], original_size[0]
                         new_height, new_width = smart_resize(height, width)
                         # Store model size in (width, height) format for consistent coordinate processing
                         model_sizes[image_index] = (new_width, new_height)
                         # Resize the image using the calculated dimensions from smart_resize
                         resized_image = pil_image.resize((new_width, new_height))
                         images.append(resized_image)
                         # Add image placeholder to content
-                        processed_content.append({
-                            "type": "image"
-                        })
+                        processed_content.append({"type": "image"})
                         image_index += 1
                 processed_message["content"] = processed_content
             processed_messages.append(processed_message)
         return processed_messages, images, original_sizes, model_sizes
     def _generate(self, **kwargs) -> str:
         """Generate response using the local MLX VLM model.
         Args:
             **kwargs: Keyword arguments containing messages and model info
         Returns:
             Generated text response
         """
-        messages = kwargs.get('messages', [])
-        model_name = kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')
-        max_tokens = kwargs.get('max_tokens', 128)
+        messages = kwargs.get("messages", [])
+        model_name = kwargs.get("model", "mlx-community/UI-TARS-1.5-7B-4bit")
+        max_tokens = kwargs.get("max_tokens", 128)
         # Warn about ignored kwargs
-        ignored_kwargs = set(kwargs.keys()) - {'messages', 'model', 'max_tokens'}
+        ignored_kwargs = set(kwargs.keys()) - {"messages", "model", "max_tokens"}
         if ignored_kwargs:
             warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
         # Load model and processor
         model, processor, config = self._load_model_and_processor(model_name)
         # Convert messages and extract images
         processed_messages, images, original_sizes, model_sizes = self._convert_messages(messages)
         # Process user text input with box coordinates after image processing
         # Swap original_size and model_size arguments for inverse transformation
         for msg_idx, msg in enumerate(processed_messages):
             if msg.get("role") == "user" and isinstance(msg.get("content"), str):
                 content = msg.get("content", "")
-                if "<|box_start|>" in content and original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
+                if (
+                    "<|box_start|>" in content
+                    and original_sizes
+                    and model_sizes
+                    and 0 in original_sizes
+                    and 0 in model_sizes
+                ):
                     orig_size = original_sizes[0]
                     model_size = model_sizes[0]
                     # Swap arguments to perform inverse transformation for user input
-                    processed_messages[msg_idx]["content"] = self._process_coordinates(content, model_size, orig_size)
+                    processed_messages[msg_idx]["content"] = self._process_coordinates(
+                        content, model_size, orig_size
+                    )
         try:
             # Format prompt according to model requirements using the processor directly
             prompt = processor.apply_chat_template(
-                processed_messages,
-                tokenize=False,
-                add_generation_prompt=True,
-                return_tensors='pt'
+                processed_messages, tokenize=False, add_generation_prompt=True, return_tensors="pt"
             )
             tokenizer = cast(PreTrainedTokenizer, processor)
             # Generate response
             text_content, usage = generate(
-                model,
-                tokenizer,
-                str(prompt),
-                images, # type: ignore
+                model,
+                tokenizer,
+                str(prompt),
+                images,  # type: ignore
                 verbose=False,
-                max_tokens=max_tokens
+                max_tokens=max_tokens,
             )
         except Exception as e:
             raise RuntimeError(f"Error generating response: {str(e)}") from e
         # Process coordinates in the response back to original image space
         if original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
             # Get original image size and model size (using the first image)
             orig_size = original_sizes[0]
             model_size = model_sizes[0]
             # Check if output contains box tokens that need processing
             if "<|box_start|>" in text_content:
                 # Process coordinates from model space back to original image space
                 text_content = self._process_coordinates(text_content, orig_size, model_size)
         return text_content
     def completion(self, *args, **kwargs) -> ModelResponse:
         """Synchronous completion method.
         Returns:
             ModelResponse with generated text
         """
         generated_text = self._generate(**kwargs)
         result = completion(
             model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}",
             mock_response=generated_text,
         )
         return cast(ModelResponse, result)
     async def acompletion(self, *args, **kwargs) -> ModelResponse:
         """Asynchronous completion method.
         Returns:
             ModelResponse with generated text
         """
         # Run _generate in thread pool to avoid blocking
         loop = asyncio.get_event_loop()
         generated_text = await loop.run_in_executor(
-            self._executor,
-            functools.partial(self._generate, **kwargs)
+            self._executor, functools.partial(self._generate, **kwargs)
         )
         result = await acompletion(
             model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}",
             mock_response=generated_text,
         )
         return cast(ModelResponse, result)
     def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
         """Synchronous streaming method.
         Returns:
             Iterator of GenericStreamingChunk
         """
         generated_text = self._generate(**kwargs)
         generic_streaming_chunk: GenericStreamingChunk = {
             "finish_reason": "stop",
             "index": 0,
@@ -331,22 +343,21 @@ class MLXVLMAdapter(CustomLLM):
             "tool_use": None,
             "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
         }
         yield generic_streaming_chunk
     async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
         """Asynchronous streaming method.
         Returns:
             AsyncIterator of GenericStreamingChunk
         """
         # Run _generate in thread pool to avoid blocking
         loop = asyncio.get_event_loop()
         generated_text = await loop.run_in_executor(
-            self._executor,
-            functools.partial(self._generate, **kwargs)
+            self._executor, functools.partial(self._generate, **kwargs)
         )
         generic_streaming_chunk: GenericStreamingChunk = {
             "finish_reason": "stop",
             "index": 0,
@@ -355,5 +366,5 @@ class MLXVLMAdapter(CustomLLM):
             "tool_use": None,
             "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
         }
-        yield generic_streaming_chunk
+        yield generic_streaming_chunk

agent/adapters/models/__init__.py CHANGED Viewed

@@ -2,32 +2,40 @@ from typing import Optional
 try:
     from transformers import AutoConfig
     HF_AVAILABLE = True
 except ImportError:
     HF_AVAILABLE = False
 from .generic import GenericHFModel
+from .internvl import InternVLModel
 from .opencua import OpenCUAModel
 from .qwen2_5_vl import Qwen2_5_VLModel
-from .internvl import InternVLModel
 def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = False):
     """Factory function to load and return the right model handler instance.
     - If the underlying transformers config class matches OpenCUA, return OpenCUAModel
     - Otherwise, return GenericHFModel
     """
     if not HF_AVAILABLE:
         raise ImportError(
-            "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
+            'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
         )
     cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
     cls = cfg.__class__.__name__
     print(f"cls: {cls}")
     if "OpenCUA" in cls:
-        return OpenCUAModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
+        return OpenCUAModel(
+            model_name=model_name, device=device, trust_remote_code=trust_remote_code
+        )
     elif "Qwen2_5_VL" in cls:
-        return Qwen2_5_VLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
+        return Qwen2_5_VLModel(
+            model_name=model_name, device=device, trust_remote_code=trust_remote_code
+        )
     elif "InternVL" in cls:
-        return InternVLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
+        return InternVLModel(
+            model_name=model_name, device=device, trust_remote_code=trust_remote_code
+        )
     return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)

agent/adapters/models/generic.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from typing import List, Dict, Any, Optional
+from typing import Any, Dict, List, Optional
 # Hugging Face imports are local to avoid hard dependency at module import
 try:
     import torch  # type: ignore
     from transformers import AutoModel, AutoProcessor  # type: ignore
     HF_AVAILABLE = True
 except Exception:
     HF_AVAILABLE = False
@@ -14,10 +15,12 @@ class GenericHFModel:
     Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
     """
-    def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
+    def __init__(
+        self, model_name: str, device: str = "auto", trust_remote_code: bool = False
+    ) -> None:
         if not HF_AVAILABLE:
             raise ImportError(
-                "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
+                'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
             )
         self.model_name = model_name
         self.device = device
@@ -64,7 +67,7 @@ class GenericHFModel:
             generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
         # Trim prompt tokens from output
         generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
         # Decode
         output_text = self.processor.batch_decode(

cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.34py3-none-any.whl → 0.4.36py3-none-any.whl