PyPI - cua-agent - Versions diffs - 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl - Mend

cua-agent 0.4.34py3-none-any.whl → 0.4.36py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show

agent/__init__.py +4 -10
agent/__main__.py +2 -1
agent/adapters/huggingfacelocal_adapter.py +54 -61
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +110 -99
agent/adapters/models/__init__.py +14 -6
agent/adapters/models/generic.py +7 -4
agent/adapters/models/internvl.py +66 -30
agent/adapters/models/opencua.py +23 -8
agent/adapters/models/qwen2_5_vl.py +7 -4
agent/agent.py +184 -158
agent/callbacks/__init__.py +4 -4
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +18 -13
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +3 -1
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/telemetry.py +67 -61
agent/callbacks/trajectory_saver.py +90 -70
agent/cli.py +115 -110
agent/computers/__init__.py +13 -8
agent/computers/base.py +32 -19
agent/computers/cua.py +33 -25
agent/computers/custom.py +78 -71
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +235 -185
agent/integrations/hud/__init__.py +15 -21
agent/integrations/hud/agent.py +101 -83
agent/integrations/hud/proxy.py +90 -57
agent/loops/__init__.py +25 -21
agent/loops/anthropic.py +537 -483
agent/loops/base.py +13 -14
agent/loops/composed_grounded.py +135 -149
agent/loops/gemini.py +31 -12
agent/loops/glm45v.py +135 -133
agent/loops/gta1.py +47 -50
agent/loops/holo.py +4 -2
agent/loops/internvl.py +6 -11
agent/loops/moondream3.py +36 -12
agent/loops/omniparser.py +215 -210
agent/loops/openai.py +49 -50
agent/loops/opencua.py +29 -41
agent/loops/qwen.py +510 -0
agent/loops/uitars.py +237 -202
agent/proxy/examples.py +54 -50
agent/proxy/handlers.py +27 -34
agent/responses.py +330 -330
agent/types.py +11 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +23 -18
agent/ui/gradio/ui_components.py +310 -161
{cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
cua_agent-0.4.36.dist-info/RECORD +64 -0
cua_agent-0.4.34.dist-info/RECORD +0 -63
{cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
{cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0

agent/adapters/models/internvl.py CHANGED Viewed

@@ -1,19 +1,22 @@
 from __future__ import annotations
-from typing import List, Dict, Any, Optional
+from typing import Any, Dict, List, Optional
 # Hugging Face imports are local to avoid hard dependency at module import
 try:
-    import torch  # type: ignore
-    from transformers import AutoModel, AutoTokenizer  # type: ignore
+    import base64  # type: ignore
+    from io import BytesIO  # type: ignore
     # Attempt to import InternVL's model dependencies
     import einops as _  # type: ignore
+    import requests  # type: ignore
     import timm as _  # type: ignore
-    from PIL import Image  # type: ignore
+    import torch  # type: ignore
     import torchvision.transforms as T  # type: ignore
+    from PIL import Image  # type: ignore
     from torchvision.transforms.functional import InterpolationMode  # type: ignore
-    import base64  # type: ignore
-    from io import BytesIO  # type: ignore
-    import requests  # type: ignore
+    from transformers import AutoModel, AutoTokenizer  # type: ignore
     HF_AVAILABLE = True
 except Exception:
     HF_AVAILABLE = False
@@ -25,10 +28,12 @@ class InternVLModel:
     Provides preprocessing to support multi-turn conversations with multiple images.
     """
-    def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
+    def __init__(
+        self, model_name: str, device: str = "auto", trust_remote_code: bool = False
+    ) -> None:
         if not HF_AVAILABLE:
             raise ImportError(
-                "InternVL dependencies not found. Install with: pip install \"cua-agent[internvl-hf]\""
+                'InternVL dependencies not found. Install with: pip install "cua-agent[internvl-hf]"'
             )
         self.model_name = model_name
         self.device = device
@@ -60,16 +65,25 @@ class InternVLModel:
     def _build_transform(self, input_size: int) -> T.Compose:
         MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
-        transform = T.Compose([
-            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
-            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
-            T.ToTensor(),
-            T.Normalize(mean=MEAN, std=STD)
-        ])
+        transform = T.Compose(
+            [
+                T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+                T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+                T.ToTensor(),
+                T.Normalize(mean=MEAN, std=STD),
+            ]
+        )
         return transform
-    def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tuple], width: int, height: int, image_size: int):
-        best_ratio_diff = float('inf')
+    def _find_closest_aspect_ratio(
+        self,
+        aspect_ratio: float,
+        target_ratios: List[tuple],
+        width: int,
+        height: int,
+        image_size: int,
+    ):
+        best_ratio_diff = float("inf")
         best_ratio = (1, 1)
         area = width * height
         for ratio in target_ratios:
@@ -83,17 +97,29 @@ class InternVLModel:
                     best_ratio = ratio
         return best_ratio
-    def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448, use_thumbnail: bool = True) -> List[Image.Image]:
+    def _dynamic_preprocess(
+        self,
+        image: Image.Image,
+        min_num: int = 1,
+        max_num: int = 12,
+        image_size: int = 448,
+        use_thumbnail: bool = True,
+    ) -> List[Image.Image]:
         orig_width, orig_height = image.size
         aspect_ratio = orig_width / orig_height
         target_ratios = set(
-            (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
-            i * j <= max_num and i * j >= min_num)
+            (i, j)
+            for n in range(min_num, max_num + 1)
+            for i in range(1, n + 1)
+            for j in range(1, n + 1)
+            if i * j <= max_num and i * j >= min_num
+        )
         target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
         target_aspect_ratio = self._find_closest_aspect_ratio(
-            aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+            aspect_ratio, target_ratios, orig_width, orig_height, image_size
+        )
         target_width = image_size * target_aspect_ratio[0]
         target_height = image_size * target_aspect_ratio[1]
@@ -106,7 +132,7 @@ class InternVLModel:
                 (i % (target_width // image_size)) * image_size,
                 (i // (target_width // image_size)) * image_size,
                 ((i % (target_width // image_size)) + 1) * image_size,
-                ((i // (target_width // image_size)) + 1) * image_size
+                ((i // (target_width // image_size)) + 1) * image_size,
             )
             split_img = resized_img.crop(box)
             processed_images.append(split_img)
@@ -122,20 +148,24 @@ class InternVLModel:
             # data URL base64
             header, b64data = src.split(",", 1)
             img_bytes = base64.b64decode(b64data)
-            return Image.open(BytesIO(img_bytes)).convert('RGB')
+            return Image.open(BytesIO(img_bytes)).convert("RGB")
         if src.startswith("http://") or src.startswith("https://"):
             resp = requests.get(src, timeout=10)
             resp.raise_for_status()
-            return Image.open(BytesIO(resp.content)).convert('RGB')
+            return Image.open(BytesIO(resp.content)).convert("RGB")
         # Assume local file path
-        return Image.open(src).convert('RGB')
+        return Image.open(src).convert("RGB")
-    def _images_to_pixel_values(self, images: List[Image.Image], input_size: int = 448, max_num: int = 12):
+    def _images_to_pixel_values(
+        self, images: List[Image.Image], input_size: int = 448, max_num: int = 12
+    ):
         transform = self._build_transform(input_size=input_size)
         pixel_values_list = []
         num_patches_list: List[int] = []
         for img in images:
-            tiles = self._dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
+            tiles = self._dynamic_preprocess(
+                img, image_size=input_size, use_thumbnail=True, max_num=max_num
+            )
             pv = [transform(tile) for tile in tiles]
             pv = torch.stack(pv)
             num_patches_list.append(pv.shape[0])
@@ -191,7 +221,9 @@ class InternVLModel:
                 last_user_text_parts = parts_text or last_user_text_parts
             elif role == "assistant":
                 # Only keep text content for history
-                parts_text = [item.get("text", "") for item in content_items if item.get("type") == "text"]
+                parts_text = [
+                    item.get("text", "") for item in content_items if item.get("type") == "text"
+                ]
                 text = "\n".join(parts_text).strip()
                 if text:
                     context_lines.append(f"Assistant: {text}")
@@ -200,7 +232,9 @@ class InternVLModel:
         pixel_values = None
         num_patches_list: List[int] = []
         if all_images:
-            pixel_values, num_patches_list = self._images_to_pixel_values(all_images, input_size=448, max_num=12)
+            pixel_values, num_patches_list = self._images_to_pixel_values(
+                all_images, input_size=448, max_num=12
+            )
             if pixel_values is not None:
                 # Convert dtype/device as in docs
                 pixel_values = pixel_values.to(torch.bfloat16)
@@ -246,7 +280,9 @@ class InternVLModel:
                         num_patches_list=num_patches_list,
                     )
                 else:
-                    response = self.model.chat(self.tokenizer, pixel_values, question, generation_config)
+                    response = self.model.chat(
+                        self.tokenizer, pixel_values, question, generation_config
+                    )
         except Exception as e:
             # Fallback: return empty string to avoid crashing the adapter
             return ""

agent/adapters/models/opencua.py CHANGED Viewed

@@ -1,13 +1,18 @@
-from typing import List, Dict, Any
-import re
 import base64
+import re
 from io import BytesIO
+from typing import Any, Dict, List
 try:
+    import blobfile as _  # assert blobfile is installed
     import torch  # type: ignore
-    from transformers import AutoTokenizer, AutoModel, AutoImageProcessor  # type: ignore
     from PIL import Image  # type: ignore
-    import blobfile as _ # assert blobfile is installed
+    from transformers import (  # type: ignore
+        AutoImageProcessor,
+        AutoModel,
+        AutoTokenizer,
+    )
     OPENCUA_AVAILABLE = True
 except Exception:
     OPENCUA_AVAILABLE = False
@@ -16,10 +21,12 @@ except Exception:
 class OpenCUAModel:
     """OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""
-    def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
+    def __init__(
+        self, model_name: str, device: str = "auto", trust_remote_code: bool = False
+    ) -> None:
         if not OPENCUA_AVAILABLE:
             raise ImportError(
-                "OpenCUA requirements not found. Install with: pip install \"cua-agent[opencua-hf]\""
+                'OpenCUA requirements not found. Install with: pip install "cua-agent[opencua-hf]"'
             )
         self.model_name = model_name
         self.device = device
@@ -56,7 +63,11 @@ class OpenCUAModel:
         return ""
     def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str:
-        assert self.model is not None and self.tokenizer is not None and self.image_processor is not None
+        assert (
+            self.model is not None
+            and self.tokenizer is not None
+            and self.image_processor is not None
+        )
         # Tokenize text side using chat template
         input_ids = self.tokenizer.apply_chat_template(
@@ -74,7 +85,11 @@ class OpenCUAModel:
             pixel_values = torch.tensor(image_info["pixel_values"]).to(
                 dtype=torch.bfloat16, device=self.model.device
             )
-            grid_thws = torch.tensor(image_info["image_grid_thw"]) if "image_grid_thw" in image_info else None
+            grid_thws = (
+                torch.tensor(image_info["image_grid_thw"])
+                if "image_grid_thw" in image_info
+                else None
+            )
         gen_kwargs: Dict[str, Any] = {
             "max_new_tokens": max_new_tokens,

agent/adapters/models/qwen2_5_vl.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from typing import List, Dict, Any, Optional
+from typing import Any, Dict, List, Optional
 # Hugging Face imports are local to avoid hard dependency at module import
 try:
     import torch  # type: ignore
     from transformers import AutoModelForImageTextToText, AutoProcessor  # type: ignore
     HF_AVAILABLE = True
 except Exception:
     HF_AVAILABLE = False
@@ -14,10 +15,12 @@ class Qwen2_5_VLModel:
     Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
     """
-    def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
+    def __init__(
+        self, model_name: str, device: str = "auto", trust_remote_code: bool = False
+    ) -> None:
         if not HF_AVAILABLE:
             raise ImportError(
-                "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
+                'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
             )
         self.model_name = model_name
         self.device = device
@@ -64,7 +67,7 @@ class Qwen2_5_VLModel:
             generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
         # Trim prompt tokens from output
         generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
         # Decode
         output_text = self.processor.batch_decode(

cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.34py3-none-any.whl → 0.4.36py3-none-any.whl