PyPI - cua-agent - Versions diffs - 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl - Mend

cua-agent 0.4.14py3-none-any.whl → 0.7.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show

agent/__init__.py +4 -19
agent/__main__.py +2 -1
agent/adapters/__init__.py +6 -0
agent/adapters/azure_ml_adapter.py +283 -0
agent/adapters/cua_adapter.py +161 -0
agent/adapters/huggingfacelocal_adapter.py +67 -125
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +370 -0
agent/adapters/models/__init__.py +41 -0
agent/adapters/models/generic.py +78 -0
agent/adapters/models/internvl.py +290 -0
agent/adapters/models/opencua.py +115 -0
agent/adapters/models/qwen2_5_vl.py +78 -0
agent/agent.py +431 -241
agent/callbacks/__init__.py +10 -3
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +54 -98
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +140 -0
agent/callbacks/otel.py +291 -0
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/prompt_instructions.py +47 -0
agent/callbacks/telemetry.py +106 -69
agent/callbacks/trajectory_saver.py +178 -70
agent/cli.py +269 -119
agent/computers/__init__.py +14 -9
agent/computers/base.py +32 -19
agent/computers/cua.py +52 -25
agent/computers/custom.py +78 -71
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +359 -235
agent/integrations/hud/__init__.py +164 -74
agent/integrations/hud/agent.py +338 -342
agent/integrations/hud/proxy.py +297 -0
agent/loops/__init__.py +44 -14
agent/loops/anthropic.py +590 -492
agent/loops/base.py +19 -15
agent/loops/composed_grounded.py +142 -144
agent/loops/fara/__init__.py +8 -0
agent/loops/fara/config.py +506 -0
agent/loops/fara/helpers.py +357 -0
agent/loops/fara/schema.py +143 -0
agent/loops/gelato.py +183 -0
agent/loops/gemini.py +935 -0
agent/loops/generic_vlm.py +601 -0
agent/loops/glm45v.py +140 -135
agent/loops/gta1.py +48 -51
agent/loops/holo.py +218 -0
agent/loops/internvl.py +180 -0
agent/loops/moondream3.py +493 -0
agent/loops/omniparser.py +326 -226
agent/loops/openai.py +63 -56
agent/loops/opencua.py +134 -0
agent/loops/uiins.py +175 -0
agent/loops/uitars.py +262 -212
agent/loops/uitars2.py +951 -0
agent/playground/__init__.py +5 -0
agent/playground/server.py +301 -0
agent/proxy/examples.py +196 -0
agent/proxy/handlers.py +255 -0
agent/responses.py +486 -339
agent/tools/__init__.py +24 -0
agent/tools/base.py +253 -0
agent/tools/browser_tool.py +423 -0
agent/types.py +20 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +25 -22
agent/ui/gradio/ui_components.py +314 -167
cua_agent-0.7.16.dist-info/METADATA +85 -0
cua_agent-0.7.16.dist-info/RECORD +79 -0
{cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
agent/integrations/hud/adapter.py +0 -121
agent/integrations/hud/computer_handler.py +0 -187
agent/telemetry.py +0 -142
cua_agent-0.4.14.dist-info/METADATA +0 -436
cua_agent-0.4.14.dist-info/RECORD +0 -50
{cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0

agent/callbacks/trajectory_saver.py CHANGED Viewed

@@ -2,24 +2,28 @@
 Trajectory saving callback handler for ComputerAgent.
 """
-import os
+import base64
+import io
 import json
+import os
 import uuid
+from copy import deepcopy
 from datetime import datetime
-import base64
 from pathlib import Path
-from typing import List, Dict, Any, Optional, Union, override
+from typing import Any, Dict, List, Optional, Union, override
 from PIL import Image, ImageDraw
-import io
 from .base import AsyncCallbackHandler
 def sanitize_image_urls(data: Any) -> Any:
     """
     Recursively search for 'image_url' keys and set their values to '[omitted]'.
     Args:
         data: Any data structure (dict, list, or primitive type)
     Returns:
         A deep copy of the data with all 'image_url' values replaced with '[omitted]'
     """
@@ -33,28 +37,91 @@ def sanitize_image_urls(data: Any) -> Any:
                 # Recursively sanitize the value
                 sanitized[key] = sanitize_image_urls(value)
         return sanitized
     elif isinstance(data, list):
         # Recursively sanitize each item in the list
         return [sanitize_image_urls(item) for item in data]
     else:
         # For primitive types (str, int, bool, None, etc.), return as-is
         return data
+def extract_computer_call_outputs(
+    items: List[Dict[str, Any]], screenshot_dir: Optional[Path]
+) -> List[Dict[str, Any]]:
+    """
+    Save any base64-encoded screenshots from computer_call_output entries to files and
+    replace their image_url with the saved file path when a call_id is present.
+    Only operates if screenshot_dir is provided and exists; otherwise returns items unchanged.
+    Args:
+        items: List of message/result dicts potentially containing computer_call_output entries
+        screenshot_dir: Directory to write screenshots into
+    Returns:
+        A new list with updated image_url fields when applicable.
+    """
+    if not items:
+        return items
+    if not screenshot_dir or not screenshot_dir.exists():
+        return items
+    updated: List[Dict[str, Any]] = []
+    for item in items:
+        # work on a shallow copy; deep copy nested 'output' if we modify it
+        msg = dict(item)
+        try:
+            if msg.get("type") == "computer_call_output":
+                call_id = msg.get("call_id")
+                output = msg.get("output", {})
+                image_url = output.get("image_url")
+                if call_id and isinstance(image_url, str) and image_url.startswith("data:"):
+                    # derive extension from MIME type e.g. data:image/png;base64,
+                    try:
+                        ext = image_url.split(";", 1)[0].split("/")[-1]
+                        if not ext:
+                            ext = "png"
+                    except Exception:
+                        ext = "png"
+                    out_path = screenshot_dir / f"{call_id}.{ext}"
+                    # write file if it doesn't exist
+                    if not out_path.exists():
+                        try:
+                            b64_payload = image_url.split(",", 1)[1]
+                            img_bytes = base64.b64decode(b64_payload)
+                            out_path.parent.mkdir(parents=True, exist_ok=True)
+                            with open(out_path, "wb") as f:
+                                f.write(img_bytes)
+                        except Exception:
+                            # if anything fails, skip modifying this message
+                            pass
+                    # update image_url to file path
+                    new_output = dict(output)
+                    new_output["image_url"] = str(out_path)
+                    msg["output"] = new_output
+        except Exception:
+            # do not block on malformed entries; keep original
+            pass
+        updated.append(msg)
+    return updated
 class TrajectorySaverCallback(AsyncCallbackHandler):
     """
     Callback handler that saves agent trajectories to disk.
     Saves each run as a separate trajectory with unique ID, and each turn
     within the trajectory gets its own folder with screenshots and responses.
     """
-    def __init__(self, trajectory_dir: str, reset_on_run: bool = True):
+    def __init__(
+        self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None
+    ):
         """
         Initialize trajectory saver.
         Args:
             trajectory_dir: Base directory to save trajectories
             reset_on_run: If True, reset trajectory_id/turn/artifact on each run.
@@ -67,15 +134,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
         self.model: Optional[str] = None
         self.total_usage: Dict[str, Any] = {}
         self.reset_on_run = reset_on_run
+        # Optional directory to store extracted screenshots from metadata/new_items
+        self.screenshot_dir: Optional[Path] = Path(screenshot_dir) if screenshot_dir else None
         # Ensure trajectory directory exists
         self.trajectory_dir.mkdir(parents=True, exist_ok=True)
+        # Ensure screenshot directory exists if specified
+        if self.screenshot_dir:
+            self.screenshot_dir.mkdir(parents=True, exist_ok=True)
     def _get_turn_dir(self) -> Path:
         """Get the directory for the current turn."""
         if not self.trajectory_id:
             raise ValueError("Trajectory not initialized - call _on_run_start first")
         # format: trajectory_id/turn_000
         turn_dir = self.trajectory_dir / self.trajectory_id / f"turn_{self.current_turn:03d}"
         turn_dir.mkdir(parents=True, exist_ok=True)
@@ -94,12 +167,17 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
             # format: turn_000/0000_name.json
             artifact_filename = f"{self.current_artifact:04d}_{name}"
             artifact_path = turn_dir / f"{artifact_filename}.json"
+            # add created_at
+            if isinstance(artifact, dict):
+                artifact = artifact.copy()
+                artifact["created_at"] = str(uuid.uuid1().time)
             with open(artifact_path, "w") as f:
                 json.dump(sanitize_image_urls(artifact), f, indent=2)
         self.current_artifact += 1
     def _update_usage(self, usage: Dict[str, Any]) -> None:
         """Update total usage statistics."""
         def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
             for key, value in source.items():
                 if isinstance(value, dict):
@@ -110,18 +188,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
                     if key not in target:
                         target[key] = 0
                     target[key] += value
         add_dicts(self.total_usage, usage)
     @override
     async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
         """Initialize trajectory tracking for a new run."""
         model = kwargs.get("model", "unknown")
         # Only reset trajectory state if reset_on_run is True or no trajectory exists
         if self.reset_on_run or not self.trajectory_id:
             model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
             if "+" in model:
                 model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
+            # strip non-alphanumeric characters from model_name_short
+            model_name_short = "".join(c for c in model_name_short if c.isalnum() or c == "_")
             # id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
             now = datetime.now()
@@ -130,19 +211,28 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
             self.current_artifact = 0
             self.model = model
             self.total_usage = {}
             # Create trajectory directory
             trajectory_path = self.trajectory_dir / self.trajectory_id
             trajectory_path.mkdir(parents=True, exist_ok=True)
-            # Save trajectory metadata
+            # Save trajectory metadata (optionally extract screenshots to screenshot_dir)
+            kwargs_to_save = kwargs.copy()
+            try:
+                if "messages" in kwargs_to_save:
+                    kwargs_to_save["messages"] = extract_computer_call_outputs(
+                        kwargs_to_save["messages"], self.screenshot_dir
+                    )
+            except Exception:
+                # If extraction fails, fall back to original messages
+                pass
             metadata = {
                 "trajectory_id": self.trajectory_id,
                 "created_at": str(uuid.uuid1().time),
                 "status": "running",
-                "kwargs": kwargs,
+                "kwargs": kwargs_to_save,
             }
             with open(trajectory_path / "metadata.json", "w") as f:
                 json.dump(metadata, f, indent=2)
         else:
@@ -150,49 +240,63 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
             self.model = model
     @override
-    async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
+    async def on_run_end(
+        self,
+        kwargs: Dict[str, Any],
+        old_items: List[Dict[str, Any]],
+        new_items: List[Dict[str, Any]],
+    ) -> None:
         """Finalize run tracking by updating metadata with completion status, usage, and new items."""
         if not self.trajectory_id:
             return
         # Update metadata with completion status, total usage, and new items
         trajectory_path = self.trajectory_dir / self.trajectory_id
         metadata_path = trajectory_path / "metadata.json"
         # Read existing metadata
         if metadata_path.exists():
             with open(metadata_path, "r") as f:
                 metadata = json.load(f)
         else:
             metadata = {}
         # Update metadata with completion info
-        metadata.update({
-            "status": "completed",
-            "completed_at": str(uuid.uuid1().time),
-            "total_usage": self.total_usage,
-            "new_items": sanitize_image_urls(new_items),
-            "total_turns": self.current_turn
-        })
+        # Optionally extract screenshots from new_items before persisting
+        new_items_to_save = new_items
+        try:
+            new_items_to_save = extract_computer_call_outputs(new_items, self.screenshot_dir)
+        except Exception:
+            pass
+        metadata.update(
+            {
+                "status": "completed",
+                "completed_at": str(uuid.uuid1().time),
+                "total_usage": self.total_usage,
+                "new_items": new_items_to_save,
+                "total_turns": self.current_turn,
+            }
+        )
         # Save updated metadata
         with open(metadata_path, "w") as f:
             json.dump(metadata, f, indent=2)
-    @override
+    @override
     async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
         if not self.trajectory_id:
             return
-        self._save_artifact("api_start", { "kwargs": kwargs })
+        self._save_artifact("api_start", {"kwargs": kwargs})
     @override
     async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
         """Save API call result."""
         if not self.trajectory_id:
             return
-        self._save_artifact("api_result", { "kwargs": kwargs, "result": result })
+        self._save_artifact("api_result", {"kwargs": kwargs, "result": result})
     @override
     async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
@@ -211,77 +315,83 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
         """Save responses to the current turn directory and update usage statistics."""
         if not self.trajectory_id:
             return
         # Save responses
         turn_dir = self._get_turn_dir()
         response_data = {
             "timestamp": str(uuid.uuid1().time),
             "model": self.model,
             "kwargs": kwargs,
-            "response": responses
+            "response": responses,
         }
         self._save_artifact("agent_response", response_data)
         # Increment turn counter
         self.current_turn += 1
     def _draw_crosshair_on_image(self, image_bytes: bytes, x: int, y: int) -> bytes:
         """
         Draw a red dot and crosshair at the specified coordinates on the image.
         Args:
             image_bytes: The original image as bytes
             x: X coordinate for the crosshair
             y: Y coordinate for the crosshair
         Returns:
             Modified image as bytes with red dot and crosshair
         """
         # Open the image
         image = Image.open(io.BytesIO(image_bytes))
         draw = ImageDraw.Draw(image)
         # Draw crosshair lines (red, 2px thick)
         crosshair_size = 20
         line_width = 2
         color = "red"
         # Horizontal line
         draw.line([(x - crosshair_size, y), (x + crosshair_size, y)], fill=color, width=line_width)
         # Vertical line
         draw.line([(x, y - crosshair_size), (x, y + crosshair_size)], fill=color, width=line_width)
         # Draw center dot (filled circle)
         dot_radius = 3
-        draw.ellipse([(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color)
+        draw.ellipse(
+            [(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color
+        )
         # Convert back to bytes
         output = io.BytesIO()
-        image.save(output, format='PNG')
+        image.save(output, format="PNG")
         return output.getvalue()
     @override
-    async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
+    async def on_computer_call_end(
+        self, item: Dict[str, Any], result: List[Dict[str, Any]]
+    ) -> None:
         """
         Called when a computer call has completed.
         Saves screenshots and computer call output.
         """
         if not self.trajectory_id:
             return
-        self._save_artifact("computer_call_result", { "item": item, "result": result })
+        self._save_artifact("computer_call_result", {"item": item, "result": result})
         # Check if action has x/y coordinates and there's a screenshot in the result
         action = item.get("action", {})
         if "x" in action and "y" in action:
             # Look for screenshot in the result
             for result_item in result:
-                if (result_item.get("type") == "computer_call_output" and
-                    result_item.get("output", {}).get("type") == "input_image"):
+                if (
+                    result_item.get("type") == "computer_call_output"
+                    and result_item.get("output", {}).get("type") == "input_image"
+                ):
                     image_url = result_item["output"]["image_url"]
                     # Extract base64 image data
                     if image_url.startswith("data:image/"):
                         # Format: data:image/png;base64,<base64_data>
@@ -289,26 +399,24 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
                     else:
                         # Assume it's just base64 data
                         base64_data = image_url
                     try:
                         # Decode the image
                         image_bytes = base64.b64decode(base64_data)
                         # Draw crosshair at the action coordinates
                         annotated_image = self._draw_crosshair_on_image(
-                            image_bytes,
-                            int(action["x"]),
-                            int(action["y"])
+                            image_bytes, int(action["x"]), int(action["y"])
                         )
                         # Save as screenshot_action
                         self._save_artifact("screenshot_action", annotated_image)
                     except Exception as e:
                         # If annotation fails, just log and continue
                         print(f"Failed to annotate screenshot: {e}")
                     break  # Only process the first screenshot found
         # Increment turn counter
-        self.current_turn += 1
+        self.current_turn += 1

cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.14py3-none-any.whl → 0.7.16py3-none-any.whl