PyPI - cua-agent - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.17__py3-none-any.whl - Mend

cua-agent 0.1.5py3-none-any.whl → 0.1.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (52) hide show

agent/__init__.py +3 -4
agent/core/__init__.py +3 -10
agent/core/computer_agent.py +207 -32
agent/core/experiment.py +20 -3
agent/core/loop.py +78 -120
agent/core/messages.py +279 -125
agent/core/telemetry.py +44 -32
agent/core/types.py +35 -0
agent/core/visualization.py +197 -0
agent/providers/anthropic/api/client.py +142 -1
agent/providers/anthropic/api_handler.py +140 -0
agent/providers/anthropic/callbacks/__init__.py +5 -0
agent/providers/anthropic/loop.py +224 -209
agent/providers/anthropic/messages/manager.py +3 -1
agent/providers/anthropic/response_handler.py +229 -0
agent/providers/anthropic/tools/base.py +1 -1
agent/providers/anthropic/tools/bash.py +0 -97
agent/providers/anthropic/tools/collection.py +2 -2
agent/providers/anthropic/tools/computer.py +34 -24
agent/providers/anthropic/tools/manager.py +2 -2
agent/providers/anthropic/utils.py +370 -0
agent/providers/omni/__init__.py +1 -20
agent/providers/omni/api_handler.py +42 -0
agent/providers/omni/clients/anthropic.py +4 -0
agent/providers/omni/image_utils.py +0 -72
agent/providers/omni/loop.py +497 -607
agent/providers/omni/parser.py +60 -5
agent/providers/omni/tools/__init__.py +25 -8
agent/providers/omni/tools/base.py +29 -0
agent/providers/omni/tools/bash.py +43 -38
agent/providers/omni/tools/computer.py +144 -181
agent/providers/omni/tools/manager.py +26 -48
agent/providers/omni/types.py +0 -4
agent/providers/omni/utils.py +225 -144
{cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/METADATA +6 -36
cua_agent-0.1.17.dist-info/RECORD +63 -0
agent/core/agent.py +0 -252
agent/core/base_agent.py +0 -164
agent/core/factory.py +0 -102
agent/providers/omni/callbacks.py +0 -78
agent/providers/omni/clients/groq.py +0 -101
agent/providers/omni/experiment.py +0 -273
agent/providers/omni/messages.py +0 -171
agent/providers/omni/tool_manager.py +0 -91
agent/providers/omni/visualization.py +0 -130
agent/types/__init__.py +0 -26
agent/types/base.py +0 -53
agent/types/messages.py +0 -36
cua_agent-0.1.5.dist-info/RECORD +0 -67
/agent/{types → core}/tools.py +0 -0
{cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/WHEEL +0 -0
{cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/entry_points.txt +0 -0

agent/providers/anthropic/response_handler.py ADDED Viewed

@@ -0,0 +1,229 @@
+"""Response and tool handling for Anthropic provider."""
+import logging
+from typing import Any, Dict, List, Optional, Tuple, cast
+from anthropic.types.beta import (
+    BetaMessage,
+    BetaMessageParam,
+    BetaTextBlock,
+    BetaTextBlockParam,
+    BetaToolUseBlockParam,
+    BetaContentBlockParam,
+)
+from .tools import ToolResult
+logger = logging.getLogger(__name__)
+class AnthropicResponseHandler:
+    """Handles Anthropic API responses and tool execution results."""
+    def __init__(self, loop):
+        """Initialize the response handler.
+        Args:
+            loop: Reference to the parent loop instance that provides context
+        """
+        self.loop = loop
+    async def handle_response(
+        self, response: BetaMessage, messages: List[Dict[str, Any]]
+    ) -> Tuple[List[Dict[str, Any]], bool]:
+        """Handle the Anthropic API response.
+        Args:
+            response: API response
+            messages: List of messages for context
+        Returns:
+            Tuple containing:
+            - List of new messages to be added
+            - Boolean indicating if the loop should continue
+        """
+        try:
+            new_messages = []
+            # Convert response to parameter format
+            response_params = self.response_to_params(response)
+            # Collect all existing tool_use IDs from previous messages for validation
+            existing_tool_use_ids = set()
+            for msg in messages:
+                if msg.get("role") == "assistant" and isinstance(msg.get("content"), list):
+                    for block in msg.get("content", []):
+                        if (
+                            isinstance(block, dict)
+                            and block.get("type") == "tool_use"
+                            and "id" in block
+                        ):
+                            existing_tool_use_ids.add(block["id"])
+            # Also add new tool_use IDs from the current response
+            current_tool_use_ids = set()
+            for block in response_params:
+                if isinstance(block, dict) and block.get("type") == "tool_use" and "id" in block:
+                    current_tool_use_ids.add(block["id"])
+                    existing_tool_use_ids.add(block["id"])
+            logger.info(f"Existing tool_use IDs in conversation: {existing_tool_use_ids}")
+            logger.info(f"New tool_use IDs in current response: {current_tool_use_ids}")
+            # Create assistant message
+            new_messages.append(
+                {
+                    "role": "assistant",
+                    "content": response_params,
+                }
+            )
+            if self.loop.callback_manager is None:
+                raise RuntimeError(
+                    "Callback manager not initialized. Call initialize_client() first."
+                )
+            # Handle tool use blocks and collect results
+            tool_result_content = []
+            for content_block in response_params:
+                # Notify callback of content
+                self.loop.callback_manager.on_content(cast(BetaContentBlockParam, content_block))
+                # Handle tool use
+                if content_block.get("type") == "tool_use":
+                    if self.loop.tool_manager is None:
+                        raise RuntimeError(
+                            "Tool manager not initialized. Call initialize_client() first."
+                        )
+                    # Execute the tool
+                    result = await self.loop.tool_manager.execute_tool(
+                        name=content_block["name"],
+                        tool_input=cast(Dict[str, Any], content_block["input"]),
+                    )
+                    # Verify the tool_use ID exists in the conversation (which it should now)
+                    tool_use_id = content_block["id"]
+                    if tool_use_id in existing_tool_use_ids:
+                        # Create tool result and add to content
+                        tool_result = self.make_tool_result(cast(ToolResult, result), tool_use_id)
+                        tool_result_content.append(tool_result)
+                        # Notify callback of tool result
+                        self.loop.callback_manager.on_tool_result(
+                            cast(ToolResult, result), content_block["id"]
+                        )
+                    else:
+                        logger.warning(
+                            f"Tool use ID {tool_use_id} not found in previous messages. Skipping tool result."
+                        )
+            # If no tool results, we're done
+            if not tool_result_content:
+                # Signal completion
+                self.loop.callback_manager.on_content({"type": "text", "text": "<DONE>"})
+                return new_messages, False
+            # Add tool results as user message
+            new_messages.append({"content": tool_result_content, "role": "user"})
+            return new_messages, True
+        except Exception as e:
+            logger.error(f"Error handling response: {str(e)}")
+            new_messages.append(
+                {
+                    "role": "assistant",
+                    "content": f"Error: {str(e)}",
+                }
+            )
+            return new_messages, False
+    def response_to_params(
+        self,
+        response: BetaMessage,
+    ) -> List[Dict[str, Any]]:
+        """Convert API response to message parameters.
+        Args:
+            response: API response message
+        Returns:
+            List of content blocks
+        """
+        result = []
+        for block in response.content:
+            if isinstance(block, BetaTextBlock):
+                result.append({"type": "text", "text": block.text})
+            else:
+                result.append(cast(Dict[str, Any], block.model_dump()))
+        return result
+    def make_tool_result(self, result: ToolResult, tool_use_id: str) -> Dict[str, Any]:
+        """Convert a tool result to API format.
+        Args:
+            result: Tool execution result
+            tool_use_id: ID of the tool use
+        Returns:
+            Formatted tool result
+        """
+        if result.content:
+            return {
+                "type": "tool_result",
+                "content": result.content,
+                "tool_use_id": tool_use_id,
+                "is_error": bool(result.error),
+            }
+        tool_result_content = []
+        is_error = False
+        if result.error:
+            is_error = True
+            tool_result_content = [
+                {
+                    "type": "text",
+                    "text": self.maybe_prepend_system_tool_result(result, result.error),
+                }
+            ]
+        else:
+            if result.output:
+                tool_result_content.append(
+                    {
+                        "type": "text",
+                        "text": self.maybe_prepend_system_tool_result(result, result.output),
+                    }
+                )
+            if result.base64_image:
+                tool_result_content.append(
+                    {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": "image/png",
+                            "data": result.base64_image,
+                        },
+                    }
+                )
+        return {
+            "type": "tool_result",
+            "content": tool_result_content,
+            "tool_use_id": tool_use_id,
+            "is_error": is_error,
+        }
+    def maybe_prepend_system_tool_result(self, result: ToolResult, result_text: str) -> str:
+        """Prepend system information to tool result if available.
+        Args:
+            result: Tool execution result
+            result_text: Text to prepend to
+        Returns:
+            Text with system information prepended if available
+        """
+        if result.system:
+            result_text = f"<s>{result.system}</s>\n{result_text}"
+        return result_text

agent/providers/anthropic/tools/base.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Any, Dict
 from anthropic.types.beta import BetaToolUnionParam
-from ....core.tools.base import BaseTool, ToolError, ToolResult, ToolFailure, CLIResult
+from ....core.tools.base import BaseTool
 class BaseAnthropicTool(BaseTool, metaclass=ABCMeta):

agent/providers/anthropic/tools/bash.py CHANGED Viewed

@@ -7,102 +7,6 @@ from .base import BaseAnthropicTool, CLIResult, ToolError, ToolResult
 from ....core.tools.bash import BaseBashTool
-class _BashSession:
-    """A session of a bash shell."""
-    _started: bool
-    _process: asyncio.subprocess.Process
-    command: str = "/bin/bash"
-    _output_delay: float = 0.2  # seconds
-    _timeout: float = 120.0  # seconds
-    _sentinel: str = "<<exit>>"
-    def __init__(self):
-        self._started = False
-        self._timed_out = False
-    async def start(self):
-        if self._started:
-            return
-        self._process = await asyncio.create_subprocess_shell(
-            self.command,
-            preexec_fn=os.setsid,
-            shell=True,
-            bufsize=0,
-            stdin=asyncio.subprocess.PIPE,
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.PIPE,
-        )
-        self._started = True
-    def stop(self):
-        """Terminate the bash shell."""
-        if not self._started:
-            raise ToolError("Session has not started.")
-        if self._process.returncode is not None:
-            return
-        self._process.terminate()
-    async def run(self, command: str):
-        """Execute a command in the bash shell."""
-        if not self._started:
-            raise ToolError("Session has not started.")
-        if self._process.returncode is not None:
-            return ToolResult(
-                system="tool must be restarted",
-                error=f"bash has exited with returncode {self._process.returncode}",
-            )
-        if self._timed_out:
-            raise ToolError(
-                f"timed out: bash has not returned in {self._timeout} seconds and must be restarted",
-            )
-        # we know these are not None because we created the process with PIPEs
-        assert self._process.stdin
-        assert self._process.stdout
-        assert self._process.stderr
-        # send command to the process
-        self._process.stdin.write(command.encode() + f"; echo '{self._sentinel}'\n".encode())
-        await self._process.stdin.drain()
-        # read output from the process, until the sentinel is found
-        try:
-            async with asyncio.timeout(self._timeout):
-                while True:
-                    await asyncio.sleep(self._output_delay)
-                    # if we read directly from stdout/stderr, it will wait forever for
-                    # EOF. use the StreamReader buffer directly instead.
-                    output = (
-                        self._process.stdout._buffer.decode()
-                    )  # pyright: ignore[reportAttributeAccessIssue]
-                    if self._sentinel in output:
-                        # strip the sentinel and break
-                        output = output[: output.index(self._sentinel)]
-                        break
-        except asyncio.TimeoutError:
-            self._timed_out = True
-            raise ToolError(
-                f"timed out: bash has not returned in {self._timeout} seconds and must be restarted",
-            ) from None
-        if output.endswith("\n"):
-            output = output[:-1]
-        error = self._process.stderr._buffer.decode()  # pyright: ignore[reportAttributeAccessIssue]
-        if error.endswith("\n"):
-            error = error[:-1]
-        # clear the buffers so that the next output can be read correctly
-        self._process.stdout._buffer.clear()  # pyright: ignore[reportAttributeAccessIssue]
-        self._process.stderr._buffer.clear()  # pyright: ignore[reportAttributeAccessIssue]
-        return CLIResult(output=output, error=error)
 class BashTool(BaseBashTool, BaseAnthropicTool):
     """
     A tool that allows the agent to run bash commands.
@@ -124,7 +28,6 @@ class BashTool(BaseBashTool, BaseAnthropicTool):
         # Then initialize the Anthropic tool
         BaseAnthropicTool.__init__(self)
         # Initialize bash session
-        self._session = _BashSession()
     async def __call__(self, command: str | None = None, restart: bool = False, **kwargs):
         """Execute a bash command.

agent/providers/anthropic/tools/collection.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """Collection classes for managing multiple tools."""
-from typing import Any
+from typing import Any, cast
 from anthropic.types.beta import BetaToolUnionParam
@@ -22,7 +22,7 @@ class ToolCollection:
     def to_params(
         self,
     ) -> list[BetaToolUnionParam]:
-        return [tool.to_params() for tool in self.tools]
+        return cast(list[BetaToolUnionParam], [tool.to_params() for tool in self.tools])
     async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult:
         tool = self.tool_map.get(name)

agent/providers/anthropic/tools/computer.py CHANGED Viewed

@@ -61,9 +61,9 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
     name: Literal["computer"] = "computer"
     api_type: Literal["computer_20250124"] = "computer_20250124"
-    width: int | None
-    height: int | None
-    display_num: int | None
+    width: int | None = None
+    height: int | None = None
+    display_num: int | None = None
     computer: Computer  # The CUA Computer instance
     logger = logging.getLogger(__name__)
@@ -106,6 +106,7 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
         display_size = await self.computer.interface.get_screen_size()
         self.width = display_size["width"]
         self.height = display_size["height"]
+        assert isinstance(self.width, int) and isinstance(self.height, int)
         self.logger.info(f"Initialized screen dimensions to {self.width}x{self.height}")
     async def __call__(
@@ -120,6 +121,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
             # Ensure dimensions are initialized
             if self.width is None or self.height is None:
                 await self.initialize_dimensions()
+                if self.width is None or self.height is None:
+                    raise ToolError("Failed to initialize screen dimensions")
         except Exception as e:
             raise ToolError(f"Failed to initialize dimensions: {e}")
@@ -147,7 +150,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                     self.logger.info(
                         f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
                     )
-                    pre_img = pre_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
+                    if not isinstance(self.width, int) or not isinstance(self.height, int):
+                        raise ToolError("Screen dimensions must be integers")
+                    size = (int(self.width), int(self.height))
+                    pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
                 self.logger.info(f"  Current dimensions: {pre_img.width}x{pre_img.height}")
@@ -160,15 +166,7 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                     await self.computer.interface.move_cursor(x, y)
                     # Then perform drag operation - check if drag_to exists or we need to use other methods
                     try:
-                        if hasattr(self.computer.interface, "drag_to"):
-                            await self.computer.interface.drag_to(x, y)
-                        else:
-                            # Alternative approach: press mouse down, move, release
-                            await self.computer.interface.mouse_down()
-                            await asyncio.sleep(0.2)
-                            await self.computer.interface.move_cursor(x, y)
-                            await asyncio.sleep(0.2)
-                            await self.computer.interface.mouse_up()
+                        await self.computer.interface.drag_to(x, y)
                     except Exception as e:
                         self.logger.error(f"Error during drag operation: {str(e)}")
                         raise ToolError(f"Failed to perform drag: {str(e)}")
@@ -214,9 +212,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                         self.logger.info(
                             f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
                         )
-                        pre_img = pre_img.resize(
-                            (self.width, self.height), Image.Resampling.LANCZOS
-                        )
+                        if not isinstance(self.width, int) or not isinstance(self.height, int):
+                            raise ToolError("Screen dimensions must be integers")
+                        size = (int(self.width), int(self.height))
+                        pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
                         # Save the scaled image back to bytes
                         buffer = io.BytesIO()
                         pre_img.save(buffer, format="PNG")
@@ -275,9 +274,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                         self.logger.info(
                             f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
                         )
-                        pre_img = pre_img.resize(
-                            (self.width, self.height), Image.Resampling.LANCZOS
-                        )
+                        if not isinstance(self.width, int) or not isinstance(self.height, int):
+                            raise ToolError("Screen dimensions must be integers")
+                        size = (int(self.width), int(self.height))
+                        pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
                     # Perform the click action
                     if action == "left_click":
@@ -335,7 +335,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                     self.logger.info(
                         f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
                     )
-                    pre_img = pre_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
+                    if not isinstance(self.width, int) or not isinstance(self.height, int):
+                        raise ToolError("Screen dimensions must be integers")
+                    size = (int(self.width), int(self.height))
+                    pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
                 if action == "key":
                     # Special handling for page up/down on macOS
@@ -365,7 +368,7 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                         # Handle single key press
                         self.logger.info(f"Pressing key: {text}")
                         try:
-                            await self.computer.interface.press(text)
+                            await self.computer.interface.press_key(text)
                             output_text = text
                         except ValueError as e:
                             raise ToolError(f"Invalid key: {text}. {str(e)}")
@@ -442,7 +445,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                         self.logger.info(
                             f"Scaling image from {img.size} to {self.width}x{self.height}"
                         )
-                        img = img.resize((self.width, self.height), Image.Resampling.LANCZOS)
+                        if not isinstance(self.width, int) or not isinstance(self.height, int):
+                            raise ToolError("Screen dimensions must be integers")
+                        size = (int(self.width), int(self.height))
+                        img = img.resize(size, Image.Resampling.LANCZOS)
                         buffer = io.BytesIO()
                         img.save(buffer, format="PNG")
                         screenshot = buffer.getvalue()
@@ -451,7 +457,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                 elif action == "cursor_position":
                     pos = await self.computer.interface.get_cursor_position()
-                    return ToolResult(output=f"X={int(pos[0])},Y={int(pos[1])}")
+                    x, y = pos  # Unpack the tuple
+                    return ToolResult(output=f"X={int(x)},Y={int(y)}")
             except Exception as e:
                 self.logger.error(f"Error during {action} action: {str(e)}")
@@ -517,7 +524,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
             # Scale image if needed
             if img.size != (self.width, self.height):
                 self.logger.info(f"Scaling image from {img.size} to {self.width}x{self.height}")
-                img = img.resize((self.width, self.height), Image.Resampling.LANCZOS)
+                if not isinstance(self.width, int) or not isinstance(self.height, int):
+                    raise ToolError("Screen dimensions must be integers")
+                size = (int(self.width), int(self.height))
+                img = img.resize(size, Image.Resampling.LANCZOS)
                 buffer = io.BytesIO()
                 img.save(buffer, format="PNG")
                 screenshot = buffer.getvalue()

agent/providers/anthropic/tools/manager.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Dict, List
+from typing import Any, Dict, List, cast
 from anthropic.types.beta import BetaToolUnionParam
 from computer.computer import Computer
@@ -37,7 +37,7 @@ class ToolManager(BaseToolManager):
         """Get tool parameters for Anthropic API calls."""
         if self.tools is None:
             raise RuntimeError("Tools not initialized. Call initialize() first.")
-        return self.tools.to_params()
+        return cast(List[BetaToolUnionParam], self.tools.to_params())
     async def execute_tool(self, name: str, tool_input: dict[str, Any]) -> ToolResult:
         """Execute a tool with the given input.

cua-agent 0.1.5__py3-none-any.whl → 0.1.17__py3-none-any.whl

Potentially problematic release.

cua-agent 0.1.5py3-none-any.whl → 0.1.17py3-none-any.whl