PyPI - cua-agent - Versions diffs - 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl - Mend

cua-agent 0.1.22py3-none-any.whl → 0.1.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (18) hide show

agent/__init__.py +1 -1
agent/core/agent.py +9 -4
agent/core/factory.py +3 -5
agent/core/provider_config.py +4 -2
agent/core/types.py +41 -1
agent/providers/omni/__init__.py +1 -1
agent/providers/omni/clients/oaicompat.py +177 -0
agent/providers/omni/loop.py +25 -1
agent/providers/omni/tools/manager.py +1 -1
agent/ui/__init__.py +1 -0
agent/ui/gradio/__init__.py +21 -0
agent/ui/gradio/app.py +872 -0
{cua_agent-0.1.22.dist-info → cua_agent-0.1.24.dist-info}/METADATA +74 -2
{cua_agent-0.1.22.dist-info → cua_agent-0.1.24.dist-info}/RECORD +16 -14
agent/core/README.md +0 -101
agent/providers/omni/types.py +0 -47
{cua_agent-0.1.22.dist-info → cua_agent-0.1.24.dist-info}/WHEEL +0 -0
{cua_agent-0.1.22.dist-info → cua_agent-0.1.24.dist-info}/entry_points.txt +0 -0

agent/__init__.py CHANGED Viewed

@@ -48,7 +48,7 @@ except Exception as e:
     # Other issues with telemetry
     logger.warning(f"Error initializing telemetry: {e}")
-from .providers.omni.types import LLMProvider, LLM
+from .core.types import LLMProvider, LLM
 from .core.factory import AgentLoop
 from .core.agent import ComputerAgent

agent/core/agent.py CHANGED Viewed

@@ -6,8 +6,7 @@ import os
 from typing import AsyncGenerator, Optional
 from computer import Computer
-from ..providers.omni.types import LLM
-from .. import AgentLoop
+from .types import LLM, AgentLoop
 from .types import AgentResponse
 from .factory import LoopFactory
 from .provider_config import DEFAULT_MODELS, ENV_VARS
@@ -75,6 +74,7 @@ class ComputerAgent:
         # Use the provided LLM object
         self.provider = model.provider
         actual_model_name = model.name or DEFAULT_MODELS.get(self.provider, "")
+        self.provider_base_url = getattr(model, "provider_base_url", None)
         # Ensure we have a valid model name
         if not actual_model_name:
@@ -86,8 +86,12 @@ class ComputerAgent:
         # Get API key from environment if not provided
         actual_api_key = api_key or os.environ.get(ENV_VARS[self.provider], "")
-        # Ollama is local and doesn't require an API key
-        if not actual_api_key and str(self.provider) != "ollama":
+        # Ollama and OpenAI-compatible APIs typically don't require an API key
+        if (
+            not actual_api_key
+            and str(self.provider) not in ["ollama", "oaicompat"]
+            and ENV_VARS[self.provider] != "none"
+        ):
             raise ValueError(f"No API key provided for {self.provider}")
         # Create the appropriate loop using the factory
@@ -102,6 +106,7 @@ class ComputerAgent:
                 save_trajectory=save_trajectory,
                 trajectory_dir=trajectory_dir,
                 only_n_most_recent_images=only_n_most_recent_images,
+                provider_base_url=self.provider_base_url,
             )
         except ValueError as e:
             logger.error(f"Failed to create loop: {str(e)}")

agent/core/factory.py CHANGED Viewed

@@ -8,10 +8,6 @@ from computer import Computer
 from .types import AgentLoop
 from .base import BaseLoop
-# For type checking only
-if TYPE_CHECKING:
-    from ..providers.omni.types import LLMProvider
 logger = logging.getLogger(__name__)
@@ -33,6 +29,7 @@ class LoopFactory:
         trajectory_dir: str = "trajectories",
         only_n_most_recent_images: Optional[int] = None,
         acknowledge_safety_check_callback: Optional[Callable[[str], Awaitable[bool]]] = None,
+        provider_base_url: Optional[str] = None,
     ) -> BaseLoop:
         """Create and return an appropriate loop instance based on type."""
         if loop_type == AgentLoop.ANTHROPIC:
@@ -77,7 +74,7 @@ class LoopFactory:
             try:
                 from ..providers.omni.loop import OmniLoop
                 from ..providers.omni.parser import OmniParser
-                from ..providers.omni.types import LLMProvider
+                from .types import LLMProvider
             except ImportError:
                 raise ImportError(
                     "The 'omni' provider is not installed. "
@@ -99,6 +96,7 @@ class LoopFactory:
                 base_dir=trajectory_dir,
                 only_n_most_recent_images=only_n_most_recent_images,
                 parser=OmniParser(),
+                provider_base_url=provider_base_url,
             )
         else:
             raise ValueError(f"Unsupported loop type: {loop_type}")

agent/core/provider_config.py CHANGED Viewed

@@ -1,17 +1,19 @@
 """Provider-specific configurations and constants."""
-from ..providers.omni.types import LLMProvider
+from .types import LLMProvider
 # Default models for different providers
 DEFAULT_MODELS = {
     LLMProvider.OPENAI: "gpt-4o",
     LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
     LLMProvider.OLLAMA: "gemma3:4b-it-q4_K_M",
+    LLMProvider.OAICOMPAT: "Qwen2.5-VL-7B-Instruct",
 }
 # Map providers to their environment variable names
 ENV_VARS = {
     LLMProvider.OPENAI: "OPENAI_API_KEY",
     LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
-    LLMProvider.OLLAMA: "OLLAMA_API_KEY",
+    LLMProvider.OLLAMA: "none",
+    LLMProvider.OAICOMPAT: "none",  # OpenAI-compatible API typically doesn't require an API key
 }

agent/core/types.py CHANGED Viewed

@@ -1,7 +1,8 @@
 """Core type definitions."""
 from typing import Any, Dict, List, Optional, TypedDict, Union
-from enum import Enum, auto
+from enum import Enum, StrEnum, auto
+from dataclasses import dataclass
 class AgentLoop(Enum):
@@ -14,6 +15,45 @@ class AgentLoop(Enum):
     # Add more loop types as needed
+class LLMProvider(StrEnum):
+    """Supported LLM providers."""
+    ANTHROPIC = "anthropic"
+    OPENAI = "openai"
+    OLLAMA = "ollama"
+    OAICOMPAT = "oaicompat"
+@dataclass
+class LLM:
+    """Configuration for LLM model and provider."""
+    provider: LLMProvider
+    name: Optional[str] = None
+    provider_base_url: Optional[str] = None
+    def __post_init__(self):
+        """Set default model name if not provided."""
+        if self.name is None:
+            from .provider_config import DEFAULT_MODELS
+            self.name = DEFAULT_MODELS.get(self.provider)
+        # Set default provider URL if none provided
+        if self.provider_base_url is None and self.provider == LLMProvider.OAICOMPAT:
+            # Default for vLLM
+            self.provider_base_url = "http://localhost:8000/v1"
+            # Common alternatives:
+            # - LM Studio: "http://localhost:1234/v1"
+            # - LocalAI: "http://localhost:8080/v1"
+            # - Ollama with OpenAI compatible API: "http://localhost:11434/v1"
+# For backward compatibility
+LLMModel = LLM
+Model = LLM
 class AgentResponse(TypedDict, total=False):
     """Agent response format."""

agent/providers/omni/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """Omni provider implementation."""
-from .types import LLMProvider
+from ...core.types import LLMProvider
 from .image_utils import (
     decode_base64_image,
 )

agent/providers/omni/clients/oaicompat.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""OpenAI-compatible client implementation."""
+import os
+import logging
+from typing import Dict, List, Optional, Any
+import aiohttp
+import re
+from .base import BaseOmniClient
+logger = logging.getLogger(__name__)
+# OpenAI-compatible client for the OmniLoop
+class OAICompatClient(BaseOmniClient):
+    """OpenAI-compatible API client implementation.
+    This client can be used with any service that implements the OpenAI API protocol, including:
+    - vLLM
+    - LM Studio
+    - LocalAI
+    - Ollama (with OpenAI compatibility)
+    - Text Generation WebUI
+    - Any other service with OpenAI API compatibility
+    """
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        model: str = "Qwen2.5-VL-7B-Instruct",
+        provider_base_url: Optional[str] = "http://localhost:8000/v1",
+        max_tokens: int = 4096,
+        temperature: float = 0.0,
+    ):
+        """Initialize the OpenAI-compatible client.
+        Args:
+            api_key: Not used for local endpoints, usually set to "EMPTY"
+            model: Model name to use
+            provider_base_url: API base URL. Typically in the format "http://localhost:PORT/v1"
+                Examples:
+                - vLLM: "http://localhost:8000/v1"
+                - LM Studio: "http://localhost:1234/v1"
+                - LocalAI: "http://localhost:8080/v1"
+                - Ollama: "http://localhost:11434/v1"
+            max_tokens: Maximum tokens to generate
+            temperature: Generation temperature
+        """
+        super().__init__(api_key="EMPTY", model=model)
+        self.api_key = "EMPTY"  # Local endpoints typically don't require an API key
+        self.model = model
+        self.provider_base_url = (
+            provider_base_url or "http://localhost:8000/v1"
+        )  # Use default if None
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+    def _extract_base64_image(self, text: str) -> Optional[str]:
+        """Extract base64 image data from an HTML img tag."""
+        pattern = r'data:image/[^;]+;base64,([^"]+)'
+        match = re.search(pattern, text)
+        return match.group(1) if match else None
+    def _get_loggable_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Create a loggable version of messages with image data truncated."""
+        loggable_messages = []
+        for msg in messages:
+            if isinstance(msg.get("content"), list):
+                new_content = []
+                for content in msg["content"]:
+                    if content.get("type") == "image":
+                        new_content.append(
+                            {"type": "image", "image_url": {"url": "[BASE64_IMAGE_DATA]"}}
+                        )
+                    else:
+                        new_content.append(content)
+                loggable_messages.append({"role": msg["role"], "content": new_content})
+            else:
+                loggable_messages.append(msg)
+        return loggable_messages
+    async def run_interleaved(
+        self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
+    ) -> Dict[str, Any]:
+        """Run interleaved chat completion.
+        Args:
+            messages: List of message dicts
+            system: System prompt
+            max_tokens: Optional max tokens override
+        Returns:
+            Response dict
+        """
+        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
+        final_messages = [{"role": "system", "content": system}]
+        # Process messages
+        for item in messages:
+            if isinstance(item, dict):
+                if isinstance(item["content"], list):
+                    # Content is already in the correct format
+                    final_messages.append(item)
+                else:
+                    # Single string content, check for image
+                    base64_img = self._extract_base64_image(item["content"])
+                    if base64_img:
+                        message = {
+                            "role": item["role"],
+                            "content": [
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
+                                }
+                            ],
+                        }
+                    else:
+                        message = {
+                            "role": item["role"],
+                            "content": [{"type": "text", "text": item["content"]}],
+                        }
+                    final_messages.append(message)
+            else:
+                # String content, check for image
+                base64_img = self._extract_base64_image(item)
+                if base64_img:
+                    message = {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
+                            }
+                        ],
+                    }
+                else:
+                    message = {"role": "user", "content": [{"type": "text", "text": item}]}
+                final_messages.append(message)
+        payload = {"model": self.model, "messages": final_messages, "temperature": self.temperature}
+        payload["max_tokens"] = max_tokens or self.max_tokens
+        try:
+            async with aiohttp.ClientSession() as session:
+                # Use default base URL if none provided
+                base_url = self.provider_base_url or "http://localhost:8000/v1"
+                # Check if the base URL already includes the chat/completions endpoint
+                endpoint_url = base_url
+                if not endpoint_url.endswith("/chat/completions"):
+                    # If the URL ends with /v1, append /chat/completions
+                    if endpoint_url.endswith("/v1"):
+                        endpoint_url = f"{endpoint_url}/chat/completions"
+                    # If the URL doesn't end with /v1, make sure it has a proper structure
+                    elif not endpoint_url.endswith("/"):
+                        endpoint_url = f"{endpoint_url}/chat/completions"
+                    else:
+                        endpoint_url = f"{endpoint_url}chat/completions"
+                # Log the endpoint URL for debugging
+                logger.debug(f"Using endpoint URL: {endpoint_url}")
+                async with session.post(endpoint_url, headers=headers, json=payload) as response:
+                    response_json = await response.json()
+                    if response.status != 200:
+                        error_msg = response_json.get("error", {}).get(
+                            "message", str(response_json)
+                        )
+                        logger.error(f"Error in API call: {error_msg}")
+                        raise Exception(f"API error: {error_msg}")
+                    return response_json
+        except Exception as e:
+            logger.error(f"Error in API call: {str(e)}")
+            raise

agent/providers/omni/loop.py CHANGED Viewed

@@ -16,10 +16,11 @@ from ...core.messages import StandardMessageManager, ImageRetentionConfig
 from .utils import to_openai_agent_response_format
 from ...core.types import AgentResponse
 from computer import Computer
-from .types import LLMProvider
+from ...core.types import LLMProvider
 from .clients.openai import OpenAIClient
 from .clients.anthropic import AnthropicClient
 from .clients.ollama import OllamaClient
+from .clients.oaicompat import OAICompatClient
 from .prompts import SYSTEM_PROMPT
 from .api_handler import OmniAPIHandler
 from .tools.manager import ToolManager
@@ -60,6 +61,7 @@ class OmniLoop(BaseLoop):
         max_retries: int = 3,
         retry_delay: float = 1.0,
         save_trajectory: bool = True,
+        provider_base_url: Optional[str] = None,
         **kwargs,
     ):
         """Initialize the loop.
@@ -75,10 +77,12 @@ class OmniLoop(BaseLoop):
             max_retries: Maximum number of retries for API calls
             retry_delay: Delay between retries in seconds
             save_trajectory: Whether to save trajectory data
+            provider_base_url: Base URL for the API provider (used for OAICOMPAT)
         """
         # Set parser and provider before initializing base class
         self.parser = parser
         self.provider = provider
+        self.provider_base_url = provider_base_url
         # Initialize message manager with image retention config
         self.message_manager = StandardMessageManager(
@@ -141,6 +145,12 @@ class OmniLoop(BaseLoop):
                 api_key=self.api_key,
                 model=self.model,
             )
+        elif self.provider == LLMProvider.OAICOMPAT:
+            self.client = OAICompatClient(
+                api_key="EMPTY",  # Local endpoints typically don't require an API key
+                model=self.model,
+                provider_base_url=self.provider_base_url,
+            )
         else:
             raise ValueError(f"Unsupported provider: {self.provider}")
@@ -171,6 +181,12 @@ class OmniLoop(BaseLoop):
                     api_key=self.api_key,
                     model=self.model,
                 )
+            elif self.provider == LLMProvider.OAICOMPAT:
+                self.client = OAICompatClient(
+                    api_key="EMPTY",  # Local endpoints typically don't require an API key
+                    model=self.model,
+                    provider_base_url=self.provider_base_url,
+                )
             else:
                 raise ValueError(f"Unsupported provider: {self.provider}")
@@ -388,6 +404,14 @@ class OmniLoop(BaseLoop):
                 except (KeyError, TypeError, IndexError) as e:
                     logger.error(f"Invalid response format: {str(e)}")
                     return True, action_screenshot_saved
+            elif self.provider == LLMProvider.OAICOMPAT:
+                try:
+                    # OpenAI-compatible response format
+                    raw_text = response["choices"][0]["message"]["content"]
+                    standard_content = [{"type": "text", "text": raw_text}]
+                except (KeyError, TypeError, IndexError) as e:
+                    logger.error(f"Invalid response format: {str(e)}")
+                    return True, action_screenshot_saved
             else:
                 # Assume OpenAI or compatible format
                 try:

agent/providers/omni/tools/manager.py CHANGED Viewed

@@ -7,7 +7,7 @@ from ....core.tools import BaseToolManager, ToolResult
 from ....core.tools.collection import ToolCollection
 from .computer import ComputerTool
 from .bash import BashTool
-from ..types import LLMProvider
+from ....core.types import LLMProvider
 class ToolManager(BaseToolManager):

agent/ui/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """UI modules for the Computer-Use Agent."""

agent/ui/gradio/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Gradio UI for Computer-Use Agent."""
+import gradio as gr
+from typing import Optional
+from .app import create_gradio_ui
+def registry(name: str = "cua:gpt-4o") -> gr.Blocks:
+    """Create and register a Gradio UI for the Computer-Use Agent.
+    Args:
+        name: The name to use for the Gradio app, in format 'provider:model'
+    Returns:
+        A Gradio Blocks application
+    """
+    provider, model = name.split(":", 1) if ":" in name else ("openai", name)
+    # Create and return the Gradio UI
+    return create_gradio_ui(provider_name=provider, model_name=model)

cua-agent 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl

Potentially problematic release.

cua-agent 0.1.22py3-none-any.whl → 0.1.24py3-none-any.whl