cua-agent 0.1.20__tar.gz → 0.1.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- {cua_agent-0.1.20 → cua_agent-0.1.22}/PKG-INFO +17 -3
- {cua_agent-0.1.20 → cua_agent-0.1.22}/README.md +2 -2
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/agent.py +2 -1
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/provider_config.py +2 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/types.py +1 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/clients/base.py +8 -17
- cua_agent-0.1.22/agent/providers/omni/clients/ollama.py +122 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/clients/openai.py +0 -4
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/loop.py +18 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/types.py +3 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/api_handler.py +3 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/pyproject.toml +18 -3
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/__init__.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/README.md +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/__init__.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/base.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/callbacks.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/experiment.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/factory.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/messages.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/telemetry.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools/__init__.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools/base.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools/bash.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools/collection.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools/computer.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools/edit.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools/manager.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/visualization.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/__init__.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/__init__.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/api/client.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/api/logging.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/api_handler.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/callbacks/__init__.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/callbacks/manager.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/loop.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/prompts.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/response_handler.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/__init__.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/base.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/bash.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/collection.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/computer.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/edit.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/manager.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/run.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/types.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/utils.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/__init__.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/api_handler.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/clients/anthropic.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/clients/utils.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/image_utils.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/parser.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/prompts.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/tools/__init__.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/tools/base.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/tools/bash.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/tools/computer.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/tools/manager.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/utils.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/__init__.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/loop.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/response_handler.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/tools/__init__.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/tools/base.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/tools/computer.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/tools/manager.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/types.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/utils.py +0 -0
- {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/telemetry.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cua-agent
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.22
|
|
4
4
|
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
5
|
Author-Email: TryCua <gh@trycua.com>
|
|
6
6
|
Requires-Python: <3.13,>=3.10
|
|
@@ -33,6 +33,19 @@ Requires-Dist: openai<2.0.0,>=1.14.0; extra == "som"
|
|
|
33
33
|
Requires-Dist: groq<0.5.0,>=0.4.0; extra == "som"
|
|
34
34
|
Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "som"
|
|
35
35
|
Requires-Dist: requests<3.0.0,>=2.31.0; extra == "som"
|
|
36
|
+
Provides-Extra: omni
|
|
37
|
+
Requires-Dist: torch>=2.2.1; extra == "omni"
|
|
38
|
+
Requires-Dist: torchvision>=0.17.1; extra == "omni"
|
|
39
|
+
Requires-Dist: ultralytics>=8.0.0; extra == "omni"
|
|
40
|
+
Requires-Dist: transformers>=4.38.2; extra == "omni"
|
|
41
|
+
Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "omni"
|
|
42
|
+
Requires-Dist: anthropic<0.47.0,>=0.46.0; extra == "omni"
|
|
43
|
+
Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "omni"
|
|
44
|
+
Requires-Dist: openai<2.0.0,>=1.14.0; extra == "omni"
|
|
45
|
+
Requires-Dist: groq<0.5.0,>=0.4.0; extra == "omni"
|
|
46
|
+
Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "omni"
|
|
47
|
+
Requires-Dist: requests<3.0.0,>=2.31.0; extra == "omni"
|
|
48
|
+
Requires-Dist: ollama<0.5.0,>=0.4.7; extra == "omni"
|
|
36
49
|
Provides-Extra: all
|
|
37
50
|
Requires-Dist: torch>=2.2.1; extra == "all"
|
|
38
51
|
Requires-Dist: torchvision>=0.17.1; extra == "all"
|
|
@@ -45,6 +58,7 @@ Requires-Dist: openai<2.0.0,>=1.14.0; extra == "all"
|
|
|
45
58
|
Requires-Dist: groq<0.5.0,>=0.4.0; extra == "all"
|
|
46
59
|
Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "all"
|
|
47
60
|
Requires-Dist: requests<3.0.0,>=2.31.0; extra == "all"
|
|
61
|
+
Requires-Dist: ollama<0.5.0,>=0.4.7; extra == "all"
|
|
48
62
|
Description-Content-Type: text/markdown
|
|
49
63
|
|
|
50
64
|
<div align="center">
|
|
@@ -80,7 +94,7 @@ pip install "cua-agent[all]"
|
|
|
80
94
|
# or install specific loop providers
|
|
81
95
|
pip install "cua-agent[openai]" # OpenAI Cua Loop
|
|
82
96
|
pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
83
|
-
pip install "cua-agent[omni]" # Cua Loop based on OmniParser
|
|
97
|
+
pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
|
|
84
98
|
```
|
|
85
99
|
|
|
86
100
|
## Run
|
|
@@ -123,7 +137,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
|
|
|
123
137
|
|:-----------|:-----------------|:------------|:-------------|
|
|
124
138
|
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
|
|
125
139
|
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
|
|
126
|
-
| `AgentLoop.OMNI` <br>(
|
|
140
|
+
| `AgentLoop.OMNI` <br>(experimental) | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
|
127
141
|
|
|
128
142
|
## AgentResponse
|
|
129
143
|
The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
|
|
@@ -31,7 +31,7 @@ pip install "cua-agent[all]"
|
|
|
31
31
|
# or install specific loop providers
|
|
32
32
|
pip install "cua-agent[openai]" # OpenAI Cua Loop
|
|
33
33
|
pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
34
|
-
pip install "cua-agent[omni]" # Cua Loop based on OmniParser
|
|
34
|
+
pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
|
|
35
35
|
```
|
|
36
36
|
|
|
37
37
|
## Run
|
|
@@ -74,7 +74,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
|
|
|
74
74
|
|:-----------|:-----------------|:------------|:-------------|
|
|
75
75
|
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
|
|
76
76
|
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
|
|
77
|
-
| `AgentLoop.OMNI` <br>(
|
|
77
|
+
| `AgentLoop.OMNI` <br>(experimental) | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
|
78
78
|
|
|
79
79
|
## AgentResponse
|
|
80
80
|
The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
|
|
@@ -86,7 +86,8 @@ class ComputerAgent:
|
|
|
86
86
|
|
|
87
87
|
# Get API key from environment if not provided
|
|
88
88
|
actual_api_key = api_key or os.environ.get(ENV_VARS[self.provider], "")
|
|
89
|
-
|
|
89
|
+
# Ollama is local and doesn't require an API key
|
|
90
|
+
if not actual_api_key and str(self.provider) != "ollama":
|
|
90
91
|
raise ValueError(f"No API key provided for {self.provider}")
|
|
91
92
|
|
|
92
93
|
# Create the appropriate loop using the factory
|
|
@@ -6,10 +6,12 @@ from ..providers.omni.types import LLMProvider
|
|
|
6
6
|
DEFAULT_MODELS = {
|
|
7
7
|
LLMProvider.OPENAI: "gpt-4o",
|
|
8
8
|
LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
9
|
+
LLMProvider.OLLAMA: "gemma3:4b-it-q4_K_M",
|
|
9
10
|
}
|
|
10
11
|
|
|
11
12
|
# Map providers to their environment variable names
|
|
12
13
|
ENV_VARS = {
|
|
13
14
|
LLMProvider.OPENAI: "OPENAI_API_KEY",
|
|
14
15
|
LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
16
|
+
LLMProvider.OLLAMA: "OLLAMA_API_KEY",
|
|
15
17
|
}
|
|
@@ -1,43 +1,34 @@
|
|
|
1
1
|
"""Base client implementation for Omni providers."""
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
import logging
|
|
5
4
|
from typing import Dict, List, Optional, Any, Tuple
|
|
6
|
-
import aiohttp
|
|
7
|
-
import json
|
|
8
5
|
|
|
9
6
|
logger = logging.getLogger(__name__)
|
|
10
7
|
|
|
8
|
+
|
|
11
9
|
class BaseOmniClient:
|
|
12
10
|
"""Base class for provider-specific clients."""
|
|
13
|
-
|
|
14
|
-
def __init__(
|
|
15
|
-
self,
|
|
16
|
-
api_key: Optional[str] = None,
|
|
17
|
-
model: Optional[str] = None
|
|
18
|
-
):
|
|
11
|
+
|
|
12
|
+
def __init__(self, api_key: Optional[str] = None, model: Optional[str] = None):
|
|
19
13
|
"""Initialize base client.
|
|
20
|
-
|
|
14
|
+
|
|
21
15
|
Args:
|
|
22
16
|
api_key: Optional API key
|
|
23
17
|
model: Optional model name
|
|
24
18
|
"""
|
|
25
19
|
self.api_key = api_key
|
|
26
20
|
self.model = model
|
|
27
|
-
|
|
21
|
+
|
|
28
22
|
async def run_interleaved(
|
|
29
|
-
self,
|
|
30
|
-
messages: List[Dict[str, Any]],
|
|
31
|
-
system: str,
|
|
32
|
-
max_tokens: Optional[int] = None
|
|
23
|
+
self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
|
|
33
24
|
) -> Dict[str, Any]:
|
|
34
25
|
"""Run interleaved chat completion.
|
|
35
|
-
|
|
26
|
+
|
|
36
27
|
Args:
|
|
37
28
|
messages: List of message dicts
|
|
38
29
|
system: System prompt
|
|
39
30
|
max_tokens: Optional max tokens override
|
|
40
|
-
|
|
31
|
+
|
|
41
32
|
Returns:
|
|
42
33
|
Response dict
|
|
43
34
|
"""
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""Ollama API client implementation."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, cast
|
|
5
|
+
import asyncio
|
|
6
|
+
from httpx import ConnectError, ReadTimeout
|
|
7
|
+
|
|
8
|
+
from ollama import AsyncClient, Options
|
|
9
|
+
from ollama import Message
|
|
10
|
+
from .base import BaseOmniClient
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class OllamaClient(BaseOmniClient):
|
|
16
|
+
"""Client for making calls to Ollama API."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, api_key: str, model: str, max_retries: int = 3, retry_delay: float = 1.0):
|
|
19
|
+
"""Initialize the Ollama client.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
api_key: Not used
|
|
23
|
+
model: Ollama model name (e.g. "gemma3:4b-it-q4_K_M")
|
|
24
|
+
max_retries: Maximum number of retries for API calls
|
|
25
|
+
retry_delay: Base delay between retries in seconds
|
|
26
|
+
"""
|
|
27
|
+
if not model:
|
|
28
|
+
raise ValueError("Model name must be provided")
|
|
29
|
+
|
|
30
|
+
self.client = AsyncClient(
|
|
31
|
+
host="http://localhost:11434",
|
|
32
|
+
)
|
|
33
|
+
self.model: str = model # Add explicit type annotation
|
|
34
|
+
self.max_retries = max_retries
|
|
35
|
+
self.retry_delay = retry_delay
|
|
36
|
+
|
|
37
|
+
def _convert_message_format(self, system: str, messages: List[Dict[str, Any]]) -> List[Any]:
|
|
38
|
+
"""Convert messages from standard format to Ollama format.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
messages: Messages in standard format
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Messages in Ollama format
|
|
45
|
+
"""
|
|
46
|
+
ollama_messages = []
|
|
47
|
+
|
|
48
|
+
# Add system message
|
|
49
|
+
ollama_messages.append(
|
|
50
|
+
{
|
|
51
|
+
"role": "system",
|
|
52
|
+
"content": system,
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
for message in messages:
|
|
57
|
+
# Skip messages with empty content
|
|
58
|
+
if not message.get("content"):
|
|
59
|
+
continue
|
|
60
|
+
content = message.get("content", [{}])[0]
|
|
61
|
+
isImage = content.get("type", "") == "image_url"
|
|
62
|
+
isText = content.get("type", "") == "text"
|
|
63
|
+
if isText:
|
|
64
|
+
data = content.get("text", "")
|
|
65
|
+
ollama_messages.append({"role": message["role"], "content": data})
|
|
66
|
+
if isImage:
|
|
67
|
+
data = content.get("image_url", {}).get("url", "")
|
|
68
|
+
# remove header
|
|
69
|
+
data = data.removeprefix("data:image/png;base64,")
|
|
70
|
+
ollama_messages.append(
|
|
71
|
+
{"role": message["role"], "content": "Use this image", "images": [data]}
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Cast the list to the correct type expected by Ollama
|
|
75
|
+
return cast(List[Any], ollama_messages)
|
|
76
|
+
|
|
77
|
+
async def run_interleaved(
|
|
78
|
+
self, messages: List[Dict[str, Any]], system: str, max_tokens: int
|
|
79
|
+
) -> Any:
|
|
80
|
+
"""Run model with interleaved conversation format.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
messages: List of messages to process
|
|
84
|
+
system: System prompt
|
|
85
|
+
max_tokens: Not used
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Model response
|
|
89
|
+
"""
|
|
90
|
+
last_error = None
|
|
91
|
+
|
|
92
|
+
for attempt in range(self.max_retries):
|
|
93
|
+
try:
|
|
94
|
+
# Convert messages to Ollama format
|
|
95
|
+
ollama_messages = self._convert_message_format(system, messages)
|
|
96
|
+
|
|
97
|
+
response = await self.client.chat(
|
|
98
|
+
model=self.model,
|
|
99
|
+
options=Options(
|
|
100
|
+
temperature=0,
|
|
101
|
+
),
|
|
102
|
+
messages=ollama_messages,
|
|
103
|
+
format="json",
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return response
|
|
107
|
+
|
|
108
|
+
except (ConnectError, ReadTimeout) as e:
|
|
109
|
+
last_error = e
|
|
110
|
+
logger.warning(
|
|
111
|
+
f"Connection error on attempt {attempt + 1}/{self.max_retries}: {str(e)}"
|
|
112
|
+
)
|
|
113
|
+
if attempt < self.max_retries - 1:
|
|
114
|
+
await asyncio.sleep(self.retry_delay * (attempt + 1)) # Exponential backoff
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
except Exception as e:
|
|
118
|
+
logger.error(f"Unexpected error in Ollama API call: {str(e)}")
|
|
119
|
+
raise RuntimeError(f"Ollama API call failed: {str(e)}")
|
|
120
|
+
|
|
121
|
+
# If we get here, all retries failed
|
|
122
|
+
raise RuntimeError(f"Connection error after {self.max_retries} retries: {str(last_error)}")
|
|
@@ -19,6 +19,7 @@ from computer import Computer
|
|
|
19
19
|
from .types import LLMProvider
|
|
20
20
|
from .clients.openai import OpenAIClient
|
|
21
21
|
from .clients.anthropic import AnthropicClient
|
|
22
|
+
from .clients.ollama import OllamaClient
|
|
22
23
|
from .prompts import SYSTEM_PROMPT
|
|
23
24
|
from .api_handler import OmniAPIHandler
|
|
24
25
|
from .tools.manager import ToolManager
|
|
@@ -135,6 +136,11 @@ class OmniLoop(BaseLoop):
|
|
|
135
136
|
api_key=self.api_key,
|
|
136
137
|
model=self.model,
|
|
137
138
|
)
|
|
139
|
+
elif self.provider == LLMProvider.OLLAMA:
|
|
140
|
+
self.client = OllamaClient(
|
|
141
|
+
api_key=self.api_key,
|
|
142
|
+
model=self.model,
|
|
143
|
+
)
|
|
138
144
|
else:
|
|
139
145
|
raise ValueError(f"Unsupported provider: {self.provider}")
|
|
140
146
|
|
|
@@ -160,6 +166,11 @@ class OmniLoop(BaseLoop):
|
|
|
160
166
|
max_retries=self.max_retries,
|
|
161
167
|
retry_delay=self.retry_delay,
|
|
162
168
|
)
|
|
169
|
+
elif self.provider == LLMProvider.OLLAMA:
|
|
170
|
+
self.client = OllamaClient(
|
|
171
|
+
api_key=self.api_key,
|
|
172
|
+
model=self.model,
|
|
173
|
+
)
|
|
163
174
|
else:
|
|
164
175
|
raise ValueError(f"Unsupported provider: {self.provider}")
|
|
165
176
|
|
|
@@ -370,6 +381,13 @@ class OmniLoop(BaseLoop):
|
|
|
370
381
|
else:
|
|
371
382
|
logger.warning("Invalid Anthropic response format")
|
|
372
383
|
return True, action_screenshot_saved
|
|
384
|
+
elif self.provider == LLMProvider.OLLAMA:
|
|
385
|
+
try:
|
|
386
|
+
raw_text = response["message"]["content"]
|
|
387
|
+
standard_content = [{"type": "text", "text": raw_text}]
|
|
388
|
+
except (KeyError, TypeError, IndexError) as e:
|
|
389
|
+
logger.error(f"Invalid response format: {str(e)}")
|
|
390
|
+
return True, action_screenshot_saved
|
|
373
391
|
else:
|
|
374
392
|
# Assume OpenAI or compatible format
|
|
375
393
|
try:
|
|
@@ -11,6 +11,7 @@ class LLMProvider(StrEnum):
|
|
|
11
11
|
ANTHROPIC = "anthropic"
|
|
12
12
|
OMNI = "omni"
|
|
13
13
|
OPENAI = "openai"
|
|
14
|
+
OLLAMA = "ollama"
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
@dataclass
|
|
@@ -35,10 +36,12 @@ Model = LLM
|
|
|
35
36
|
PROVIDER_TO_DEFAULT_MODEL: Dict[LLMProvider, str] = {
|
|
36
37
|
LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
37
38
|
LLMProvider.OPENAI: "gpt-4o",
|
|
39
|
+
LLMProvider.OLLAMA: "gemma3:4b-it-q4_K_M",
|
|
38
40
|
}
|
|
39
41
|
|
|
40
42
|
# Environment variable names for each provider
|
|
41
43
|
PROVIDER_TO_ENV_VAR: Dict[LLMProvider, str] = {
|
|
42
44
|
LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
43
45
|
LLMProvider.OPENAI: "OPENAI_API_KEY",
|
|
46
|
+
LLMProvider.OLLAMA: "none",
|
|
44
47
|
}
|
|
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
|
|
|
6
6
|
|
|
7
7
|
[project]
|
|
8
8
|
name = "cua-agent"
|
|
9
|
-
version = "0.1.
|
|
9
|
+
version = "0.1.22"
|
|
10
10
|
description = "CUA (Computer Use) Agent for AI-driven computer interaction"
|
|
11
11
|
readme = "README.md"
|
|
12
12
|
authors = [
|
|
@@ -49,6 +49,20 @@ som = [
|
|
|
49
49
|
"dashscope>=1.13.0,<2.0.0",
|
|
50
50
|
"requests>=2.31.0,<3.0.0",
|
|
51
51
|
]
|
|
52
|
+
omni = [
|
|
53
|
+
"torch>=2.2.1",
|
|
54
|
+
"torchvision>=0.17.1",
|
|
55
|
+
"ultralytics>=8.0.0",
|
|
56
|
+
"transformers>=4.38.2",
|
|
57
|
+
"cua-som>=0.1.0,<0.2.0",
|
|
58
|
+
"anthropic>=0.46.0,<0.47.0",
|
|
59
|
+
"boto3>=1.35.81,<2.0.0",
|
|
60
|
+
"openai>=1.14.0,<2.0.0",
|
|
61
|
+
"groq>=0.4.0,<0.5.0",
|
|
62
|
+
"dashscope>=1.13.0,<2.0.0",
|
|
63
|
+
"requests>=2.31.0,<3.0.0",
|
|
64
|
+
"ollama>=0.4.7,<0.5.0",
|
|
65
|
+
]
|
|
52
66
|
all = [
|
|
53
67
|
"torch>=2.2.1",
|
|
54
68
|
"torchvision>=0.17.1",
|
|
@@ -61,6 +75,7 @@ all = [
|
|
|
61
75
|
"groq>=0.4.0,<0.5.0",
|
|
62
76
|
"dashscope>=1.13.0,<2.0.0",
|
|
63
77
|
"requests>=2.31.0,<3.0.0",
|
|
78
|
+
"ollama>=0.4.7,<0.5.0",
|
|
64
79
|
]
|
|
65
80
|
|
|
66
81
|
[tool.pdm]
|
|
@@ -84,7 +99,7 @@ target-version = [
|
|
|
84
99
|
|
|
85
100
|
[tool.ruff]
|
|
86
101
|
line-length = 100
|
|
87
|
-
target-version = "0.1.
|
|
102
|
+
target-version = "0.1.22"
|
|
88
103
|
select = [
|
|
89
104
|
"E",
|
|
90
105
|
"F",
|
|
@@ -98,7 +113,7 @@ docstring-code-format = true
|
|
|
98
113
|
|
|
99
114
|
[tool.mypy]
|
|
100
115
|
strict = true
|
|
101
|
-
python_version = "0.1.
|
|
116
|
+
python_version = "0.1.22"
|
|
102
117
|
ignore_missing_imports = true
|
|
103
118
|
disallow_untyped_defs = true
|
|
104
119
|
check_untyped_defs = true
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|