cua-agent 0.1.20__tar.gz → 0.1.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (73) hide show
  1. {cua_agent-0.1.20 → cua_agent-0.1.22}/PKG-INFO +17 -3
  2. {cua_agent-0.1.20 → cua_agent-0.1.22}/README.md +2 -2
  3. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/agent.py +2 -1
  4. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/provider_config.py +2 -0
  5. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/types.py +1 -0
  6. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/clients/base.py +8 -17
  7. cua_agent-0.1.22/agent/providers/omni/clients/ollama.py +122 -0
  8. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/clients/openai.py +0 -4
  9. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/loop.py +18 -0
  10. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/types.py +3 -0
  11. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/api_handler.py +3 -0
  12. {cua_agent-0.1.20 → cua_agent-0.1.22}/pyproject.toml +18 -3
  13. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/__init__.py +0 -0
  14. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/README.md +0 -0
  15. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/__init__.py +0 -0
  16. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/base.py +0 -0
  17. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/callbacks.py +0 -0
  18. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/experiment.py +0 -0
  19. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/factory.py +0 -0
  20. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/messages.py +0 -0
  21. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/telemetry.py +0 -0
  22. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools/__init__.py +0 -0
  23. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools/base.py +0 -0
  24. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools/bash.py +0 -0
  25. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools/collection.py +0 -0
  26. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools/computer.py +0 -0
  27. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools/edit.py +0 -0
  28. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools/manager.py +0 -0
  29. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/tools.py +0 -0
  30. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/core/visualization.py +0 -0
  31. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/__init__.py +0 -0
  32. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/__init__.py +0 -0
  33. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/api/client.py +0 -0
  34. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/api/logging.py +0 -0
  35. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/api_handler.py +0 -0
  36. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/callbacks/__init__.py +0 -0
  37. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/callbacks/manager.py +0 -0
  38. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/loop.py +0 -0
  39. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/prompts.py +0 -0
  40. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/response_handler.py +0 -0
  41. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/__init__.py +0 -0
  42. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/base.py +0 -0
  43. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/bash.py +0 -0
  44. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/collection.py +0 -0
  45. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/computer.py +0 -0
  46. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/edit.py +0 -0
  47. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/manager.py +0 -0
  48. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/tools/run.py +0 -0
  49. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/types.py +0 -0
  50. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/anthropic/utils.py +0 -0
  51. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/__init__.py +0 -0
  52. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/api_handler.py +0 -0
  53. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/clients/anthropic.py +0 -0
  54. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/clients/utils.py +0 -0
  55. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/image_utils.py +0 -0
  56. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/parser.py +0 -0
  57. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/prompts.py +0 -0
  58. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/tools/__init__.py +0 -0
  59. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/tools/base.py +0 -0
  60. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/tools/bash.py +0 -0
  61. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/tools/computer.py +0 -0
  62. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/tools/manager.py +0 -0
  63. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/omni/utils.py +0 -0
  64. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/__init__.py +0 -0
  65. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/loop.py +0 -0
  66. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/response_handler.py +0 -0
  67. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/tools/__init__.py +0 -0
  68. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/tools/base.py +0 -0
  69. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/tools/computer.py +0 -0
  70. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/tools/manager.py +0 -0
  71. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/types.py +0 -0
  72. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/providers/openai/utils.py +0 -0
  73. {cua_agent-0.1.20 → cua_agent-0.1.22}/agent/telemetry.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.1.20
3
+ Version: 0.1.22
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: <3.13,>=3.10
@@ -33,6 +33,19 @@ Requires-Dist: openai<2.0.0,>=1.14.0; extra == "som"
33
33
  Requires-Dist: groq<0.5.0,>=0.4.0; extra == "som"
34
34
  Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "som"
35
35
  Requires-Dist: requests<3.0.0,>=2.31.0; extra == "som"
36
+ Provides-Extra: omni
37
+ Requires-Dist: torch>=2.2.1; extra == "omni"
38
+ Requires-Dist: torchvision>=0.17.1; extra == "omni"
39
+ Requires-Dist: ultralytics>=8.0.0; extra == "omni"
40
+ Requires-Dist: transformers>=4.38.2; extra == "omni"
41
+ Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "omni"
42
+ Requires-Dist: anthropic<0.47.0,>=0.46.0; extra == "omni"
43
+ Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "omni"
44
+ Requires-Dist: openai<2.0.0,>=1.14.0; extra == "omni"
45
+ Requires-Dist: groq<0.5.0,>=0.4.0; extra == "omni"
46
+ Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "omni"
47
+ Requires-Dist: requests<3.0.0,>=2.31.0; extra == "omni"
48
+ Requires-Dist: ollama<0.5.0,>=0.4.7; extra == "omni"
36
49
  Provides-Extra: all
37
50
  Requires-Dist: torch>=2.2.1; extra == "all"
38
51
  Requires-Dist: torchvision>=0.17.1; extra == "all"
@@ -45,6 +58,7 @@ Requires-Dist: openai<2.0.0,>=1.14.0; extra == "all"
45
58
  Requires-Dist: groq<0.5.0,>=0.4.0; extra == "all"
46
59
  Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "all"
47
60
  Requires-Dist: requests<3.0.0,>=2.31.0; extra == "all"
61
+ Requires-Dist: ollama<0.5.0,>=0.4.7; extra == "all"
48
62
  Description-Content-Type: text/markdown
49
63
 
50
64
  <div align="center">
@@ -80,7 +94,7 @@ pip install "cua-agent[all]"
80
94
  # or install specific loop providers
81
95
  pip install "cua-agent[openai]" # OpenAI Cua Loop
82
96
  pip install "cua-agent[anthropic]" # Anthropic Cua Loop
83
- pip install "cua-agent[omni]" # Cua Loop based on OmniParser
97
+ pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
84
98
  ```
85
99
 
86
100
  ## Run
@@ -123,7 +137,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
123
137
  |:-----------|:-----------------|:------------|:-------------|
124
138
  | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
125
139
  | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
126
- | `AgentLoop.OMNI` <br>(preview) | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `gpt-3.5-turbo` | Use OmniParser for element pixel-detection (SoM) and any VLMs | OmniParser |
140
+ | `AgentLoop.OMNI` <br>(experimental) | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
127
141
 
128
142
  ## AgentResponse
129
143
  The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
@@ -31,7 +31,7 @@ pip install "cua-agent[all]"
31
31
  # or install specific loop providers
32
32
  pip install "cua-agent[openai]" # OpenAI Cua Loop
33
33
  pip install "cua-agent[anthropic]" # Anthropic Cua Loop
34
- pip install "cua-agent[omni]" # Cua Loop based on OmniParser
34
+ pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
35
35
  ```
36
36
 
37
37
  ## Run
@@ -74,7 +74,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
74
74
  |:-----------|:-----------------|:------------|:-------------|
75
75
  | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
76
76
  | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
77
- | `AgentLoop.OMNI` <br>(preview) | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `gpt-3.5-turbo` | Use OmniParser for element pixel-detection (SoM) and any VLMs | OmniParser |
77
+ | `AgentLoop.OMNI` <br>(experimental) | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
78
78
 
79
79
  ## AgentResponse
80
80
  The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
@@ -86,7 +86,8 @@ class ComputerAgent:
86
86
 
87
87
  # Get API key from environment if not provided
88
88
  actual_api_key = api_key or os.environ.get(ENV_VARS[self.provider], "")
89
- if not actual_api_key:
89
+ # Ollama is local and doesn't require an API key
90
+ if not actual_api_key and str(self.provider) != "ollama":
90
91
  raise ValueError(f"No API key provided for {self.provider}")
91
92
 
92
93
  # Create the appropriate loop using the factory
@@ -6,10 +6,12 @@ from ..providers.omni.types import LLMProvider
6
6
  DEFAULT_MODELS = {
7
7
  LLMProvider.OPENAI: "gpt-4o",
8
8
  LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
9
+ LLMProvider.OLLAMA: "gemma3:4b-it-q4_K_M",
9
10
  }
10
11
 
11
12
  # Map providers to their environment variable names
12
13
  ENV_VARS = {
13
14
  LLMProvider.OPENAI: "OPENAI_API_KEY",
14
15
  LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
16
+ LLMProvider.OLLAMA: "OLLAMA_API_KEY",
15
17
  }
@@ -10,6 +10,7 @@ class AgentLoop(Enum):
10
10
  ANTHROPIC = auto() # Anthropic implementation
11
11
  OMNI = auto() # OmniLoop implementation
12
12
  OPENAI = auto() # OpenAI implementation
13
+ OLLAMA = auto() # OLLAMA implementation
13
14
  # Add more loop types as needed
14
15
 
15
16
 
@@ -1,43 +1,34 @@
1
1
  """Base client implementation for Omni providers."""
2
2
 
3
- import os
4
3
  import logging
5
4
  from typing import Dict, List, Optional, Any, Tuple
6
- import aiohttp
7
- import json
8
5
 
9
6
  logger = logging.getLogger(__name__)
10
7
 
8
+
11
9
  class BaseOmniClient:
12
10
  """Base class for provider-specific clients."""
13
-
14
- def __init__(
15
- self,
16
- api_key: Optional[str] = None,
17
- model: Optional[str] = None
18
- ):
11
+
12
+ def __init__(self, api_key: Optional[str] = None, model: Optional[str] = None):
19
13
  """Initialize base client.
20
-
14
+
21
15
  Args:
22
16
  api_key: Optional API key
23
17
  model: Optional model name
24
18
  """
25
19
  self.api_key = api_key
26
20
  self.model = model
27
-
21
+
28
22
  async def run_interleaved(
29
- self,
30
- messages: List[Dict[str, Any]],
31
- system: str,
32
- max_tokens: Optional[int] = None
23
+ self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
33
24
  ) -> Dict[str, Any]:
34
25
  """Run interleaved chat completion.
35
-
26
+
36
27
  Args:
37
28
  messages: List of message dicts
38
29
  system: System prompt
39
30
  max_tokens: Optional max tokens override
40
-
31
+
41
32
  Returns:
42
33
  Response dict
43
34
  """
@@ -0,0 +1,122 @@
1
+ """Ollama API client implementation."""
2
+
3
+ import logging
4
+ from typing import Any, Dict, List, Optional, Tuple, cast
5
+ import asyncio
6
+ from httpx import ConnectError, ReadTimeout
7
+
8
+ from ollama import AsyncClient, Options
9
+ from ollama import Message
10
+ from .base import BaseOmniClient
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class OllamaClient(BaseOmniClient):
16
+ """Client for making calls to Ollama API."""
17
+
18
+ def __init__(self, api_key: str, model: str, max_retries: int = 3, retry_delay: float = 1.0):
19
+ """Initialize the Ollama client.
20
+
21
+ Args:
22
+ api_key: Not used
23
+ model: Ollama model name (e.g. "gemma3:4b-it-q4_K_M")
24
+ max_retries: Maximum number of retries for API calls
25
+ retry_delay: Base delay between retries in seconds
26
+ """
27
+ if not model:
28
+ raise ValueError("Model name must be provided")
29
+
30
+ self.client = AsyncClient(
31
+ host="http://localhost:11434",
32
+ )
33
+ self.model: str = model # Add explicit type annotation
34
+ self.max_retries = max_retries
35
+ self.retry_delay = retry_delay
36
+
37
+ def _convert_message_format(self, system: str, messages: List[Dict[str, Any]]) -> List[Any]:
38
+ """Convert messages from standard format to Ollama format.
39
+
40
+ Args:
41
+ messages: Messages in standard format
42
+
43
+ Returns:
44
+ Messages in Ollama format
45
+ """
46
+ ollama_messages = []
47
+
48
+ # Add system message
49
+ ollama_messages.append(
50
+ {
51
+ "role": "system",
52
+ "content": system,
53
+ }
54
+ )
55
+
56
+ for message in messages:
57
+ # Skip messages with empty content
58
+ if not message.get("content"):
59
+ continue
60
+ content = message.get("content", [{}])[0]
61
+ isImage = content.get("type", "") == "image_url"
62
+ isText = content.get("type", "") == "text"
63
+ if isText:
64
+ data = content.get("text", "")
65
+ ollama_messages.append({"role": message["role"], "content": data})
66
+ if isImage:
67
+ data = content.get("image_url", {}).get("url", "")
68
+ # remove header
69
+ data = data.removeprefix("data:image/png;base64,")
70
+ ollama_messages.append(
71
+ {"role": message["role"], "content": "Use this image", "images": [data]}
72
+ )
73
+
74
+ # Cast the list to the correct type expected by Ollama
75
+ return cast(List[Any], ollama_messages)
76
+
77
+ async def run_interleaved(
78
+ self, messages: List[Dict[str, Any]], system: str, max_tokens: int
79
+ ) -> Any:
80
+ """Run model with interleaved conversation format.
81
+
82
+ Args:
83
+ messages: List of messages to process
84
+ system: System prompt
85
+ max_tokens: Not used
86
+
87
+ Returns:
88
+ Model response
89
+ """
90
+ last_error = None
91
+
92
+ for attempt in range(self.max_retries):
93
+ try:
94
+ # Convert messages to Ollama format
95
+ ollama_messages = self._convert_message_format(system, messages)
96
+
97
+ response = await self.client.chat(
98
+ model=self.model,
99
+ options=Options(
100
+ temperature=0,
101
+ ),
102
+ messages=ollama_messages,
103
+ format="json",
104
+ )
105
+
106
+ return response
107
+
108
+ except (ConnectError, ReadTimeout) as e:
109
+ last_error = e
110
+ logger.warning(
111
+ f"Connection error on attempt {attempt + 1}/{self.max_retries}: {str(e)}"
112
+ )
113
+ if attempt < self.max_retries - 1:
114
+ await asyncio.sleep(self.retry_delay * (attempt + 1)) # Exponential backoff
115
+ continue
116
+
117
+ except Exception as e:
118
+ logger.error(f"Unexpected error in Ollama API call: {str(e)}")
119
+ raise RuntimeError(f"Ollama API call failed: {str(e)}")
120
+
121
+ # If we get here, all retries failed
122
+ raise RuntimeError(f"Connection error after {self.max_retries} retries: {str(last_error)}")
@@ -4,11 +4,7 @@ import os
4
4
  import logging
5
5
  from typing import Dict, List, Optional, Any
6
6
  import aiohttp
7
- import base64
8
7
  import re
9
- import json
10
- import ssl
11
- import certifi
12
8
  from datetime import datetime
13
9
  from .base import BaseOmniClient
14
10
 
@@ -19,6 +19,7 @@ from computer import Computer
19
19
  from .types import LLMProvider
20
20
  from .clients.openai import OpenAIClient
21
21
  from .clients.anthropic import AnthropicClient
22
+ from .clients.ollama import OllamaClient
22
23
  from .prompts import SYSTEM_PROMPT
23
24
  from .api_handler import OmniAPIHandler
24
25
  from .tools.manager import ToolManager
@@ -135,6 +136,11 @@ class OmniLoop(BaseLoop):
135
136
  api_key=self.api_key,
136
137
  model=self.model,
137
138
  )
139
+ elif self.provider == LLMProvider.OLLAMA:
140
+ self.client = OllamaClient(
141
+ api_key=self.api_key,
142
+ model=self.model,
143
+ )
138
144
  else:
139
145
  raise ValueError(f"Unsupported provider: {self.provider}")
140
146
 
@@ -160,6 +166,11 @@ class OmniLoop(BaseLoop):
160
166
  max_retries=self.max_retries,
161
167
  retry_delay=self.retry_delay,
162
168
  )
169
+ elif self.provider == LLMProvider.OLLAMA:
170
+ self.client = OllamaClient(
171
+ api_key=self.api_key,
172
+ model=self.model,
173
+ )
163
174
  else:
164
175
  raise ValueError(f"Unsupported provider: {self.provider}")
165
176
 
@@ -370,6 +381,13 @@ class OmniLoop(BaseLoop):
370
381
  else:
371
382
  logger.warning("Invalid Anthropic response format")
372
383
  return True, action_screenshot_saved
384
+ elif self.provider == LLMProvider.OLLAMA:
385
+ try:
386
+ raw_text = response["message"]["content"]
387
+ standard_content = [{"type": "text", "text": raw_text}]
388
+ except (KeyError, TypeError, IndexError) as e:
389
+ logger.error(f"Invalid response format: {str(e)}")
390
+ return True, action_screenshot_saved
373
391
  else:
374
392
  # Assume OpenAI or compatible format
375
393
  try:
@@ -11,6 +11,7 @@ class LLMProvider(StrEnum):
11
11
  ANTHROPIC = "anthropic"
12
12
  OMNI = "omni"
13
13
  OPENAI = "openai"
14
+ OLLAMA = "ollama"
14
15
 
15
16
 
16
17
  @dataclass
@@ -35,10 +36,12 @@ Model = LLM
35
36
  PROVIDER_TO_DEFAULT_MODEL: Dict[LLMProvider, str] = {
36
37
  LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
37
38
  LLMProvider.OPENAI: "gpt-4o",
39
+ LLMProvider.OLLAMA: "gemma3:4b-it-q4_K_M",
38
40
  }
39
41
 
40
42
  # Environment variable names for each provider
41
43
  PROVIDER_TO_ENV_VAR: Dict[LLMProvider, str] = {
42
44
  LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
43
45
  LLMProvider.OPENAI: "OPENAI_API_KEY",
46
+ LLMProvider.OLLAMA: "none",
44
47
  }
@@ -132,6 +132,9 @@ class OpenAIAPIHandler:
132
132
  }
133
133
  ],
134
134
  "input": input_array,
135
+ "reasoning": {
136
+ "generate_summary": "concise",
137
+ },
135
138
  "truncation": "auto",
136
139
  }
137
140
 
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "cua-agent"
9
- version = "0.1.20"
9
+ version = "0.1.22"
10
10
  description = "CUA (Computer Use) Agent for AI-driven computer interaction"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -49,6 +49,20 @@ som = [
49
49
  "dashscope>=1.13.0,<2.0.0",
50
50
  "requests>=2.31.0,<3.0.0",
51
51
  ]
52
+ omni = [
53
+ "torch>=2.2.1",
54
+ "torchvision>=0.17.1",
55
+ "ultralytics>=8.0.0",
56
+ "transformers>=4.38.2",
57
+ "cua-som>=0.1.0,<0.2.0",
58
+ "anthropic>=0.46.0,<0.47.0",
59
+ "boto3>=1.35.81,<2.0.0",
60
+ "openai>=1.14.0,<2.0.0",
61
+ "groq>=0.4.0,<0.5.0",
62
+ "dashscope>=1.13.0,<2.0.0",
63
+ "requests>=2.31.0,<3.0.0",
64
+ "ollama>=0.4.7,<0.5.0",
65
+ ]
52
66
  all = [
53
67
  "torch>=2.2.1",
54
68
  "torchvision>=0.17.1",
@@ -61,6 +75,7 @@ all = [
61
75
  "groq>=0.4.0,<0.5.0",
62
76
  "dashscope>=1.13.0,<2.0.0",
63
77
  "requests>=2.31.0,<3.0.0",
78
+ "ollama>=0.4.7,<0.5.0",
64
79
  ]
65
80
 
66
81
  [tool.pdm]
@@ -84,7 +99,7 @@ target-version = [
84
99
 
85
100
  [tool.ruff]
86
101
  line-length = 100
87
- target-version = "0.1.20"
102
+ target-version = "0.1.22"
88
103
  select = [
89
104
  "E",
90
105
  "F",
@@ -98,7 +113,7 @@ docstring-code-format = true
98
113
 
99
114
  [tool.mypy]
100
115
  strict = true
101
- python_version = "0.1.20"
116
+ python_version = "0.1.22"
102
117
  ignore_missing_imports = true
103
118
  disallow_untyped_defs = true
104
119
  check_untyped_defs = true
File without changes