cua-agent 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (67) hide show
  1. {cua_agent-0.1.0 → cua_agent-0.1.1}/PKG-INFO +1 -1
  2. cua_agent-0.1.1/README.md +126 -0
  3. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/__init__.py +2 -2
  4. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/agent.py +74 -28
  5. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/messages.py +15 -0
  6. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/loop.py +5 -1
  7. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/messages.py +3 -0
  8. cua_agent-0.1.1/agent/providers/omni/types.py +53 -0
  9. {cua_agent-0.1.0 → cua_agent-0.1.1}/pyproject.toml +3 -3
  10. cua_agent-0.1.0/README.md +0 -213
  11. cua_agent-0.1.0/agent/providers/omni/types.py +0 -30
  12. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/README.md +0 -0
  13. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/README.md +0 -0
  14. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/__init__.py +0 -0
  15. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/base_agent.py +0 -0
  16. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/callbacks.py +0 -0
  17. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/computer_agent.py +0 -0
  18. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/experiment.py +0 -0
  19. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/factory.py +0 -0
  20. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/loop.py +0 -0
  21. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/tools/__init__.py +0 -0
  22. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/tools/base.py +0 -0
  23. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/tools/bash.py +0 -0
  24. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/tools/collection.py +0 -0
  25. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/tools/computer.py +0 -0
  26. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/tools/edit.py +0 -0
  27. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/core/tools/manager.py +0 -0
  28. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/__init__.py +0 -0
  29. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/__init__.py +0 -0
  30. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/api/client.py +0 -0
  31. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/api/logging.py +0 -0
  32. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/callbacks/manager.py +0 -0
  33. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/loop.py +0 -0
  34. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/messages/manager.py +0 -0
  35. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/prompts.py +0 -0
  36. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/__init__.py +0 -0
  37. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/base.py +0 -0
  38. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/bash.py +0 -0
  39. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/collection.py +0 -0
  40. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/computer.py +0 -0
  41. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/edit.py +0 -0
  42. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/manager.py +0 -0
  43. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/tools/run.py +0 -0
  44. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/anthropic/types.py +0 -0
  45. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/__init__.py +0 -0
  46. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/callbacks.py +0 -0
  47. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/clients/anthropic.py +0 -0
  48. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/clients/base.py +0 -0
  49. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/clients/groq.py +0 -0
  50. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/clients/openai.py +0 -0
  51. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/clients/utils.py +0 -0
  52. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/experiment.py +0 -0
  53. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/image_utils.py +0 -0
  54. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/parser.py +0 -0
  55. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/prompts.py +0 -0
  56. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/tool_manager.py +0 -0
  57. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/tools/__init__.py +0 -0
  58. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/tools/bash.py +0 -0
  59. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/tools/computer.py +0 -0
  60. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/tools/manager.py +0 -0
  61. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/utils.py +0 -0
  62. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/providers/omni/visualization.py +0 -0
  63. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/types/__init__.py +0 -0
  64. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/types/base.py +0 -0
  65. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/types/messages.py +0 -0
  66. {cua_agent-0.1.0 → cua_agent-0.1.1}/agent/types/tools.py +0 -0
  67. {cua_agent-0.1.0 → cua_agent-0.1.1}/tests/test_agent.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: <3.13,>=3.10
@@ -0,0 +1,126 @@
1
+ <div align="center">
2
+ <h1>
3
+ <div class="image-wrapper" style="display: inline-block;">
4
+ <picture>
5
+ <source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../img/logo_white.png" style="display: block; margin: auto;">
6
+ <source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../img/logo_black.png" style="display: block; margin: auto;">
7
+ <img alt="Shows my svg">
8
+ </picture>
9
+ </div>
10
+
11
+ [![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
12
+ [![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
13
+ [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
14
+ [![PyPI](https://img.shields.io/pypi/v/cua-computer?color=333333)](https://pypi.org/project/cua-computer/)
15
+ </h1>
16
+ </div>
17
+
18
+ **Agent** is a Computer Use (CUA) framework for running multi-app agentic workflows targeting macOS and Linux sandbox, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). The framework integrates with Microsoft's OmniParser for enhanced UI understanding and interaction.
19
+
20
+ ### Get started with Agent
21
+
22
+ ```python
23
+ from agent import ComputerAgent, AgenticLoop, APIProvider
24
+ from computer import Computer
25
+
26
+ computer = Computer(verbosity=logging.INFO)
27
+
28
+ agent = ComputerAgent(
29
+ computer=computer,
30
+ api_key="<your-anthropic-api-key>",
31
+ loop_type=AgenticLoop.ANTHROPIC, # or AgenticLoop.OMNI
32
+ ai_provider=APIProvider.ANTHROPIC,
33
+ model='claude-3-7-sonnet-20250219',
34
+ save_trajectory=True,
35
+ trajectory_dir=str(Path("trajectories") / datetime.now().strftime("%Y%m%d_%H%M%S")),
36
+ only_n_most_recent_images=3,
37
+ verbosity=logging.INFO,
38
+ )
39
+
40
+ tasks = [
41
+ """
42
+ Please help me with the following task:
43
+ 1. Open Safari browser
44
+ 2. Go to Wikipedia.org
45
+ 3. Search for "Claude AI"
46
+ 4. Summarize the main points you find about Claude AI
47
+ """
48
+ ]
49
+
50
+ async with agent:
51
+ for i, task in enumerate(tasks, 1):
52
+ print(f"\nExecuting task {i}/{len(tasks)}: {task}")
53
+ async for result in agent.run(task):
54
+ print(result)
55
+ print(f"Task {i} completed")
56
+ ```
57
+
58
+ ## Install
59
+
60
+ ### cua-agent
61
+
62
+ ```bash
63
+
64
+ pip install cua-agent[all]
65
+
66
+ # or install specific loop providers
67
+ pip install cua-agent[anthropic]
68
+ pip install cua-agent[omni]
69
+
70
+
71
+ ```
72
+
73
+ ## Features
74
+
75
+ ### OmniParser Integration
76
+ - Enhanced UI understanding with element detection
77
+ - Automatic bounding box detection for UI elements
78
+ - Improved accuracy for complex UI interactions
79
+ - Support for icon and text element recognition
80
+
81
+ ### Basic Computer Control
82
+ - Direct keyboard and mouse control
83
+ - Window and application management
84
+ - Screenshot capabilities
85
+ - Basic UI element detection
86
+
87
+ ### Provider Support
88
+ - OpenAI (GPT-4V) - Recommended for OmniParser integration
89
+ - Anthropic (Claude) - Strong general performance
90
+ - Groq - Fast inference with Llama models
91
+ - DeepSeek - Alternative model provider
92
+ - Qwen - Alibaba's multimodal model
93
+
94
+ ## Run
95
+
96
+ Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA):
97
+
98
+ - [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
99
+
100
+ ## Components
101
+
102
+ The library consists of several components:
103
+
104
+ - **Core**
105
+ - `ComputerAgent`: Unified agent class supporting multiple loop types
106
+ - `BaseComputerAgent`: Abstract base class for computer agents
107
+
108
+ - **Providers**
109
+ - `Anthropic`: Implementation for Anthropic Claude models
110
+ - `Omni`: Implementation for multiple providers (OpenAI, Groq, etc.)
111
+
112
+ - **Loops**
113
+ - `AnthropicLoop`: Loop implementation for Anthropic
114
+ - `OmniLoop`: Generic loop supporting multiple providers
115
+
116
+ ## Configuration
117
+
118
+ The agent can be configured with various parameters:
119
+
120
+ - **loop_type**: The type of loop to use (ANTHROPIC or OMNI)
121
+ - **provider**: AI provider to use with the loop
122
+ - **model**: The AI model to use
123
+ - **save_trajectory**: Whether to save screenshots and logs
124
+ - **only_n_most_recent_images**: Only keep a specific number of recent images
125
+
126
+ See the [Core README](./agent/core/README.md) for more details on the unified agent.
@@ -5,6 +5,6 @@ __version__ = "0.1.0"
5
5
  from .core.factory import AgentFactory
6
6
  from .core.agent import ComputerAgent
7
7
  from .types.base import Provider, AgenticLoop
8
- from .providers.omni.types import APIProvider
8
+ from .providers.omni.types import LLMProvider, LLM, Model, LLMModel, APIProvider
9
9
 
10
- __all__ = ["AgentFactory", "Provider", "ComputerAgent", "AgenticLoop", "APIProvider"]
10
+ __all__ = ["AgentFactory", "Provider", "ComputerAgent", "AgenticLoop", "LLMProvider", "LLM", "Model", "LLMModel", "APIProvider"]
@@ -3,7 +3,7 @@
3
3
  import os
4
4
  import logging
5
5
  import asyncio
6
- from typing import Any, AsyncGenerator, Dict, List, Optional, TYPE_CHECKING
6
+ from typing import Any, AsyncGenerator, Dict, List, Optional, TYPE_CHECKING, Union, cast
7
7
  from datetime import datetime
8
8
 
9
9
  from computer import Computer
@@ -17,23 +17,23 @@ if TYPE_CHECKING:
17
17
  from ..providers.omni.loop import OmniLoop
18
18
  from ..providers.omni.parser import OmniParser
19
19
 
20
- # Import the APIProvider enum without importing the whole module
21
- from ..providers.omni.types import APIProvider
20
+ # Import the provider types
21
+ from ..providers.omni.types import LLMProvider, LLM, Model, LLMModel, APIProvider
22
22
 
23
23
  logger = logging.getLogger(__name__)
24
24
 
25
25
  # Default models for different providers
26
26
  DEFAULT_MODELS = {
27
- APIProvider.OPENAI: "gpt-4o",
28
- APIProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
29
- APIProvider.GROQ: "llama3-70b-8192",
27
+ LLMProvider.OPENAI: "gpt-4o",
28
+ LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
29
+ LLMProvider.GROQ: "llama3-70b-8192",
30
30
  }
31
31
 
32
32
  # Map providers to their environment variable names
33
33
  ENV_VARS = {
34
- APIProvider.OPENAI: "OPENAI_API_KEY",
35
- APIProvider.GROQ: "GROQ_API_KEY",
36
- APIProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
34
+ LLMProvider.OPENAI: "OPENAI_API_KEY",
35
+ LLMProvider.GROQ: "GROQ_API_KEY",
36
+ LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
37
37
  }
38
38
 
39
39
 
@@ -48,9 +48,8 @@ class ComputerAgent(BaseComputerAgent):
48
48
  self,
49
49
  computer: Computer,
50
50
  loop_type: AgenticLoop = AgenticLoop.OMNI,
51
- ai_provider: APIProvider = APIProvider.OPENAI,
51
+ model: Optional[Union[LLM, Dict[str, str], str]] = None,
52
52
  api_key: Optional[str] = None,
53
- model: Optional[str] = None,
54
53
  save_trajectory: bool = True,
55
54
  trajectory_dir: Optional[str] = "trajectories",
56
55
  only_n_most_recent_images: Optional[int] = None,
@@ -63,9 +62,12 @@ class ComputerAgent(BaseComputerAgent):
63
62
  Args:
64
63
  computer: Computer instance to control
65
64
  loop_type: The type of loop to use (Anthropic or Omni)
66
- ai_provider: AI provider to use (required for Cua loop)
65
+ model: LLM configuration. Can be:
66
+ - LLM object with provider and name
67
+ - Dict with 'provider' and 'name' keys
68
+ - String with model name (defaults to OpenAI provider)
69
+ - None (defaults based on loop_type)
67
70
  api_key: Optional API key (will use environment variable if not provided)
68
- model: Optional model name (will use provider default if not specified)
69
71
  save_trajectory: Whether to save screenshots and logs
70
72
  trajectory_dir: Directory to save trajectories (defaults to "trajectories")
71
73
  only_n_most_recent_images: Limit history to N most recent images
@@ -88,7 +90,6 @@ class ComputerAgent(BaseComputerAgent):
88
90
  )
89
91
 
90
92
  self.loop_type = loop_type
91
- self.provider = ai_provider
92
93
  self.save_trajectory = save_trajectory
93
94
  self.trajectory_dir = trajectory_dir
94
95
  self.only_n_most_recent_images = only_n_most_recent_images
@@ -98,14 +99,19 @@ class ComputerAgent(BaseComputerAgent):
98
99
  # Configure logging based on verbosity
99
100
  self._configure_logging(verbosity)
100
101
 
102
+ # Process model configuration
103
+ self.model_config = self._process_model_config(model, loop_type)
104
+
101
105
  # Get API key from environment if not provided
102
106
  if api_key is None:
103
107
  env_var = (
104
- ENV_VARS.get(ai_provider) if loop_type == AgenticLoop.OMNI else "ANTHROPIC_API_KEY"
108
+ ENV_VARS.get(self.model_config.provider)
109
+ if loop_type == AgenticLoop.OMNI
110
+ else "ANTHROPIC_API_KEY"
105
111
  )
106
112
  if not env_var:
107
113
  raise ValueError(
108
- f"Unsupported provider: {ai_provider}. Please use one of: {list(ENV_VARS.keys())}"
114
+ f"Unsupported provider: {self.model_config.provider}. Please use one of: {list(ENV_VARS.keys())}"
109
115
  )
110
116
 
111
117
  api_key = os.environ.get(env_var)
@@ -119,17 +125,51 @@ class ComputerAgent(BaseComputerAgent):
119
125
  )
120
126
  self.api_key = api_key
121
127
 
122
- # Set model based on provider if not specified
123
- if model is None:
124
- if loop_type == AgenticLoop.OMNI:
125
- self.model = DEFAULT_MODELS[ai_provider]
126
- else: # Anthropic loop
127
- self.model = DEFAULT_MODELS[APIProvider.ANTHROPIC]
128
- else:
129
- self.model = model
130
-
131
128
  # Initialize the appropriate loop based on loop_type
132
129
  self.loop = self._init_loop()
130
+
131
+ def _process_model_config(
132
+ self, model_input: Optional[Union[LLM, Dict[str, str], str]], loop_type: AgenticLoop
133
+ ) -> LLM:
134
+ """Process and normalize model configuration.
135
+
136
+ Args:
137
+ model_input: Input model configuration (LLM, dict, string, or None)
138
+ loop_type: The loop type being used
139
+
140
+ Returns:
141
+ Normalized LLM instance
142
+ """
143
+ # Handle case where model_input is None
144
+ if model_input is None:
145
+ # Use Anthropic for Anthropic loop, OpenAI for Omni loop
146
+ default_provider = (
147
+ LLMProvider.ANTHROPIC if loop_type == AgenticLoop.ANTHROPIC else LLMProvider.OPENAI
148
+ )
149
+ return LLM(provider=default_provider)
150
+
151
+ # Handle case where model_input is already a LLM or one of its aliases
152
+ if isinstance(model_input, (LLM, Model, LLMModel)):
153
+ return model_input
154
+
155
+ # Handle case where model_input is a dict
156
+ if isinstance(model_input, dict):
157
+ provider = model_input.get("provider", LLMProvider.OPENAI)
158
+ if isinstance(provider, str):
159
+ provider = LLMProvider(provider)
160
+ return LLM(
161
+ provider=provider,
162
+ name=model_input.get("name")
163
+ )
164
+
165
+ # Handle case where model_input is a string (model name)
166
+ if isinstance(model_input, str):
167
+ default_provider = (
168
+ LLMProvider.ANTHROPIC if loop_type == AgenticLoop.ANTHROPIC else LLMProvider.OPENAI
169
+ )
170
+ return LLM(provider=default_provider, name=model_input)
171
+
172
+ raise ValueError(f"Unsupported model configuration: {model_input}")
133
173
 
134
174
  def _configure_logging(self, verbosity: int):
135
175
  """Configure logging based on verbosity level."""
@@ -162,9 +202,12 @@ class ComputerAgent(BaseComputerAgent):
162
202
  if self.loop_type == AgenticLoop.ANTHROPIC:
163
203
  from ..providers.anthropic.loop import AnthropicLoop
164
204
 
205
+ # Ensure we always have a valid model name
206
+ model_name = self.model_config.name or DEFAULT_MODELS[LLMProvider.ANTHROPIC]
207
+
165
208
  return AnthropicLoop(
166
209
  api_key=self.api_key,
167
- model=self.model,
210
+ model=model_name,
168
211
  computer=self.computer,
169
212
  save_trajectory=self.save_trajectory,
170
213
  base_dir=self.trajectory_dir,
@@ -176,10 +219,13 @@ class ComputerAgent(BaseComputerAgent):
176
219
  if "parser" not in self._kwargs:
177
220
  self._kwargs["parser"] = OmniParser()
178
221
 
222
+ # Ensure we always have a valid model name
223
+ model_name = self.model_config.name or DEFAULT_MODELS[self.model_config.provider]
224
+
179
225
  return OmniLoop(
180
- provider=self.provider,
226
+ provider=self.model_config.provider,
181
227
  api_key=self.api_key,
182
- model=self.model,
228
+ model=model_name,
183
229
  computer=self.computer,
184
230
  save_trajectory=self.save_trajectory,
185
231
  base_dir=self.trajectory_dir,
@@ -37,6 +37,17 @@ class BaseMessageManager:
37
37
  if self.image_retention_config.min_removal_threshold < 1:
38
38
  raise ValueError("min_removal_threshold must be at least 1")
39
39
 
40
+ # Track provider for message formatting
41
+ self.provider = "openai" # Default provider
42
+
43
+ def set_provider(self, provider: str) -> None:
44
+ """Set the current provider to format messages for.
45
+
46
+ Args:
47
+ provider: Provider name (e.g., 'openai', 'anthropic')
48
+ """
49
+ self.provider = provider.lower()
50
+
40
51
  def prepare_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
41
52
  """Prepare messages by applying image retention and caching as configured.
42
53
 
@@ -96,6 +107,10 @@ class BaseMessageManager:
96
107
  Args:
97
108
  messages: Messages to inject caching into
98
109
  """
110
+ # Only apply cache_control for Anthropic API, not OpenAI
111
+ if self.provider != "anthropic":
112
+ return
113
+
99
114
  # Default to caching last 3 turns
100
115
  turns_to_cache = 3
101
116
  for message in reversed(messages):
@@ -219,9 +219,13 @@ class OmniLoop(BaseLoop):
219
219
  if self.client is None:
220
220
  raise RuntimeError("Failed to initialize client")
221
221
 
222
+ # Set the provider in message manager based on current provider
223
+ provider_name = str(self.provider).split(".")[-1].lower() # Extract name from enum
224
+ self.message_manager.set_provider(provider_name)
225
+
222
226
  # Apply image retention and prepare messages
223
227
  # This will limit the number of images based on only_n_most_recent_images
224
- prepared_messages = self.message_manager.prepare_messages(messages.copy())
228
+ prepared_messages = self.message_manager.get_formatted_messages(provider_name)
225
229
 
226
230
  # Filter out system messages for Anthropic
227
231
  if self.provider == APIProvider.ANTHROPIC:
@@ -103,6 +103,9 @@ class OmniMessageManager(BaseMessageManager):
103
103
  Returns:
104
104
  List of formatted messages
105
105
  """
106
+ # Set the provider for message formatting
107
+ self.set_provider(provider)
108
+
106
109
  if provider == "anthropic":
107
110
  return self._format_for_anthropic()
108
111
  elif provider == "openai":
@@ -0,0 +1,53 @@
1
+ """Type definitions for the Omni provider."""
2
+
3
+ from enum import StrEnum
4
+ from typing import Dict, Optional
5
+ from dataclasses import dataclass
6
+
7
+
8
+ class LLMProvider(StrEnum):
9
+ """Supported LLM providers."""
10
+
11
+ ANTHROPIC = "anthropic"
12
+ OPENAI = "openai"
13
+ GROQ = "groq"
14
+ QWEN = "qwen"
15
+
16
+
17
+ # For backward compatibility
18
+ APIProvider = LLMProvider
19
+
20
+
21
+ @dataclass
22
+ class LLM:
23
+ """Configuration for LLM model and provider."""
24
+
25
+ provider: LLMProvider
26
+ name: Optional[str] = None
27
+
28
+ def __post_init__(self):
29
+ """Set default model name if not provided."""
30
+ if self.name is None:
31
+ self.name = PROVIDER_TO_DEFAULT_MODEL.get(self.provider)
32
+
33
+
34
+ # For backward compatibility
35
+ LLMModel = LLM
36
+ Model = LLM
37
+
38
+
39
+ # Default models for each provider
40
+ PROVIDER_TO_DEFAULT_MODEL: Dict[LLMProvider, str] = {
41
+ LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
42
+ LLMProvider.OPENAI: "gpt-4o",
43
+ LLMProvider.GROQ: "deepseek-r1-distill-llama-70b",
44
+ LLMProvider.QWEN: "qwen2.5-vl-72b-instruct",
45
+ }
46
+
47
+ # Environment variable names for each provider
48
+ PROVIDER_TO_ENV_VAR: Dict[LLMProvider, str] = {
49
+ LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
50
+ LLMProvider.OPENAI: "OPENAI_API_KEY",
51
+ LLMProvider.GROQ: "GROQ_API_KEY",
52
+ LLMProvider.QWEN: "QWEN_API_KEY",
53
+ }
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "cua-agent"
9
- version = "0.1.0"
9
+ version = "0.1.1"
10
10
  description = "CUA (Computer Use) Agent for AI-driven computer interaction"
11
11
  authors = [
12
12
  { name = "TryCua", email = "gh@trycua.com" },
@@ -78,7 +78,7 @@ target-version = [
78
78
 
79
79
  [tool.ruff]
80
80
  line-length = 100
81
- target-version = "0.1.0"
81
+ target-version = "0.1.1"
82
82
  select = [
83
83
  "E",
84
84
  "F",
@@ -92,7 +92,7 @@ docstring-code-format = true
92
92
 
93
93
  [tool.mypy]
94
94
  strict = true
95
- python_version = "0.1.0"
95
+ python_version = "0.1.1"
96
96
  ignore_missing_imports = true
97
97
  disallow_untyped_defs = true
98
98
  check_untyped_defs = true
cua_agent-0.1.0/README.md DELETED
@@ -1,213 +0,0 @@
1
- <div align="center">
2
- <h1>
3
- <div class="image-wrapper" style="display: inline-block;">
4
- <picture>
5
- <source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../img/logo_white.png" style="display: block; margin: auto;">
6
- <source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../img/logo_black.png" style="display: block; margin: auto;">
7
- <img alt="Shows my svg">
8
- </picture>
9
- </div>
10
-
11
- [![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
12
- [![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
13
- [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
14
- [![PyPI](https://img.shields.io/pypi/v/cua-computer?color=333333)](https://pypi.org/project/cua-computer/)
15
- </h1>
16
- </div>
17
-
18
- **Agent** is a Computer Use (CUA) framework for running multi-app agentic workflows targeting macOS and Linux sandbox, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). The framework integrates with Microsoft's OmniParser for enhanced UI understanding and interaction.
19
-
20
- ### Get started with Agent
21
-
22
- There are two ways to use the agent: with OmniParser for enhanced UI understanding (recommended) or with basic computer control.
23
-
24
- #### Option 1: With OmniParser (Recommended)
25
-
26
- <div align="center">
27
- <img src="../../img/agent.png"/>
28
- </div>
29
-
30
- ```python
31
- from agent.providers.omni import OmniComputerAgent, APIProvider
32
-
33
- # Set your API key
34
- export OPENAI_API_KEY="your-openai-api-key"
35
-
36
- # Initialize agent with OmniParser for enhanced UI understanding
37
- agent = OmniComputerAgent(
38
- provider=APIProvider.OPENAI,
39
- model="gpt-4o",
40
- start_omniparser=True # Automatically starts OmniParser server
41
- )
42
-
43
- task = """
44
- 1. Search the ai-gradio repo on GitHub.
45
- 2. Clone it to the desktop.
46
- 3. Open the repo with the Cursor app.
47
- 4. Work with Cursor to add a new provider for Cua.
48
- """
49
-
50
- async with agent: # Ensures proper cleanup
51
- async for result in agent.run(task):
52
- print(result)
53
- ```
54
-
55
- #### Option 2: Basic Computer Control
56
-
57
- ```python
58
- from agent.computer_agent import ComputerAgent
59
- from agent.base.types import Provider
60
-
61
- # Set your API key (supports any provider)
62
- export OPENAI_API_KEY="your-openai-api-key" # or other provider keys
63
-
64
- # Initialize basic agent
65
- agent = ComputerAgent(
66
- provider=Provider.OPENAI, # or ANTHROPIC, GROQ, etc.
67
- )
68
-
69
- task = """
70
- 1. Open Chrome and navigate to github.com
71
- 2. Search for 'trycua/cua'
72
- 3. Star the repository
73
- """
74
-
75
- async with agent:
76
- async for result in agent.run(task):
77
- print(result)
78
- ```
79
-
80
- ## Install
81
-
82
- ### cua-agent
83
-
84
- ```bash
85
- # Basic installation with Anthropic
86
- pip install cua-agent[anthropic]
87
-
88
- # Install with OmniParser (recommended)
89
- # Includes all provider dependencies (OpenAI, Anthropic, etc.)
90
- pip install cua-agent[omni]
91
-
92
- # Install with all features and providers
93
- pip install cua-agent[all]
94
- ```
95
-
96
- ## Features
97
-
98
- ### OmniParser Integration
99
- - Enhanced UI understanding with element detection
100
- - Automatic bounding box detection for UI elements
101
- - Improved accuracy for complex UI interactions
102
- - Support for icon and text element recognition
103
-
104
- ### Basic Computer Control
105
- - Direct keyboard and mouse control
106
- - Window and application management
107
- - Screenshot capabilities
108
- - Basic UI element detection
109
-
110
- ### Provider Support
111
- - OpenAI (GPT-4V) - Recommended for OmniParser integration
112
- - Anthropic (Claude) - Strong general performance
113
- - Groq - Fast inference with Llama models
114
- - DeepSeek - Alternative model provider
115
- - Qwen - Alibaba's multimodal model
116
-
117
- ## Run
118
-
119
- Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA):
120
-
121
- - [Getting Started with OmniParser](../../notebooks/omniparser_nb.ipynb) - Enhanced UI understanding
122
- - [Basic Computer Control](../../notebooks/basic_agent_nb.ipynb) - Simple computer interactions
123
- - [Advanced Usage](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
124
-
125
- # Computer Agent Library
126
-
127
- A Python library for controlling computer interactions with AI agents.
128
-
129
- ## Introduction
130
-
131
- This library provides a unified interface for AI-powered computer interaction agents, allowing applications to automate UI interactions through various AI providers.
132
-
133
- ## Key Features
134
-
135
- - **Unified Agent**: Single `ComputerAgent` class with configurable loop types
136
- - **Multiple AI providers**: Support for OpenAI, Anthropic, Groq, and other providers
137
- - **Screen analysis**: Intelligent screen parsing and element identification
138
- - **Tool execution**: Execute tools and commands to interact with the computer
139
- - **Trajectory saving**: Option to save screenshots and logs for debugging and analysis
140
-
141
- ## Installation
142
-
143
- To install the library along with its dependencies:
144
-
145
- ```bash
146
- pip install -e .
147
- ```
148
-
149
- ## Usage
150
-
151
- Here's a simple example of how to use the ComputerAgent:
152
-
153
- ```python
154
- import asyncio
155
- from computer import Computer
156
- from agent.core.agent import ComputerAgent
157
- from agent.types.base import AgenticLoop
158
- from agent.providers.omni.types import APIProvider
159
-
160
- async def main():
161
- # Initialize the computer interface
162
- computer = Computer()
163
-
164
- # Create a computer agent
165
- agent = ComputerAgent(
166
- computer=computer,
167
- loop_type=AgenticLoop.OMNI, # Use OMNI loop
168
- provider=APIProvider.OPENAI, # With OpenAI provider
169
- model="gpt-4o", # Specify the model
170
- save_trajectory=True, # Save logs and screenshots
171
- )
172
-
173
- # Use the agent with a context manager
174
- async with agent:
175
- # Run a task
176
- async for result in agent.run("Open Safari and navigate to github.com"):
177
- # Process the result
178
- title = result["metadata"].get("title", "Screen Analysis")
179
- content = result["content"]
180
- print(f"\n{title}")
181
- print(content)
182
-
183
- if __name__ == "__main__":
184
- asyncio.run(main())
185
- ```
186
-
187
- ## Components
188
-
189
- The library consists of several components:
190
-
191
- - **Core**
192
- - `ComputerAgent`: Unified agent class supporting multiple loop types
193
- - `BaseComputerAgent`: Abstract base class for computer agents
194
-
195
- - **Providers**
196
- - `Anthropic`: Implementation for Anthropic Claude models
197
- - `Omni`: Implementation for multiple providers (OpenAI, Groq, etc.)
198
-
199
- - **Loops**
200
- - `AnthropicLoop`: Loop implementation for Anthropic
201
- - `OmniLoop`: Generic loop supporting multiple providers
202
-
203
- ## Configuration
204
-
205
- The agent can be configured with various parameters:
206
-
207
- - **loop_type**: The type of loop to use (ANTHROPIC or OMNI)
208
- - **provider**: AI provider to use with the loop
209
- - **model**: The AI model to use
210
- - **save_trajectory**: Whether to save screenshots and logs
211
- - **only_n_most_recent_images**: Only keep a specific number of recent images
212
-
213
- See the [Core README](./agent/core/README.md) for more details on the unified agent.
@@ -1,30 +0,0 @@
1
- """Type definitions for the Omni provider."""
2
-
3
- from enum import StrEnum
4
- from typing import Dict
5
-
6
-
7
- class APIProvider(StrEnum):
8
- """Supported API providers."""
9
-
10
- ANTHROPIC = "anthropic"
11
- OPENAI = "openai"
12
- GROQ = "groq"
13
- QWEN = "qwen"
14
-
15
-
16
- # Default models for each provider
17
- PROVIDER_TO_DEFAULT_MODEL: Dict[APIProvider, str] = {
18
- APIProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
19
- APIProvider.OPENAI: "gpt-4o",
20
- APIProvider.GROQ: "deepseek-r1-distill-llama-70b",
21
- APIProvider.QWEN: "qwen2.5-vl-72b-instruct",
22
- }
23
-
24
- # Environment variable names for each provider
25
- PROVIDER_TO_ENV_VAR: Dict[APIProvider, str] = {
26
- APIProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
27
- APIProvider.OPENAI: "OPENAI_API_KEY",
28
- APIProvider.GROQ: "GROQ_API_KEY",
29
- APIProvider.QWEN: "QWEN_API_KEY",
30
- }
File without changes
File without changes
File without changes
File without changes