cua-agent 0.1.21__tar.gz → 0.1.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- {cua_agent-0.1.21 → cua_agent-0.1.23}/PKG-INFO +67 -3
- {cua_agent-0.1.21 → cua_agent-0.1.23}/README.md +47 -2
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/__init__.py +1 -1
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/agent.py +9 -3
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/factory.py +3 -5
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/provider_config.py +5 -1
- cua_agent-0.1.23/agent/core/types.py +103 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/__init__.py +1 -1
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/clients/base.py +8 -17
- cua_agent-0.1.23/agent/providers/omni/clients/oaicompat.py +177 -0
- cua_agent-0.1.23/agent/providers/omni/clients/ollama.py +122 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/clients/openai.py +0 -4
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/loop.py +43 -1
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/tools/manager.py +1 -1
- cua_agent-0.1.23/agent/ui/__init__.py +1 -0
- cua_agent-0.1.23/agent/ui/gradio/__init__.py +21 -0
- cua_agent-0.1.23/agent/ui/gradio/app.py +872 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/pyproject.toml +24 -3
- cua_agent-0.1.21/agent/core/README.md +0 -101
- cua_agent-0.1.21/agent/core/types.py +0 -45
- cua_agent-0.1.21/agent/providers/omni/types.py +0 -44
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/__init__.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/base.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/callbacks.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/experiment.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/messages.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/telemetry.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/tools/__init__.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/tools/base.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/tools/bash.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/tools/collection.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/tools/computer.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/tools/edit.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/tools/manager.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/tools.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/core/visualization.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/__init__.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/__init__.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/api/client.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/api/logging.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/api_handler.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/callbacks/__init__.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/callbacks/manager.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/loop.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/prompts.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/response_handler.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/tools/__init__.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/tools/base.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/tools/bash.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/tools/collection.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/tools/computer.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/tools/edit.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/tools/manager.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/tools/run.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/types.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/anthropic/utils.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/api_handler.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/clients/anthropic.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/clients/utils.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/image_utils.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/parser.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/prompts.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/tools/__init__.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/tools/base.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/tools/bash.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/tools/computer.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/omni/utils.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/openai/__init__.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/openai/api_handler.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/openai/loop.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/openai/response_handler.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/openai/tools/__init__.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/openai/tools/base.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/openai/tools/computer.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/openai/tools/manager.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/openai/types.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/providers/openai/utils.py +0 -0
- {cua_agent-0.1.21 → cua_agent-0.1.23}/agent/telemetry.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cua-agent
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.23
|
|
4
4
|
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
5
|
Author-Email: TryCua <gh@trycua.com>
|
|
6
6
|
Requires-Python: <3.13,>=3.10
|
|
@@ -21,6 +21,9 @@ Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "anthropic"
|
|
|
21
21
|
Provides-Extra: openai
|
|
22
22
|
Requires-Dist: openai<2.0.0,>=1.14.0; extra == "openai"
|
|
23
23
|
Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "openai"
|
|
24
|
+
Provides-Extra: ui
|
|
25
|
+
Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "ui"
|
|
26
|
+
Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "ui"
|
|
24
27
|
Provides-Extra: som
|
|
25
28
|
Requires-Dist: torch>=2.2.1; extra == "som"
|
|
26
29
|
Requires-Dist: torchvision>=0.17.1; extra == "som"
|
|
@@ -33,6 +36,19 @@ Requires-Dist: openai<2.0.0,>=1.14.0; extra == "som"
|
|
|
33
36
|
Requires-Dist: groq<0.5.0,>=0.4.0; extra == "som"
|
|
34
37
|
Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "som"
|
|
35
38
|
Requires-Dist: requests<3.0.0,>=2.31.0; extra == "som"
|
|
39
|
+
Provides-Extra: omni
|
|
40
|
+
Requires-Dist: torch>=2.2.1; extra == "omni"
|
|
41
|
+
Requires-Dist: torchvision>=0.17.1; extra == "omni"
|
|
42
|
+
Requires-Dist: ultralytics>=8.0.0; extra == "omni"
|
|
43
|
+
Requires-Dist: transformers>=4.38.2; extra == "omni"
|
|
44
|
+
Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "omni"
|
|
45
|
+
Requires-Dist: anthropic<0.47.0,>=0.46.0; extra == "omni"
|
|
46
|
+
Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "omni"
|
|
47
|
+
Requires-Dist: openai<2.0.0,>=1.14.0; extra == "omni"
|
|
48
|
+
Requires-Dist: groq<0.5.0,>=0.4.0; extra == "omni"
|
|
49
|
+
Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "omni"
|
|
50
|
+
Requires-Dist: requests<3.0.0,>=2.31.0; extra == "omni"
|
|
51
|
+
Requires-Dist: ollama<0.5.0,>=0.4.7; extra == "omni"
|
|
36
52
|
Provides-Extra: all
|
|
37
53
|
Requires-Dist: torch>=2.2.1; extra == "all"
|
|
38
54
|
Requires-Dist: torchvision>=0.17.1; extra == "all"
|
|
@@ -45,6 +61,9 @@ Requires-Dist: openai<2.0.0,>=1.14.0; extra == "all"
|
|
|
45
61
|
Requires-Dist: groq<0.5.0,>=0.4.0; extra == "all"
|
|
46
62
|
Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "all"
|
|
47
63
|
Requires-Dist: requests<3.0.0,>=2.31.0; extra == "all"
|
|
64
|
+
Requires-Dist: ollama<0.5.0,>=0.4.7; extra == "all"
|
|
65
|
+
Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "all"
|
|
66
|
+
Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "all"
|
|
48
67
|
Description-Content-Type: text/markdown
|
|
49
68
|
|
|
50
69
|
<div align="center">
|
|
@@ -80,7 +99,8 @@ pip install "cua-agent[all]"
|
|
|
80
99
|
# or install specific loop providers
|
|
81
100
|
pip install "cua-agent[openai]" # OpenAI Cua Loop
|
|
82
101
|
pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
83
|
-
pip install "cua-agent[omni]" # Cua Loop based on OmniParser
|
|
102
|
+
pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
|
|
103
|
+
pip install "cua-agent[ui]" # Gradio UI for the agent
|
|
84
104
|
```
|
|
85
105
|
|
|
86
106
|
## Run
|
|
@@ -92,6 +112,12 @@ async with Computer() as macos_computer:
|
|
|
92
112
|
computer=macos_computer,
|
|
93
113
|
loop=AgentLoop.OPENAI,
|
|
94
114
|
model=LLM(provider=LLMProvider.OPENAI)
|
|
115
|
+
# or
|
|
116
|
+
# loop=AgentLoop.ANTHROPIC,
|
|
117
|
+
# model=LLM(provider=LLMProvider.ANTHROPIC)
|
|
118
|
+
# or
|
|
119
|
+
# loop=AgentLoop.OMNI,
|
|
120
|
+
# model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
|
|
95
121
|
)
|
|
96
122
|
|
|
97
123
|
tasks = [
|
|
@@ -115,6 +141,44 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
|
|
|
115
141
|
|
|
116
142
|
- [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
|
|
117
143
|
|
|
144
|
+
## Using the Gradio UI
|
|
145
|
+
|
|
146
|
+
The agent includes a Gradio-based user interface for easy interaction. To use it:
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
# Install with Gradio support
|
|
150
|
+
pip install "cua-agent[ui]"
|
|
151
|
+
|
|
152
|
+
# Create a simple launcher script
|
|
153
|
+
```python
|
|
154
|
+
from agent.ui.gradio.app import create_gradio_ui
|
|
155
|
+
|
|
156
|
+
app = create_gradio_ui()
|
|
157
|
+
app.launch(share=False)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
The Gradio UI provides:
|
|
161
|
+
- Selection of different agent loops (OpenAI, Anthropic, OMNI)
|
|
162
|
+
- Model selection for each provider
|
|
163
|
+
- Configuration of agent parameters
|
|
164
|
+
- Chat interface for interacting with the agent
|
|
165
|
+
|
|
166
|
+
You can also embed the Gradio UI in your own application:
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
# Import directly in your application
|
|
170
|
+
from agent.ui.gradio.app import create_gradio_ui
|
|
171
|
+
|
|
172
|
+
# Create the UI with advanced features
|
|
173
|
+
demo = create_gradio_ui()
|
|
174
|
+
demo.launch()
|
|
175
|
+
|
|
176
|
+
# Or for a simpler interface
|
|
177
|
+
from agent.ui.gradio import registry
|
|
178
|
+
demo = registry(name='cua:gpt-4o')
|
|
179
|
+
demo.launch()
|
|
180
|
+
```
|
|
181
|
+
|
|
118
182
|
## Agent Loops
|
|
119
183
|
|
|
120
184
|
The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
|
|
@@ -123,7 +187,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
|
|
|
123
187
|
|:-----------|:-----------------|:------------|:-------------|
|
|
124
188
|
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
|
|
125
189
|
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
|
|
126
|
-
| `AgentLoop.OMNI`
|
|
190
|
+
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
|
127
191
|
|
|
128
192
|
## AgentResponse
|
|
129
193
|
The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
|
|
@@ -31,7 +31,8 @@ pip install "cua-agent[all]"
|
|
|
31
31
|
# or install specific loop providers
|
|
32
32
|
pip install "cua-agent[openai]" # OpenAI Cua Loop
|
|
33
33
|
pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
34
|
-
pip install "cua-agent[omni]" # Cua Loop based on OmniParser
|
|
34
|
+
pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
|
|
35
|
+
pip install "cua-agent[ui]" # Gradio UI for the agent
|
|
35
36
|
```
|
|
36
37
|
|
|
37
38
|
## Run
|
|
@@ -43,6 +44,12 @@ async with Computer() as macos_computer:
|
|
|
43
44
|
computer=macos_computer,
|
|
44
45
|
loop=AgentLoop.OPENAI,
|
|
45
46
|
model=LLM(provider=LLMProvider.OPENAI)
|
|
47
|
+
# or
|
|
48
|
+
# loop=AgentLoop.ANTHROPIC,
|
|
49
|
+
# model=LLM(provider=LLMProvider.ANTHROPIC)
|
|
50
|
+
# or
|
|
51
|
+
# loop=AgentLoop.OMNI,
|
|
52
|
+
# model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
|
|
46
53
|
)
|
|
47
54
|
|
|
48
55
|
tasks = [
|
|
@@ -66,6 +73,44 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
|
|
|
66
73
|
|
|
67
74
|
- [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
|
|
68
75
|
|
|
76
|
+
## Using the Gradio UI
|
|
77
|
+
|
|
78
|
+
The agent includes a Gradio-based user interface for easy interaction. To use it:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# Install with Gradio support
|
|
82
|
+
pip install "cua-agent[ui]"
|
|
83
|
+
|
|
84
|
+
# Create a simple launcher script
|
|
85
|
+
```python
|
|
86
|
+
from agent.ui.gradio.app import create_gradio_ui
|
|
87
|
+
|
|
88
|
+
app = create_gradio_ui()
|
|
89
|
+
app.launch(share=False)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
The Gradio UI provides:
|
|
93
|
+
- Selection of different agent loops (OpenAI, Anthropic, OMNI)
|
|
94
|
+
- Model selection for each provider
|
|
95
|
+
- Configuration of agent parameters
|
|
96
|
+
- Chat interface for interacting with the agent
|
|
97
|
+
|
|
98
|
+
You can also embed the Gradio UI in your own application:
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
# Import directly in your application
|
|
102
|
+
from agent.ui.gradio.app import create_gradio_ui
|
|
103
|
+
|
|
104
|
+
# Create the UI with advanced features
|
|
105
|
+
demo = create_gradio_ui()
|
|
106
|
+
demo.launch()
|
|
107
|
+
|
|
108
|
+
# Or for a simpler interface
|
|
109
|
+
from agent.ui.gradio import registry
|
|
110
|
+
demo = registry(name='cua:gpt-4o')
|
|
111
|
+
demo.launch()
|
|
112
|
+
```
|
|
113
|
+
|
|
69
114
|
## Agent Loops
|
|
70
115
|
|
|
71
116
|
The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
|
|
@@ -74,7 +119,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
|
|
|
74
119
|
|:-----------|:-----------------|:------------|:-------------|
|
|
75
120
|
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
|
|
76
121
|
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
|
|
77
|
-
| `AgentLoop.OMNI`
|
|
122
|
+
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
|
78
123
|
|
|
79
124
|
## AgentResponse
|
|
80
125
|
The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
|
|
@@ -48,7 +48,7 @@ except Exception as e:
|
|
|
48
48
|
# Other issues with telemetry
|
|
49
49
|
logger.warning(f"Error initializing telemetry: {e}")
|
|
50
50
|
|
|
51
|
-
from .
|
|
51
|
+
from .core.types import LLMProvider, LLM
|
|
52
52
|
from .core.factory import AgentLoop
|
|
53
53
|
from .core.agent import ComputerAgent
|
|
54
54
|
|
|
@@ -6,8 +6,7 @@ import os
|
|
|
6
6
|
from typing import AsyncGenerator, Optional
|
|
7
7
|
|
|
8
8
|
from computer import Computer
|
|
9
|
-
from
|
|
10
|
-
from .. import AgentLoop
|
|
9
|
+
from .types import LLM, AgentLoop
|
|
11
10
|
from .types import AgentResponse
|
|
12
11
|
from .factory import LoopFactory
|
|
13
12
|
from .provider_config import DEFAULT_MODELS, ENV_VARS
|
|
@@ -75,6 +74,7 @@ class ComputerAgent:
|
|
|
75
74
|
# Use the provided LLM object
|
|
76
75
|
self.provider = model.provider
|
|
77
76
|
actual_model_name = model.name or DEFAULT_MODELS.get(self.provider, "")
|
|
77
|
+
self.provider_base_url = getattr(model, "provider_base_url", None)
|
|
78
78
|
|
|
79
79
|
# Ensure we have a valid model name
|
|
80
80
|
if not actual_model_name:
|
|
@@ -86,7 +86,12 @@ class ComputerAgent:
|
|
|
86
86
|
|
|
87
87
|
# Get API key from environment if not provided
|
|
88
88
|
actual_api_key = api_key or os.environ.get(ENV_VARS[self.provider], "")
|
|
89
|
-
|
|
89
|
+
# Ollama and OpenAI-compatible APIs typically don't require an API key
|
|
90
|
+
if (
|
|
91
|
+
not actual_api_key
|
|
92
|
+
and str(self.provider) not in ["ollama", "oaicompat"]
|
|
93
|
+
and ENV_VARS[self.provider] != "none"
|
|
94
|
+
):
|
|
90
95
|
raise ValueError(f"No API key provided for {self.provider}")
|
|
91
96
|
|
|
92
97
|
# Create the appropriate loop using the factory
|
|
@@ -101,6 +106,7 @@ class ComputerAgent:
|
|
|
101
106
|
save_trajectory=save_trajectory,
|
|
102
107
|
trajectory_dir=trajectory_dir,
|
|
103
108
|
only_n_most_recent_images=only_n_most_recent_images,
|
|
109
|
+
provider_base_url=self.provider_base_url,
|
|
104
110
|
)
|
|
105
111
|
except ValueError as e:
|
|
106
112
|
logger.error(f"Failed to create loop: {str(e)}")
|
|
@@ -8,10 +8,6 @@ from computer import Computer
|
|
|
8
8
|
from .types import AgentLoop
|
|
9
9
|
from .base import BaseLoop
|
|
10
10
|
|
|
11
|
-
# For type checking only
|
|
12
|
-
if TYPE_CHECKING:
|
|
13
|
-
from ..providers.omni.types import LLMProvider
|
|
14
|
-
|
|
15
11
|
logger = logging.getLogger(__name__)
|
|
16
12
|
|
|
17
13
|
|
|
@@ -33,6 +29,7 @@ class LoopFactory:
|
|
|
33
29
|
trajectory_dir: str = "trajectories",
|
|
34
30
|
only_n_most_recent_images: Optional[int] = None,
|
|
35
31
|
acknowledge_safety_check_callback: Optional[Callable[[str], Awaitable[bool]]] = None,
|
|
32
|
+
provider_base_url: Optional[str] = None,
|
|
36
33
|
) -> BaseLoop:
|
|
37
34
|
"""Create and return an appropriate loop instance based on type."""
|
|
38
35
|
if loop_type == AgentLoop.ANTHROPIC:
|
|
@@ -77,7 +74,7 @@ class LoopFactory:
|
|
|
77
74
|
try:
|
|
78
75
|
from ..providers.omni.loop import OmniLoop
|
|
79
76
|
from ..providers.omni.parser import OmniParser
|
|
80
|
-
from
|
|
77
|
+
from .types import LLMProvider
|
|
81
78
|
except ImportError:
|
|
82
79
|
raise ImportError(
|
|
83
80
|
"The 'omni' provider is not installed. "
|
|
@@ -99,6 +96,7 @@ class LoopFactory:
|
|
|
99
96
|
base_dir=trajectory_dir,
|
|
100
97
|
only_n_most_recent_images=only_n_most_recent_images,
|
|
101
98
|
parser=OmniParser(),
|
|
99
|
+
provider_base_url=provider_base_url,
|
|
102
100
|
)
|
|
103
101
|
else:
|
|
104
102
|
raise ValueError(f"Unsupported loop type: {loop_type}")
|
|
@@ -1,15 +1,19 @@
|
|
|
1
1
|
"""Provider-specific configurations and constants."""
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from .types import LLMProvider
|
|
4
4
|
|
|
5
5
|
# Default models for different providers
|
|
6
6
|
DEFAULT_MODELS = {
|
|
7
7
|
LLMProvider.OPENAI: "gpt-4o",
|
|
8
8
|
LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
9
|
+
LLMProvider.OLLAMA: "gemma3:4b-it-q4_K_M",
|
|
10
|
+
LLMProvider.OAICOMPAT: "Qwen2.5-VL-7B-Instruct",
|
|
9
11
|
}
|
|
10
12
|
|
|
11
13
|
# Map providers to their environment variable names
|
|
12
14
|
ENV_VARS = {
|
|
13
15
|
LLMProvider.OPENAI: "OPENAI_API_KEY",
|
|
14
16
|
LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
17
|
+
LLMProvider.OLLAMA: "none",
|
|
18
|
+
LLMProvider.OAICOMPAT: "none", # OpenAI-compatible API typically doesn't require an API key
|
|
15
19
|
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""Core type definitions."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional, TypedDict, Union
|
|
4
|
+
from enum import Enum, StrEnum, auto
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AgentLoop(Enum):
|
|
9
|
+
"""Enumeration of available loop types."""
|
|
10
|
+
|
|
11
|
+
ANTHROPIC = auto() # Anthropic implementation
|
|
12
|
+
OMNI = auto() # OmniLoop implementation
|
|
13
|
+
OPENAI = auto() # OpenAI implementation
|
|
14
|
+
OLLAMA = auto() # OLLAMA implementation
|
|
15
|
+
# Add more loop types as needed
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LLMProvider(StrEnum):
|
|
19
|
+
"""Supported LLM providers."""
|
|
20
|
+
|
|
21
|
+
ANTHROPIC = "anthropic"
|
|
22
|
+
OPENAI = "openai"
|
|
23
|
+
OLLAMA = "ollama"
|
|
24
|
+
OAICOMPAT = "oaicompat"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class LLM:
|
|
29
|
+
"""Configuration for LLM model and provider."""
|
|
30
|
+
|
|
31
|
+
provider: LLMProvider
|
|
32
|
+
name: Optional[str] = None
|
|
33
|
+
provider_base_url: Optional[str] = None
|
|
34
|
+
|
|
35
|
+
def __post_init__(self):
|
|
36
|
+
"""Set default model name if not provided."""
|
|
37
|
+
if self.name is None:
|
|
38
|
+
from .provider_config import DEFAULT_MODELS
|
|
39
|
+
|
|
40
|
+
self.name = DEFAULT_MODELS.get(self.provider)
|
|
41
|
+
|
|
42
|
+
# Set default provider URL if none provided
|
|
43
|
+
if self.provider_base_url is None and self.provider == LLMProvider.OAICOMPAT:
|
|
44
|
+
# Default for vLLM
|
|
45
|
+
self.provider_base_url = "http://localhost:8000/v1"
|
|
46
|
+
# Common alternatives:
|
|
47
|
+
# - LM Studio: "http://localhost:1234/v1"
|
|
48
|
+
# - LocalAI: "http://localhost:8080/v1"
|
|
49
|
+
# - Ollama with OpenAI compatible API: "http://localhost:11434/v1"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# For backward compatibility
|
|
53
|
+
LLMModel = LLM
|
|
54
|
+
Model = LLM
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# Default models for each provider
|
|
58
|
+
PROVIDER_TO_DEFAULT_MODEL: Dict[LLMProvider, str] = {
|
|
59
|
+
LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
60
|
+
LLMProvider.OPENAI: "gpt-4o",
|
|
61
|
+
LLMProvider.OLLAMA: "gemma3:4b-it-q4_K_M",
|
|
62
|
+
LLMProvider.OAICOMPAT: "Qwen2.5-VL-7B-Instruct",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Environment variable names for each provider
|
|
66
|
+
PROVIDER_TO_ENV_VAR: Dict[LLMProvider, str] = {
|
|
67
|
+
LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
68
|
+
LLMProvider.OPENAI: "OPENAI_API_KEY",
|
|
69
|
+
LLMProvider.OLLAMA: "none",
|
|
70
|
+
LLMProvider.OAICOMPAT: "none",
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class AgentResponse(TypedDict, total=False):
|
|
75
|
+
"""Agent response format."""
|
|
76
|
+
|
|
77
|
+
id: str
|
|
78
|
+
object: str
|
|
79
|
+
created_at: int
|
|
80
|
+
status: str
|
|
81
|
+
error: Optional[str]
|
|
82
|
+
incomplete_details: Optional[Any]
|
|
83
|
+
instructions: Optional[Any]
|
|
84
|
+
max_output_tokens: Optional[int]
|
|
85
|
+
model: str
|
|
86
|
+
output: List[Dict[str, Any]]
|
|
87
|
+
parallel_tool_calls: bool
|
|
88
|
+
previous_response_id: Optional[str]
|
|
89
|
+
reasoning: Dict[str, str]
|
|
90
|
+
store: bool
|
|
91
|
+
temperature: float
|
|
92
|
+
text: Dict[str, Dict[str, str]]
|
|
93
|
+
tool_choice: str
|
|
94
|
+
tools: List[Dict[str, Union[str, int]]]
|
|
95
|
+
top_p: float
|
|
96
|
+
truncation: str
|
|
97
|
+
usage: Dict[str, Any]
|
|
98
|
+
user: Optional[str]
|
|
99
|
+
metadata: Dict[str, Any]
|
|
100
|
+
response: Dict[str, List[Dict[str, Any]]]
|
|
101
|
+
# Additional fields for error responses
|
|
102
|
+
role: str
|
|
103
|
+
content: Union[str, List[Dict[str, Any]]]
|
|
@@ -1,43 +1,34 @@
|
|
|
1
1
|
"""Base client implementation for Omni providers."""
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
import logging
|
|
5
4
|
from typing import Dict, List, Optional, Any, Tuple
|
|
6
|
-
import aiohttp
|
|
7
|
-
import json
|
|
8
5
|
|
|
9
6
|
logger = logging.getLogger(__name__)
|
|
10
7
|
|
|
8
|
+
|
|
11
9
|
class BaseOmniClient:
|
|
12
10
|
"""Base class for provider-specific clients."""
|
|
13
|
-
|
|
14
|
-
def __init__(
|
|
15
|
-
self,
|
|
16
|
-
api_key: Optional[str] = None,
|
|
17
|
-
model: Optional[str] = None
|
|
18
|
-
):
|
|
11
|
+
|
|
12
|
+
def __init__(self, api_key: Optional[str] = None, model: Optional[str] = None):
|
|
19
13
|
"""Initialize base client.
|
|
20
|
-
|
|
14
|
+
|
|
21
15
|
Args:
|
|
22
16
|
api_key: Optional API key
|
|
23
17
|
model: Optional model name
|
|
24
18
|
"""
|
|
25
19
|
self.api_key = api_key
|
|
26
20
|
self.model = model
|
|
27
|
-
|
|
21
|
+
|
|
28
22
|
async def run_interleaved(
|
|
29
|
-
self,
|
|
30
|
-
messages: List[Dict[str, Any]],
|
|
31
|
-
system: str,
|
|
32
|
-
max_tokens: Optional[int] = None
|
|
23
|
+
self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
|
|
33
24
|
) -> Dict[str, Any]:
|
|
34
25
|
"""Run interleaved chat completion.
|
|
35
|
-
|
|
26
|
+
|
|
36
27
|
Args:
|
|
37
28
|
messages: List of message dicts
|
|
38
29
|
system: System prompt
|
|
39
30
|
max_tokens: Optional max tokens override
|
|
40
|
-
|
|
31
|
+
|
|
41
32
|
Returns:
|
|
42
33
|
Response dict
|
|
43
34
|
"""
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""OpenAI-compatible client implementation."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Dict, List, Optional, Any
|
|
6
|
+
import aiohttp
|
|
7
|
+
import re
|
|
8
|
+
from .base import BaseOmniClient
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# OpenAI-compatible client for the OmniLoop
|
|
14
|
+
class OAICompatClient(BaseOmniClient):
|
|
15
|
+
"""OpenAI-compatible API client implementation.
|
|
16
|
+
|
|
17
|
+
This client can be used with any service that implements the OpenAI API protocol, including:
|
|
18
|
+
- vLLM
|
|
19
|
+
- LM Studio
|
|
20
|
+
- LocalAI
|
|
21
|
+
- Ollama (with OpenAI compatibility)
|
|
22
|
+
- Text Generation WebUI
|
|
23
|
+
- Any other service with OpenAI API compatibility
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
api_key: Optional[str] = None,
|
|
29
|
+
model: str = "Qwen2.5-VL-7B-Instruct",
|
|
30
|
+
provider_base_url: Optional[str] = "http://localhost:8000/v1",
|
|
31
|
+
max_tokens: int = 4096,
|
|
32
|
+
temperature: float = 0.0,
|
|
33
|
+
):
|
|
34
|
+
"""Initialize the OpenAI-compatible client.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
api_key: Not used for local endpoints, usually set to "EMPTY"
|
|
38
|
+
model: Model name to use
|
|
39
|
+
provider_base_url: API base URL. Typically in the format "http://localhost:PORT/v1"
|
|
40
|
+
Examples:
|
|
41
|
+
- vLLM: "http://localhost:8000/v1"
|
|
42
|
+
- LM Studio: "http://localhost:1234/v1"
|
|
43
|
+
- LocalAI: "http://localhost:8080/v1"
|
|
44
|
+
- Ollama: "http://localhost:11434/v1"
|
|
45
|
+
max_tokens: Maximum tokens to generate
|
|
46
|
+
temperature: Generation temperature
|
|
47
|
+
"""
|
|
48
|
+
super().__init__(api_key="EMPTY", model=model)
|
|
49
|
+
self.api_key = "EMPTY" # Local endpoints typically don't require an API key
|
|
50
|
+
self.model = model
|
|
51
|
+
self.provider_base_url = (
|
|
52
|
+
provider_base_url or "http://localhost:8000/v1"
|
|
53
|
+
) # Use default if None
|
|
54
|
+
self.max_tokens = max_tokens
|
|
55
|
+
self.temperature = temperature
|
|
56
|
+
|
|
57
|
+
def _extract_base64_image(self, text: str) -> Optional[str]:
|
|
58
|
+
"""Extract base64 image data from an HTML img tag."""
|
|
59
|
+
pattern = r'data:image/[^;]+;base64,([^"]+)'
|
|
60
|
+
match = re.search(pattern, text)
|
|
61
|
+
return match.group(1) if match else None
|
|
62
|
+
|
|
63
|
+
def _get_loggable_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
64
|
+
"""Create a loggable version of messages with image data truncated."""
|
|
65
|
+
loggable_messages = []
|
|
66
|
+
for msg in messages:
|
|
67
|
+
if isinstance(msg.get("content"), list):
|
|
68
|
+
new_content = []
|
|
69
|
+
for content in msg["content"]:
|
|
70
|
+
if content.get("type") == "image":
|
|
71
|
+
new_content.append(
|
|
72
|
+
{"type": "image", "image_url": {"url": "[BASE64_IMAGE_DATA]"}}
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
new_content.append(content)
|
|
76
|
+
loggable_messages.append({"role": msg["role"], "content": new_content})
|
|
77
|
+
else:
|
|
78
|
+
loggable_messages.append(msg)
|
|
79
|
+
return loggable_messages
|
|
80
|
+
|
|
81
|
+
async def run_interleaved(
|
|
82
|
+
self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
|
|
83
|
+
) -> Dict[str, Any]:
|
|
84
|
+
"""Run interleaved chat completion.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
messages: List of message dicts
|
|
88
|
+
system: System prompt
|
|
89
|
+
max_tokens: Optional max tokens override
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Response dict
|
|
93
|
+
"""
|
|
94
|
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
|
|
95
|
+
|
|
96
|
+
final_messages = [{"role": "system", "content": system}]
|
|
97
|
+
|
|
98
|
+
# Process messages
|
|
99
|
+
for item in messages:
|
|
100
|
+
if isinstance(item, dict):
|
|
101
|
+
if isinstance(item["content"], list):
|
|
102
|
+
# Content is already in the correct format
|
|
103
|
+
final_messages.append(item)
|
|
104
|
+
else:
|
|
105
|
+
# Single string content, check for image
|
|
106
|
+
base64_img = self._extract_base64_image(item["content"])
|
|
107
|
+
if base64_img:
|
|
108
|
+
message = {
|
|
109
|
+
"role": item["role"],
|
|
110
|
+
"content": [
|
|
111
|
+
{
|
|
112
|
+
"type": "image_url",
|
|
113
|
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
|
|
114
|
+
}
|
|
115
|
+
],
|
|
116
|
+
}
|
|
117
|
+
else:
|
|
118
|
+
message = {
|
|
119
|
+
"role": item["role"],
|
|
120
|
+
"content": [{"type": "text", "text": item["content"]}],
|
|
121
|
+
}
|
|
122
|
+
final_messages.append(message)
|
|
123
|
+
else:
|
|
124
|
+
# String content, check for image
|
|
125
|
+
base64_img = self._extract_base64_image(item)
|
|
126
|
+
if base64_img:
|
|
127
|
+
message = {
|
|
128
|
+
"role": "user",
|
|
129
|
+
"content": [
|
|
130
|
+
{
|
|
131
|
+
"type": "image_url",
|
|
132
|
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
|
|
133
|
+
}
|
|
134
|
+
],
|
|
135
|
+
}
|
|
136
|
+
else:
|
|
137
|
+
message = {"role": "user", "content": [{"type": "text", "text": item}]}
|
|
138
|
+
final_messages.append(message)
|
|
139
|
+
|
|
140
|
+
payload = {"model": self.model, "messages": final_messages, "temperature": self.temperature}
|
|
141
|
+
payload["max_tokens"] = max_tokens or self.max_tokens
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
async with aiohttp.ClientSession() as session:
|
|
145
|
+
# Use default base URL if none provided
|
|
146
|
+
base_url = self.provider_base_url or "http://localhost:8000/v1"
|
|
147
|
+
|
|
148
|
+
# Check if the base URL already includes the chat/completions endpoint
|
|
149
|
+
endpoint_url = base_url
|
|
150
|
+
if not endpoint_url.endswith("/chat/completions"):
|
|
151
|
+
# If the URL ends with /v1, append /chat/completions
|
|
152
|
+
if endpoint_url.endswith("/v1"):
|
|
153
|
+
endpoint_url = f"{endpoint_url}/chat/completions"
|
|
154
|
+
# If the URL doesn't end with /v1, make sure it has a proper structure
|
|
155
|
+
elif not endpoint_url.endswith("/"):
|
|
156
|
+
endpoint_url = f"{endpoint_url}/chat/completions"
|
|
157
|
+
else:
|
|
158
|
+
endpoint_url = f"{endpoint_url}chat/completions"
|
|
159
|
+
|
|
160
|
+
# Log the endpoint URL for debugging
|
|
161
|
+
logger.debug(f"Using endpoint URL: {endpoint_url}")
|
|
162
|
+
|
|
163
|
+
async with session.post(endpoint_url, headers=headers, json=payload) as response:
|
|
164
|
+
response_json = await response.json()
|
|
165
|
+
|
|
166
|
+
if response.status != 200:
|
|
167
|
+
error_msg = response_json.get("error", {}).get(
|
|
168
|
+
"message", str(response_json)
|
|
169
|
+
)
|
|
170
|
+
logger.error(f"Error in API call: {error_msg}")
|
|
171
|
+
raise Exception(f"API error: {error_msg}")
|
|
172
|
+
|
|
173
|
+
return response_json
|
|
174
|
+
|
|
175
|
+
except Exception as e:
|
|
176
|
+
logger.error(f"Error in API call: {str(e)}")
|
|
177
|
+
raise
|