cua-agent 0.1.29__tar.gz → 0.1.31__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- {cua_agent-0.1.29 → cua_agent-0.1.31}/PKG-INFO +20 -19
- {cua_agent-0.1.29 → cua_agent-0.1.31}/README.md +17 -18
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/factory.py +19 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/types.py +1 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/clients/oaicompat.py +12 -2
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/tools/computer.py +3 -7
- cua_agent-0.1.31/agent/providers/uitars/__init__.py +1 -0
- cua_agent-0.1.31/agent/providers/uitars/clients/base.py +35 -0
- cua_agent-0.1.31/agent/providers/uitars/clients/oaicompat.py +216 -0
- cua_agent-0.1.31/agent/providers/uitars/loop.py +598 -0
- cua_agent-0.1.31/agent/providers/uitars/prompts.py +63 -0
- cua_agent-0.1.31/agent/providers/uitars/tools/__init__.py +1 -0
- cua_agent-0.1.31/agent/providers/uitars/tools/computer.py +283 -0
- cua_agent-0.1.31/agent/providers/uitars/tools/manager.py +60 -0
- cua_agent-0.1.31/agent/providers/uitars/utils.py +153 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/ui/gradio/app.py +12 -2
- {cua_agent-0.1.29 → cua_agent-0.1.31}/pyproject.toml +6 -3
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/__init__.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/__init__.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/agent.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/base.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/callbacks.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/experiment.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/messages.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/provider_config.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/telemetry.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools/__init__.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools/base.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools/bash.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools/collection.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools/computer.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools/edit.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools/manager.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/visualization.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/__init__.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/__init__.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/api/client.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/api/logging.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/api_handler.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/callbacks/__init__.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/callbacks/manager.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/loop.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/prompts.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/response_handler.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/__init__.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/base.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/bash.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/collection.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/computer.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/edit.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/manager.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/run.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/types.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/utils.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/__init__.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/api_handler.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/clients/anthropic.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/clients/base.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/clients/ollama.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/clients/openai.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/clients/utils.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/image_utils.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/loop.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/parser.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/prompts.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/tools/__init__.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/tools/base.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/tools/bash.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/tools/computer.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/tools/manager.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/utils.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/__init__.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/api_handler.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/loop.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/response_handler.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/tools/__init__.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/tools/base.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/tools/manager.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/types.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/utils.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/telemetry.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/ui/__init__.py +0 -0
- {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/ui/gradio/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cua-agent
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.31
|
|
4
4
|
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
5
|
Author-Email: TryCua <gh@trycua.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -21,6 +21,8 @@ Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "anthropic"
|
|
|
21
21
|
Provides-Extra: openai
|
|
22
22
|
Requires-Dist: openai<2.0.0,>=1.14.0; extra == "openai"
|
|
23
23
|
Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "openai"
|
|
24
|
+
Provides-Extra: uitars
|
|
25
|
+
Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "uitars"
|
|
24
26
|
Provides-Extra: ui
|
|
25
27
|
Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "ui"
|
|
26
28
|
Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "ui"
|
|
@@ -99,6 +101,7 @@ pip install "cua-agent[all]"
|
|
|
99
101
|
# or install specific loop providers
|
|
100
102
|
pip install "cua-agent[openai]" # OpenAI Cua Loop
|
|
101
103
|
pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
104
|
+
pip install "cua-agent[uitars]" # UI-Tars support
|
|
102
105
|
pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
|
|
103
106
|
pip install "cua-agent[ui]" # Gradio UI for the agent
|
|
104
107
|
```
|
|
@@ -118,6 +121,9 @@ async with Computer() as macos_computer:
|
|
|
118
121
|
# or
|
|
119
122
|
# loop=AgentLoop.OMNI,
|
|
120
123
|
# model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
|
|
124
|
+
# or
|
|
125
|
+
# loop=AgentLoop.UITARS,
|
|
126
|
+
# model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
|
|
121
127
|
)
|
|
122
128
|
|
|
123
129
|
tasks = [
|
|
@@ -143,7 +149,13 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
|
|
|
143
149
|
|
|
144
150
|
## Using the Gradio UI
|
|
145
151
|
|
|
146
|
-
The agent includes a Gradio-based user interface for
|
|
152
|
+
The agent includes a Gradio-based user interface for easier interaction.
|
|
153
|
+
|
|
154
|
+
<div align="center">
|
|
155
|
+
<img src="../../img/agent_gradio_ui.png"/>
|
|
156
|
+
</div>
|
|
157
|
+
|
|
158
|
+
To use it:
|
|
147
159
|
|
|
148
160
|
```bash
|
|
149
161
|
# Install with Gradio support
|
|
@@ -192,6 +204,10 @@ The Gradio UI provides:
|
|
|
192
204
|
- Configuration of agent parameters
|
|
193
205
|
- Chat interface for interacting with the agent
|
|
194
206
|
|
|
207
|
+
### Using UI-TARS
|
|
208
|
+
|
|
209
|
+
You can use UI-TARS by first following the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md). This will give you a provider URL like this: `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the gradio UI.
|
|
210
|
+
|
|
195
211
|
## Agent Loops
|
|
196
212
|
|
|
197
213
|
The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
|
|
@@ -200,6 +216,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
|
|
|
200
216
|
|:-----------|:-----------------|:------------|:-------------|
|
|
201
217
|
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
|
|
202
218
|
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
|
|
219
|
+
| `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required |
|
|
203
220
|
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
|
204
221
|
|
|
205
222
|
## AgentResponse
|
|
@@ -241,25 +258,9 @@ async for result in agent.run(task):
|
|
|
241
258
|
print(output)
|
|
242
259
|
```
|
|
243
260
|
|
|
244
|
-
### Gradio UI
|
|
245
|
-
|
|
246
|
-
You can also interact with the agent using a Gradio interface.
|
|
247
|
-
|
|
248
|
-
```python
|
|
249
|
-
# Ensure environment variables (e.g., API keys) are loaded
|
|
250
|
-
# You might need a helper function like load_dotenv_files() if using .env
|
|
251
|
-
# from utils import load_dotenv_files
|
|
252
|
-
# load_dotenv_files()
|
|
253
|
-
|
|
254
|
-
from agent.ui.gradio.app import create_gradio_ui
|
|
255
|
-
|
|
256
|
-
app = create_gradio_ui()
|
|
257
|
-
app.launch(share=False)
|
|
258
|
-
```
|
|
259
|
-
|
|
260
261
|
**Note on Settings Persistence:**
|
|
261
262
|
|
|
262
263
|
* The Gradio UI automatically saves your configuration (Agent Loop, Model Choice, Custom Base URL, Save Trajectory state, Recent Images count) to a file named `.gradio_settings.json` in the project's root directory when you successfully run a task.
|
|
263
264
|
* This allows your preferences to persist between sessions.
|
|
264
265
|
* API keys entered into the custom provider field are **not** saved in this file for security reasons. Manage API keys using environment variables (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`) or a `.env` file.
|
|
265
|
-
* It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
|
|
266
|
+
* It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
|
|
@@ -31,6 +31,7 @@ pip install "cua-agent[all]"
|
|
|
31
31
|
# or install specific loop providers
|
|
32
32
|
pip install "cua-agent[openai]" # OpenAI Cua Loop
|
|
33
33
|
pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
34
|
+
pip install "cua-agent[uitars]" # UI-Tars support
|
|
34
35
|
pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
|
|
35
36
|
pip install "cua-agent[ui]" # Gradio UI for the agent
|
|
36
37
|
```
|
|
@@ -50,6 +51,9 @@ async with Computer() as macos_computer:
|
|
|
50
51
|
# or
|
|
51
52
|
# loop=AgentLoop.OMNI,
|
|
52
53
|
# model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
|
|
54
|
+
# or
|
|
55
|
+
# loop=AgentLoop.UITARS,
|
|
56
|
+
# model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
|
|
53
57
|
)
|
|
54
58
|
|
|
55
59
|
tasks = [
|
|
@@ -75,7 +79,13 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
|
|
|
75
79
|
|
|
76
80
|
## Using the Gradio UI
|
|
77
81
|
|
|
78
|
-
The agent includes a Gradio-based user interface for
|
|
82
|
+
The agent includes a Gradio-based user interface for easier interaction.
|
|
83
|
+
|
|
84
|
+
<div align="center">
|
|
85
|
+
<img src="../../img/agent_gradio_ui.png"/>
|
|
86
|
+
</div>
|
|
87
|
+
|
|
88
|
+
To use it:
|
|
79
89
|
|
|
80
90
|
```bash
|
|
81
91
|
# Install with Gradio support
|
|
@@ -124,6 +134,10 @@ The Gradio UI provides:
|
|
|
124
134
|
- Configuration of agent parameters
|
|
125
135
|
- Chat interface for interacting with the agent
|
|
126
136
|
|
|
137
|
+
### Using UI-TARS
|
|
138
|
+
|
|
139
|
+
You can use UI-TARS by first following the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md). This will give you a provider URL like this: `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the gradio UI.
|
|
140
|
+
|
|
127
141
|
## Agent Loops
|
|
128
142
|
|
|
129
143
|
The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
|
|
@@ -132,6 +146,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
|
|
|
132
146
|
|:-----------|:-----------------|:------------|:-------------|
|
|
133
147
|
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
|
|
134
148
|
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
|
|
149
|
+
| `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required |
|
|
135
150
|
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
|
136
151
|
|
|
137
152
|
## AgentResponse
|
|
@@ -173,25 +188,9 @@ async for result in agent.run(task):
|
|
|
173
188
|
print(output)
|
|
174
189
|
```
|
|
175
190
|
|
|
176
|
-
### Gradio UI
|
|
177
|
-
|
|
178
|
-
You can also interact with the agent using a Gradio interface.
|
|
179
|
-
|
|
180
|
-
```python
|
|
181
|
-
# Ensure environment variables (e.g., API keys) are loaded
|
|
182
|
-
# You might need a helper function like load_dotenv_files() if using .env
|
|
183
|
-
# from utils import load_dotenv_files
|
|
184
|
-
# load_dotenv_files()
|
|
185
|
-
|
|
186
|
-
from agent.ui.gradio.app import create_gradio_ui
|
|
187
|
-
|
|
188
|
-
app = create_gradio_ui()
|
|
189
|
-
app.launch(share=False)
|
|
190
|
-
```
|
|
191
|
-
|
|
192
191
|
**Note on Settings Persistence:**
|
|
193
192
|
|
|
194
193
|
* The Gradio UI automatically saves your configuration (Agent Loop, Model Choice, Custom Base URL, Save Trajectory state, Recent Images count) to a file named `.gradio_settings.json` in the project's root directory when you successfully run a task.
|
|
195
194
|
* This allows your preferences to persist between sessions.
|
|
196
195
|
* API keys entered into the custom provider field are **not** saved in this file for security reasons. Manage API keys using environment variables (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`) or a `.env` file.
|
|
197
|
-
* It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
|
|
196
|
+
* It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
|
|
@@ -98,5 +98,24 @@ class LoopFactory:
|
|
|
98
98
|
parser=OmniParser(),
|
|
99
99
|
provider_base_url=provider_base_url,
|
|
100
100
|
)
|
|
101
|
+
elif loop_type == AgentLoop.UITARS:
|
|
102
|
+
# Lazy import UITARSLoop only when needed
|
|
103
|
+
try:
|
|
104
|
+
from ..providers.uitars.loop import UITARSLoop
|
|
105
|
+
except ImportError:
|
|
106
|
+
raise ImportError(
|
|
107
|
+
"The 'uitars' provider is not installed. "
|
|
108
|
+
"Install it with 'pip install cua-agent[all]'"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return UITARSLoop(
|
|
112
|
+
api_key=api_key,
|
|
113
|
+
model=model_name,
|
|
114
|
+
computer=computer,
|
|
115
|
+
save_trajectory=save_trajectory,
|
|
116
|
+
base_dir=trajectory_dir,
|
|
117
|
+
only_n_most_recent_images=only_n_most_recent_images,
|
|
118
|
+
provider_base_url=provider_base_url,
|
|
119
|
+
)
|
|
101
120
|
else:
|
|
102
121
|
raise ValueError(f"Unsupported loop type: {loop_type}")
|
|
@@ -93,7 +93,14 @@ class OAICompatClient(BaseOmniClient):
|
|
|
93
93
|
"""
|
|
94
94
|
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
|
|
95
95
|
|
|
96
|
-
final_messages = [
|
|
96
|
+
final_messages = [
|
|
97
|
+
{
|
|
98
|
+
"role": "system",
|
|
99
|
+
"content": [
|
|
100
|
+
{ "type": "text", "text": system }
|
|
101
|
+
]
|
|
102
|
+
}
|
|
103
|
+
]
|
|
97
104
|
|
|
98
105
|
# Process messages
|
|
99
106
|
for item in messages:
|
|
@@ -117,7 +124,10 @@ class OAICompatClient(BaseOmniClient):
|
|
|
117
124
|
else:
|
|
118
125
|
message = {
|
|
119
126
|
"role": item["role"],
|
|
120
|
-
"content": [{
|
|
127
|
+
"content": [{
|
|
128
|
+
"type": "text",
|
|
129
|
+
"text": item["content"]
|
|
130
|
+
}],
|
|
121
131
|
}
|
|
122
132
|
final_messages.append(message)
|
|
123
133
|
else:
|
|
@@ -162,8 +162,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
162
162
|
y = kwargs.get("y")
|
|
163
163
|
if x is None or y is None:
|
|
164
164
|
raise ToolError("x and y coordinates are required for scroll action")
|
|
165
|
-
scroll_x = kwargs.get("scroll_x", 0) //
|
|
166
|
-
scroll_y = kwargs.get("scroll_y", 0) //
|
|
165
|
+
scroll_x = kwargs.get("scroll_x", 0) // 50
|
|
166
|
+
scroll_y = kwargs.get("scroll_y", 0) // 50
|
|
167
167
|
return await self.handle_scroll(x, y, scroll_x, scroll_y)
|
|
168
168
|
elif type == "screenshot":
|
|
169
169
|
return await self.screenshot()
|
|
@@ -240,11 +240,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
240
240
|
|
|
241
241
|
if len(mapped_keys) > 1:
|
|
242
242
|
# For key combinations (like Ctrl+C)
|
|
243
|
-
|
|
244
|
-
await self.computer.interface.press_key(k)
|
|
245
|
-
await asyncio.sleep(0.1)
|
|
246
|
-
for k in reversed(mapped_keys):
|
|
247
|
-
await self.computer.interface.press_key(k)
|
|
243
|
+
await self.computer.interface.hotkey(*mapped_keys)
|
|
248
244
|
else:
|
|
249
245
|
# Single key press
|
|
250
246
|
await self.computer.interface.press_key(mapped_keys[0])
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""UI-TARS Agent provider package."""
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Base client implementation for Omni providers."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Dict, List, Optional, Any, Tuple
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BaseUITarsClient:
|
|
10
|
+
"""Base class for provider-specific clients."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, api_key: Optional[str] = None, model: Optional[str] = None):
|
|
13
|
+
"""Initialize base client.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
api_key: Optional API key
|
|
17
|
+
model: Optional model name
|
|
18
|
+
"""
|
|
19
|
+
self.api_key = api_key
|
|
20
|
+
self.model = model
|
|
21
|
+
|
|
22
|
+
async def run_interleaved(
|
|
23
|
+
self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
|
|
24
|
+
) -> Dict[str, Any]:
|
|
25
|
+
"""Run interleaved chat completion.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
messages: List of message dicts
|
|
29
|
+
system: System prompt
|
|
30
|
+
max_tokens: Optional max tokens override
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Response dict
|
|
34
|
+
"""
|
|
35
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
"""OpenAI-compatible client implementation."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Dict, List, Optional, Any
|
|
6
|
+
import aiohttp
|
|
7
|
+
import re
|
|
8
|
+
from .base import BaseUITarsClient
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# OpenAI-compatible client for the UI_Tars
|
|
14
|
+
class OAICompatClient(BaseUITarsClient):
|
|
15
|
+
"""OpenAI-compatible API client implementation.
|
|
16
|
+
|
|
17
|
+
This client can be used with any service that implements the OpenAI API protocol, including:
|
|
18
|
+
- Huggingface Text Generation Interface endpoints
|
|
19
|
+
- vLLM
|
|
20
|
+
- LM Studio
|
|
21
|
+
- LocalAI
|
|
22
|
+
- Ollama (with OpenAI compatibility)
|
|
23
|
+
- Text Generation WebUI
|
|
24
|
+
- Any other service with OpenAI API compatibility
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
api_key: Optional[str] = None,
|
|
30
|
+
model: str = "Qwen2.5-VL-7B-Instruct",
|
|
31
|
+
provider_base_url: Optional[str] = "http://localhost:8000/v1",
|
|
32
|
+
max_tokens: int = 4096,
|
|
33
|
+
temperature: float = 0.0,
|
|
34
|
+
):
|
|
35
|
+
"""Initialize the OpenAI-compatible client.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
api_key: Not used for local endpoints, usually set to "EMPTY"
|
|
39
|
+
model: Model name to use
|
|
40
|
+
provider_base_url: API base URL. Typically in the format "http://localhost:PORT/v1"
|
|
41
|
+
Examples:
|
|
42
|
+
- vLLM: "http://localhost:8000/v1"
|
|
43
|
+
- LM Studio: "http://localhost:1234/v1"
|
|
44
|
+
- LocalAI: "http://localhost:8080/v1"
|
|
45
|
+
- Ollama: "http://localhost:11434/v1"
|
|
46
|
+
max_tokens: Maximum tokens to generate
|
|
47
|
+
temperature: Generation temperature
|
|
48
|
+
"""
|
|
49
|
+
super().__init__(api_key=api_key or "EMPTY", model=model)
|
|
50
|
+
self.api_key = api_key or "EMPTY" # Local endpoints typically don't require an API key
|
|
51
|
+
self.model = model
|
|
52
|
+
self.provider_base_url = (
|
|
53
|
+
provider_base_url or "http://localhost:8000/v1"
|
|
54
|
+
) # Use default if None
|
|
55
|
+
self.max_tokens = max_tokens
|
|
56
|
+
self.temperature = temperature
|
|
57
|
+
|
|
58
|
+
def _extract_base64_image(self, text: str) -> Optional[str]:
|
|
59
|
+
"""Extract base64 image data from an HTML img tag."""
|
|
60
|
+
pattern = r'data:image/[^;]+;base64,([^"]+)'
|
|
61
|
+
match = re.search(pattern, text)
|
|
62
|
+
return match.group(1) if match else None
|
|
63
|
+
|
|
64
|
+
def _get_loggable_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
65
|
+
"""Create a loggable version of messages with image data truncated."""
|
|
66
|
+
loggable_messages = []
|
|
67
|
+
for msg in messages:
|
|
68
|
+
if isinstance(msg.get("content"), list):
|
|
69
|
+
new_content = []
|
|
70
|
+
for content in msg["content"]:
|
|
71
|
+
if content.get("type") == "image":
|
|
72
|
+
new_content.append(
|
|
73
|
+
{"type": "image", "image_url": {"url": "[BASE64_IMAGE_DATA]"}}
|
|
74
|
+
)
|
|
75
|
+
else:
|
|
76
|
+
new_content.append(content)
|
|
77
|
+
loggable_messages.append({"role": msg["role"], "content": new_content})
|
|
78
|
+
else:
|
|
79
|
+
loggable_messages.append(msg)
|
|
80
|
+
return loggable_messages
|
|
81
|
+
|
|
82
|
+
async def run_interleaved(
|
|
83
|
+
self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
|
|
84
|
+
) -> Dict[str, Any]:
|
|
85
|
+
"""Run interleaved chat completion.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
messages: List of message dicts
|
|
89
|
+
system: System prompt
|
|
90
|
+
max_tokens: Optional max tokens override
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Response dict
|
|
94
|
+
"""
|
|
95
|
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
|
|
96
|
+
|
|
97
|
+
final_messages = [
|
|
98
|
+
{
|
|
99
|
+
"role": "system",
|
|
100
|
+
"content": [
|
|
101
|
+
{ "type": "text", "text": system }
|
|
102
|
+
]
|
|
103
|
+
}
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
# Process messages
|
|
107
|
+
for item in messages:
|
|
108
|
+
if isinstance(item, dict):
|
|
109
|
+
if isinstance(item["content"], list):
|
|
110
|
+
# Content is already in the correct format
|
|
111
|
+
final_messages.append(item)
|
|
112
|
+
else:
|
|
113
|
+
# Single string content, check for image
|
|
114
|
+
base64_img = self._extract_base64_image(item["content"])
|
|
115
|
+
if base64_img:
|
|
116
|
+
message = {
|
|
117
|
+
"role": item["role"],
|
|
118
|
+
"content": [
|
|
119
|
+
{
|
|
120
|
+
"type": "image_url",
|
|
121
|
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
|
|
122
|
+
}
|
|
123
|
+
],
|
|
124
|
+
}
|
|
125
|
+
else:
|
|
126
|
+
message = {
|
|
127
|
+
"role": item["role"],
|
|
128
|
+
"content": [{"type": "text", "text": item["content"]}],
|
|
129
|
+
}
|
|
130
|
+
final_messages.append(message)
|
|
131
|
+
else:
|
|
132
|
+
# String content, check for image
|
|
133
|
+
base64_img = self._extract_base64_image(item)
|
|
134
|
+
if base64_img:
|
|
135
|
+
message = {
|
|
136
|
+
"role": "user",
|
|
137
|
+
"content": [
|
|
138
|
+
{
|
|
139
|
+
"type": "image_url",
|
|
140
|
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
|
|
141
|
+
}
|
|
142
|
+
],
|
|
143
|
+
}
|
|
144
|
+
else:
|
|
145
|
+
message = {"role": "user", "content": [{"type": "text", "text": item}]}
|
|
146
|
+
final_messages.append(message)
|
|
147
|
+
|
|
148
|
+
payload = {
|
|
149
|
+
"model": self.model,
|
|
150
|
+
"messages": final_messages,
|
|
151
|
+
"max_tokens": max_tokens or self.max_tokens,
|
|
152
|
+
"temperature": self.temperature,
|
|
153
|
+
"top_p": 0.7,
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
async with aiohttp.ClientSession() as session:
|
|
158
|
+
# Use default base URL if none provided
|
|
159
|
+
base_url = self.provider_base_url or "http://localhost:8000/v1"
|
|
160
|
+
|
|
161
|
+
# Check if the base URL already includes the chat/completions endpoint
|
|
162
|
+
|
|
163
|
+
endpoint_url = base_url
|
|
164
|
+
if not endpoint_url.endswith("/chat/completions"):
|
|
165
|
+
# If URL is RunPod format, make it OpenAI compatible
|
|
166
|
+
if endpoint_url.startswith("https://api.runpod.ai/v2/"):
|
|
167
|
+
# Extract RunPod endpoint ID
|
|
168
|
+
parts = endpoint_url.split("/")
|
|
169
|
+
if len(parts) >= 5:
|
|
170
|
+
runpod_id = parts[4]
|
|
171
|
+
endpoint_url = f"https://api.runpod.ai/v2/{runpod_id}/openai/v1/chat/completions"
|
|
172
|
+
# If the URL ends with /v1, append /chat/completions
|
|
173
|
+
elif endpoint_url.endswith("/v1"):
|
|
174
|
+
endpoint_url = f"{endpoint_url}/chat/completions"
|
|
175
|
+
# If the URL doesn't end with /v1, make sure it has a proper structure
|
|
176
|
+
elif not endpoint_url.endswith("/"):
|
|
177
|
+
endpoint_url = f"{endpoint_url}/chat/completions"
|
|
178
|
+
else:
|
|
179
|
+
endpoint_url = f"{endpoint_url}chat/completions"
|
|
180
|
+
|
|
181
|
+
# Log the endpoint URL for debugging
|
|
182
|
+
logger.debug(f"Using endpoint URL: {endpoint_url}")
|
|
183
|
+
|
|
184
|
+
async with session.post(endpoint_url, headers=headers, json=payload) as response:
|
|
185
|
+
# Log the status and content type
|
|
186
|
+
logger.debug(f"Status: {response.status}")
|
|
187
|
+
logger.debug(f"Content-Type: {response.headers.get('Content-Type')}")
|
|
188
|
+
|
|
189
|
+
# Get the raw text of the response
|
|
190
|
+
response_text = await response.text()
|
|
191
|
+
logger.debug(f"Response content: {response_text}")
|
|
192
|
+
|
|
193
|
+
# Try to parse as JSON if the content type is appropriate
|
|
194
|
+
if "application/json" in response.headers.get('Content-Type', ''):
|
|
195
|
+
response_json = await response.json()
|
|
196
|
+
else:
|
|
197
|
+
raise Exception(f"Response is not JSON format")
|
|
198
|
+
# # Optionally try to parse it anyway
|
|
199
|
+
# try:
|
|
200
|
+
# import json
|
|
201
|
+
# response_json = json.loads(response_text)
|
|
202
|
+
# except json.JSONDecodeError as e:
|
|
203
|
+
# print(f"Failed to parse response as JSON: {e}")
|
|
204
|
+
|
|
205
|
+
if response.status != 200:
|
|
206
|
+
error_msg = response_json.get("error", {}).get(
|
|
207
|
+
"message", str(response_json)
|
|
208
|
+
)
|
|
209
|
+
logger.error(f"Error in API call: {error_msg}")
|
|
210
|
+
raise Exception(f"API error: {error_msg}")
|
|
211
|
+
|
|
212
|
+
return response_json
|
|
213
|
+
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logger.error(f"Error in API call: {str(e)}")
|
|
216
|
+
raise
|