cua-agent 0.3.2__py3-none-any.whl → 0.4.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +15 -51
- agent/__main__.py +21 -0
- agent/adapters/__init__.py +9 -0
- agent/adapters/huggingfacelocal_adapter.py +216 -0
- agent/agent.py +577 -0
- agent/callbacks/__init__.py +17 -0
- agent/callbacks/base.py +153 -0
- agent/callbacks/budget_manager.py +44 -0
- agent/callbacks/image_retention.py +139 -0
- agent/callbacks/logging.py +247 -0
- agent/callbacks/pii_anonymization.py +259 -0
- agent/callbacks/trajectory_saver.py +305 -0
- agent/cli.py +290 -0
- agent/computer_handler.py +107 -0
- agent/decorators.py +90 -0
- agent/loops/__init__.py +11 -0
- agent/loops/anthropic.py +728 -0
- agent/loops/omniparser.py +339 -0
- agent/loops/openai.py +95 -0
- agent/loops/uitars.py +688 -0
- agent/responses.py +207 -0
- agent/types.py +79 -0
- agent/ui/__init__.py +7 -1
- agent/ui/gradio/__init__.py +6 -19
- agent/ui/gradio/app.py +80 -1299
- agent/ui/gradio/ui_components.py +703 -0
- cua_agent-0.4.0b1.dist-info/METADATA +424 -0
- cua_agent-0.4.0b1.dist-info/RECORD +30 -0
- agent/core/__init__.py +0 -27
- agent/core/agent.py +0 -210
- agent/core/base.py +0 -217
- agent/core/callbacks.py +0 -200
- agent/core/experiment.py +0 -249
- agent/core/factory.py +0 -122
- agent/core/messages.py +0 -332
- agent/core/provider_config.py +0 -21
- agent/core/telemetry.py +0 -142
- agent/core/tools/__init__.py +0 -21
- agent/core/tools/base.py +0 -74
- agent/core/tools/bash.py +0 -52
- agent/core/tools/collection.py +0 -46
- agent/core/tools/computer.py +0 -113
- agent/core/tools/edit.py +0 -67
- agent/core/tools/manager.py +0 -56
- agent/core/tools.py +0 -32
- agent/core/types.py +0 -88
- agent/core/visualization.py +0 -197
- agent/providers/__init__.py +0 -4
- agent/providers/anthropic/__init__.py +0 -6
- agent/providers/anthropic/api/client.py +0 -360
- agent/providers/anthropic/api/logging.py +0 -150
- agent/providers/anthropic/api_handler.py +0 -140
- agent/providers/anthropic/callbacks/__init__.py +0 -5
- agent/providers/anthropic/callbacks/manager.py +0 -65
- agent/providers/anthropic/loop.py +0 -568
- agent/providers/anthropic/prompts.py +0 -23
- agent/providers/anthropic/response_handler.py +0 -226
- agent/providers/anthropic/tools/__init__.py +0 -33
- agent/providers/anthropic/tools/base.py +0 -88
- agent/providers/anthropic/tools/bash.py +0 -66
- agent/providers/anthropic/tools/collection.py +0 -34
- agent/providers/anthropic/tools/computer.py +0 -396
- agent/providers/anthropic/tools/edit.py +0 -326
- agent/providers/anthropic/tools/manager.py +0 -54
- agent/providers/anthropic/tools/run.py +0 -42
- agent/providers/anthropic/types.py +0 -16
- agent/providers/anthropic/utils.py +0 -381
- agent/providers/omni/__init__.py +0 -8
- agent/providers/omni/api_handler.py +0 -42
- agent/providers/omni/clients/anthropic.py +0 -103
- agent/providers/omni/clients/base.py +0 -35
- agent/providers/omni/clients/oaicompat.py +0 -195
- agent/providers/omni/clients/ollama.py +0 -122
- agent/providers/omni/clients/openai.py +0 -155
- agent/providers/omni/clients/utils.py +0 -25
- agent/providers/omni/image_utils.py +0 -34
- agent/providers/omni/loop.py +0 -990
- agent/providers/omni/parser.py +0 -307
- agent/providers/omni/prompts.py +0 -64
- agent/providers/omni/tools/__init__.py +0 -30
- agent/providers/omni/tools/base.py +0 -29
- agent/providers/omni/tools/bash.py +0 -74
- agent/providers/omni/tools/computer.py +0 -179
- agent/providers/omni/tools/manager.py +0 -61
- agent/providers/omni/utils.py +0 -236
- agent/providers/openai/__init__.py +0 -6
- agent/providers/openai/api_handler.py +0 -456
- agent/providers/openai/loop.py +0 -472
- agent/providers/openai/response_handler.py +0 -205
- agent/providers/openai/tools/__init__.py +0 -15
- agent/providers/openai/tools/base.py +0 -79
- agent/providers/openai/tools/computer.py +0 -326
- agent/providers/openai/tools/manager.py +0 -106
- agent/providers/openai/types.py +0 -36
- agent/providers/openai/utils.py +0 -98
- agent/providers/uitars/__init__.py +0 -1
- agent/providers/uitars/clients/base.py +0 -35
- agent/providers/uitars/clients/mlxvlm.py +0 -263
- agent/providers/uitars/clients/oaicompat.py +0 -214
- agent/providers/uitars/loop.py +0 -660
- agent/providers/uitars/prompts.py +0 -63
- agent/providers/uitars/tools/__init__.py +0 -1
- agent/providers/uitars/tools/computer.py +0 -283
- agent/providers/uitars/tools/manager.py +0 -60
- agent/providers/uitars/utils.py +0 -264
- agent/telemetry.py +0 -21
- agent/ui/__main__.py +0 -15
- cua_agent-0.3.2.dist-info/METADATA +0 -295
- cua_agent-0.3.2.dist-info/RECORD +0 -87
- {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +0 -0
- {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0
agent/ui/gradio/app.py
CHANGED
|
@@ -1,27 +1,18 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Advanced Gradio UI for Computer-Use Agent
|
|
2
|
+
Advanced Gradio UI for Computer-Use Agent (cua-agent)
|
|
3
3
|
|
|
4
|
-
This is a Gradio interface for the Computer-Use Agent
|
|
4
|
+
This is a Gradio interface for the Computer-Use Agent v0.4.x (cua-agent)
|
|
5
5
|
with an advanced UI for model selection and configuration.
|
|
6
6
|
|
|
7
|
-
Supported Agent
|
|
8
|
-
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
-
|
|
12
|
-
• claude-3-5-sonnet-20240620
|
|
13
|
-
• claude-3-7-sonnet-20250219
|
|
14
|
-
|
|
15
|
-
- AgentLoop.OMNI (experimental): Uses OmniParser for element pixel-detection
|
|
16
|
-
• claude-3-5-sonnet-20240620
|
|
17
|
-
• claude-3-7-sonnet-20250219
|
|
18
|
-
• gpt-4.5-preview
|
|
19
|
-
• gpt-4o
|
|
20
|
-
• gpt-4
|
|
7
|
+
Supported Agent Models:
|
|
8
|
+
- OpenAI: openai/computer-use-preview
|
|
9
|
+
- Anthropic: anthropic/claude-3-5-sonnet-20241022, anthropic/claude-3-7-sonnet-20250219
|
|
10
|
+
- UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
|
11
|
+
- Omniparser: omniparser+anthropic/claude-3-5-sonnet-20241022, omniparser+ollama_chat/gemma3
|
|
21
12
|
|
|
22
13
|
Requirements:
|
|
23
|
-
- Mac with Apple Silicon (M1/M2/M3/M4)
|
|
24
|
-
- macOS 14 (Sonoma) or newer
|
|
14
|
+
- Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
|
|
15
|
+
- macOS 14 (Sonoma) or newer / Ubuntu 20.04+
|
|
25
16
|
- Python 3.11+
|
|
26
17
|
- Lume CLI installed (https://github.com/trycua/cua)
|
|
27
18
|
- OpenAI or Anthropic API key
|
|
@@ -39,19 +30,21 @@ from gradio.components.chatbot import MetadataDict
|
|
|
39
30
|
from typing import cast
|
|
40
31
|
|
|
41
32
|
# Import from agent package
|
|
42
|
-
from agent
|
|
43
|
-
from agent.
|
|
33
|
+
from agent import ComputerAgent
|
|
34
|
+
from agent.types import Messages, AgentResponse
|
|
44
35
|
from computer import Computer
|
|
45
36
|
|
|
46
|
-
from agent import ComputerAgent, AgentLoop, LLM, LLMProvider
|
|
47
|
-
|
|
48
37
|
# Global variables
|
|
49
38
|
global_agent = None
|
|
50
39
|
global_computer = None
|
|
51
40
|
SETTINGS_FILE = Path(".gradio_settings.json")
|
|
52
41
|
|
|
53
|
-
# We'll use asyncio.run() instead of a persistent event loop
|
|
54
42
|
|
|
43
|
+
import dotenv
|
|
44
|
+
if dotenv.load_dotenv():
|
|
45
|
+
print(f"DEBUG - Loaded environment variables from {dotenv.find_dotenv()}")
|
|
46
|
+
else:
|
|
47
|
+
print("DEBUG - No .env file found")
|
|
55
48
|
|
|
56
49
|
# --- Settings Load/Save Functions ---
|
|
57
50
|
def load_settings() -> Dict[str, Any]:
|
|
@@ -60,7 +53,6 @@ def load_settings() -> Dict[str, Any]:
|
|
|
60
53
|
try:
|
|
61
54
|
with open(SETTINGS_FILE, "r") as f:
|
|
62
55
|
settings = json.load(f)
|
|
63
|
-
# Basic validation (can be expanded)
|
|
64
56
|
if isinstance(settings, dict):
|
|
65
57
|
print(f"DEBUG - Loaded settings from {SETTINGS_FILE}")
|
|
66
58
|
return settings
|
|
@@ -71,7 +63,6 @@ def load_settings() -> Dict[str, Any]:
|
|
|
71
63
|
|
|
72
64
|
def save_settings(settings: Dict[str, Any]):
|
|
73
65
|
"""Saves settings to the JSON file."""
|
|
74
|
-
# Ensure sensitive keys are not saved
|
|
75
66
|
settings.pop("provider_api_key", None)
|
|
76
67
|
try:
|
|
77
68
|
with open(SETTINGS_FILE, "w") as f:
|
|
@@ -81,44 +72,18 @@ def save_settings(settings: Dict[str, Any]):
|
|
|
81
72
|
print(f"Warning: Could not save settings to {SETTINGS_FILE}: {e}")
|
|
82
73
|
|
|
83
74
|
|
|
84
|
-
# --- End Settings Load/Save ---
|
|
85
|
-
|
|
86
|
-
|
|
87
75
|
# Custom Screenshot Handler for Gradio chat
|
|
88
|
-
class GradioChatScreenshotHandler
|
|
89
|
-
"""Custom handler that adds screenshots to the Gradio chatbot
|
|
76
|
+
class GradioChatScreenshotHandler:
|
|
77
|
+
"""Custom handler that adds screenshots to the Gradio chatbot."""
|
|
90
78
|
|
|
91
79
|
def __init__(self, chatbot_history: List[gr.ChatMessage]):
|
|
92
|
-
"""Initialize with reference to chat history and annotated image component.
|
|
93
|
-
|
|
94
|
-
Args:
|
|
95
|
-
chatbot_history: Reference to the Gradio chatbot history list
|
|
96
|
-
annotated_image: Reference to the annotated image component
|
|
97
|
-
"""
|
|
98
80
|
self.chatbot_history = chatbot_history
|
|
99
81
|
print("GradioChatScreenshotHandler initialized")
|
|
100
82
|
|
|
101
|
-
async def on_screenshot(
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
parsed_screen: Optional[dict] = None,
|
|
106
|
-
) -> None:
|
|
107
|
-
"""Add screenshot to chatbot when a screenshot is taken and update the annotated image.
|
|
108
|
-
|
|
109
|
-
Args:
|
|
110
|
-
screenshot_base64: Base64 encoded screenshot
|
|
111
|
-
action_type: Type of action that triggered the screenshot
|
|
112
|
-
|
|
113
|
-
Returns:
|
|
114
|
-
Original screenshot (does not modify it)
|
|
115
|
-
"""
|
|
116
|
-
# Create a markdown image element for the screenshot
|
|
117
|
-
image_markdown = (
|
|
118
|
-
f""
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
# Simply append the screenshot as a new message
|
|
83
|
+
async def on_screenshot(self, screenshot_base64: str, action_type: str = "") -> None:
|
|
84
|
+
"""Add screenshot to chatbot when a screenshot is taken."""
|
|
85
|
+
image_markdown = f""
|
|
86
|
+
|
|
122
87
|
if self.chatbot_history is not None:
|
|
123
88
|
self.chatbot_history.append(
|
|
124
89
|
gr.ChatMessage(
|
|
@@ -129,198 +94,68 @@ class GradioChatScreenshotHandler(DefaultCallbackHandler):
|
|
|
129
94
|
)
|
|
130
95
|
|
|
131
96
|
|
|
132
|
-
# Detect
|
|
97
|
+
# Detect platform capabilities
|
|
133
98
|
is_mac = platform.system().lower() == "darwin"
|
|
134
|
-
|
|
135
|
-
# Detect if lume is available (host device is macOS)
|
|
136
99
|
is_lume_available = is_mac or (os.environ.get("PYLUME_HOST", "localhost") != "localhost")
|
|
137
100
|
|
|
138
101
|
print("PYLUME_HOST: ", os.environ.get("PYLUME_HOST", "localhost"))
|
|
139
102
|
print("is_mac: ", is_mac)
|
|
140
103
|
print("Lume available: ", is_lume_available)
|
|
141
104
|
|
|
142
|
-
# Map model names to
|
|
105
|
+
# Map model names to agent model strings
|
|
143
106
|
MODEL_MAPPINGS = {
|
|
144
107
|
"openai": {
|
|
145
|
-
|
|
146
|
-
"
|
|
147
|
-
"OpenAI: Computer-Use Preview": "computer-use-preview",
|
|
148
|
-
# Map standard OpenAI model names to CUA-specific model names
|
|
149
|
-
"gpt-4-turbo": "computer-use-preview",
|
|
150
|
-
"gpt-4o": "computer-use-preview",
|
|
151
|
-
"gpt-4": "computer-use-preview",
|
|
152
|
-
"gpt-4.5-preview": "computer-use-preview",
|
|
153
|
-
"gpt-4o-mini": "gpt-4o-mini",
|
|
108
|
+
"default": "openai/computer-use-preview",
|
|
109
|
+
"OpenAI: Computer-Use Preview": "openai/computer-use-preview",
|
|
154
110
|
},
|
|
155
111
|
"anthropic": {
|
|
156
|
-
|
|
157
|
-
"
|
|
158
|
-
|
|
159
|
-
"Anthropic: Claude
|
|
160
|
-
"Anthropic: Claude
|
|
161
|
-
"claude-opus-4-20250514": "claude-opus-4-20250514",
|
|
162
|
-
"claude-sonnet-4-20250514": "claude-sonnet-4-20250514",
|
|
163
|
-
|
|
164
|
-
# Specific Claude models for CUA
|
|
165
|
-
"Anthropic: Claude 3.7 Sonnet (20250219)": "claude-3-5-sonnet-20240620",
|
|
166
|
-
"Anthropic: Claude 3.5 Sonnet (20240620)": "claude-3-7-sonnet-20250219",
|
|
167
|
-
"claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
|
|
168
|
-
"claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
|
|
169
|
-
# Map standard model names to CUA-specific model names
|
|
170
|
-
"claude-3-opus": "claude-3-7-sonnet-20250219",
|
|
171
|
-
"claude-3-sonnet": "claude-3-5-sonnet-20240620",
|
|
172
|
-
"claude-3-5-sonnet": "claude-3-5-sonnet-20240620",
|
|
173
|
-
"claude-3-7-sonnet": "claude-3-7-sonnet-20250219",
|
|
112
|
+
"default": "anthropic/claude-3-7-sonnet-20250219",
|
|
113
|
+
"Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
|
|
114
|
+
"Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
|
|
115
|
+
"Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
|
|
116
|
+
"Anthropic: Claude 3.5 Sonnet (20240620)": "anthropic/claude-3-5-sonnet-20240620",
|
|
174
117
|
},
|
|
175
118
|
"omni": {
|
|
176
|
-
|
|
177
|
-
"
|
|
178
|
-
"
|
|
179
|
-
"
|
|
180
|
-
"
|
|
181
|
-
"gpt-4.5-preview": "gpt-4.5-preview",
|
|
182
|
-
"claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
|
|
183
|
-
"claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
|
|
119
|
+
"default": "omniparser+openai/gpt-4o",
|
|
120
|
+
"OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
|
|
121
|
+
"OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
|
|
122
|
+
"OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
|
|
123
|
+
"OMNI: Claude 3.5 Sonnet (20240620)": "omniparser+anthropic/claude-3-5-sonnet-20240620",
|
|
184
124
|
},
|
|
185
125
|
"uitars": {
|
|
186
|
-
|
|
187
|
-
"
|
|
188
|
-
"mlx-community/UI-TARS-1.5-7B-4bit": "mlx-community/UI-TARS-1.5-7B-4bit",
|
|
189
|
-
"mlx-community/UI-TARS-1.5-7B-6bit": "mlx-community/UI-TARS-1.5-7B-6bit"
|
|
190
|
-
},
|
|
191
|
-
"ollama": {
|
|
192
|
-
# For Ollama models, we keep the original name
|
|
193
|
-
"default": "llama3", # A common default model
|
|
194
|
-
# Don't map other models - we'll use the original name
|
|
195
|
-
},
|
|
196
|
-
"oaicompat": {
|
|
197
|
-
# Default for OpenAI-compatible providers like VLLM
|
|
198
|
-
"default": "Qwen2.5-VL-7B-Instruct",
|
|
126
|
+
"default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",
|
|
127
|
+
"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
|
|
199
128
|
},
|
|
200
129
|
}
|
|
201
130
|
|
|
202
131
|
|
|
203
|
-
def
|
|
204
|
-
"""
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
"OMNI": AgentLoop.OMNI,
|
|
219
|
-
"OMNI-OLLAMA": AgentLoop.OMNI, # Special case for Ollama models with OMNI parser
|
|
220
|
-
"UITARS": AgentLoop.UITARS, # UI-TARS implementation
|
|
221
|
-
}
|
|
222
|
-
agent_loop = loop_provider_map.get(loop_provider, AgentLoop.OPENAI)
|
|
223
|
-
|
|
224
|
-
# Set up the provider and model based on the loop and model_name
|
|
225
|
-
if agent_loop == AgentLoop.OPENAI:
|
|
226
|
-
provider = LLMProvider.OPENAI
|
|
227
|
-
model_name_to_use = MODEL_MAPPINGS["openai"].get(
|
|
228
|
-
model_name, MODEL_MAPPINGS["openai"]["default"]
|
|
229
|
-
)
|
|
230
|
-
elif agent_loop == AgentLoop.ANTHROPIC:
|
|
231
|
-
provider = LLMProvider.ANTHROPIC
|
|
232
|
-
model_name_to_use = MODEL_MAPPINGS["anthropic"].get(
|
|
233
|
-
model_name, MODEL_MAPPINGS["anthropic"]["default"]
|
|
234
|
-
)
|
|
235
|
-
elif agent_loop == AgentLoop.OMNI:
|
|
236
|
-
# Determine provider and clean model name based on the full string from UI
|
|
237
|
-
cleaned_model_name = model_name # Default to using the name as-is (for custom)
|
|
238
|
-
|
|
239
|
-
if model_name == "Custom model (OpenAI compatible API)":
|
|
240
|
-
# Actual model name comes from custom_model_value via model_to_use.
|
|
241
|
-
# Assume OAICOMPAT for custom models unless overridden by URL/key later?
|
|
242
|
-
# get_provider_and_model determines the *initial* provider/model.
|
|
243
|
-
# The custom URL/key in process_response ultimately dictates the OAICOMPAT setup.
|
|
244
|
-
provider = LLMProvider.OAICOMPAT
|
|
245
|
-
# We set cleaned_model_name below outside the checks based on model_to_use
|
|
246
|
-
cleaned_model_name = "" # Placeholder, will be set by custom value later
|
|
247
|
-
elif model_name.startswith("OMNI: Ollama "):
|
|
248
|
-
provider = LLMProvider.OLLAMA
|
|
249
|
-
# Extract the part after "OMNI: Ollama "
|
|
250
|
-
cleaned_model_name = model_name.split("OMNI: Ollama ", 1)[1]
|
|
251
|
-
elif model_name.startswith("OMNI: Claude "):
|
|
252
|
-
provider = LLMProvider.ANTHROPIC
|
|
253
|
-
|
|
254
|
-
model_name = model_name.replace("OMNI: ", "Anthropic: ")
|
|
255
|
-
cleaned_model_name = MODEL_MAPPINGS["anthropic"].get(
|
|
256
|
-
model_name, MODEL_MAPPINGS["anthropic"]["default"]
|
|
257
|
-
)
|
|
258
|
-
elif model_name.startswith("OMNI: OpenAI "):
|
|
259
|
-
provider = LLMProvider.OPENAI
|
|
260
|
-
# Extract the model part, e.g., "GPT-4o mini"
|
|
261
|
-
model_key_part = model_name.replace("OMNI: OpenAI ", "")
|
|
262
|
-
# Normalize the extracted part: "gpt4omini"
|
|
263
|
-
model_key_part_norm = model_key_part.lower().replace("-", "").replace(" ", "")
|
|
264
|
-
|
|
265
|
-
cleaned_model_name = MODEL_MAPPINGS["omni"]["default"] # Default if not found
|
|
266
|
-
# Find the canonical name in the main OMNI map for OpenAI models
|
|
267
|
-
for key_omni, val_omni in MODEL_MAPPINGS["omni"].items():
|
|
268
|
-
# Normalize the omni map key: "gpt-4o-mini" -> "gpt4omini"
|
|
269
|
-
key_omni_norm = key_omni.lower().replace("-", "").replace(" ", "")
|
|
270
|
-
# Check if the normalized omni key matches the normalized extracted part
|
|
271
|
-
if key_omni_norm == model_key_part_norm:
|
|
272
|
-
cleaned_model_name = (
|
|
273
|
-
val_omni # Use the value from the OMNI map (e.g., gpt-4o-mini)
|
|
274
|
-
)
|
|
275
|
-
break
|
|
276
|
-
# Note: No fallback needed here as we explicitly check against omni keys
|
|
277
|
-
|
|
278
|
-
else: # Handles unexpected formats or the raw custom name if "Custom model (OpenAI compatible API)" selected
|
|
279
|
-
# Should only happen if user selected "Custom model (OpenAI compatible API)"
|
|
280
|
-
# Or if a model name format isn't caught above
|
|
281
|
-
provider = LLMProvider.OAICOMPAT
|
|
282
|
-
cleaned_model_name = (
|
|
283
|
-
model_name.strip() if model_name.strip() else MODEL_MAPPINGS["oaicompat"]["default"]
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
# Assign the determined model name
|
|
287
|
-
model_name_to_use = cleaned_model_name
|
|
288
|
-
# agent_loop remains AgentLoop.OMNI
|
|
289
|
-
elif agent_loop == AgentLoop.UITARS:
|
|
290
|
-
# For UITARS, use MLXVLM for mlx-community models, OAICOMPAT for custom
|
|
291
|
-
if model_name == "Custom model (OpenAI compatible API)":
|
|
292
|
-
provider = LLMProvider.OAICOMPAT
|
|
293
|
-
model_name_to_use = "tgi"
|
|
294
|
-
else:
|
|
295
|
-
provider = LLMProvider.MLXVLM
|
|
296
|
-
# Get the model name from the mappings or use as-is if not found
|
|
297
|
-
model_name_to_use = MODEL_MAPPINGS["uitars"].get(
|
|
298
|
-
model_name, model_name if model_name else MODEL_MAPPINGS["uitars"]["default"]
|
|
299
|
-
)
|
|
300
|
-
else:
|
|
301
|
-
# Default to OpenAI if unrecognized loop
|
|
302
|
-
provider = LLMProvider.OPENAI
|
|
303
|
-
model_name_to_use = MODEL_MAPPINGS["openai"]["default"]
|
|
304
|
-
agent_loop = AgentLoop.OPENAI
|
|
305
|
-
|
|
306
|
-
print(f"Mapping {model_name} and {loop_provider} to {provider}, {model_name_to_use}, {agent_loop}")
|
|
307
|
-
|
|
308
|
-
return provider, model_name_to_use, agent_loop
|
|
132
|
+
def get_model_string(model_name: str, loop_provider: str) -> str:
|
|
133
|
+
"""Determine the agent model string based on the input."""
|
|
134
|
+
if model_name == "Custom model (OpenAI compatible API)":
|
|
135
|
+
return "custom_oaicompat"
|
|
136
|
+
elif model_name == "Custom model (ollama)":
|
|
137
|
+
return "custom_ollama"
|
|
138
|
+
elif loop_provider == "OMNI-OLLAMA" or model_name.startswith("OMNI: Ollama "):
|
|
139
|
+
if model_name.startswith("OMNI: Ollama "):
|
|
140
|
+
ollama_model = model_name.split("OMNI: Ollama ", 1)[1]
|
|
141
|
+
return f"omniparser+ollama_chat/{ollama_model}"
|
|
142
|
+
return "omniparser+ollama_chat/llama3"
|
|
143
|
+
|
|
144
|
+
# Map based on loop provider
|
|
145
|
+
mapping = MODEL_MAPPINGS.get(loop_provider.lower(), MODEL_MAPPINGS["openai"])
|
|
146
|
+
return mapping.get(model_name, mapping["default"])
|
|
309
147
|
|
|
310
148
|
|
|
311
149
|
def get_ollama_models() -> List[str]:
|
|
312
150
|
"""Get available models from Ollama if installed."""
|
|
313
151
|
try:
|
|
314
152
|
import subprocess
|
|
315
|
-
|
|
316
153
|
result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
|
|
317
154
|
if result.returncode == 0:
|
|
318
155
|
lines = result.stdout.strip().split("\n")
|
|
319
|
-
if len(lines) < 2:
|
|
156
|
+
if len(lines) < 2:
|
|
320
157
|
return []
|
|
321
|
-
|
|
322
158
|
models = []
|
|
323
|
-
# Skip header line
|
|
324
159
|
for line in lines[1:]:
|
|
325
160
|
parts = line.split()
|
|
326
161
|
if parts:
|
|
@@ -342,7 +177,6 @@ def create_computer_instance(
|
|
|
342
177
|
) -> Computer:
|
|
343
178
|
"""Create or get the global Computer instance."""
|
|
344
179
|
global global_computer
|
|
345
|
-
|
|
346
180
|
if global_computer is None:
|
|
347
181
|
global_computer = Computer(
|
|
348
182
|
verbosity=verbosity,
|
|
@@ -351,29 +185,25 @@ def create_computer_instance(
|
|
|
351
185
|
name=name if name else "",
|
|
352
186
|
api_key=api_key
|
|
353
187
|
)
|
|
354
|
-
|
|
355
188
|
return global_computer
|
|
356
189
|
|
|
357
190
|
|
|
358
191
|
def create_agent(
|
|
359
|
-
|
|
360
|
-
agent_loop: AgentLoop,
|
|
361
|
-
model_name: str,
|
|
362
|
-
api_key: Optional[str] = None,
|
|
192
|
+
model_string: str,
|
|
363
193
|
save_trajectory: bool = True,
|
|
364
194
|
only_n_most_recent_images: int = 3,
|
|
365
195
|
verbosity: int = logging.INFO,
|
|
366
|
-
|
|
367
|
-
provider_base_url: Optional[str] = None,
|
|
196
|
+
custom_model_name: Optional[str] = None,
|
|
368
197
|
computer_os: str = "macos",
|
|
369
198
|
computer_provider: str = "lume",
|
|
370
199
|
computer_name: Optional[str] = None,
|
|
371
200
|
computer_api_key: Optional[str] = None,
|
|
201
|
+
max_trajectory_budget: Optional[float] = None,
|
|
372
202
|
) -> ComputerAgent:
|
|
373
203
|
"""Create or update the global agent with the specified parameters."""
|
|
374
204
|
global global_agent
|
|
375
205
|
|
|
376
|
-
# Create the computer
|
|
206
|
+
# Create the computer
|
|
377
207
|
computer = create_computer_instance(
|
|
378
208
|
verbosity=verbosity,
|
|
379
209
|
os_type=computer_os,
|
|
@@ -382,1085 +212,36 @@ def create_agent(
|
|
|
382
212
|
api_key=computer_api_key
|
|
383
213
|
)
|
|
384
214
|
|
|
385
|
-
#
|
|
386
|
-
if
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
# Special handling for OAICOMPAT - use OAICOMPAT provider with custom base URL
|
|
398
|
-
print(f"DEBUG - Creating OAICOMPAT agent with model: {model_name}, URL: {custom_base_url}")
|
|
399
|
-
llm = LLM(
|
|
400
|
-
provider=LLMProvider.OAICOMPAT, # Set to OAICOMPAT instead of using original provider
|
|
401
|
-
name=model_name,
|
|
402
|
-
provider_base_url=custom_base_url,
|
|
403
|
-
)
|
|
404
|
-
print(f"DEBUG - LLM provider is now: {llm.provider}, base URL: {llm.provider_base_url}")
|
|
405
|
-
# Note: Don't pass use_oaicompat to the agent, as it doesn't accept this parameter
|
|
406
|
-
elif provider == LLMProvider.OAICOMPAT:
|
|
407
|
-
# This path is unlikely to be taken with our current approach
|
|
408
|
-
llm = LLM(provider=provider, name=model_name, provider_base_url=custom_base_url)
|
|
409
|
-
else:
|
|
410
|
-
# For other providers, just use standard parameters
|
|
411
|
-
llm = LLM(provider=provider, name=model_name)
|
|
412
|
-
|
|
413
|
-
# Create or update the agent
|
|
414
|
-
global_agent = ComputerAgent(
|
|
415
|
-
computer=computer,
|
|
416
|
-
loop=agent_loop,
|
|
417
|
-
model=llm,
|
|
418
|
-
api_key=api_key,
|
|
419
|
-
save_trajectory=save_trajectory,
|
|
420
|
-
only_n_most_recent_images=only_n_most_recent_images,
|
|
421
|
-
verbosity=verbosity,
|
|
422
|
-
)
|
|
423
|
-
|
|
424
|
-
return global_agent
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
def create_gradio_ui(
|
|
428
|
-
provider_name: str = "openai",
|
|
429
|
-
model_name: str = "gpt-4o",
|
|
430
|
-
) -> gr.Blocks:
|
|
431
|
-
"""Create a Gradio UI for the Computer-Use Agent.
|
|
432
|
-
|
|
433
|
-
Args:
|
|
434
|
-
provider_name: The provider to use (e.g., "openai", "anthropic")
|
|
435
|
-
model_name: The model to use (e.g., "gpt-4o", "claude-3-7-sonnet")
|
|
436
|
-
|
|
437
|
-
Returns:
|
|
438
|
-
A Gradio Blocks application
|
|
439
|
-
"""
|
|
440
|
-
# --- Load Settings ---
|
|
441
|
-
saved_settings = load_settings()
|
|
442
|
-
# --- End Load Settings ---
|
|
443
|
-
|
|
444
|
-
# Check for API keys
|
|
445
|
-
openai_api_key = os.environ.get("OPENAI_API_KEY", "")
|
|
446
|
-
anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
|
447
|
-
cua_api_key = os.environ.get("CUA_API_KEY", "")
|
|
448
|
-
|
|
449
|
-
# Always show models regardless of API key availability
|
|
450
|
-
openai_models = ["OpenAI: Computer-Use Preview"]
|
|
451
|
-
anthropic_models = [
|
|
452
|
-
"Anthropic: Claude 4 Opus (20250514)",
|
|
453
|
-
"Anthropic: Claude 4 Sonnet (20250514)",
|
|
454
|
-
|
|
455
|
-
"Anthropic: Claude 3.7 Sonnet (20250219)",
|
|
456
|
-
"Anthropic: Claude 3.5 Sonnet (20240620)",
|
|
457
|
-
]
|
|
458
|
-
omni_models = [
|
|
459
|
-
"OMNI: OpenAI GPT-4o",
|
|
460
|
-
"OMNI: OpenAI GPT-4o mini",
|
|
461
|
-
"OMNI: OpenAI GPT-4.5-preview",
|
|
462
|
-
"OMNI: Claude 4 Opus (20250514)",
|
|
463
|
-
"OMNI: Claude 4 Sonnet (20250514)",
|
|
464
|
-
"OMNI: Claude 3.7 Sonnet (20250219)",
|
|
465
|
-
"OMNI: Claude 3.5 Sonnet (20240620)"
|
|
466
|
-
]
|
|
467
|
-
|
|
468
|
-
# Check if API keys are available
|
|
469
|
-
has_openai_key = bool(openai_api_key)
|
|
470
|
-
has_anthropic_key = bool(anthropic_api_key)
|
|
471
|
-
has_cua_key = bool(cua_api_key)
|
|
472
|
-
|
|
473
|
-
print("has_openai_key", has_openai_key)
|
|
474
|
-
print("has_anthropic_key", has_anthropic_key)
|
|
475
|
-
print("has_cua_key", has_cua_key)
|
|
476
|
-
|
|
477
|
-
# Get Ollama models for OMNI
|
|
478
|
-
ollama_models = get_ollama_models()
|
|
479
|
-
if ollama_models:
|
|
480
|
-
omni_models += ollama_models
|
|
481
|
-
|
|
482
|
-
# Detect if current device is MacOS
|
|
483
|
-
is_mac = platform.system().lower() == "darwin"
|
|
484
|
-
|
|
485
|
-
# Format model choices
|
|
486
|
-
provider_to_models = {
|
|
487
|
-
"OPENAI": openai_models,
|
|
488
|
-
"ANTHROPIC": anthropic_models,
|
|
489
|
-
"OMNI": omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"], # Add custom model options
|
|
490
|
-
"UITARS": ([
|
|
491
|
-
"mlx-community/UI-TARS-1.5-7B-4bit",
|
|
492
|
-
"mlx-community/UI-TARS-1.5-7B-6bit",
|
|
493
|
-
] if is_mac else []) + ["Custom model (OpenAI compatible API)"], # UI-TARS options with MLX models
|
|
215
|
+
# Handle custom models
|
|
216
|
+
if model_string == "custom_oaicompat" and custom_model_name:
|
|
217
|
+
model_string = custom_model_name
|
|
218
|
+
elif model_string == "custom_ollama" and custom_model_name:
|
|
219
|
+
model_string = f"omniparser+ollama_chat/{custom_model_name}"
|
|
220
|
+
|
|
221
|
+
# Create agent kwargs
|
|
222
|
+
agent_kwargs = {
|
|
223
|
+
"model": model_string,
|
|
224
|
+
"tools": [computer],
|
|
225
|
+
"only_n_most_recent_images": only_n_most_recent_images,
|
|
226
|
+
"verbosity": verbosity,
|
|
494
227
|
}
|
|
495
|
-
|
|
496
|
-
# --- Apply Saved Settings (override defaults if available) ---
|
|
497
|
-
initial_loop = saved_settings.get("agent_loop", "OMNI")
|
|
498
|
-
# Ensure the saved model is actually available in the choices for the loaded loop
|
|
499
|
-
available_models_for_loop = provider_to_models.get(initial_loop, [])
|
|
500
|
-
saved_model_choice = saved_settings.get("model_choice")
|
|
501
|
-
if saved_model_choice and saved_model_choice in available_models_for_loop:
|
|
502
|
-
initial_model = saved_model_choice
|
|
503
|
-
else:
|
|
504
|
-
# If saved model isn't valid for the loop, reset to default for that loop
|
|
505
|
-
if initial_loop == "OPENAI":
|
|
506
|
-
initial_model = (
|
|
507
|
-
"OpenAI: Computer-Use Preview" if openai_models else "No models available"
|
|
508
|
-
)
|
|
509
|
-
elif initial_loop == "ANTHROPIC":
|
|
510
|
-
initial_model = anthropic_models[0] if anthropic_models else "No models available"
|
|
511
|
-
else: # OMNI
|
|
512
|
-
initial_model = omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
|
|
513
|
-
if "Custom model (OpenAI compatible API)" in available_models_for_loop:
|
|
514
|
-
initial_model = (
|
|
515
|
-
"Custom model (OpenAI compatible API)" # Default to custom if available and no other default fits
|
|
516
|
-
)
|
|
517
|
-
|
|
518
|
-
initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
|
|
519
|
-
initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
|
|
520
|
-
initial_save_trajectory = saved_settings.get("save_trajectory", True)
|
|
521
|
-
initial_recent_images = saved_settings.get("recent_images", 3)
|
|
522
|
-
# --- End Apply Saved Settings ---
|
|
523
|
-
|
|
524
|
-
# Example prompts
|
|
525
|
-
example_messages = [
|
|
526
|
-
"Create a Python virtual environment, install pandas and matplotlib, then plot stock data",
|
|
527
|
-
"Open a PDF in Preview, add annotations, and save it as a compressed version",
|
|
528
|
-
"Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
|
|
529
|
-
"Configure SSH keys and set up a connection to a remote server",
|
|
530
|
-
]
|
|
531
228
|
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
agent_loop_choice: The agent loop type (e.g., UITARS, OPENAI, ANTHROPIC, OMNI)
|
|
538
|
-
provider: The provider type (e.g., OPENAI, ANTHROPIC, OLLAMA, OAICOMPAT, MLXVLM)
|
|
539
|
-
model_name: The model name
|
|
540
|
-
tasks: List of tasks to execute
|
|
541
|
-
provider_url: The provider base URL for OAICOMPAT providers
|
|
542
|
-
recent_images: Number of recent images to keep in context
|
|
543
|
-
save_trajectory: Whether to save the agent trajectory
|
|
544
|
-
computer_os: Operating system type for the computer
|
|
545
|
-
computer_provider: Provider type for the computer
|
|
546
|
-
container_name: Optional VM name
|
|
547
|
-
cua_cloud_api_key: Optional CUA Cloud API key
|
|
548
|
-
|
|
549
|
-
Returns:
|
|
550
|
-
Formatted Python code as a string
|
|
551
|
-
"""
|
|
552
|
-
# Format the tasks as a Python list
|
|
553
|
-
tasks_str = ""
|
|
554
|
-
for task in tasks:
|
|
555
|
-
if task and task.strip():
|
|
556
|
-
tasks_str += f' "{task}",\n'
|
|
557
|
-
|
|
558
|
-
# Create the Python code template with computer configuration
|
|
559
|
-
computer_args = []
|
|
560
|
-
if computer_os != "macos":
|
|
561
|
-
computer_args.append(f'os_type="{computer_os}"')
|
|
562
|
-
if computer_provider != "lume":
|
|
563
|
-
computer_args.append(f'provider_type="{computer_provider}"')
|
|
564
|
-
if container_name:
|
|
565
|
-
computer_args.append(f'name="{container_name}"')
|
|
566
|
-
if cua_cloud_api_key:
|
|
567
|
-
computer_args.append(f'api_key="{cua_cloud_api_key}"')
|
|
568
|
-
|
|
569
|
-
computer_args_str = ", ".join(computer_args)
|
|
570
|
-
if computer_args_str:
|
|
571
|
-
computer_args_str = f"({computer_args_str})"
|
|
572
|
-
else:
|
|
573
|
-
computer_args_str = "()"
|
|
574
|
-
|
|
575
|
-
code = f'''import asyncio
|
|
576
|
-
from computer import Computer
|
|
577
|
-
from agent import ComputerAgent, LLM, AgentLoop, LLMProvider
|
|
578
|
-
|
|
579
|
-
async def main():
|
|
580
|
-
async with Computer{computer_args_str} as macos_computer:
|
|
581
|
-
agent = ComputerAgent(
|
|
582
|
-
computer=macos_computer,
|
|
583
|
-
loop=AgentLoop.{agent_loop_choice},
|
|
584
|
-
only_n_most_recent_images={recent_images},
|
|
585
|
-
save_trajectory={save_trajectory},'''
|
|
586
|
-
|
|
587
|
-
# Add the model configuration based on provider and agent loop
|
|
588
|
-
if agent_loop_choice == "OPENAI":
|
|
589
|
-
# For OPENAI loop, always use OPENAI provider with computer-use-preview
|
|
590
|
-
code += f'''
|
|
591
|
-
model=LLM(
|
|
592
|
-
provider=LLMProvider.OPENAI,
|
|
593
|
-
name="computer-use-preview"
|
|
594
|
-
)'''
|
|
595
|
-
elif agent_loop_choice == "ANTHROPIC":
|
|
596
|
-
# For ANTHROPIC loop, always use ANTHROPIC provider
|
|
597
|
-
code += f'''
|
|
598
|
-
model=LLM(
|
|
599
|
-
provider=LLMProvider.ANTHROPIC,
|
|
600
|
-
name="{model_name}"
|
|
601
|
-
)'''
|
|
602
|
-
elif agent_loop_choice == "UITARS":
|
|
603
|
-
# For UITARS, use MLXVLM for mlx-community models, OAICOMPAT for others
|
|
604
|
-
if provider == LLMProvider.MLXVLM:
|
|
605
|
-
code += f'''
|
|
606
|
-
model=LLM(
|
|
607
|
-
provider=LLMProvider.MLXVLM,
|
|
608
|
-
name="{model_name}"
|
|
609
|
-
)'''
|
|
610
|
-
else: # OAICOMPAT
|
|
611
|
-
code += f'''
|
|
612
|
-
model=LLM(
|
|
613
|
-
provider=LLMProvider.OAICOMPAT,
|
|
614
|
-
name="{model_name}",
|
|
615
|
-
provider_base_url="{provider_url}"
|
|
616
|
-
)'''
|
|
617
|
-
elif agent_loop_choice == "OMNI":
|
|
618
|
-
# For OMNI, provider can be OPENAI, ANTHROPIC, OLLAMA, or OAICOMPAT
|
|
619
|
-
if provider == LLMProvider.OAICOMPAT:
|
|
620
|
-
code += f'''
|
|
621
|
-
model=LLM(
|
|
622
|
-
provider=LLMProvider.OAICOMPAT,
|
|
623
|
-
name="{model_name}",
|
|
624
|
-
provider_base_url="{provider_url}"
|
|
625
|
-
)'''
|
|
626
|
-
else: # OPENAI, ANTHROPIC, OLLAMA
|
|
627
|
-
code += f'''
|
|
628
|
-
model=LLM(
|
|
629
|
-
provider=LLMProvider.{provider.name},
|
|
630
|
-
name="{model_name}"
|
|
631
|
-
)'''
|
|
632
|
-
else:
|
|
633
|
-
# Default case - just use the provided provider and model
|
|
634
|
-
code += f'''
|
|
635
|
-
model=LLM(
|
|
636
|
-
provider=LLMProvider.{provider.name},
|
|
637
|
-
name="{model_name}"
|
|
638
|
-
)'''
|
|
639
|
-
|
|
640
|
-
code += """
|
|
641
|
-
)
|
|
642
|
-
"""
|
|
643
|
-
|
|
644
|
-
# Add tasks section if there are tasks
|
|
645
|
-
if tasks_str:
|
|
646
|
-
code += f'''
|
|
647
|
-
# Prompts for the computer-use agent
|
|
648
|
-
tasks = [
|
|
649
|
-
{tasks_str.rstrip()}
|
|
650
|
-
]
|
|
651
|
-
|
|
652
|
-
for task in tasks:
|
|
653
|
-
print(f"Executing task: {{task}}")
|
|
654
|
-
async for result in agent.run(task):
|
|
655
|
-
print(result)'''
|
|
656
|
-
else:
|
|
657
|
-
# If no tasks, just add a placeholder for a single task
|
|
658
|
-
code += f'''
|
|
659
|
-
# Execute a single task
|
|
660
|
-
task = "Search for information about CUA on GitHub"
|
|
661
|
-
print(f"Executing task: {{task}}")
|
|
662
|
-
async for result in agent.run(task):
|
|
663
|
-
print(result)'''
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
# Add the main block
|
|
668
|
-
code += '''
|
|
669
|
-
|
|
670
|
-
if __name__ == "__main__":
|
|
671
|
-
asyncio.run(main())'''
|
|
672
|
-
|
|
673
|
-
return code
|
|
674
|
-
|
|
675
|
-
# Create the Gradio interface with advanced UI
|
|
676
|
-
with gr.Blocks(title="Computer-Use Agent") as demo:
|
|
677
|
-
with gr.Row():
|
|
678
|
-
# Left column for settings
|
|
679
|
-
with gr.Column(scale=1):
|
|
680
|
-
# Logo with theme-aware styling
|
|
681
|
-
gr.HTML(
|
|
682
|
-
"""
|
|
683
|
-
<style>
|
|
684
|
-
.light-logo, .dark-logo {
|
|
685
|
-
display: block;
|
|
686
|
-
margin: 0 auto;
|
|
687
|
-
width: 80px;
|
|
688
|
-
}
|
|
689
|
-
/* Hide dark logo in light mode */
|
|
690
|
-
.dark-logo {
|
|
691
|
-
display: none;
|
|
692
|
-
}
|
|
693
|
-
/* In dark mode, hide light logo and show dark logo */
|
|
694
|
-
.dark .light-logo {
|
|
695
|
-
display: none;
|
|
696
|
-
}
|
|
697
|
-
.dark .dark-logo {
|
|
698
|
-
display: block;
|
|
699
|
-
}
|
|
700
|
-
</style>
|
|
701
|
-
<div style="display: flex; justify-content: center; margin-bottom: 0.5em">
|
|
702
|
-
<img class="light-logo" alt="CUA Logo"
|
|
703
|
-
src="https://github.com/trycua/cua/blob/main/img/logo_black.png?raw=true" />
|
|
704
|
-
<img class="dark-logo" alt="CUA Logo"
|
|
705
|
-
src="https://github.com/trycua/cua/blob/main/img/logo_white.png?raw=true" />
|
|
706
|
-
</div>
|
|
707
|
-
"""
|
|
708
|
-
)
|
|
709
|
-
|
|
710
|
-
# Add accordion for Python code
|
|
711
|
-
with gr.Accordion("Python Code", open=False):
|
|
712
|
-
code_display = gr.Code(
|
|
713
|
-
language="python",
|
|
714
|
-
value=generate_python_code(
|
|
715
|
-
initial_loop,
|
|
716
|
-
LLMProvider.OPENAI,
|
|
717
|
-
"gpt-4o",
|
|
718
|
-
[],
|
|
719
|
-
"https://openrouter.ai/api/v1",
|
|
720
|
-
3, # recent_images default
|
|
721
|
-
True, # save_trajectory default
|
|
722
|
-
"macos",
|
|
723
|
-
"lume",
|
|
724
|
-
"",
|
|
725
|
-
""
|
|
726
|
-
),
|
|
727
|
-
interactive=False,
|
|
728
|
-
)
|
|
729
|
-
|
|
730
|
-
with gr.Accordion("Computer Configuration", open=True):
|
|
731
|
-
# Computer configuration options
|
|
732
|
-
computer_os = gr.Radio(
|
|
733
|
-
choices=["macos", "linux", "windows"],
|
|
734
|
-
label="Operating System",
|
|
735
|
-
value="macos",
|
|
736
|
-
info="Select the operating system for the computer",
|
|
737
|
-
)
|
|
738
|
-
|
|
739
|
-
is_windows = platform.system().lower() == "windows"
|
|
740
|
-
is_mac = platform.system().lower() == "darwin"
|
|
741
|
-
|
|
742
|
-
providers = ["cloud"]
|
|
743
|
-
if is_lume_available:
|
|
744
|
-
providers += ["lume"]
|
|
745
|
-
if is_windows:
|
|
746
|
-
providers += ["winsandbox"]
|
|
747
|
-
|
|
748
|
-
computer_provider = gr.Radio(
|
|
749
|
-
choices=providers,
|
|
750
|
-
label="Provider",
|
|
751
|
-
value="lume" if is_mac else "cloud",
|
|
752
|
-
info="Select the computer provider",
|
|
753
|
-
)
|
|
754
|
-
|
|
755
|
-
container_name = gr.Textbox(
|
|
756
|
-
label="Container Name",
|
|
757
|
-
placeholder="Enter container name (optional)",
|
|
758
|
-
value="",
|
|
759
|
-
info="Optional name for the container",
|
|
760
|
-
)
|
|
761
|
-
|
|
762
|
-
cua_cloud_api_key = gr.Textbox(
|
|
763
|
-
label="CUA Cloud API Key",
|
|
764
|
-
placeholder="Enter your CUA Cloud API key",
|
|
765
|
-
value="",
|
|
766
|
-
type="password",
|
|
767
|
-
info="Required for cloud provider",
|
|
768
|
-
visible=(not has_cua_key)
|
|
769
|
-
)
|
|
770
|
-
|
|
771
|
-
with gr.Accordion("Agent Configuration", open=True):
|
|
772
|
-
# Configuration options
|
|
773
|
-
agent_loop = gr.Dropdown(
|
|
774
|
-
choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
|
|
775
|
-
label="Agent Loop",
|
|
776
|
-
value=initial_loop,
|
|
777
|
-
info="Select the agent loop provider",
|
|
778
|
-
)
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
# Create separate model selection dropdowns for each provider type
|
|
782
|
-
# This avoids the Gradio bug with updating choices
|
|
783
|
-
with gr.Group() as model_selection_group:
|
|
784
|
-
# OpenAI models dropdown
|
|
785
|
-
openai_model_choice = gr.Dropdown(
|
|
786
|
-
choices=openai_models,
|
|
787
|
-
label="OpenAI Model",
|
|
788
|
-
value=openai_models[0] if openai_models else "No models available",
|
|
789
|
-
info="Select OpenAI model",
|
|
790
|
-
interactive=True,
|
|
791
|
-
visible=(initial_loop == "OPENAI")
|
|
792
|
-
)
|
|
793
|
-
|
|
794
|
-
# Anthropic models dropdown
|
|
795
|
-
anthropic_model_choice = gr.Dropdown(
|
|
796
|
-
choices=anthropic_models,
|
|
797
|
-
label="Anthropic Model",
|
|
798
|
-
value=anthropic_models[0] if anthropic_models else "No models available",
|
|
799
|
-
info="Select Anthropic model",
|
|
800
|
-
interactive=True,
|
|
801
|
-
visible=(initial_loop == "ANTHROPIC")
|
|
802
|
-
)
|
|
803
|
-
|
|
804
|
-
# OMNI models dropdown
|
|
805
|
-
omni_model_choice = gr.Dropdown(
|
|
806
|
-
choices=omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
|
|
807
|
-
label="OMNI Model",
|
|
808
|
-
value=omni_models[0] if omni_models else "Custom model (OpenAI compatible API)",
|
|
809
|
-
info="Select OMNI model or choose a custom model option",
|
|
810
|
-
interactive=True,
|
|
811
|
-
visible=(initial_loop == "OMNI")
|
|
812
|
-
)
|
|
813
|
-
|
|
814
|
-
# UITARS models dropdown
|
|
815
|
-
uitars_model_choice = gr.Dropdown(
|
|
816
|
-
choices=provider_to_models.get("UITARS", ["No models available"]),
|
|
817
|
-
label="UITARS Model",
|
|
818
|
-
value=provider_to_models.get("UITARS", ["No models available"])[0] if provider_to_models.get("UITARS") else "No models available",
|
|
819
|
-
info="Select UITARS model",
|
|
820
|
-
interactive=True,
|
|
821
|
-
visible=(initial_loop == "UITARS")
|
|
822
|
-
)
|
|
823
|
-
|
|
824
|
-
# Hidden field to store the selected model (for compatibility with existing code)
|
|
825
|
-
model_choice = gr.Textbox(visible=False)
|
|
826
|
-
|
|
827
|
-
# Add API key inputs for OpenAI and Anthropic
|
|
828
|
-
with gr.Group(visible=not has_openai_key and (initial_loop == "OPENAI" or initial_loop == "OMNI")) as openai_key_group:
|
|
829
|
-
openai_api_key_input = gr.Textbox(
|
|
830
|
-
label="OpenAI API Key",
|
|
831
|
-
placeholder="Enter your OpenAI API key",
|
|
832
|
-
value="",
|
|
833
|
-
interactive=True,
|
|
834
|
-
type="password",
|
|
835
|
-
info="Required for OpenAI models"
|
|
836
|
-
)
|
|
837
|
-
|
|
838
|
-
with gr.Group(visible=not has_anthropic_key and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")) as anthropic_key_group:
|
|
839
|
-
anthropic_api_key_input = gr.Textbox(
|
|
840
|
-
label="Anthropic API Key",
|
|
841
|
-
placeholder="Enter your Anthropic API key",
|
|
842
|
-
value="",
|
|
843
|
-
interactive=True,
|
|
844
|
-
type="password",
|
|
845
|
-
info="Required for Anthropic models"
|
|
846
|
-
)
|
|
847
|
-
|
|
848
|
-
# Function to set OpenAI API key environment variable
|
|
849
|
-
def set_openai_api_key(key):
|
|
850
|
-
if key and key.strip():
|
|
851
|
-
os.environ["OPENAI_API_KEY"] = key.strip()
|
|
852
|
-
print(f"DEBUG - Set OpenAI API key environment variable")
|
|
853
|
-
return key
|
|
854
|
-
|
|
855
|
-
# Function to set Anthropic API key environment variable
|
|
856
|
-
def set_anthropic_api_key(key):
|
|
857
|
-
if key and key.strip():
|
|
858
|
-
os.environ["ANTHROPIC_API_KEY"] = key.strip()
|
|
859
|
-
print(f"DEBUG - Set Anthropic API key environment variable")
|
|
860
|
-
return key
|
|
861
|
-
|
|
862
|
-
# Add change event handlers for API key inputs
|
|
863
|
-
openai_api_key_input.change(
|
|
864
|
-
fn=set_openai_api_key,
|
|
865
|
-
inputs=[openai_api_key_input],
|
|
866
|
-
outputs=[openai_api_key_input],
|
|
867
|
-
queue=False
|
|
868
|
-
)
|
|
869
|
-
|
|
870
|
-
anthropic_api_key_input.change(
|
|
871
|
-
fn=set_anthropic_api_key,
|
|
872
|
-
inputs=[anthropic_api_key_input],
|
|
873
|
-
outputs=[anthropic_api_key_input],
|
|
874
|
-
queue=False
|
|
875
|
-
)
|
|
876
|
-
|
|
877
|
-
# Combined function to update UI based on selections
|
|
878
|
-
def update_ui(loop=None, openai_model=None, anthropic_model=None, omni_model=None, uitars_model=None):
|
|
879
|
-
# Default values if not provided
|
|
880
|
-
loop = loop or agent_loop.value
|
|
881
|
-
|
|
882
|
-
# Determine which model value to use for custom model checks
|
|
883
|
-
model_value = None
|
|
884
|
-
if loop == "OPENAI" and openai_model:
|
|
885
|
-
model_value = openai_model
|
|
886
|
-
elif loop == "ANTHROPIC" and anthropic_model:
|
|
887
|
-
model_value = anthropic_model
|
|
888
|
-
elif loop == "OMNI" and omni_model:
|
|
889
|
-
model_value = omni_model
|
|
890
|
-
elif loop == "UITARS" and uitars_model:
|
|
891
|
-
model_value = uitars_model
|
|
892
|
-
|
|
893
|
-
# Show/hide appropriate model dropdown based on loop selection
|
|
894
|
-
openai_visible = (loop == "OPENAI")
|
|
895
|
-
anthropic_visible = (loop == "ANTHROPIC")
|
|
896
|
-
omni_visible = (loop == "OMNI")
|
|
897
|
-
uitars_visible = (loop == "UITARS")
|
|
898
|
-
|
|
899
|
-
# Show/hide API key inputs based on loop selection
|
|
900
|
-
show_openai_key = not has_openai_key and (loop == "OPENAI" or (loop == "OMNI" and model_value and "OpenAI" in model_value and "Custom" not in model_value))
|
|
901
|
-
show_anthropic_key = not has_anthropic_key and (loop == "ANTHROPIC" or (loop == "OMNI" and model_value and "Claude" in model_value and "Custom" not in model_value))
|
|
902
|
-
|
|
903
|
-
# Determine custom model visibility
|
|
904
|
-
is_custom_openai_api = model_value == "Custom model (OpenAI compatible API)"
|
|
905
|
-
is_custom_ollama = model_value == "Custom model (ollama)"
|
|
906
|
-
is_any_custom = is_custom_openai_api or is_custom_ollama
|
|
907
|
-
|
|
908
|
-
# Update the hidden model_choice field based on the visible dropdown
|
|
909
|
-
model_choice_value = model_value if model_value else ""
|
|
910
|
-
|
|
911
|
-
# Return all UI updates
|
|
912
|
-
return [
|
|
913
|
-
# Model dropdowns visibility
|
|
914
|
-
gr.update(visible=openai_visible),
|
|
915
|
-
gr.update(visible=anthropic_visible),
|
|
916
|
-
gr.update(visible=omni_visible),
|
|
917
|
-
gr.update(visible=uitars_visible),
|
|
918
|
-
# API key inputs visibility
|
|
919
|
-
gr.update(visible=show_openai_key),
|
|
920
|
-
gr.update(visible=show_anthropic_key),
|
|
921
|
-
# Custom model fields visibility
|
|
922
|
-
gr.update(visible=is_any_custom), # Custom model name always visible for any custom option
|
|
923
|
-
gr.update(visible=is_custom_openai_api), # Provider base URL only for OpenAI compatible API
|
|
924
|
-
gr.update(visible=is_custom_openai_api), # Provider API key only for OpenAI compatible API
|
|
925
|
-
# Update the hidden model_choice field
|
|
926
|
-
gr.update(value=model_choice_value)
|
|
927
|
-
]
|
|
928
|
-
|
|
929
|
-
# Add custom model textbox (visible for both custom model options)
|
|
930
|
-
custom_model = gr.Textbox(
|
|
931
|
-
label="Custom Model Name",
|
|
932
|
-
placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct or llama3)",
|
|
933
|
-
value=initial_custom_model,
|
|
934
|
-
visible=(initial_model == "Custom model (OpenAI compatible API)" or initial_model == "Custom model (ollama)"),
|
|
935
|
-
interactive=True,
|
|
936
|
-
)
|
|
937
|
-
|
|
938
|
-
# Add custom provider base URL textbox (only visible for OpenAI compatible API)
|
|
939
|
-
provider_base_url = gr.Textbox(
|
|
940
|
-
label="Provider Base URL",
|
|
941
|
-
placeholder="Enter provider base URL (e.g., http://localhost:1234/v1)",
|
|
942
|
-
value=initial_provider_base_url,
|
|
943
|
-
visible=(initial_model == "Custom model (OpenAI compatible API)"),
|
|
944
|
-
interactive=True,
|
|
945
|
-
)
|
|
946
|
-
|
|
947
|
-
# Add custom API key textbox (only visible for OpenAI compatible API)
|
|
948
|
-
provider_api_key = gr.Textbox(
|
|
949
|
-
label="Provider API Key",
|
|
950
|
-
placeholder="Enter provider API key (if required)",
|
|
951
|
-
value="",
|
|
952
|
-
visible=(initial_model == "Custom model (OpenAI compatible API)"),
|
|
953
|
-
interactive=True,
|
|
954
|
-
type="password",
|
|
955
|
-
)
|
|
956
|
-
|
|
957
|
-
# Connect agent_loop changes to update all UI elements
|
|
958
|
-
agent_loop.change(
|
|
959
|
-
fn=update_ui,
|
|
960
|
-
inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
|
|
961
|
-
outputs=[
|
|
962
|
-
openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
|
|
963
|
-
openai_key_group, anthropic_key_group,
|
|
964
|
-
custom_model, provider_base_url, provider_api_key,
|
|
965
|
-
model_choice # Add model_choice to outputs
|
|
966
|
-
],
|
|
967
|
-
queue=False # Process immediately without queueing
|
|
968
|
-
)
|
|
969
|
-
|
|
970
|
-
# Connect each model dropdown to update UI
|
|
971
|
-
omni_model_choice.change(
|
|
972
|
-
fn=update_ui,
|
|
973
|
-
inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
|
|
974
|
-
outputs=[
|
|
975
|
-
openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
|
|
976
|
-
openai_key_group, anthropic_key_group,
|
|
977
|
-
custom_model, provider_base_url, provider_api_key,
|
|
978
|
-
model_choice # Add model_choice to outputs
|
|
979
|
-
],
|
|
980
|
-
queue=False
|
|
981
|
-
)
|
|
982
|
-
|
|
983
|
-
uitars_model_choice.change(
|
|
984
|
-
fn=update_ui,
|
|
985
|
-
inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
|
|
986
|
-
outputs=[
|
|
987
|
-
openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
|
|
988
|
-
openai_key_group, anthropic_key_group,
|
|
989
|
-
custom_model, provider_base_url, provider_api_key,
|
|
990
|
-
model_choice # Add model_choice to outputs
|
|
991
|
-
],
|
|
992
|
-
queue=False
|
|
993
|
-
)
|
|
994
|
-
|
|
995
|
-
openai_model_choice.change(
|
|
996
|
-
fn=update_ui,
|
|
997
|
-
inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
|
|
998
|
-
outputs=[
|
|
999
|
-
openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
|
|
1000
|
-
openai_key_group, anthropic_key_group,
|
|
1001
|
-
custom_model, provider_base_url, provider_api_key,
|
|
1002
|
-
model_choice # Add model_choice to outputs
|
|
1003
|
-
],
|
|
1004
|
-
queue=False
|
|
1005
|
-
)
|
|
1006
|
-
|
|
1007
|
-
anthropic_model_choice.change(
|
|
1008
|
-
fn=update_ui,
|
|
1009
|
-
inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
|
|
1010
|
-
outputs=[
|
|
1011
|
-
openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
|
|
1012
|
-
openai_key_group, anthropic_key_group,
|
|
1013
|
-
custom_model, provider_base_url, provider_api_key,
|
|
1014
|
-
model_choice # Add model_choice to outputs
|
|
1015
|
-
],
|
|
1016
|
-
queue=False
|
|
1017
|
-
)
|
|
1018
|
-
|
|
1019
|
-
save_trajectory = gr.Checkbox(
|
|
1020
|
-
label="Save Trajectory",
|
|
1021
|
-
value=initial_save_trajectory,
|
|
1022
|
-
info="Save the agent's trajectory for debugging",
|
|
1023
|
-
interactive=True,
|
|
1024
|
-
)
|
|
1025
|
-
|
|
1026
|
-
recent_images = gr.Slider(
|
|
1027
|
-
label="Recent Images",
|
|
1028
|
-
minimum=1,
|
|
1029
|
-
maximum=10,
|
|
1030
|
-
value=initial_recent_images,
|
|
1031
|
-
step=1,
|
|
1032
|
-
info="Number of recent images to keep in context",
|
|
1033
|
-
interactive=True,
|
|
1034
|
-
)
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
# Right column for chat interface
|
|
1038
|
-
with gr.Column(scale=2):
|
|
1039
|
-
# Add instruction text before the chat interface
|
|
1040
|
-
gr.Markdown(
|
|
1041
|
-
"Ask me to perform tasks in a virtual macOS environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
|
|
1042
|
-
)
|
|
1043
|
-
|
|
1044
|
-
chatbot_history = gr.Chatbot(type="messages")
|
|
1045
|
-
msg = gr.Textbox(
|
|
1046
|
-
placeholder="Ask me to perform tasks in a virtual macOS environment"
|
|
1047
|
-
)
|
|
1048
|
-
clear = gr.Button("Clear")
|
|
1049
|
-
|
|
1050
|
-
# Add cancel button
|
|
1051
|
-
cancel_button = gr.Button("Cancel", variant="stop")
|
|
1052
|
-
|
|
1053
|
-
# Add examples
|
|
1054
|
-
example_group = gr.Examples(examples=example_messages, inputs=msg)
|
|
1055
|
-
|
|
1056
|
-
# Function to handle chat submission
|
|
1057
|
-
def chat_submit(message, history):
|
|
1058
|
-
# Add user message to history
|
|
1059
|
-
history.append(gr.ChatMessage(role="user", content=message))
|
|
1060
|
-
return "", history
|
|
1061
|
-
|
|
1062
|
-
# Function to cancel the running agent
|
|
1063
|
-
async def cancel_agent_task(history):
|
|
1064
|
-
global global_agent
|
|
1065
|
-
if global_agent and hasattr(global_agent, '_loop'):
|
|
1066
|
-
print("DEBUG - Cancelling agent task")
|
|
1067
|
-
# Cancel the agent loop
|
|
1068
|
-
if hasattr(global_agent._loop, 'cancel') and callable(global_agent._loop.cancel):
|
|
1069
|
-
await global_agent._loop.cancel()
|
|
1070
|
-
history.append(gr.ChatMessage(role="assistant", content="Task cancelled by user", metadata={"title": "❌ Cancelled"}))
|
|
1071
|
-
else:
|
|
1072
|
-
history.append(gr.ChatMessage(role="assistant", content="Could not cancel task: cancel method not found", metadata={"title": "⚠️ Warning"}))
|
|
1073
|
-
else:
|
|
1074
|
-
history.append(gr.ChatMessage(role="assistant", content="No active agent task to cancel", metadata={"title": "ℹ️ Info"}))
|
|
1075
|
-
return history
|
|
1076
|
-
|
|
1077
|
-
# Function to process agent response after user input
|
|
1078
|
-
async def process_response(
|
|
1079
|
-
history,
|
|
1080
|
-
openai_model_value,
|
|
1081
|
-
anthropic_model_value,
|
|
1082
|
-
omni_model_value,
|
|
1083
|
-
uitars_model_value,
|
|
1084
|
-
custom_model_value,
|
|
1085
|
-
agent_loop_choice,
|
|
1086
|
-
save_traj,
|
|
1087
|
-
recent_imgs,
|
|
1088
|
-
custom_url_value=None,
|
|
1089
|
-
custom_api_key=None,
|
|
1090
|
-
openai_key_input=None,
|
|
1091
|
-
anthropic_key_input=None,
|
|
1092
|
-
computer_os="macos",
|
|
1093
|
-
computer_provider="lume",
|
|
1094
|
-
container_name="",
|
|
1095
|
-
cua_cloud_api_key="",
|
|
1096
|
-
):
|
|
1097
|
-
if not history:
|
|
1098
|
-
yield history
|
|
1099
|
-
return
|
|
1100
|
-
|
|
1101
|
-
# Get the last user message
|
|
1102
|
-
last_user_message = history[-1]["content"]
|
|
1103
|
-
|
|
1104
|
-
# Get the appropriate model value based on the agent loop
|
|
1105
|
-
if agent_loop_choice == "OPENAI":
|
|
1106
|
-
model_choice_value = openai_model_value
|
|
1107
|
-
elif agent_loop_choice == "ANTHROPIC":
|
|
1108
|
-
model_choice_value = anthropic_model_value
|
|
1109
|
-
elif agent_loop_choice == "OMNI":
|
|
1110
|
-
model_choice_value = omni_model_value
|
|
1111
|
-
elif agent_loop_choice == "UITARS":
|
|
1112
|
-
model_choice_value = uitars_model_value
|
|
1113
|
-
else:
|
|
1114
|
-
model_choice_value = "No models available"
|
|
1115
|
-
|
|
1116
|
-
# Determine if this is a custom model selection and which type
|
|
1117
|
-
is_custom_openai_api = model_choice_value == "Custom model (OpenAI compatible API)"
|
|
1118
|
-
is_custom_ollama = model_choice_value == "Custom model (ollama)"
|
|
1119
|
-
is_custom_model_selected = is_custom_openai_api or is_custom_ollama
|
|
1120
|
-
|
|
1121
|
-
# Determine the model name string to analyze: custom or from dropdown
|
|
1122
|
-
if is_custom_model_selected:
|
|
1123
|
-
model_string_to_analyze = custom_model_value
|
|
1124
|
-
else:
|
|
1125
|
-
model_string_to_analyze = model_choice_value # Use the full UI string initially
|
|
1126
|
-
|
|
1127
|
-
try:
|
|
1128
|
-
# Special case for UITARS - use MLXVLM provider or OAICOMPAT for custom
|
|
1129
|
-
if agent_loop_choice == "UITARS":
|
|
1130
|
-
if is_custom_openai_api:
|
|
1131
|
-
provider = LLMProvider.OAICOMPAT
|
|
1132
|
-
cleaned_model_name_from_func = custom_model_value
|
|
1133
|
-
agent_loop_type = AgentLoop.UITARS
|
|
1134
|
-
print(f"Using OAICOMPAT provider for custom UITARS model: {custom_model_value}")
|
|
1135
|
-
else:
|
|
1136
|
-
provider = LLMProvider.MLXVLM
|
|
1137
|
-
cleaned_model_name_from_func = model_string_to_analyze
|
|
1138
|
-
agent_loop_type = AgentLoop.UITARS
|
|
1139
|
-
print(f"Using MLXVLM provider for UITARS model: {model_string_to_analyze}")
|
|
1140
|
-
# Special case for Ollama custom model
|
|
1141
|
-
elif is_custom_ollama and agent_loop_choice == "OMNI":
|
|
1142
|
-
provider = LLMProvider.OLLAMA
|
|
1143
|
-
cleaned_model_name_from_func = custom_model_value
|
|
1144
|
-
agent_loop_type = AgentLoop.OMNI
|
|
1145
|
-
print(f"Using Ollama provider for custom model: {custom_model_value}")
|
|
1146
|
-
else:
|
|
1147
|
-
# Get the provider, *cleaned* model name, and agent loop type
|
|
1148
|
-
provider, cleaned_model_name_from_func, agent_loop_type = (
|
|
1149
|
-
get_provider_and_model(model_string_to_analyze, agent_loop_choice)
|
|
1150
|
-
)
|
|
1151
|
-
|
|
1152
|
-
print(f"provider={provider} cleaned_model_name_from_func={cleaned_model_name_from_func} agent_loop_type={agent_loop_type} agent_loop_choice={agent_loop_choice}")
|
|
1153
|
-
|
|
1154
|
-
# Determine the final model name to send to the agent
|
|
1155
|
-
# If custom selected, use the custom text box value, otherwise use the cleaned name
|
|
1156
|
-
final_model_name_to_send = (
|
|
1157
|
-
custom_model_value
|
|
1158
|
-
if is_custom_model_selected
|
|
1159
|
-
else cleaned_model_name_from_func
|
|
1160
|
-
)
|
|
1161
|
-
|
|
1162
|
-
# Determine if OAICOMPAT should be used (for OpenAI compatible API custom model)
|
|
1163
|
-
is_oaicompat = is_custom_openai_api
|
|
1164
|
-
|
|
1165
|
-
# Get API key based on provider determined by get_provider_and_model
|
|
1166
|
-
if is_oaicompat and custom_api_key:
|
|
1167
|
-
# Use custom API key if provided for OpenAI compatible API custom model
|
|
1168
|
-
api_key = custom_api_key
|
|
1169
|
-
print(
|
|
1170
|
-
f"DEBUG - Using custom API key for OpenAI compatible API model: {final_model_name_to_send}"
|
|
1171
|
-
)
|
|
1172
|
-
elif provider == LLMProvider.OLLAMA:
|
|
1173
|
-
# No API key needed for Ollama
|
|
1174
|
-
api_key = ""
|
|
1175
|
-
print(f"DEBUG - No API key needed for Ollama model: {final_model_name_to_send}")
|
|
1176
|
-
elif provider == LLMProvider.OPENAI:
|
|
1177
|
-
# Use OpenAI key from input if provided, otherwise use environment variable
|
|
1178
|
-
api_key = openai_key_input if openai_key_input else (openai_api_key or os.environ.get("OPENAI_API_KEY", ""))
|
|
1179
|
-
if openai_key_input:
|
|
1180
|
-
# Set the environment variable for the OpenAI API key
|
|
1181
|
-
os.environ["OPENAI_API_KEY"] = openai_key_input
|
|
1182
|
-
print(f"DEBUG - Using provided OpenAI API key from UI and set as environment variable")
|
|
1183
|
-
elif provider == LLMProvider.ANTHROPIC:
|
|
1184
|
-
# Use Anthropic key from input if provided, otherwise use environment variable
|
|
1185
|
-
api_key = anthropic_key_input if anthropic_key_input else (anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", ""))
|
|
1186
|
-
if anthropic_key_input:
|
|
1187
|
-
# Set the environment variable for the Anthropic API key
|
|
1188
|
-
os.environ["ANTHROPIC_API_KEY"] = anthropic_key_input
|
|
1189
|
-
print(f"DEBUG - Using provided Anthropic API key from UI and set as environment variable")
|
|
1190
|
-
else:
|
|
1191
|
-
# For Ollama or default OAICOMPAT (without custom key), no key needed/expected
|
|
1192
|
-
api_key = ""
|
|
1193
|
-
|
|
1194
|
-
cua_cloud_api_key = cua_cloud_api_key or os.environ.get("CUA_API_KEY", "")
|
|
1195
|
-
|
|
1196
|
-
# --- Save Settings Before Running Agent ---
|
|
1197
|
-
current_settings = {
|
|
1198
|
-
"agent_loop": agent_loop_choice,
|
|
1199
|
-
"model_choice": model_choice_value,
|
|
1200
|
-
"custom_model": custom_model_value,
|
|
1201
|
-
"provider_base_url": custom_url_value,
|
|
1202
|
-
"save_trajectory": save_traj,
|
|
1203
|
-
"recent_images": recent_imgs,
|
|
1204
|
-
"computer_os": computer_os,
|
|
1205
|
-
"computer_provider": computer_provider,
|
|
1206
|
-
"container_name": container_name,
|
|
1207
|
-
"cua_cloud_api_key": cua_cloud_api_key,
|
|
1208
|
-
}
|
|
1209
|
-
save_settings(current_settings)
|
|
1210
|
-
# --- End Save Settings ---
|
|
1211
|
-
|
|
1212
|
-
# Create or update the agent
|
|
1213
|
-
create_agent(
|
|
1214
|
-
# Provider determined by special cases and get_provider_and_model
|
|
1215
|
-
provider=provider,
|
|
1216
|
-
agent_loop=agent_loop_type,
|
|
1217
|
-
# Pass the FINAL determined model name (cleaned or custom)
|
|
1218
|
-
model_name=final_model_name_to_send,
|
|
1219
|
-
api_key=api_key,
|
|
1220
|
-
save_trajectory=save_traj,
|
|
1221
|
-
only_n_most_recent_images=recent_imgs,
|
|
1222
|
-
use_oaicompat=is_oaicompat, # Set flag if custom model was selected
|
|
1223
|
-
# Pass custom URL only if custom model was selected
|
|
1224
|
-
provider_base_url=custom_url_value if is_oaicompat else None,
|
|
1225
|
-
computer_os=computer_os,
|
|
1226
|
-
computer_provider=computer_provider,
|
|
1227
|
-
computer_name=container_name,
|
|
1228
|
-
computer_api_key=cua_cloud_api_key,
|
|
1229
|
-
verbosity=logging.DEBUG, # Added verbosity here
|
|
1230
|
-
)
|
|
1231
|
-
|
|
1232
|
-
if global_agent is None:
|
|
1233
|
-
# Add initial empty assistant message
|
|
1234
|
-
history.append(
|
|
1235
|
-
gr.ChatMessage(
|
|
1236
|
-
role="assistant",
|
|
1237
|
-
content="Failed to create agent. Check API keys and configuration.",
|
|
1238
|
-
)
|
|
1239
|
-
)
|
|
1240
|
-
yield history
|
|
1241
|
-
return
|
|
1242
|
-
|
|
1243
|
-
# Add the screenshot handler to the agent's loop if available
|
|
1244
|
-
if global_agent and hasattr(global_agent, "_loop"):
|
|
1245
|
-
print("DEBUG - Adding screenshot handler to agent loop")
|
|
1246
|
-
|
|
1247
|
-
# Create the screenshot handler with references to UI components
|
|
1248
|
-
screenshot_handler = GradioChatScreenshotHandler(history)
|
|
1249
|
-
|
|
1250
|
-
# Add the handler to the callback manager if it exists AND is not None
|
|
1251
|
-
if (
|
|
1252
|
-
hasattr(global_agent._loop, "callback_manager")
|
|
1253
|
-
and global_agent._loop.callback_manager is not None
|
|
1254
|
-
):
|
|
1255
|
-
global_agent._loop.callback_manager.add_handler(screenshot_handler)
|
|
1256
|
-
print(
|
|
1257
|
-
f"DEBUG - Screenshot handler added to callback manager with history: {id(history)}"
|
|
1258
|
-
)
|
|
1259
|
-
else:
|
|
1260
|
-
# Optional: Log a warning if the callback manager is missing/None for a specific loop
|
|
1261
|
-
print(
|
|
1262
|
-
f"WARNING - Callback manager not found or is None for loop type: {type(global_agent._loop)}. Screenshot handler not added."
|
|
1263
|
-
)
|
|
1264
|
-
|
|
1265
|
-
# Stream responses from the agent
|
|
1266
|
-
async for result in global_agent.run(last_user_message):
|
|
1267
|
-
print(f"DEBUG - Agent response ------- START")
|
|
1268
|
-
from pprint import pprint
|
|
1269
|
-
pprint(result)
|
|
1270
|
-
print(f"DEBUG - Agent response ------- END")
|
|
1271
|
-
|
|
1272
|
-
def generate_gradio_messages():
|
|
1273
|
-
if result.get("content"):
|
|
1274
|
-
yield gr.ChatMessage(
|
|
1275
|
-
role="assistant",
|
|
1276
|
-
content=result.get("content", ""),
|
|
1277
|
-
metadata=cast(MetadataDict, result.get("metadata", {}))
|
|
1278
|
-
)
|
|
1279
|
-
else:
|
|
1280
|
-
outputs = result.get("output", [])
|
|
1281
|
-
for output in outputs:
|
|
1282
|
-
if output.get("type") == "message":
|
|
1283
|
-
content = output.get("content", [])
|
|
1284
|
-
for content_part in content:
|
|
1285
|
-
if content_part.get("text"):
|
|
1286
|
-
yield gr.ChatMessage(
|
|
1287
|
-
role=output.get("role", "assistant"),
|
|
1288
|
-
content=content_part.get("text", ""),
|
|
1289
|
-
metadata=content_part.get("metadata", {})
|
|
1290
|
-
)
|
|
1291
|
-
elif output.get("type") == "reasoning":
|
|
1292
|
-
# if it's openAI, we only have access to a summary of the reasoning
|
|
1293
|
-
summary_content = output.get("summary", [])
|
|
1294
|
-
if summary_content:
|
|
1295
|
-
for summary_part in summary_content:
|
|
1296
|
-
if summary_part.get("type") == "summary_text":
|
|
1297
|
-
yield gr.ChatMessage(
|
|
1298
|
-
role="assistant",
|
|
1299
|
-
content=summary_part.get("text", "")
|
|
1300
|
-
)
|
|
1301
|
-
else:
|
|
1302
|
-
summary_content = output.get("text", "")
|
|
1303
|
-
if summary_content:
|
|
1304
|
-
yield gr.ChatMessage(
|
|
1305
|
-
role="assistant",
|
|
1306
|
-
content=summary_content,
|
|
1307
|
-
)
|
|
1308
|
-
elif output.get("type") == "computer_call":
|
|
1309
|
-
action = output.get("action", {})
|
|
1310
|
-
action_type = action.get("type", "")
|
|
1311
|
-
if action_type:
|
|
1312
|
-
action_title = f"🛠️ Performing {action_type}"
|
|
1313
|
-
if action.get("x") and action.get("y"):
|
|
1314
|
-
action_title += f" at ({action['x']}, {action['y']})"
|
|
1315
|
-
yield gr.ChatMessage(
|
|
1316
|
-
role="assistant",
|
|
1317
|
-
content=f"```json\n{json.dumps(action)}\n```",
|
|
1318
|
-
metadata={"title": action_title}
|
|
1319
|
-
)
|
|
1320
|
-
|
|
1321
|
-
for message in generate_gradio_messages():
|
|
1322
|
-
history.append(message)
|
|
1323
|
-
yield history
|
|
1324
|
-
|
|
1325
|
-
except Exception as e:
|
|
1326
|
-
import traceback
|
|
1327
|
-
|
|
1328
|
-
traceback.print_exc()
|
|
1329
|
-
# Update with error message
|
|
1330
|
-
history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
|
|
1331
|
-
yield history
|
|
1332
|
-
|
|
1333
|
-
# Connect the submit button to the process_response function
|
|
1334
|
-
submit_event = msg.submit(
|
|
1335
|
-
fn=chat_submit,
|
|
1336
|
-
inputs=[msg, chatbot_history],
|
|
1337
|
-
outputs=[msg, chatbot_history],
|
|
1338
|
-
queue=False,
|
|
1339
|
-
).then(
|
|
1340
|
-
fn=process_response,
|
|
1341
|
-
inputs=[
|
|
1342
|
-
chatbot_history,
|
|
1343
|
-
openai_model_choice,
|
|
1344
|
-
anthropic_model_choice,
|
|
1345
|
-
omni_model_choice,
|
|
1346
|
-
uitars_model_choice,
|
|
1347
|
-
custom_model,
|
|
1348
|
-
agent_loop,
|
|
1349
|
-
save_trajectory,
|
|
1350
|
-
recent_images,
|
|
1351
|
-
provider_base_url,
|
|
1352
|
-
provider_api_key,
|
|
1353
|
-
openai_api_key_input,
|
|
1354
|
-
anthropic_api_key_input,
|
|
1355
|
-
computer_os,
|
|
1356
|
-
computer_provider,
|
|
1357
|
-
container_name,
|
|
1358
|
-
cua_cloud_api_key,
|
|
1359
|
-
],
|
|
1360
|
-
outputs=[chatbot_history],
|
|
1361
|
-
queue=True,
|
|
1362
|
-
)
|
|
1363
|
-
|
|
1364
|
-
# Clear button functionality
|
|
1365
|
-
clear.click(lambda: None, None, chatbot_history, queue=False)
|
|
1366
|
-
|
|
1367
|
-
# Connect cancel button to cancel function
|
|
1368
|
-
cancel_button.click(
|
|
1369
|
-
cancel_agent_task,
|
|
1370
|
-
[chatbot_history],
|
|
1371
|
-
[chatbot_history],
|
|
1372
|
-
queue=False # Process immediately without queueing
|
|
1373
|
-
)
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
# Function to update the code display based on configuration and chat history
|
|
1377
|
-
def update_code_display(agent_loop, model_choice_val, custom_model_val, chat_history, provider_base_url, recent_images_val, save_trajectory_val, computer_os, computer_provider, container_name, cua_cloud_api_key):
|
|
1378
|
-
# Extract messages from chat history
|
|
1379
|
-
messages = []
|
|
1380
|
-
if chat_history:
|
|
1381
|
-
for msg in chat_history:
|
|
1382
|
-
if isinstance(msg, dict) and msg.get("role") == "user":
|
|
1383
|
-
messages.append(msg.get("content", ""))
|
|
1384
|
-
|
|
1385
|
-
# Determine provider and model based on current selection
|
|
1386
|
-
provider, model_name, _ = get_provider_and_model(
|
|
1387
|
-
model_choice_val or custom_model_val or "gpt-4o",
|
|
1388
|
-
agent_loop
|
|
1389
|
-
)
|
|
1390
|
-
|
|
1391
|
-
return generate_python_code(
|
|
1392
|
-
agent_loop,
|
|
1393
|
-
provider,
|
|
1394
|
-
model_name,
|
|
1395
|
-
messages,
|
|
1396
|
-
provider_base_url,
|
|
1397
|
-
recent_images_val,
|
|
1398
|
-
save_trajectory_val,
|
|
1399
|
-
computer_os,
|
|
1400
|
-
computer_provider,
|
|
1401
|
-
container_name,
|
|
1402
|
-
cua_cloud_api_key
|
|
1403
|
-
)
|
|
1404
|
-
|
|
1405
|
-
# Update code display when configuration changes
|
|
1406
|
-
agent_loop.change(
|
|
1407
|
-
update_code_display,
|
|
1408
|
-
inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
|
|
1409
|
-
outputs=[code_display]
|
|
1410
|
-
)
|
|
1411
|
-
model_choice.change(
|
|
1412
|
-
update_code_display,
|
|
1413
|
-
inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
|
|
1414
|
-
outputs=[code_display]
|
|
1415
|
-
)
|
|
1416
|
-
custom_model.change(
|
|
1417
|
-
update_code_display,
|
|
1418
|
-
inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
|
|
1419
|
-
outputs=[code_display]
|
|
1420
|
-
)
|
|
1421
|
-
chatbot_history.change(
|
|
1422
|
-
update_code_display,
|
|
1423
|
-
inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
|
|
1424
|
-
outputs=[code_display]
|
|
1425
|
-
)
|
|
1426
|
-
recent_images.change(
|
|
1427
|
-
update_code_display,
|
|
1428
|
-
inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
|
|
1429
|
-
outputs=[code_display]
|
|
1430
|
-
)
|
|
1431
|
-
save_trajectory.change(
|
|
1432
|
-
update_code_display,
|
|
1433
|
-
inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
|
|
1434
|
-
outputs=[code_display]
|
|
1435
|
-
)
|
|
1436
|
-
computer_os.change(
|
|
1437
|
-
update_code_display,
|
|
1438
|
-
inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
|
|
1439
|
-
outputs=[code_display]
|
|
1440
|
-
)
|
|
1441
|
-
computer_provider.change(
|
|
1442
|
-
update_code_display,
|
|
1443
|
-
inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
|
|
1444
|
-
outputs=[code_display]
|
|
1445
|
-
)
|
|
1446
|
-
container_name.change(
|
|
1447
|
-
update_code_display,
|
|
1448
|
-
inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
|
|
1449
|
-
outputs=[code_display]
|
|
1450
|
-
)
|
|
1451
|
-
cua_cloud_api_key.change(
|
|
1452
|
-
update_code_display,
|
|
1453
|
-
inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
|
|
1454
|
-
outputs=[code_display]
|
|
1455
|
-
)
|
|
229
|
+
if save_trajectory:
|
|
230
|
+
agent_kwargs["trajectory_dir"] = "trajectories"
|
|
231
|
+
|
|
232
|
+
if max_trajectory_budget:
|
|
233
|
+
agent_kwargs["max_trajectory_budget"] = {"max_budget": max_trajectory_budget, "raise_error": True}
|
|
1456
234
|
|
|
1457
|
-
|
|
235
|
+
global_agent = ComputerAgent(**agent_kwargs)
|
|
236
|
+
return global_agent
|
|
1458
237
|
|
|
1459
238
|
|
|
1460
239
|
def test_cua():
|
|
1461
240
|
"""Standalone function to launch the Gradio app."""
|
|
241
|
+
from agent.ui.gradio.ui_components import create_gradio_ui
|
|
242
|
+
print(f"Starting Gradio app for CUA Agent...")
|
|
1462
243
|
demo = create_gradio_ui()
|
|
1463
|
-
demo.launch(share=False, inbrowser=True)
|
|
244
|
+
demo.launch(share=False, inbrowser=True)
|
|
1464
245
|
|
|
1465
246
|
|
|
1466
247
|
if __name__ == "__main__":
|