cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/ui/gradio/ui_components.py
CHANGED
|
@@ -2,19 +2,25 @@
|
|
|
2
2
|
UI Components for the Gradio interface
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import os
|
|
6
5
|
import asyncio
|
|
7
|
-
import logging
|
|
8
6
|
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
9
|
import platform
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Dict, List, Optional,
|
|
11
|
+
from typing import Any, Dict, List, Optional, cast
|
|
12
|
+
|
|
12
13
|
import gradio as gr
|
|
13
14
|
from gradio.components.chatbot import MetadataDict
|
|
14
15
|
|
|
15
16
|
from .app import (
|
|
16
|
-
|
|
17
|
-
|
|
17
|
+
create_agent,
|
|
18
|
+
get_model_string,
|
|
19
|
+
get_ollama_models,
|
|
20
|
+
global_agent,
|
|
21
|
+
global_computer,
|
|
22
|
+
load_settings,
|
|
23
|
+
save_settings,
|
|
18
24
|
)
|
|
19
25
|
|
|
20
26
|
# Global messages array to maintain conversation history
|
|
@@ -23,30 +29,28 @@ global_messages = []
|
|
|
23
29
|
|
|
24
30
|
def create_gradio_ui() -> gr.Blocks:
|
|
25
31
|
"""Create a Gradio UI for the Computer-Use Agent."""
|
|
26
|
-
|
|
32
|
+
|
|
27
33
|
# Load settings
|
|
28
34
|
saved_settings = load_settings()
|
|
29
|
-
|
|
35
|
+
|
|
30
36
|
# Check for API keys
|
|
31
37
|
openai_api_key = os.environ.get("OPENAI_API_KEY", "")
|
|
32
38
|
anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
|
33
39
|
cua_api_key = os.environ.get("CUA_API_KEY", "")
|
|
34
|
-
|
|
40
|
+
|
|
35
41
|
# Model choices
|
|
36
42
|
openai_models = ["OpenAI: Computer-Use Preview"]
|
|
37
43
|
anthropic_models = [
|
|
38
44
|
"Anthropic: Claude 4 Opus (20250514)",
|
|
39
45
|
"Anthropic: Claude 4 Sonnet (20250514)",
|
|
40
46
|
"Anthropic: Claude 3.7 Sonnet (20250219)",
|
|
41
|
-
"Anthropic: Claude 3.5 Sonnet (20241022)",
|
|
42
47
|
]
|
|
43
48
|
omni_models = [
|
|
44
49
|
"OMNI: OpenAI GPT-4o",
|
|
45
50
|
"OMNI: OpenAI GPT-4o mini",
|
|
46
|
-
"OMNI: Claude 3.7 Sonnet (20250219)",
|
|
47
|
-
"OMNI: Claude 3.5 Sonnet (20241022)"
|
|
51
|
+
"OMNI: Claude 3.7 Sonnet (20250219)",
|
|
48
52
|
]
|
|
49
|
-
|
|
53
|
+
|
|
50
54
|
# Check if API keys are available
|
|
51
55
|
has_openai_key = bool(openai_api_key)
|
|
52
56
|
has_anthropic_key = bool(anthropic_api_key)
|
|
@@ -59,15 +63,20 @@ def create_gradio_ui() -> gr.Blocks:
|
|
|
59
63
|
|
|
60
64
|
# Detect platform
|
|
61
65
|
is_mac = platform.system().lower() == "darwin"
|
|
62
|
-
|
|
66
|
+
|
|
63
67
|
# Format model choices
|
|
64
68
|
provider_to_models = {
|
|
65
69
|
"OPENAI": openai_models,
|
|
66
70
|
"ANTHROPIC": anthropic_models,
|
|
67
71
|
"OMNI": omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
|
|
68
|
-
"UITARS": (
|
|
69
|
-
|
|
70
|
-
|
|
72
|
+
"UITARS": (
|
|
73
|
+
[
|
|
74
|
+
"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
|
|
75
|
+
]
|
|
76
|
+
if is_mac
|
|
77
|
+
else []
|
|
78
|
+
)
|
|
79
|
+
+ ["Custom model (OpenAI compatible API)"],
|
|
71
80
|
}
|
|
72
81
|
|
|
73
82
|
# Apply saved settings
|
|
@@ -82,7 +91,9 @@ def create_gradio_ui() -> gr.Blocks:
|
|
|
82
91
|
elif initial_loop == "ANTHROPIC":
|
|
83
92
|
initial_model = anthropic_models[0] if anthropic_models else "No models available"
|
|
84
93
|
else: # OMNI
|
|
85
|
-
initial_model =
|
|
94
|
+
initial_model = (
|
|
95
|
+
omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
|
|
96
|
+
)
|
|
86
97
|
|
|
87
98
|
initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
|
|
88
99
|
initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
|
|
@@ -96,16 +107,27 @@ def create_gradio_ui() -> gr.Blocks:
|
|
|
96
107
|
"Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
|
|
97
108
|
"Configure SSH keys and set up a connection to a remote server",
|
|
98
109
|
]
|
|
99
|
-
|
|
100
|
-
def generate_python_code(
|
|
110
|
+
|
|
111
|
+
def generate_python_code(
|
|
112
|
+
agent_loop_choice,
|
|
113
|
+
model_name,
|
|
114
|
+
tasks,
|
|
115
|
+
recent_images=3,
|
|
116
|
+
save_trajectory=True,
|
|
117
|
+
computer_os="linux",
|
|
118
|
+
computer_provider="cloud",
|
|
119
|
+
container_name="",
|
|
120
|
+
cua_cloud_api_key="",
|
|
121
|
+
max_budget=None,
|
|
122
|
+
):
|
|
101
123
|
"""Generate Python code for the current configuration and tasks."""
|
|
102
124
|
tasks_str = ""
|
|
103
125
|
for task in tasks:
|
|
104
126
|
if task and task.strip():
|
|
105
127
|
tasks_str += f' "{task}",\n'
|
|
106
|
-
|
|
128
|
+
|
|
107
129
|
model_string = get_model_string(model_name, agent_loop_choice)
|
|
108
|
-
|
|
130
|
+
|
|
109
131
|
computer_args = []
|
|
110
132
|
if computer_os != "macos":
|
|
111
133
|
computer_args.append(f'os_type="{computer_os}"')
|
|
@@ -115,14 +137,14 @@ def create_gradio_ui() -> gr.Blocks:
|
|
|
115
137
|
computer_args.append(f'name="{container_name}"')
|
|
116
138
|
if cua_cloud_api_key:
|
|
117
139
|
computer_args.append(f'api_key="{cua_cloud_api_key}"')
|
|
118
|
-
|
|
140
|
+
|
|
119
141
|
computer_args_str = ", ".join(computer_args)
|
|
120
142
|
if computer_args_str:
|
|
121
143
|
computer_args_str = f"({computer_args_str})"
|
|
122
144
|
else:
|
|
123
145
|
computer_args_str = "()"
|
|
124
|
-
|
|
125
|
-
code = f
|
|
146
|
+
|
|
147
|
+
code = f"""import asyncio
|
|
126
148
|
from computer import Computer
|
|
127
149
|
from agent import ComputerAgent
|
|
128
150
|
|
|
@@ -131,22 +153,22 @@ async def main():
|
|
|
131
153
|
agent = ComputerAgent(
|
|
132
154
|
model="{model_string}",
|
|
133
155
|
tools=[computer],
|
|
134
|
-
only_n_most_recent_images={recent_images},
|
|
135
|
-
|
|
156
|
+
only_n_most_recent_images={recent_images},"""
|
|
157
|
+
|
|
136
158
|
if save_trajectory:
|
|
137
|
-
code +=
|
|
138
|
-
trajectory_dir="trajectories",
|
|
139
|
-
|
|
159
|
+
code += """
|
|
160
|
+
trajectory_dir="trajectories","""
|
|
161
|
+
|
|
140
162
|
if max_budget:
|
|
141
|
-
code += f
|
|
142
|
-
max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},
|
|
143
|
-
|
|
144
|
-
code +=
|
|
163
|
+
code += f"""
|
|
164
|
+
max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},"""
|
|
165
|
+
|
|
166
|
+
code += """
|
|
145
167
|
)
|
|
146
|
-
|
|
147
|
-
|
|
168
|
+
"""
|
|
169
|
+
|
|
148
170
|
if tasks_str:
|
|
149
|
-
code += f
|
|
171
|
+
code += f"""
|
|
150
172
|
# Prompts for the computer-use agent
|
|
151
173
|
tasks = [
|
|
152
174
|
{tasks_str.rstrip()}
|
|
@@ -158,23 +180,23 @@ async def main():
|
|
|
158
180
|
async for result in agent.run(messages):
|
|
159
181
|
for item in result["output"]:
|
|
160
182
|
if item["type"] == "message":
|
|
161
|
-
print(item["content"][0]["text"])
|
|
183
|
+
print(item["content"][0]["text"])"""
|
|
162
184
|
else:
|
|
163
|
-
code +=
|
|
185
|
+
code += """
|
|
164
186
|
# Execute a single task
|
|
165
|
-
task = "Search for information about
|
|
166
|
-
print(f"Executing task: {
|
|
167
|
-
messages = [{
|
|
187
|
+
task = "Search for information about Cua on GitHub"
|
|
188
|
+
print(f"Executing task: {task}")
|
|
189
|
+
messages = [{"role": "user", "content": task}]
|
|
168
190
|
async for result in agent.run(messages):
|
|
169
191
|
for item in result["output"]:
|
|
170
192
|
if item["type"] == "message":
|
|
171
|
-
print(item["content"][0]["text"])
|
|
193
|
+
print(item["content"][0]["text"])"""
|
|
172
194
|
|
|
173
|
-
code +=
|
|
195
|
+
code += """
|
|
174
196
|
|
|
175
197
|
if __name__ == "__main__":
|
|
176
|
-
asyncio.run(main())
|
|
177
|
-
|
|
198
|
+
asyncio.run(main())"""
|
|
199
|
+
|
|
178
200
|
return code
|
|
179
201
|
|
|
180
202
|
# Create the Gradio interface
|
|
@@ -186,7 +208,7 @@ if __name__ == "__main__":
|
|
|
186
208
|
gr.HTML(
|
|
187
209
|
"""
|
|
188
210
|
<div style="display: flex; justify-content: center; margin-bottom: 0.5em">
|
|
189
|
-
<img alt="
|
|
211
|
+
<img alt="Cua Logo" style="width: 80px;"
|
|
190
212
|
src="https://github.com/trycua/cua/blob/main/img/logo_white.png?raw=true" />
|
|
191
213
|
</div>
|
|
192
214
|
"""
|
|
@@ -199,11 +221,11 @@ if __name__ == "__main__":
|
|
|
199
221
|
value=generate_python_code(initial_loop, "gpt-4o", []),
|
|
200
222
|
interactive=False,
|
|
201
223
|
)
|
|
202
|
-
|
|
224
|
+
|
|
203
225
|
with gr.Accordion("Computer Configuration", open=True):
|
|
204
226
|
is_windows = platform.system().lower() == "windows"
|
|
205
227
|
is_mac = platform.system().lower() == "darwin"
|
|
206
|
-
|
|
228
|
+
|
|
207
229
|
providers = ["cloud", "localhost", "docker"]
|
|
208
230
|
if is_mac:
|
|
209
231
|
providers += ["lume"]
|
|
@@ -227,30 +249,30 @@ if __name__ == "__main__":
|
|
|
227
249
|
value=computer_choices[0],
|
|
228
250
|
info="Select the operating system for the computer",
|
|
229
251
|
)
|
|
230
|
-
|
|
252
|
+
|
|
231
253
|
computer_provider = gr.Radio(
|
|
232
254
|
choices=providers,
|
|
233
255
|
label="Provider",
|
|
234
256
|
value="lume" if is_mac else "cloud",
|
|
235
257
|
info="Select the computer provider",
|
|
236
258
|
)
|
|
237
|
-
|
|
259
|
+
|
|
238
260
|
container_name = gr.Textbox(
|
|
239
261
|
label="Container Name",
|
|
240
262
|
placeholder="Enter container name (optional)",
|
|
241
263
|
value=os.environ.get("CUA_CONTAINER_NAME", ""),
|
|
242
264
|
info="Optional name for the container",
|
|
243
265
|
)
|
|
244
|
-
|
|
266
|
+
|
|
245
267
|
cua_cloud_api_key = gr.Textbox(
|
|
246
|
-
label="
|
|
247
|
-
placeholder="Enter your
|
|
268
|
+
label="Cua Cloud API Key",
|
|
269
|
+
placeholder="Enter your Cua Cloud API key",
|
|
248
270
|
value=os.environ.get("CUA_API_KEY", ""),
|
|
249
271
|
type="password",
|
|
250
272
|
info="Required for cloud provider",
|
|
251
|
-
visible=(not has_cua_key)
|
|
273
|
+
visible=(not has_cua_key),
|
|
252
274
|
)
|
|
253
|
-
|
|
275
|
+
|
|
254
276
|
with gr.Accordion("Agent Configuration", open=True):
|
|
255
277
|
agent_loop = gr.Dropdown(
|
|
256
278
|
choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
|
|
@@ -267,90 +289,113 @@ if __name__ == "__main__":
|
|
|
267
289
|
value=openai_models[0] if openai_models else "No models available",
|
|
268
290
|
info="Select OpenAI model",
|
|
269
291
|
interactive=True,
|
|
270
|
-
visible=(initial_loop == "OPENAI")
|
|
292
|
+
visible=(initial_loop == "OPENAI"),
|
|
271
293
|
)
|
|
272
|
-
|
|
294
|
+
|
|
273
295
|
anthropic_model_choice = gr.Dropdown(
|
|
274
296
|
choices=anthropic_models,
|
|
275
297
|
label="Anthropic Model",
|
|
276
|
-
value=
|
|
298
|
+
value=(
|
|
299
|
+
anthropic_models[0] if anthropic_models else "No models available"
|
|
300
|
+
),
|
|
277
301
|
info="Select Anthropic model",
|
|
278
302
|
interactive=True,
|
|
279
|
-
visible=(initial_loop == "ANTHROPIC")
|
|
303
|
+
visible=(initial_loop == "ANTHROPIC"),
|
|
280
304
|
)
|
|
281
|
-
|
|
305
|
+
|
|
282
306
|
omni_model_choice = gr.Dropdown(
|
|
283
|
-
choices=omni_models
|
|
307
|
+
choices=omni_models
|
|
308
|
+
+ ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
|
|
284
309
|
label="OMNI Model",
|
|
285
|
-
value=
|
|
310
|
+
value=(
|
|
311
|
+
omni_models[0]
|
|
312
|
+
if omni_models
|
|
313
|
+
else "Custom model (OpenAI compatible API)"
|
|
314
|
+
),
|
|
286
315
|
info="Select OMNI model or choose a custom model option",
|
|
287
316
|
interactive=True,
|
|
288
|
-
visible=(initial_loop == "OMNI")
|
|
317
|
+
visible=(initial_loop == "OMNI"),
|
|
289
318
|
)
|
|
290
|
-
|
|
319
|
+
|
|
291
320
|
uitars_model_choice = gr.Dropdown(
|
|
292
321
|
choices=provider_to_models.get("UITARS", ["No models available"]),
|
|
293
322
|
label="UITARS Model",
|
|
294
|
-
value=
|
|
323
|
+
value=(
|
|
324
|
+
provider_to_models.get("UITARS", ["No models available"])[0]
|
|
325
|
+
if provider_to_models.get("UITARS")
|
|
326
|
+
else "No models available"
|
|
327
|
+
),
|
|
295
328
|
info="Select UITARS model",
|
|
296
329
|
interactive=True,
|
|
297
|
-
visible=(initial_loop == "UITARS")
|
|
330
|
+
visible=(initial_loop == "UITARS"),
|
|
298
331
|
)
|
|
299
|
-
|
|
332
|
+
|
|
300
333
|
model_choice = gr.Textbox(visible=False)
|
|
301
334
|
|
|
302
335
|
# API key inputs
|
|
303
|
-
with gr.Group(
|
|
336
|
+
with gr.Group(
|
|
337
|
+
visible=not has_openai_key
|
|
338
|
+
and (initial_loop == "OPENAI" or initial_loop == "OMNI")
|
|
339
|
+
) as openai_key_group:
|
|
304
340
|
openai_api_key_input = gr.Textbox(
|
|
305
341
|
label="OpenAI API Key",
|
|
306
342
|
placeholder="Enter your OpenAI API key",
|
|
307
343
|
value=os.environ.get("OPENAI_API_KEY", ""),
|
|
308
344
|
interactive=True,
|
|
309
345
|
type="password",
|
|
310
|
-
info="Required for OpenAI models"
|
|
346
|
+
info="Required for OpenAI models",
|
|
311
347
|
)
|
|
312
|
-
|
|
313
|
-
with gr.Group(
|
|
348
|
+
|
|
349
|
+
with gr.Group(
|
|
350
|
+
visible=not has_anthropic_key
|
|
351
|
+
and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")
|
|
352
|
+
) as anthropic_key_group:
|
|
314
353
|
anthropic_api_key_input = gr.Textbox(
|
|
315
354
|
label="Anthropic API Key",
|
|
316
355
|
placeholder="Enter your Anthropic API key",
|
|
317
356
|
value=os.environ.get("ANTHROPIC_API_KEY", ""),
|
|
318
357
|
interactive=True,
|
|
319
358
|
type="password",
|
|
320
|
-
info="Required for Anthropic models"
|
|
359
|
+
info="Required for Anthropic models",
|
|
321
360
|
)
|
|
322
|
-
|
|
361
|
+
|
|
323
362
|
# API key handlers
|
|
324
363
|
def set_openai_api_key(key):
|
|
325
364
|
if key and key.strip():
|
|
326
365
|
os.environ["OPENAI_API_KEY"] = key.strip()
|
|
327
|
-
print(
|
|
366
|
+
print("DEBUG - Set OpenAI API key environment variable")
|
|
328
367
|
return key
|
|
329
|
-
|
|
368
|
+
|
|
330
369
|
def set_anthropic_api_key(key):
|
|
331
370
|
if key and key.strip():
|
|
332
371
|
os.environ["ANTHROPIC_API_KEY"] = key.strip()
|
|
333
|
-
print(
|
|
372
|
+
print("DEBUG - Set Anthropic API key environment variable")
|
|
334
373
|
return key
|
|
335
|
-
|
|
374
|
+
|
|
336
375
|
openai_api_key_input.change(
|
|
337
376
|
fn=set_openai_api_key,
|
|
338
377
|
inputs=[openai_api_key_input],
|
|
339
378
|
outputs=[openai_api_key_input],
|
|
340
|
-
queue=False
|
|
379
|
+
queue=False,
|
|
341
380
|
)
|
|
342
|
-
|
|
381
|
+
|
|
343
382
|
anthropic_api_key_input.change(
|
|
344
383
|
fn=set_anthropic_api_key,
|
|
345
384
|
inputs=[anthropic_api_key_input],
|
|
346
385
|
outputs=[anthropic_api_key_input],
|
|
347
|
-
queue=False
|
|
386
|
+
queue=False,
|
|
348
387
|
)
|
|
349
388
|
|
|
350
389
|
# UI update function
|
|
351
|
-
def update_ui(
|
|
390
|
+
def update_ui(
|
|
391
|
+
loop=None,
|
|
392
|
+
openai_model=None,
|
|
393
|
+
anthropic_model=None,
|
|
394
|
+
omni_model=None,
|
|
395
|
+
uitars_model=None,
|
|
396
|
+
):
|
|
352
397
|
loop = loop or agent_loop.value
|
|
353
|
-
|
|
398
|
+
|
|
354
399
|
model_value = None
|
|
355
400
|
if loop == "OPENAI" and openai_model:
|
|
356
401
|
model_value = openai_model
|
|
@@ -360,21 +405,37 @@ if __name__ == "__main__":
|
|
|
360
405
|
model_value = omni_model
|
|
361
406
|
elif loop == "UITARS" and uitars_model:
|
|
362
407
|
model_value = uitars_model
|
|
363
|
-
|
|
364
|
-
openai_visible =
|
|
365
|
-
anthropic_visible =
|
|
366
|
-
omni_visible =
|
|
367
|
-
uitars_visible =
|
|
368
|
-
|
|
369
|
-
show_openai_key = not has_openai_key and (
|
|
370
|
-
|
|
371
|
-
|
|
408
|
+
|
|
409
|
+
openai_visible = loop == "OPENAI"
|
|
410
|
+
anthropic_visible = loop == "ANTHROPIC"
|
|
411
|
+
omni_visible = loop == "OMNI"
|
|
412
|
+
uitars_visible = loop == "UITARS"
|
|
413
|
+
|
|
414
|
+
show_openai_key = not has_openai_key and (
|
|
415
|
+
loop == "OPENAI"
|
|
416
|
+
or (
|
|
417
|
+
loop == "OMNI"
|
|
418
|
+
and model_value
|
|
419
|
+
and "OpenAI" in model_value
|
|
420
|
+
and "Custom" not in model_value
|
|
421
|
+
)
|
|
422
|
+
)
|
|
423
|
+
show_anthropic_key = not has_anthropic_key and (
|
|
424
|
+
loop == "ANTHROPIC"
|
|
425
|
+
or (
|
|
426
|
+
loop == "OMNI"
|
|
427
|
+
and model_value
|
|
428
|
+
and "Claude" in model_value
|
|
429
|
+
and "Custom" not in model_value
|
|
430
|
+
)
|
|
431
|
+
)
|
|
432
|
+
|
|
372
433
|
is_custom_openai_api = model_value == "Custom model (OpenAI compatible API)"
|
|
373
434
|
is_custom_ollama = model_value == "Custom model (ollama)"
|
|
374
435
|
is_any_custom = is_custom_openai_api or is_custom_ollama
|
|
375
|
-
|
|
436
|
+
|
|
376
437
|
model_choice_value = model_value if model_value else ""
|
|
377
|
-
|
|
438
|
+
|
|
378
439
|
return [
|
|
379
440
|
gr.update(visible=openai_visible),
|
|
380
441
|
gr.update(visible=anthropic_visible),
|
|
@@ -385,15 +446,18 @@ if __name__ == "__main__":
|
|
|
385
446
|
gr.update(visible=is_any_custom),
|
|
386
447
|
gr.update(visible=is_custom_openai_api),
|
|
387
448
|
gr.update(visible=is_custom_openai_api),
|
|
388
|
-
gr.update(value=model_choice_value)
|
|
449
|
+
gr.update(value=model_choice_value),
|
|
389
450
|
]
|
|
390
|
-
|
|
451
|
+
|
|
391
452
|
# Custom model inputs
|
|
392
453
|
custom_model = gr.Textbox(
|
|
393
454
|
label="Custom Model Name",
|
|
394
455
|
placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct or llama3)",
|
|
395
456
|
value=initial_custom_model,
|
|
396
|
-
visible=(
|
|
457
|
+
visible=(
|
|
458
|
+
initial_model == "Custom model (OpenAI compatible API)"
|
|
459
|
+
or initial_model == "Custom model (ollama)"
|
|
460
|
+
),
|
|
397
461
|
interactive=True,
|
|
398
462
|
)
|
|
399
463
|
|
|
@@ -413,36 +477,56 @@ if __name__ == "__main__":
|
|
|
413
477
|
interactive=True,
|
|
414
478
|
type="password",
|
|
415
479
|
)
|
|
416
|
-
|
|
480
|
+
|
|
417
481
|
# Provider visibility update function
|
|
418
482
|
def update_provider_visibility(provider):
|
|
419
483
|
"""Update visibility of container name and API key based on selected provider."""
|
|
420
484
|
is_localhost = provider == "localhost"
|
|
421
485
|
return [
|
|
422
486
|
gr.update(visible=not is_localhost), # container_name
|
|
423
|
-
gr.update(
|
|
487
|
+
gr.update(
|
|
488
|
+
visible=not is_localhost and not has_cua_key
|
|
489
|
+
), # cua_cloud_api_key
|
|
424
490
|
]
|
|
425
|
-
|
|
491
|
+
|
|
426
492
|
# Connect provider change event
|
|
427
493
|
computer_provider.change(
|
|
428
494
|
fn=update_provider_visibility,
|
|
429
495
|
inputs=[computer_provider],
|
|
430
496
|
outputs=[container_name, cua_cloud_api_key],
|
|
431
|
-
queue=False
|
|
497
|
+
queue=False,
|
|
432
498
|
)
|
|
433
|
-
|
|
499
|
+
|
|
434
500
|
# Connect UI update events
|
|
435
|
-
for dropdown in [
|
|
501
|
+
for dropdown in [
|
|
502
|
+
agent_loop,
|
|
503
|
+
omni_model_choice,
|
|
504
|
+
uitars_model_choice,
|
|
505
|
+
openai_model_choice,
|
|
506
|
+
anthropic_model_choice,
|
|
507
|
+
]:
|
|
436
508
|
dropdown.change(
|
|
437
509
|
fn=update_ui,
|
|
438
|
-
inputs=[
|
|
510
|
+
inputs=[
|
|
511
|
+
agent_loop,
|
|
512
|
+
openai_model_choice,
|
|
513
|
+
anthropic_model_choice,
|
|
514
|
+
omni_model_choice,
|
|
515
|
+
uitars_model_choice,
|
|
516
|
+
],
|
|
439
517
|
outputs=[
|
|
440
|
-
openai_model_choice,
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
518
|
+
openai_model_choice,
|
|
519
|
+
anthropic_model_choice,
|
|
520
|
+
omni_model_choice,
|
|
521
|
+
uitars_model_choice,
|
|
522
|
+
openai_key_group,
|
|
523
|
+
anthropic_key_group,
|
|
524
|
+
custom_model,
|
|
525
|
+
provider_base_url,
|
|
526
|
+
provider_api_key,
|
|
527
|
+
model_choice,
|
|
444
528
|
],
|
|
445
|
-
queue=False
|
|
529
|
+
queue=False,
|
|
446
530
|
)
|
|
447
531
|
|
|
448
532
|
save_trajectory = gr.Checkbox(
|
|
@@ -461,7 +545,7 @@ if __name__ == "__main__":
|
|
|
461
545
|
info="Number of recent images to keep in context",
|
|
462
546
|
interactive=True,
|
|
463
547
|
)
|
|
464
|
-
|
|
548
|
+
|
|
465
549
|
max_budget = gr.Number(
|
|
466
550
|
label="Max Budget ($)",
|
|
467
551
|
value=lambda: None,
|
|
@@ -478,10 +562,8 @@ if __name__ == "__main__":
|
|
|
478
562
|
"Ask me to perform tasks in a virtual environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
|
|
479
563
|
)
|
|
480
564
|
|
|
481
|
-
chatbot_history = gr.Chatbot(
|
|
482
|
-
msg = gr.Textbox(
|
|
483
|
-
placeholder="Ask me to perform tasks in a virtual environment"
|
|
484
|
-
)
|
|
565
|
+
chatbot_history = gr.Chatbot()
|
|
566
|
+
msg = gr.Textbox(placeholder="Ask me to perform tasks in a virtual environment")
|
|
485
567
|
clear = gr.Button("Clear")
|
|
486
568
|
cancel_button = gr.Button("Cancel", variant="stop")
|
|
487
569
|
|
|
@@ -498,11 +580,23 @@ if __name__ == "__main__":
|
|
|
498
580
|
global global_agent
|
|
499
581
|
if global_agent:
|
|
500
582
|
print("DEBUG - Cancelling agent task")
|
|
501
|
-
history.append(
|
|
583
|
+
history.append(
|
|
584
|
+
gr.ChatMessage(
|
|
585
|
+
role="assistant",
|
|
586
|
+
content="Task cancelled by user",
|
|
587
|
+
metadata={"title": "❌ Cancelled"},
|
|
588
|
+
)
|
|
589
|
+
)
|
|
502
590
|
else:
|
|
503
|
-
history.append(
|
|
591
|
+
history.append(
|
|
592
|
+
gr.ChatMessage(
|
|
593
|
+
role="assistant",
|
|
594
|
+
content="No active agent task to cancel",
|
|
595
|
+
metadata={"title": "ℹ️ Info"},
|
|
596
|
+
)
|
|
597
|
+
)
|
|
504
598
|
return history
|
|
505
|
-
|
|
599
|
+
|
|
506
600
|
# Process response function
|
|
507
601
|
async def process_response(
|
|
508
602
|
history,
|
|
@@ -542,10 +636,13 @@ if __name__ == "__main__":
|
|
|
542
636
|
model_choice_value = uitars_model_value
|
|
543
637
|
else:
|
|
544
638
|
model_choice_value = "No models available"
|
|
545
|
-
|
|
639
|
+
|
|
546
640
|
# Determine if this is a custom model selection
|
|
547
|
-
is_custom_model_selected = model_choice_value in [
|
|
548
|
-
|
|
641
|
+
is_custom_model_selected = model_choice_value in [
|
|
642
|
+
"Custom model (OpenAI compatible API)",
|
|
643
|
+
"Custom model (ollama)",
|
|
644
|
+
]
|
|
645
|
+
|
|
549
646
|
# Determine the model name string to analyze
|
|
550
647
|
if is_custom_model_selected:
|
|
551
648
|
model_string_to_analyze = custom_model_value
|
|
@@ -583,13 +680,19 @@ if __name__ == "__main__":
|
|
|
583
680
|
model_string=model_string,
|
|
584
681
|
save_trajectory=save_traj,
|
|
585
682
|
only_n_most_recent_images=recent_imgs,
|
|
586
|
-
custom_model_name=
|
|
683
|
+
custom_model_name=(
|
|
684
|
+
custom_model_value if is_custom_model_selected else None
|
|
685
|
+
),
|
|
587
686
|
computer_os=computer_os,
|
|
588
687
|
computer_provider=computer_provider,
|
|
589
688
|
computer_name=container_name,
|
|
590
689
|
computer_api_key=cua_cloud_api_key,
|
|
591
690
|
verbosity=logging.DEBUG,
|
|
592
|
-
max_trajectory_budget=
|
|
691
|
+
max_trajectory_budget=(
|
|
692
|
+
max_budget_value
|
|
693
|
+
if max_budget_value and max_budget_value > 0
|
|
694
|
+
else None
|
|
695
|
+
),
|
|
593
696
|
)
|
|
594
697
|
|
|
595
698
|
if global_agent is None:
|
|
@@ -605,7 +708,7 @@ if __name__ == "__main__":
|
|
|
605
708
|
# Add user message to global history
|
|
606
709
|
global global_messages
|
|
607
710
|
global_messages.append({"role": "user", "content": last_user_message})
|
|
608
|
-
|
|
711
|
+
|
|
609
712
|
# Stream responses from the agent
|
|
610
713
|
async for result in global_agent.run(global_messages):
|
|
611
714
|
global_messages += result.get("output", [])
|
|
@@ -613,18 +716,20 @@ if __name__ == "__main__":
|
|
|
613
716
|
# from pprint import pprint
|
|
614
717
|
# pprint(result)
|
|
615
718
|
# print(f"DEBUG - Agent response ------- END")
|
|
616
|
-
|
|
719
|
+
|
|
617
720
|
# Process the result output
|
|
618
721
|
for item in result.get("output", []):
|
|
619
722
|
if item.get("type") == "message":
|
|
620
723
|
content = item.get("content", [])
|
|
621
724
|
for content_part in content:
|
|
622
725
|
if content_part.get("text"):
|
|
623
|
-
history.append(
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
726
|
+
history.append(
|
|
727
|
+
gr.ChatMessage(
|
|
728
|
+
role=item.get("role", "assistant"),
|
|
729
|
+
content=content_part.get("text", ""),
|
|
730
|
+
metadata=content_part.get("metadata", {}),
|
|
731
|
+
)
|
|
732
|
+
)
|
|
628
733
|
elif item.get("type") == "computer_call":
|
|
629
734
|
action = item.get("action", {})
|
|
630
735
|
action_type = action.get("type", "")
|
|
@@ -632,43 +737,52 @@ if __name__ == "__main__":
|
|
|
632
737
|
action_title = f"🛠️ Performing {action_type}"
|
|
633
738
|
if action.get("x") and action.get("y"):
|
|
634
739
|
action_title += f" at ({action['x']}, {action['y']})"
|
|
635
|
-
history.append(
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
740
|
+
history.append(
|
|
741
|
+
gr.ChatMessage(
|
|
742
|
+
role="assistant",
|
|
743
|
+
content=f"```json\n{json.dumps(action)}\n```",
|
|
744
|
+
metadata={"title": action_title},
|
|
745
|
+
)
|
|
746
|
+
)
|
|
640
747
|
elif item.get("type") == "function_call":
|
|
641
748
|
function_name = item.get("name", "")
|
|
642
749
|
arguments = item.get("arguments", "{}")
|
|
643
|
-
history.append(
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
750
|
+
history.append(
|
|
751
|
+
gr.ChatMessage(
|
|
752
|
+
role="assistant",
|
|
753
|
+
content=f"🔧 Calling function: {function_name}\n```json\n{arguments}\n```",
|
|
754
|
+
metadata={"title": f"Function Call: {function_name}"},
|
|
755
|
+
)
|
|
756
|
+
)
|
|
648
757
|
elif item.get("type") == "function_call_output":
|
|
649
758
|
output = item.get("output", "")
|
|
650
|
-
history.append(
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
759
|
+
history.append(
|
|
760
|
+
gr.ChatMessage(
|
|
761
|
+
role="assistant",
|
|
762
|
+
content=f"📤 Function output:\n```\n{output}\n```",
|
|
763
|
+
metadata={"title": "Function Output"},
|
|
764
|
+
)
|
|
765
|
+
)
|
|
655
766
|
elif item.get("type") == "computer_call_output":
|
|
656
767
|
output = item.get("output", {}).get("image_url", "")
|
|
657
768
|
image_markdown = f""
|
|
658
|
-
history.append(
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
769
|
+
history.append(
|
|
770
|
+
gr.ChatMessage(
|
|
771
|
+
role="assistant",
|
|
772
|
+
content=image_markdown,
|
|
773
|
+
metadata={"title": "🖥️ Computer Output"},
|
|
774
|
+
)
|
|
775
|
+
)
|
|
776
|
+
|
|
664
777
|
yield history
|
|
665
|
-
|
|
778
|
+
|
|
666
779
|
except Exception as e:
|
|
667
780
|
import traceback
|
|
781
|
+
|
|
668
782
|
traceback.print_exc()
|
|
669
783
|
history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
|
|
670
784
|
yield history
|
|
671
|
-
|
|
785
|
+
|
|
672
786
|
# Connect the submit button
|
|
673
787
|
submit_event = msg.submit(
|
|
674
788
|
fn=chat_submit,
|
|
@@ -706,44 +820,77 @@ if __name__ == "__main__":
|
|
|
706
820
|
global global_messages
|
|
707
821
|
global_messages.clear()
|
|
708
822
|
return None
|
|
709
|
-
|
|
823
|
+
|
|
710
824
|
clear.click(clear_chat, None, chatbot_history, queue=False)
|
|
711
|
-
|
|
825
|
+
|
|
712
826
|
# Connect cancel button
|
|
713
827
|
cancel_button.click(
|
|
714
|
-
cancel_agent_task,
|
|
715
|
-
[chatbot_history],
|
|
716
|
-
[chatbot_history],
|
|
717
|
-
queue=False
|
|
828
|
+
cancel_agent_task, [chatbot_history], [chatbot_history], queue=False
|
|
718
829
|
)
|
|
719
830
|
|
|
720
831
|
# Code display update function
|
|
721
|
-
def update_code_display(
|
|
832
|
+
def update_code_display(
|
|
833
|
+
agent_loop,
|
|
834
|
+
model_choice_val,
|
|
835
|
+
custom_model_val,
|
|
836
|
+
chat_history,
|
|
837
|
+
recent_images_val,
|
|
838
|
+
save_trajectory_val,
|
|
839
|
+
computer_os,
|
|
840
|
+
computer_provider,
|
|
841
|
+
container_name,
|
|
842
|
+
cua_cloud_api_key,
|
|
843
|
+
max_budget_val,
|
|
844
|
+
):
|
|
722
845
|
messages = []
|
|
723
846
|
if chat_history:
|
|
724
847
|
for msg in chat_history:
|
|
725
848
|
if isinstance(msg, dict) and msg.get("role") == "user":
|
|
726
849
|
messages.append(msg.get("content", ""))
|
|
727
|
-
|
|
850
|
+
|
|
728
851
|
return generate_python_code(
|
|
729
|
-
agent_loop,
|
|
730
|
-
model_choice_val or custom_model_val or "gpt-4o",
|
|
731
|
-
messages,
|
|
852
|
+
agent_loop,
|
|
853
|
+
model_choice_val or custom_model_val or "gpt-4o",
|
|
854
|
+
messages,
|
|
732
855
|
recent_images_val,
|
|
733
856
|
save_trajectory_val,
|
|
734
857
|
computer_os,
|
|
735
858
|
computer_provider,
|
|
736
859
|
container_name,
|
|
737
860
|
cua_cloud_api_key,
|
|
738
|
-
max_budget_val
|
|
861
|
+
max_budget_val,
|
|
739
862
|
)
|
|
740
|
-
|
|
863
|
+
|
|
741
864
|
# Update code display when configuration changes
|
|
742
|
-
for component in [
|
|
865
|
+
for component in [
|
|
866
|
+
agent_loop,
|
|
867
|
+
model_choice,
|
|
868
|
+
custom_model,
|
|
869
|
+
chatbot_history,
|
|
870
|
+
recent_images,
|
|
871
|
+
save_trajectory,
|
|
872
|
+
computer_os,
|
|
873
|
+
computer_provider,
|
|
874
|
+
container_name,
|
|
875
|
+
cua_cloud_api_key,
|
|
876
|
+
max_budget,
|
|
877
|
+
]:
|
|
743
878
|
component.change(
|
|
744
879
|
update_code_display,
|
|
745
|
-
inputs=[
|
|
746
|
-
|
|
880
|
+
inputs=[
|
|
881
|
+
agent_loop,
|
|
882
|
+
model_choice,
|
|
883
|
+
custom_model,
|
|
884
|
+
chatbot_history,
|
|
885
|
+
recent_images,
|
|
886
|
+
save_trajectory,
|
|
887
|
+
computer_os,
|
|
888
|
+
computer_provider,
|
|
889
|
+
container_name,
|
|
890
|
+
cua_cloud_api_key,
|
|
891
|
+
max_budget,
|
|
892
|
+
],
|
|
893
|
+
outputs=[code_display],
|
|
747
894
|
)
|
|
748
895
|
|
|
749
896
|
return demo
|