cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +32 -19
- agent/computers/cua.py +33 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +215 -210
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +510 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
- cua_agent-0.4.36.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0
agent/ui/gradio/ui_components.py
CHANGED
|
@@ -2,19 +2,25 @@
|
|
|
2
2
|
UI Components for the Gradio interface
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import os
|
|
6
5
|
import asyncio
|
|
7
|
-
import logging
|
|
8
6
|
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
9
|
import platform
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Dict, List, Optional,
|
|
11
|
+
from typing import Any, Dict, List, Optional, cast
|
|
12
|
+
|
|
12
13
|
import gradio as gr
|
|
13
14
|
from gradio.components.chatbot import MetadataDict
|
|
14
15
|
|
|
15
16
|
from .app import (
|
|
16
|
-
|
|
17
|
-
|
|
17
|
+
create_agent,
|
|
18
|
+
get_model_string,
|
|
19
|
+
get_ollama_models,
|
|
20
|
+
global_agent,
|
|
21
|
+
global_computer,
|
|
22
|
+
load_settings,
|
|
23
|
+
save_settings,
|
|
18
24
|
)
|
|
19
25
|
|
|
20
26
|
# Global messages array to maintain conversation history
|
|
@@ -23,15 +29,15 @@ global_messages = []
|
|
|
23
29
|
|
|
24
30
|
def create_gradio_ui() -> gr.Blocks:
|
|
25
31
|
"""Create a Gradio UI for the Computer-Use Agent."""
|
|
26
|
-
|
|
32
|
+
|
|
27
33
|
# Load settings
|
|
28
34
|
saved_settings = load_settings()
|
|
29
|
-
|
|
35
|
+
|
|
30
36
|
# Check for API keys
|
|
31
37
|
openai_api_key = os.environ.get("OPENAI_API_KEY", "")
|
|
32
38
|
anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
|
33
39
|
cua_api_key = os.environ.get("CUA_API_KEY", "")
|
|
34
|
-
|
|
40
|
+
|
|
35
41
|
# Model choices
|
|
36
42
|
openai_models = ["OpenAI: Computer-Use Preview"]
|
|
37
43
|
anthropic_models = [
|
|
@@ -43,10 +49,10 @@ def create_gradio_ui() -> gr.Blocks:
|
|
|
43
49
|
omni_models = [
|
|
44
50
|
"OMNI: OpenAI GPT-4o",
|
|
45
51
|
"OMNI: OpenAI GPT-4o mini",
|
|
46
|
-
"OMNI: Claude 3.7 Sonnet (20250219)",
|
|
47
|
-
"OMNI: Claude 3.5 Sonnet (20241022)"
|
|
52
|
+
"OMNI: Claude 3.7 Sonnet (20250219)",
|
|
53
|
+
"OMNI: Claude 3.5 Sonnet (20241022)",
|
|
48
54
|
]
|
|
49
|
-
|
|
55
|
+
|
|
50
56
|
# Check if API keys are available
|
|
51
57
|
has_openai_key = bool(openai_api_key)
|
|
52
58
|
has_anthropic_key = bool(anthropic_api_key)
|
|
@@ -59,15 +65,20 @@ def create_gradio_ui() -> gr.Blocks:
|
|
|
59
65
|
|
|
60
66
|
# Detect platform
|
|
61
67
|
is_mac = platform.system().lower() == "darwin"
|
|
62
|
-
|
|
68
|
+
|
|
63
69
|
# Format model choices
|
|
64
70
|
provider_to_models = {
|
|
65
71
|
"OPENAI": openai_models,
|
|
66
72
|
"ANTHROPIC": anthropic_models,
|
|
67
73
|
"OMNI": omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
|
|
68
|
-
"UITARS": (
|
|
69
|
-
|
|
70
|
-
|
|
74
|
+
"UITARS": (
|
|
75
|
+
[
|
|
76
|
+
"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
|
|
77
|
+
]
|
|
78
|
+
if is_mac
|
|
79
|
+
else []
|
|
80
|
+
)
|
|
81
|
+
+ ["Custom model (OpenAI compatible API)"],
|
|
71
82
|
}
|
|
72
83
|
|
|
73
84
|
# Apply saved settings
|
|
@@ -82,7 +93,9 @@ def create_gradio_ui() -> gr.Blocks:
|
|
|
82
93
|
elif initial_loop == "ANTHROPIC":
|
|
83
94
|
initial_model = anthropic_models[0] if anthropic_models else "No models available"
|
|
84
95
|
else: # OMNI
|
|
85
|
-
initial_model =
|
|
96
|
+
initial_model = (
|
|
97
|
+
omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
|
|
98
|
+
)
|
|
86
99
|
|
|
87
100
|
initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
|
|
88
101
|
initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
|
|
@@ -96,16 +109,27 @@ def create_gradio_ui() -> gr.Blocks:
|
|
|
96
109
|
"Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
|
|
97
110
|
"Configure SSH keys and set up a connection to a remote server",
|
|
98
111
|
]
|
|
99
|
-
|
|
100
|
-
def generate_python_code(
|
|
112
|
+
|
|
113
|
+
def generate_python_code(
|
|
114
|
+
agent_loop_choice,
|
|
115
|
+
model_name,
|
|
116
|
+
tasks,
|
|
117
|
+
recent_images=3,
|
|
118
|
+
save_trajectory=True,
|
|
119
|
+
computer_os="linux",
|
|
120
|
+
computer_provider="cloud",
|
|
121
|
+
container_name="",
|
|
122
|
+
cua_cloud_api_key="",
|
|
123
|
+
max_budget=None,
|
|
124
|
+
):
|
|
101
125
|
"""Generate Python code for the current configuration and tasks."""
|
|
102
126
|
tasks_str = ""
|
|
103
127
|
for task in tasks:
|
|
104
128
|
if task and task.strip():
|
|
105
129
|
tasks_str += f' "{task}",\n'
|
|
106
|
-
|
|
130
|
+
|
|
107
131
|
model_string = get_model_string(model_name, agent_loop_choice)
|
|
108
|
-
|
|
132
|
+
|
|
109
133
|
computer_args = []
|
|
110
134
|
if computer_os != "macos":
|
|
111
135
|
computer_args.append(f'os_type="{computer_os}"')
|
|
@@ -115,14 +139,14 @@ def create_gradio_ui() -> gr.Blocks:
|
|
|
115
139
|
computer_args.append(f'name="{container_name}"')
|
|
116
140
|
if cua_cloud_api_key:
|
|
117
141
|
computer_args.append(f'api_key="{cua_cloud_api_key}"')
|
|
118
|
-
|
|
142
|
+
|
|
119
143
|
computer_args_str = ", ".join(computer_args)
|
|
120
144
|
if computer_args_str:
|
|
121
145
|
computer_args_str = f"({computer_args_str})"
|
|
122
146
|
else:
|
|
123
147
|
computer_args_str = "()"
|
|
124
|
-
|
|
125
|
-
code = f
|
|
148
|
+
|
|
149
|
+
code = f"""import asyncio
|
|
126
150
|
from computer import Computer
|
|
127
151
|
from agent import ComputerAgent
|
|
128
152
|
|
|
@@ -131,22 +155,22 @@ async def main():
|
|
|
131
155
|
agent = ComputerAgent(
|
|
132
156
|
model="{model_string}",
|
|
133
157
|
tools=[computer],
|
|
134
|
-
only_n_most_recent_images={recent_images},
|
|
135
|
-
|
|
158
|
+
only_n_most_recent_images={recent_images},"""
|
|
159
|
+
|
|
136
160
|
if save_trajectory:
|
|
137
|
-
code +=
|
|
138
|
-
trajectory_dir="trajectories",
|
|
139
|
-
|
|
161
|
+
code += """
|
|
162
|
+
trajectory_dir="trajectories","""
|
|
163
|
+
|
|
140
164
|
if max_budget:
|
|
141
|
-
code += f
|
|
142
|
-
max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},
|
|
143
|
-
|
|
144
|
-
code +=
|
|
165
|
+
code += f"""
|
|
166
|
+
max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},"""
|
|
167
|
+
|
|
168
|
+
code += """
|
|
145
169
|
)
|
|
146
|
-
|
|
147
|
-
|
|
170
|
+
"""
|
|
171
|
+
|
|
148
172
|
if tasks_str:
|
|
149
|
-
code += f
|
|
173
|
+
code += f"""
|
|
150
174
|
# Prompts for the computer-use agent
|
|
151
175
|
tasks = [
|
|
152
176
|
{tasks_str.rstrip()}
|
|
@@ -158,23 +182,23 @@ async def main():
|
|
|
158
182
|
async for result in agent.run(messages):
|
|
159
183
|
for item in result["output"]:
|
|
160
184
|
if item["type"] == "message":
|
|
161
|
-
print(item["content"][0]["text"])
|
|
185
|
+
print(item["content"][0]["text"])"""
|
|
162
186
|
else:
|
|
163
|
-
code +=
|
|
187
|
+
code += """
|
|
164
188
|
# Execute a single task
|
|
165
189
|
task = "Search for information about CUA on GitHub"
|
|
166
|
-
print(f"Executing task: {
|
|
167
|
-
messages = [{
|
|
190
|
+
print(f"Executing task: {task}")
|
|
191
|
+
messages = [{"role": "user", "content": task}]
|
|
168
192
|
async for result in agent.run(messages):
|
|
169
193
|
for item in result["output"]:
|
|
170
194
|
if item["type"] == "message":
|
|
171
|
-
print(item["content"][0]["text"])
|
|
195
|
+
print(item["content"][0]["text"])"""
|
|
172
196
|
|
|
173
|
-
code +=
|
|
197
|
+
code += """
|
|
174
198
|
|
|
175
199
|
if __name__ == "__main__":
|
|
176
|
-
asyncio.run(main())
|
|
177
|
-
|
|
200
|
+
asyncio.run(main())"""
|
|
201
|
+
|
|
178
202
|
return code
|
|
179
203
|
|
|
180
204
|
# Create the Gradio interface
|
|
@@ -199,11 +223,11 @@ if __name__ == "__main__":
|
|
|
199
223
|
value=generate_python_code(initial_loop, "gpt-4o", []),
|
|
200
224
|
interactive=False,
|
|
201
225
|
)
|
|
202
|
-
|
|
226
|
+
|
|
203
227
|
with gr.Accordion("Computer Configuration", open=True):
|
|
204
228
|
is_windows = platform.system().lower() == "windows"
|
|
205
229
|
is_mac = platform.system().lower() == "darwin"
|
|
206
|
-
|
|
230
|
+
|
|
207
231
|
providers = ["cloud", "localhost", "docker"]
|
|
208
232
|
if is_mac:
|
|
209
233
|
providers += ["lume"]
|
|
@@ -227,30 +251,30 @@ if __name__ == "__main__":
|
|
|
227
251
|
value=computer_choices[0],
|
|
228
252
|
info="Select the operating system for the computer",
|
|
229
253
|
)
|
|
230
|
-
|
|
254
|
+
|
|
231
255
|
computer_provider = gr.Radio(
|
|
232
256
|
choices=providers,
|
|
233
257
|
label="Provider",
|
|
234
258
|
value="lume" if is_mac else "cloud",
|
|
235
259
|
info="Select the computer provider",
|
|
236
260
|
)
|
|
237
|
-
|
|
261
|
+
|
|
238
262
|
container_name = gr.Textbox(
|
|
239
263
|
label="Container Name",
|
|
240
264
|
placeholder="Enter container name (optional)",
|
|
241
265
|
value=os.environ.get("CUA_CONTAINER_NAME", ""),
|
|
242
266
|
info="Optional name for the container",
|
|
243
267
|
)
|
|
244
|
-
|
|
268
|
+
|
|
245
269
|
cua_cloud_api_key = gr.Textbox(
|
|
246
270
|
label="CUA Cloud API Key",
|
|
247
271
|
placeholder="Enter your CUA Cloud API key",
|
|
248
272
|
value=os.environ.get("CUA_API_KEY", ""),
|
|
249
273
|
type="password",
|
|
250
274
|
info="Required for cloud provider",
|
|
251
|
-
visible=(not has_cua_key)
|
|
275
|
+
visible=(not has_cua_key),
|
|
252
276
|
)
|
|
253
|
-
|
|
277
|
+
|
|
254
278
|
with gr.Accordion("Agent Configuration", open=True):
|
|
255
279
|
agent_loop = gr.Dropdown(
|
|
256
280
|
choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
|
|
@@ -267,90 +291,113 @@ if __name__ == "__main__":
|
|
|
267
291
|
value=openai_models[0] if openai_models else "No models available",
|
|
268
292
|
info="Select OpenAI model",
|
|
269
293
|
interactive=True,
|
|
270
|
-
visible=(initial_loop == "OPENAI")
|
|
294
|
+
visible=(initial_loop == "OPENAI"),
|
|
271
295
|
)
|
|
272
|
-
|
|
296
|
+
|
|
273
297
|
anthropic_model_choice = gr.Dropdown(
|
|
274
298
|
choices=anthropic_models,
|
|
275
299
|
label="Anthropic Model",
|
|
276
|
-
value=
|
|
300
|
+
value=(
|
|
301
|
+
anthropic_models[0] if anthropic_models else "No models available"
|
|
302
|
+
),
|
|
277
303
|
info="Select Anthropic model",
|
|
278
304
|
interactive=True,
|
|
279
|
-
visible=(initial_loop == "ANTHROPIC")
|
|
305
|
+
visible=(initial_loop == "ANTHROPIC"),
|
|
280
306
|
)
|
|
281
|
-
|
|
307
|
+
|
|
282
308
|
omni_model_choice = gr.Dropdown(
|
|
283
|
-
choices=omni_models
|
|
309
|
+
choices=omni_models
|
|
310
|
+
+ ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
|
|
284
311
|
label="OMNI Model",
|
|
285
|
-
value=
|
|
312
|
+
value=(
|
|
313
|
+
omni_models[0]
|
|
314
|
+
if omni_models
|
|
315
|
+
else "Custom model (OpenAI compatible API)"
|
|
316
|
+
),
|
|
286
317
|
info="Select OMNI model or choose a custom model option",
|
|
287
318
|
interactive=True,
|
|
288
|
-
visible=(initial_loop == "OMNI")
|
|
319
|
+
visible=(initial_loop == "OMNI"),
|
|
289
320
|
)
|
|
290
|
-
|
|
321
|
+
|
|
291
322
|
uitars_model_choice = gr.Dropdown(
|
|
292
323
|
choices=provider_to_models.get("UITARS", ["No models available"]),
|
|
293
324
|
label="UITARS Model",
|
|
294
|
-
value=
|
|
325
|
+
value=(
|
|
326
|
+
provider_to_models.get("UITARS", ["No models available"])[0]
|
|
327
|
+
if provider_to_models.get("UITARS")
|
|
328
|
+
else "No models available"
|
|
329
|
+
),
|
|
295
330
|
info="Select UITARS model",
|
|
296
331
|
interactive=True,
|
|
297
|
-
visible=(initial_loop == "UITARS")
|
|
332
|
+
visible=(initial_loop == "UITARS"),
|
|
298
333
|
)
|
|
299
|
-
|
|
334
|
+
|
|
300
335
|
model_choice = gr.Textbox(visible=False)
|
|
301
336
|
|
|
302
337
|
# API key inputs
|
|
303
|
-
with gr.Group(
|
|
338
|
+
with gr.Group(
|
|
339
|
+
visible=not has_openai_key
|
|
340
|
+
and (initial_loop == "OPENAI" or initial_loop == "OMNI")
|
|
341
|
+
) as openai_key_group:
|
|
304
342
|
openai_api_key_input = gr.Textbox(
|
|
305
343
|
label="OpenAI API Key",
|
|
306
344
|
placeholder="Enter your OpenAI API key",
|
|
307
345
|
value=os.environ.get("OPENAI_API_KEY", ""),
|
|
308
346
|
interactive=True,
|
|
309
347
|
type="password",
|
|
310
|
-
info="Required for OpenAI models"
|
|
348
|
+
info="Required for OpenAI models",
|
|
311
349
|
)
|
|
312
|
-
|
|
313
|
-
with gr.Group(
|
|
350
|
+
|
|
351
|
+
with gr.Group(
|
|
352
|
+
visible=not has_anthropic_key
|
|
353
|
+
and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")
|
|
354
|
+
) as anthropic_key_group:
|
|
314
355
|
anthropic_api_key_input = gr.Textbox(
|
|
315
356
|
label="Anthropic API Key",
|
|
316
357
|
placeholder="Enter your Anthropic API key",
|
|
317
358
|
value=os.environ.get("ANTHROPIC_API_KEY", ""),
|
|
318
359
|
interactive=True,
|
|
319
360
|
type="password",
|
|
320
|
-
info="Required for Anthropic models"
|
|
361
|
+
info="Required for Anthropic models",
|
|
321
362
|
)
|
|
322
|
-
|
|
363
|
+
|
|
323
364
|
# API key handlers
|
|
324
365
|
def set_openai_api_key(key):
|
|
325
366
|
if key and key.strip():
|
|
326
367
|
os.environ["OPENAI_API_KEY"] = key.strip()
|
|
327
|
-
print(
|
|
368
|
+
print("DEBUG - Set OpenAI API key environment variable")
|
|
328
369
|
return key
|
|
329
|
-
|
|
370
|
+
|
|
330
371
|
def set_anthropic_api_key(key):
|
|
331
372
|
if key and key.strip():
|
|
332
373
|
os.environ["ANTHROPIC_API_KEY"] = key.strip()
|
|
333
|
-
print(
|
|
374
|
+
print("DEBUG - Set Anthropic API key environment variable")
|
|
334
375
|
return key
|
|
335
|
-
|
|
376
|
+
|
|
336
377
|
openai_api_key_input.change(
|
|
337
378
|
fn=set_openai_api_key,
|
|
338
379
|
inputs=[openai_api_key_input],
|
|
339
380
|
outputs=[openai_api_key_input],
|
|
340
|
-
queue=False
|
|
381
|
+
queue=False,
|
|
341
382
|
)
|
|
342
|
-
|
|
383
|
+
|
|
343
384
|
anthropic_api_key_input.change(
|
|
344
385
|
fn=set_anthropic_api_key,
|
|
345
386
|
inputs=[anthropic_api_key_input],
|
|
346
387
|
outputs=[anthropic_api_key_input],
|
|
347
|
-
queue=False
|
|
388
|
+
queue=False,
|
|
348
389
|
)
|
|
349
390
|
|
|
350
391
|
# UI update function
|
|
351
|
-
def update_ui(
|
|
392
|
+
def update_ui(
|
|
393
|
+
loop=None,
|
|
394
|
+
openai_model=None,
|
|
395
|
+
anthropic_model=None,
|
|
396
|
+
omni_model=None,
|
|
397
|
+
uitars_model=None,
|
|
398
|
+
):
|
|
352
399
|
loop = loop or agent_loop.value
|
|
353
|
-
|
|
400
|
+
|
|
354
401
|
model_value = None
|
|
355
402
|
if loop == "OPENAI" and openai_model:
|
|
356
403
|
model_value = openai_model
|
|
@@ -360,21 +407,37 @@ if __name__ == "__main__":
|
|
|
360
407
|
model_value = omni_model
|
|
361
408
|
elif loop == "UITARS" and uitars_model:
|
|
362
409
|
model_value = uitars_model
|
|
363
|
-
|
|
364
|
-
openai_visible =
|
|
365
|
-
anthropic_visible =
|
|
366
|
-
omni_visible =
|
|
367
|
-
uitars_visible =
|
|
368
|
-
|
|
369
|
-
show_openai_key = not has_openai_key and (
|
|
370
|
-
|
|
371
|
-
|
|
410
|
+
|
|
411
|
+
openai_visible = loop == "OPENAI"
|
|
412
|
+
anthropic_visible = loop == "ANTHROPIC"
|
|
413
|
+
omni_visible = loop == "OMNI"
|
|
414
|
+
uitars_visible = loop == "UITARS"
|
|
415
|
+
|
|
416
|
+
show_openai_key = not has_openai_key and (
|
|
417
|
+
loop == "OPENAI"
|
|
418
|
+
or (
|
|
419
|
+
loop == "OMNI"
|
|
420
|
+
and model_value
|
|
421
|
+
and "OpenAI" in model_value
|
|
422
|
+
and "Custom" not in model_value
|
|
423
|
+
)
|
|
424
|
+
)
|
|
425
|
+
show_anthropic_key = not has_anthropic_key and (
|
|
426
|
+
loop == "ANTHROPIC"
|
|
427
|
+
or (
|
|
428
|
+
loop == "OMNI"
|
|
429
|
+
and model_value
|
|
430
|
+
and "Claude" in model_value
|
|
431
|
+
and "Custom" not in model_value
|
|
432
|
+
)
|
|
433
|
+
)
|
|
434
|
+
|
|
372
435
|
is_custom_openai_api = model_value == "Custom model (OpenAI compatible API)"
|
|
373
436
|
is_custom_ollama = model_value == "Custom model (ollama)"
|
|
374
437
|
is_any_custom = is_custom_openai_api or is_custom_ollama
|
|
375
|
-
|
|
438
|
+
|
|
376
439
|
model_choice_value = model_value if model_value else ""
|
|
377
|
-
|
|
440
|
+
|
|
378
441
|
return [
|
|
379
442
|
gr.update(visible=openai_visible),
|
|
380
443
|
gr.update(visible=anthropic_visible),
|
|
@@ -385,15 +448,18 @@ if __name__ == "__main__":
|
|
|
385
448
|
gr.update(visible=is_any_custom),
|
|
386
449
|
gr.update(visible=is_custom_openai_api),
|
|
387
450
|
gr.update(visible=is_custom_openai_api),
|
|
388
|
-
gr.update(value=model_choice_value)
|
|
451
|
+
gr.update(value=model_choice_value),
|
|
389
452
|
]
|
|
390
|
-
|
|
453
|
+
|
|
391
454
|
# Custom model inputs
|
|
392
455
|
custom_model = gr.Textbox(
|
|
393
456
|
label="Custom Model Name",
|
|
394
457
|
placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct or llama3)",
|
|
395
458
|
value=initial_custom_model,
|
|
396
|
-
visible=(
|
|
459
|
+
visible=(
|
|
460
|
+
initial_model == "Custom model (OpenAI compatible API)"
|
|
461
|
+
or initial_model == "Custom model (ollama)"
|
|
462
|
+
),
|
|
397
463
|
interactive=True,
|
|
398
464
|
)
|
|
399
465
|
|
|
@@ -413,36 +479,56 @@ if __name__ == "__main__":
|
|
|
413
479
|
interactive=True,
|
|
414
480
|
type="password",
|
|
415
481
|
)
|
|
416
|
-
|
|
482
|
+
|
|
417
483
|
# Provider visibility update function
|
|
418
484
|
def update_provider_visibility(provider):
|
|
419
485
|
"""Update visibility of container name and API key based on selected provider."""
|
|
420
486
|
is_localhost = provider == "localhost"
|
|
421
487
|
return [
|
|
422
488
|
gr.update(visible=not is_localhost), # container_name
|
|
423
|
-
gr.update(
|
|
489
|
+
gr.update(
|
|
490
|
+
visible=not is_localhost and not has_cua_key
|
|
491
|
+
), # cua_cloud_api_key
|
|
424
492
|
]
|
|
425
|
-
|
|
493
|
+
|
|
426
494
|
# Connect provider change event
|
|
427
495
|
computer_provider.change(
|
|
428
496
|
fn=update_provider_visibility,
|
|
429
497
|
inputs=[computer_provider],
|
|
430
498
|
outputs=[container_name, cua_cloud_api_key],
|
|
431
|
-
queue=False
|
|
499
|
+
queue=False,
|
|
432
500
|
)
|
|
433
|
-
|
|
501
|
+
|
|
434
502
|
# Connect UI update events
|
|
435
|
-
for dropdown in [
|
|
503
|
+
for dropdown in [
|
|
504
|
+
agent_loop,
|
|
505
|
+
omni_model_choice,
|
|
506
|
+
uitars_model_choice,
|
|
507
|
+
openai_model_choice,
|
|
508
|
+
anthropic_model_choice,
|
|
509
|
+
]:
|
|
436
510
|
dropdown.change(
|
|
437
511
|
fn=update_ui,
|
|
438
|
-
inputs=[
|
|
512
|
+
inputs=[
|
|
513
|
+
agent_loop,
|
|
514
|
+
openai_model_choice,
|
|
515
|
+
anthropic_model_choice,
|
|
516
|
+
omni_model_choice,
|
|
517
|
+
uitars_model_choice,
|
|
518
|
+
],
|
|
439
519
|
outputs=[
|
|
440
|
-
openai_model_choice,
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
520
|
+
openai_model_choice,
|
|
521
|
+
anthropic_model_choice,
|
|
522
|
+
omni_model_choice,
|
|
523
|
+
uitars_model_choice,
|
|
524
|
+
openai_key_group,
|
|
525
|
+
anthropic_key_group,
|
|
526
|
+
custom_model,
|
|
527
|
+
provider_base_url,
|
|
528
|
+
provider_api_key,
|
|
529
|
+
model_choice,
|
|
444
530
|
],
|
|
445
|
-
queue=False
|
|
531
|
+
queue=False,
|
|
446
532
|
)
|
|
447
533
|
|
|
448
534
|
save_trajectory = gr.Checkbox(
|
|
@@ -461,7 +547,7 @@ if __name__ == "__main__":
|
|
|
461
547
|
info="Number of recent images to keep in context",
|
|
462
548
|
interactive=True,
|
|
463
549
|
)
|
|
464
|
-
|
|
550
|
+
|
|
465
551
|
max_budget = gr.Number(
|
|
466
552
|
label="Max Budget ($)",
|
|
467
553
|
value=lambda: None,
|
|
@@ -479,9 +565,7 @@ if __name__ == "__main__":
|
|
|
479
565
|
)
|
|
480
566
|
|
|
481
567
|
chatbot_history = gr.Chatbot(type="messages")
|
|
482
|
-
msg = gr.Textbox(
|
|
483
|
-
placeholder="Ask me to perform tasks in a virtual environment"
|
|
484
|
-
)
|
|
568
|
+
msg = gr.Textbox(placeholder="Ask me to perform tasks in a virtual environment")
|
|
485
569
|
clear = gr.Button("Clear")
|
|
486
570
|
cancel_button = gr.Button("Cancel", variant="stop")
|
|
487
571
|
|
|
@@ -498,11 +582,23 @@ if __name__ == "__main__":
|
|
|
498
582
|
global global_agent
|
|
499
583
|
if global_agent:
|
|
500
584
|
print("DEBUG - Cancelling agent task")
|
|
501
|
-
history.append(
|
|
585
|
+
history.append(
|
|
586
|
+
gr.ChatMessage(
|
|
587
|
+
role="assistant",
|
|
588
|
+
content="Task cancelled by user",
|
|
589
|
+
metadata={"title": "❌ Cancelled"},
|
|
590
|
+
)
|
|
591
|
+
)
|
|
502
592
|
else:
|
|
503
|
-
history.append(
|
|
593
|
+
history.append(
|
|
594
|
+
gr.ChatMessage(
|
|
595
|
+
role="assistant",
|
|
596
|
+
content="No active agent task to cancel",
|
|
597
|
+
metadata={"title": "ℹ️ Info"},
|
|
598
|
+
)
|
|
599
|
+
)
|
|
504
600
|
return history
|
|
505
|
-
|
|
601
|
+
|
|
506
602
|
# Process response function
|
|
507
603
|
async def process_response(
|
|
508
604
|
history,
|
|
@@ -542,10 +638,13 @@ if __name__ == "__main__":
|
|
|
542
638
|
model_choice_value = uitars_model_value
|
|
543
639
|
else:
|
|
544
640
|
model_choice_value = "No models available"
|
|
545
|
-
|
|
641
|
+
|
|
546
642
|
# Determine if this is a custom model selection
|
|
547
|
-
is_custom_model_selected = model_choice_value in [
|
|
548
|
-
|
|
643
|
+
is_custom_model_selected = model_choice_value in [
|
|
644
|
+
"Custom model (OpenAI compatible API)",
|
|
645
|
+
"Custom model (ollama)",
|
|
646
|
+
]
|
|
647
|
+
|
|
549
648
|
# Determine the model name string to analyze
|
|
550
649
|
if is_custom_model_selected:
|
|
551
650
|
model_string_to_analyze = custom_model_value
|
|
@@ -583,13 +682,19 @@ if __name__ == "__main__":
|
|
|
583
682
|
model_string=model_string,
|
|
584
683
|
save_trajectory=save_traj,
|
|
585
684
|
only_n_most_recent_images=recent_imgs,
|
|
586
|
-
custom_model_name=
|
|
685
|
+
custom_model_name=(
|
|
686
|
+
custom_model_value if is_custom_model_selected else None
|
|
687
|
+
),
|
|
587
688
|
computer_os=computer_os,
|
|
588
689
|
computer_provider=computer_provider,
|
|
589
690
|
computer_name=container_name,
|
|
590
691
|
computer_api_key=cua_cloud_api_key,
|
|
591
692
|
verbosity=logging.DEBUG,
|
|
592
|
-
max_trajectory_budget=
|
|
693
|
+
max_trajectory_budget=(
|
|
694
|
+
max_budget_value
|
|
695
|
+
if max_budget_value and max_budget_value > 0
|
|
696
|
+
else None
|
|
697
|
+
),
|
|
593
698
|
)
|
|
594
699
|
|
|
595
700
|
if global_agent is None:
|
|
@@ -605,7 +710,7 @@ if __name__ == "__main__":
|
|
|
605
710
|
# Add user message to global history
|
|
606
711
|
global global_messages
|
|
607
712
|
global_messages.append({"role": "user", "content": last_user_message})
|
|
608
|
-
|
|
713
|
+
|
|
609
714
|
# Stream responses from the agent
|
|
610
715
|
async for result in global_agent.run(global_messages):
|
|
611
716
|
global_messages += result.get("output", [])
|
|
@@ -613,18 +718,20 @@ if __name__ == "__main__":
|
|
|
613
718
|
# from pprint import pprint
|
|
614
719
|
# pprint(result)
|
|
615
720
|
# print(f"DEBUG - Agent response ------- END")
|
|
616
|
-
|
|
721
|
+
|
|
617
722
|
# Process the result output
|
|
618
723
|
for item in result.get("output", []):
|
|
619
724
|
if item.get("type") == "message":
|
|
620
725
|
content = item.get("content", [])
|
|
621
726
|
for content_part in content:
|
|
622
727
|
if content_part.get("text"):
|
|
623
|
-
history.append(
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
728
|
+
history.append(
|
|
729
|
+
gr.ChatMessage(
|
|
730
|
+
role=item.get("role", "assistant"),
|
|
731
|
+
content=content_part.get("text", ""),
|
|
732
|
+
metadata=content_part.get("metadata", {}),
|
|
733
|
+
)
|
|
734
|
+
)
|
|
628
735
|
elif item.get("type") == "computer_call":
|
|
629
736
|
action = item.get("action", {})
|
|
630
737
|
action_type = action.get("type", "")
|
|
@@ -632,43 +739,52 @@ if __name__ == "__main__":
|
|
|
632
739
|
action_title = f"🛠️ Performing {action_type}"
|
|
633
740
|
if action.get("x") and action.get("y"):
|
|
634
741
|
action_title += f" at ({action['x']}, {action['y']})"
|
|
635
|
-
history.append(
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
742
|
+
history.append(
|
|
743
|
+
gr.ChatMessage(
|
|
744
|
+
role="assistant",
|
|
745
|
+
content=f"```json\n{json.dumps(action)}\n```",
|
|
746
|
+
metadata={"title": action_title},
|
|
747
|
+
)
|
|
748
|
+
)
|
|
640
749
|
elif item.get("type") == "function_call":
|
|
641
750
|
function_name = item.get("name", "")
|
|
642
751
|
arguments = item.get("arguments", "{}")
|
|
643
|
-
history.append(
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
752
|
+
history.append(
|
|
753
|
+
gr.ChatMessage(
|
|
754
|
+
role="assistant",
|
|
755
|
+
content=f"🔧 Calling function: {function_name}\n```json\n{arguments}\n```",
|
|
756
|
+
metadata={"title": f"Function Call: {function_name}"},
|
|
757
|
+
)
|
|
758
|
+
)
|
|
648
759
|
elif item.get("type") == "function_call_output":
|
|
649
760
|
output = item.get("output", "")
|
|
650
|
-
history.append(
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
761
|
+
history.append(
|
|
762
|
+
gr.ChatMessage(
|
|
763
|
+
role="assistant",
|
|
764
|
+
content=f"📤 Function output:\n```\n{output}\n```",
|
|
765
|
+
metadata={"title": "Function Output"},
|
|
766
|
+
)
|
|
767
|
+
)
|
|
655
768
|
elif item.get("type") == "computer_call_output":
|
|
656
769
|
output = item.get("output", {}).get("image_url", "")
|
|
657
770
|
image_markdown = f""
|
|
658
|
-
history.append(
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
771
|
+
history.append(
|
|
772
|
+
gr.ChatMessage(
|
|
773
|
+
role="assistant",
|
|
774
|
+
content=image_markdown,
|
|
775
|
+
metadata={"title": "🖥️ Computer Output"},
|
|
776
|
+
)
|
|
777
|
+
)
|
|
778
|
+
|
|
664
779
|
yield history
|
|
665
|
-
|
|
780
|
+
|
|
666
781
|
except Exception as e:
|
|
667
782
|
import traceback
|
|
783
|
+
|
|
668
784
|
traceback.print_exc()
|
|
669
785
|
history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
|
|
670
786
|
yield history
|
|
671
|
-
|
|
787
|
+
|
|
672
788
|
# Connect the submit button
|
|
673
789
|
submit_event = msg.submit(
|
|
674
790
|
fn=chat_submit,
|
|
@@ -706,44 +822,77 @@ if __name__ == "__main__":
|
|
|
706
822
|
global global_messages
|
|
707
823
|
global_messages.clear()
|
|
708
824
|
return None
|
|
709
|
-
|
|
825
|
+
|
|
710
826
|
clear.click(clear_chat, None, chatbot_history, queue=False)
|
|
711
|
-
|
|
827
|
+
|
|
712
828
|
# Connect cancel button
|
|
713
829
|
cancel_button.click(
|
|
714
|
-
cancel_agent_task,
|
|
715
|
-
[chatbot_history],
|
|
716
|
-
[chatbot_history],
|
|
717
|
-
queue=False
|
|
830
|
+
cancel_agent_task, [chatbot_history], [chatbot_history], queue=False
|
|
718
831
|
)
|
|
719
832
|
|
|
720
833
|
# Code display update function
|
|
721
|
-
def update_code_display(
|
|
834
|
+
def update_code_display(
|
|
835
|
+
agent_loop,
|
|
836
|
+
model_choice_val,
|
|
837
|
+
custom_model_val,
|
|
838
|
+
chat_history,
|
|
839
|
+
recent_images_val,
|
|
840
|
+
save_trajectory_val,
|
|
841
|
+
computer_os,
|
|
842
|
+
computer_provider,
|
|
843
|
+
container_name,
|
|
844
|
+
cua_cloud_api_key,
|
|
845
|
+
max_budget_val,
|
|
846
|
+
):
|
|
722
847
|
messages = []
|
|
723
848
|
if chat_history:
|
|
724
849
|
for msg in chat_history:
|
|
725
850
|
if isinstance(msg, dict) and msg.get("role") == "user":
|
|
726
851
|
messages.append(msg.get("content", ""))
|
|
727
|
-
|
|
852
|
+
|
|
728
853
|
return generate_python_code(
|
|
729
|
-
agent_loop,
|
|
730
|
-
model_choice_val or custom_model_val or "gpt-4o",
|
|
731
|
-
messages,
|
|
854
|
+
agent_loop,
|
|
855
|
+
model_choice_val or custom_model_val or "gpt-4o",
|
|
856
|
+
messages,
|
|
732
857
|
recent_images_val,
|
|
733
858
|
save_trajectory_val,
|
|
734
859
|
computer_os,
|
|
735
860
|
computer_provider,
|
|
736
861
|
container_name,
|
|
737
862
|
cua_cloud_api_key,
|
|
738
|
-
max_budget_val
|
|
863
|
+
max_budget_val,
|
|
739
864
|
)
|
|
740
|
-
|
|
865
|
+
|
|
741
866
|
# Update code display when configuration changes
|
|
742
|
-
for component in [
|
|
867
|
+
for component in [
|
|
868
|
+
agent_loop,
|
|
869
|
+
model_choice,
|
|
870
|
+
custom_model,
|
|
871
|
+
chatbot_history,
|
|
872
|
+
recent_images,
|
|
873
|
+
save_trajectory,
|
|
874
|
+
computer_os,
|
|
875
|
+
computer_provider,
|
|
876
|
+
container_name,
|
|
877
|
+
cua_cloud_api_key,
|
|
878
|
+
max_budget,
|
|
879
|
+
]:
|
|
743
880
|
component.change(
|
|
744
881
|
update_code_display,
|
|
745
|
-
inputs=[
|
|
746
|
-
|
|
882
|
+
inputs=[
|
|
883
|
+
agent_loop,
|
|
884
|
+
model_choice,
|
|
885
|
+
custom_model,
|
|
886
|
+
chatbot_history,
|
|
887
|
+
recent_images,
|
|
888
|
+
save_trajectory,
|
|
889
|
+
computer_os,
|
|
890
|
+
computer_provider,
|
|
891
|
+
container_name,
|
|
892
|
+
cua_cloud_api_key,
|
|
893
|
+
max_budget,
|
|
894
|
+
],
|
|
895
|
+
outputs=[code_display],
|
|
747
896
|
)
|
|
748
897
|
|
|
749
898
|
return demo
|