cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +4 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +110 -99
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +337 -185
  15. agent/callbacks/__init__.py +9 -4
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +35 -33
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +99 -61
  25. agent/callbacks/trajectory_saver.py +95 -69
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +38 -99
  37. agent/integrations/hud/agent.py +369 -0
  38. agent/integrations/hud/proxy.py +166 -52
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +579 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +136 -150
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +50 -51
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +247 -206
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +61 -57
  64. agent/proxy/handlers.py +46 -39
  65. agent/responses.py +447 -347
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +11 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. cua_agent-0.4.22.dist-info/METADATA +0 -436
  78. cua_agent-0.4.22.dist-info/RECORD +0 -51
  79. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -2,19 +2,25 @@
2
2
  UI Components for the Gradio interface
3
3
  """
4
4
 
5
- import os
6
5
  import asyncio
7
- import logging
8
6
  import json
7
+ import logging
8
+ import os
9
9
  import platform
10
10
  from pathlib import Path
11
- from typing import Dict, List, Optional, Any, cast
11
+ from typing import Any, Dict, List, Optional, cast
12
+
12
13
  import gradio as gr
13
14
  from gradio.components.chatbot import MetadataDict
14
15
 
15
16
  from .app import (
16
- load_settings, save_settings, create_agent, get_model_string,
17
- get_ollama_models, global_agent, global_computer
17
+ create_agent,
18
+ get_model_string,
19
+ get_ollama_models,
20
+ global_agent,
21
+ global_computer,
22
+ load_settings,
23
+ save_settings,
18
24
  )
19
25
 
20
26
  # Global messages array to maintain conversation history
@@ -23,30 +29,28 @@ global_messages = []
23
29
 
24
30
  def create_gradio_ui() -> gr.Blocks:
25
31
  """Create a Gradio UI for the Computer-Use Agent."""
26
-
32
+
27
33
  # Load settings
28
34
  saved_settings = load_settings()
29
-
35
+
30
36
  # Check for API keys
31
37
  openai_api_key = os.environ.get("OPENAI_API_KEY", "")
32
38
  anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
33
39
  cua_api_key = os.environ.get("CUA_API_KEY", "")
34
-
40
+
35
41
  # Model choices
36
42
  openai_models = ["OpenAI: Computer-Use Preview"]
37
43
  anthropic_models = [
38
44
  "Anthropic: Claude 4 Opus (20250514)",
39
45
  "Anthropic: Claude 4 Sonnet (20250514)",
40
46
  "Anthropic: Claude 3.7 Sonnet (20250219)",
41
- "Anthropic: Claude 3.5 Sonnet (20241022)",
42
47
  ]
43
48
  omni_models = [
44
49
  "OMNI: OpenAI GPT-4o",
45
50
  "OMNI: OpenAI GPT-4o mini",
46
- "OMNI: Claude 3.7 Sonnet (20250219)",
47
- "OMNI: Claude 3.5 Sonnet (20241022)"
51
+ "OMNI: Claude 3.7 Sonnet (20250219)",
48
52
  ]
49
-
53
+
50
54
  # Check if API keys are available
51
55
  has_openai_key = bool(openai_api_key)
52
56
  has_anthropic_key = bool(anthropic_api_key)
@@ -59,15 +63,20 @@ def create_gradio_ui() -> gr.Blocks:
59
63
 
60
64
  # Detect platform
61
65
  is_mac = platform.system().lower() == "darwin"
62
-
66
+
63
67
  # Format model choices
64
68
  provider_to_models = {
65
69
  "OPENAI": openai_models,
66
70
  "ANTHROPIC": anthropic_models,
67
71
  "OMNI": omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
68
- "UITARS": ([
69
- "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
70
- ] if is_mac else []) + ["Custom model (OpenAI compatible API)"],
72
+ "UITARS": (
73
+ [
74
+ "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
75
+ ]
76
+ if is_mac
77
+ else []
78
+ )
79
+ + ["Custom model (OpenAI compatible API)"],
71
80
  }
72
81
 
73
82
  # Apply saved settings
@@ -82,7 +91,9 @@ def create_gradio_ui() -> gr.Blocks:
82
91
  elif initial_loop == "ANTHROPIC":
83
92
  initial_model = anthropic_models[0] if anthropic_models else "No models available"
84
93
  else: # OMNI
85
- initial_model = omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
94
+ initial_model = (
95
+ omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
96
+ )
86
97
 
87
98
  initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
88
99
  initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
@@ -96,16 +107,27 @@ def create_gradio_ui() -> gr.Blocks:
96
107
  "Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
97
108
  "Configure SSH keys and set up a connection to a remote server",
98
109
  ]
99
-
100
- def generate_python_code(agent_loop_choice, model_name, tasks, recent_images=3, save_trajectory=True, computer_os="linux", computer_provider="cloud", container_name="", cua_cloud_api_key="", max_budget=None):
110
+
111
+ def generate_python_code(
112
+ agent_loop_choice,
113
+ model_name,
114
+ tasks,
115
+ recent_images=3,
116
+ save_trajectory=True,
117
+ computer_os="linux",
118
+ computer_provider="cloud",
119
+ container_name="",
120
+ cua_cloud_api_key="",
121
+ max_budget=None,
122
+ ):
101
123
  """Generate Python code for the current configuration and tasks."""
102
124
  tasks_str = ""
103
125
  for task in tasks:
104
126
  if task and task.strip():
105
127
  tasks_str += f' "{task}",\n'
106
-
128
+
107
129
  model_string = get_model_string(model_name, agent_loop_choice)
108
-
130
+
109
131
  computer_args = []
110
132
  if computer_os != "macos":
111
133
  computer_args.append(f'os_type="{computer_os}"')
@@ -115,14 +137,14 @@ def create_gradio_ui() -> gr.Blocks:
115
137
  computer_args.append(f'name="{container_name}"')
116
138
  if cua_cloud_api_key:
117
139
  computer_args.append(f'api_key="{cua_cloud_api_key}"')
118
-
140
+
119
141
  computer_args_str = ", ".join(computer_args)
120
142
  if computer_args_str:
121
143
  computer_args_str = f"({computer_args_str})"
122
144
  else:
123
145
  computer_args_str = "()"
124
-
125
- code = f'''import asyncio
146
+
147
+ code = f"""import asyncio
126
148
  from computer import Computer
127
149
  from agent import ComputerAgent
128
150
 
@@ -131,22 +153,22 @@ async def main():
131
153
  agent = ComputerAgent(
132
154
  model="{model_string}",
133
155
  tools=[computer],
134
- only_n_most_recent_images={recent_images},'''
135
-
156
+ only_n_most_recent_images={recent_images},"""
157
+
136
158
  if save_trajectory:
137
- code += '''
138
- trajectory_dir="trajectories",'''
139
-
159
+ code += """
160
+ trajectory_dir="trajectories","""
161
+
140
162
  if max_budget:
141
- code += f'''
142
- max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},'''
143
-
144
- code += '''
163
+ code += f"""
164
+ max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},"""
165
+
166
+ code += """
145
167
  )
146
- '''
147
-
168
+ """
169
+
148
170
  if tasks_str:
149
- code += f'''
171
+ code += f"""
150
172
  # Prompts for the computer-use agent
151
173
  tasks = [
152
174
  {tasks_str.rstrip()}
@@ -158,23 +180,23 @@ async def main():
158
180
  async for result in agent.run(messages):
159
181
  for item in result["output"]:
160
182
  if item["type"] == "message":
161
- print(item["content"][0]["text"])'''
183
+ print(item["content"][0]["text"])"""
162
184
  else:
163
- code += f'''
185
+ code += """
164
186
  # Execute a single task
165
- task = "Search for information about CUA on GitHub"
166
- print(f"Executing task: {{task}}")
167
- messages = [{{"role": "user", "content": task}}]
187
+ task = "Search for information about Cua on GitHub"
188
+ print(f"Executing task: {task}")
189
+ messages = [{"role": "user", "content": task}]
168
190
  async for result in agent.run(messages):
169
191
  for item in result["output"]:
170
192
  if item["type"] == "message":
171
- print(item["content"][0]["text"])'''
193
+ print(item["content"][0]["text"])"""
172
194
 
173
- code += '''
195
+ code += """
174
196
 
175
197
  if __name__ == "__main__":
176
- asyncio.run(main())'''
177
-
198
+ asyncio.run(main())"""
199
+
178
200
  return code
179
201
 
180
202
  # Create the Gradio interface
@@ -186,7 +208,7 @@ if __name__ == "__main__":
186
208
  gr.HTML(
187
209
  """
188
210
  <div style="display: flex; justify-content: center; margin-bottom: 0.5em">
189
- <img alt="CUA Logo" style="width: 80px;"
211
+ <img alt="Cua Logo" style="width: 80px;"
190
212
  src="https://github.com/trycua/cua/blob/main/img/logo_white.png?raw=true" />
191
213
  </div>
192
214
  """
@@ -199,11 +221,11 @@ if __name__ == "__main__":
199
221
  value=generate_python_code(initial_loop, "gpt-4o", []),
200
222
  interactive=False,
201
223
  )
202
-
224
+
203
225
  with gr.Accordion("Computer Configuration", open=True):
204
226
  is_windows = platform.system().lower() == "windows"
205
227
  is_mac = platform.system().lower() == "darwin"
206
-
228
+
207
229
  providers = ["cloud", "localhost", "docker"]
208
230
  if is_mac:
209
231
  providers += ["lume"]
@@ -227,30 +249,30 @@ if __name__ == "__main__":
227
249
  value=computer_choices[0],
228
250
  info="Select the operating system for the computer",
229
251
  )
230
-
252
+
231
253
  computer_provider = gr.Radio(
232
254
  choices=providers,
233
255
  label="Provider",
234
256
  value="lume" if is_mac else "cloud",
235
257
  info="Select the computer provider",
236
258
  )
237
-
259
+
238
260
  container_name = gr.Textbox(
239
261
  label="Container Name",
240
262
  placeholder="Enter container name (optional)",
241
263
  value=os.environ.get("CUA_CONTAINER_NAME", ""),
242
264
  info="Optional name for the container",
243
265
  )
244
-
266
+
245
267
  cua_cloud_api_key = gr.Textbox(
246
- label="CUA Cloud API Key",
247
- placeholder="Enter your CUA Cloud API key",
268
+ label="Cua Cloud API Key",
269
+ placeholder="Enter your Cua Cloud API key",
248
270
  value=os.environ.get("CUA_API_KEY", ""),
249
271
  type="password",
250
272
  info="Required for cloud provider",
251
- visible=(not has_cua_key)
273
+ visible=(not has_cua_key),
252
274
  )
253
-
275
+
254
276
  with gr.Accordion("Agent Configuration", open=True):
255
277
  agent_loop = gr.Dropdown(
256
278
  choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
@@ -267,90 +289,113 @@ if __name__ == "__main__":
267
289
  value=openai_models[0] if openai_models else "No models available",
268
290
  info="Select OpenAI model",
269
291
  interactive=True,
270
- visible=(initial_loop == "OPENAI")
292
+ visible=(initial_loop == "OPENAI"),
271
293
  )
272
-
294
+
273
295
  anthropic_model_choice = gr.Dropdown(
274
296
  choices=anthropic_models,
275
297
  label="Anthropic Model",
276
- value=anthropic_models[0] if anthropic_models else "No models available",
298
+ value=(
299
+ anthropic_models[0] if anthropic_models else "No models available"
300
+ ),
277
301
  info="Select Anthropic model",
278
302
  interactive=True,
279
- visible=(initial_loop == "ANTHROPIC")
303
+ visible=(initial_loop == "ANTHROPIC"),
280
304
  )
281
-
305
+
282
306
  omni_model_choice = gr.Dropdown(
283
- choices=omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
307
+ choices=omni_models
308
+ + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
284
309
  label="OMNI Model",
285
- value=omni_models[0] if omni_models else "Custom model (OpenAI compatible API)",
310
+ value=(
311
+ omni_models[0]
312
+ if omni_models
313
+ else "Custom model (OpenAI compatible API)"
314
+ ),
286
315
  info="Select OMNI model or choose a custom model option",
287
316
  interactive=True,
288
- visible=(initial_loop == "OMNI")
317
+ visible=(initial_loop == "OMNI"),
289
318
  )
290
-
319
+
291
320
  uitars_model_choice = gr.Dropdown(
292
321
  choices=provider_to_models.get("UITARS", ["No models available"]),
293
322
  label="UITARS Model",
294
- value=provider_to_models.get("UITARS", ["No models available"])[0] if provider_to_models.get("UITARS") else "No models available",
323
+ value=(
324
+ provider_to_models.get("UITARS", ["No models available"])[0]
325
+ if provider_to_models.get("UITARS")
326
+ else "No models available"
327
+ ),
295
328
  info="Select UITARS model",
296
329
  interactive=True,
297
- visible=(initial_loop == "UITARS")
330
+ visible=(initial_loop == "UITARS"),
298
331
  )
299
-
332
+
300
333
  model_choice = gr.Textbox(visible=False)
301
334
 
302
335
  # API key inputs
303
- with gr.Group(visible=not has_openai_key and (initial_loop == "OPENAI" or initial_loop == "OMNI")) as openai_key_group:
336
+ with gr.Group(
337
+ visible=not has_openai_key
338
+ and (initial_loop == "OPENAI" or initial_loop == "OMNI")
339
+ ) as openai_key_group:
304
340
  openai_api_key_input = gr.Textbox(
305
341
  label="OpenAI API Key",
306
342
  placeholder="Enter your OpenAI API key",
307
343
  value=os.environ.get("OPENAI_API_KEY", ""),
308
344
  interactive=True,
309
345
  type="password",
310
- info="Required for OpenAI models"
346
+ info="Required for OpenAI models",
311
347
  )
312
-
313
- with gr.Group(visible=not has_anthropic_key and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")) as anthropic_key_group:
348
+
349
+ with gr.Group(
350
+ visible=not has_anthropic_key
351
+ and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")
352
+ ) as anthropic_key_group:
314
353
  anthropic_api_key_input = gr.Textbox(
315
354
  label="Anthropic API Key",
316
355
  placeholder="Enter your Anthropic API key",
317
356
  value=os.environ.get("ANTHROPIC_API_KEY", ""),
318
357
  interactive=True,
319
358
  type="password",
320
- info="Required for Anthropic models"
359
+ info="Required for Anthropic models",
321
360
  )
322
-
361
+
323
362
  # API key handlers
324
363
  def set_openai_api_key(key):
325
364
  if key and key.strip():
326
365
  os.environ["OPENAI_API_KEY"] = key.strip()
327
- print(f"DEBUG - Set OpenAI API key environment variable")
366
+ print("DEBUG - Set OpenAI API key environment variable")
328
367
  return key
329
-
368
+
330
369
  def set_anthropic_api_key(key):
331
370
  if key and key.strip():
332
371
  os.environ["ANTHROPIC_API_KEY"] = key.strip()
333
- print(f"DEBUG - Set Anthropic API key environment variable")
372
+ print("DEBUG - Set Anthropic API key environment variable")
334
373
  return key
335
-
374
+
336
375
  openai_api_key_input.change(
337
376
  fn=set_openai_api_key,
338
377
  inputs=[openai_api_key_input],
339
378
  outputs=[openai_api_key_input],
340
- queue=False
379
+ queue=False,
341
380
  )
342
-
381
+
343
382
  anthropic_api_key_input.change(
344
383
  fn=set_anthropic_api_key,
345
384
  inputs=[anthropic_api_key_input],
346
385
  outputs=[anthropic_api_key_input],
347
- queue=False
386
+ queue=False,
348
387
  )
349
388
 
350
389
  # UI update function
351
- def update_ui(loop=None, openai_model=None, anthropic_model=None, omni_model=None, uitars_model=None):
390
+ def update_ui(
391
+ loop=None,
392
+ openai_model=None,
393
+ anthropic_model=None,
394
+ omni_model=None,
395
+ uitars_model=None,
396
+ ):
352
397
  loop = loop or agent_loop.value
353
-
398
+
354
399
  model_value = None
355
400
  if loop == "OPENAI" and openai_model:
356
401
  model_value = openai_model
@@ -360,21 +405,37 @@ if __name__ == "__main__":
360
405
  model_value = omni_model
361
406
  elif loop == "UITARS" and uitars_model:
362
407
  model_value = uitars_model
363
-
364
- openai_visible = (loop == "OPENAI")
365
- anthropic_visible = (loop == "ANTHROPIC")
366
- omni_visible = (loop == "OMNI")
367
- uitars_visible = (loop == "UITARS")
368
-
369
- show_openai_key = not has_openai_key and (loop == "OPENAI" or (loop == "OMNI" and model_value and "OpenAI" in model_value and "Custom" not in model_value))
370
- show_anthropic_key = not has_anthropic_key and (loop == "ANTHROPIC" or (loop == "OMNI" and model_value and "Claude" in model_value and "Custom" not in model_value))
371
-
408
+
409
+ openai_visible = loop == "OPENAI"
410
+ anthropic_visible = loop == "ANTHROPIC"
411
+ omni_visible = loop == "OMNI"
412
+ uitars_visible = loop == "UITARS"
413
+
414
+ show_openai_key = not has_openai_key and (
415
+ loop == "OPENAI"
416
+ or (
417
+ loop == "OMNI"
418
+ and model_value
419
+ and "OpenAI" in model_value
420
+ and "Custom" not in model_value
421
+ )
422
+ )
423
+ show_anthropic_key = not has_anthropic_key and (
424
+ loop == "ANTHROPIC"
425
+ or (
426
+ loop == "OMNI"
427
+ and model_value
428
+ and "Claude" in model_value
429
+ and "Custom" not in model_value
430
+ )
431
+ )
432
+
372
433
  is_custom_openai_api = model_value == "Custom model (OpenAI compatible API)"
373
434
  is_custom_ollama = model_value == "Custom model (ollama)"
374
435
  is_any_custom = is_custom_openai_api or is_custom_ollama
375
-
436
+
376
437
  model_choice_value = model_value if model_value else ""
377
-
438
+
378
439
  return [
379
440
  gr.update(visible=openai_visible),
380
441
  gr.update(visible=anthropic_visible),
@@ -385,15 +446,18 @@ if __name__ == "__main__":
385
446
  gr.update(visible=is_any_custom),
386
447
  gr.update(visible=is_custom_openai_api),
387
448
  gr.update(visible=is_custom_openai_api),
388
- gr.update(value=model_choice_value)
449
+ gr.update(value=model_choice_value),
389
450
  ]
390
-
451
+
391
452
  # Custom model inputs
392
453
  custom_model = gr.Textbox(
393
454
  label="Custom Model Name",
394
455
  placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct or llama3)",
395
456
  value=initial_custom_model,
396
- visible=(initial_model == "Custom model (OpenAI compatible API)" or initial_model == "Custom model (ollama)"),
457
+ visible=(
458
+ initial_model == "Custom model (OpenAI compatible API)"
459
+ or initial_model == "Custom model (ollama)"
460
+ ),
397
461
  interactive=True,
398
462
  )
399
463
 
@@ -413,36 +477,56 @@ if __name__ == "__main__":
413
477
  interactive=True,
414
478
  type="password",
415
479
  )
416
-
480
+
417
481
  # Provider visibility update function
418
482
  def update_provider_visibility(provider):
419
483
  """Update visibility of container name and API key based on selected provider."""
420
484
  is_localhost = provider == "localhost"
421
485
  return [
422
486
  gr.update(visible=not is_localhost), # container_name
423
- gr.update(visible=not is_localhost and not has_cua_key) # cua_cloud_api_key
487
+ gr.update(
488
+ visible=not is_localhost and not has_cua_key
489
+ ), # cua_cloud_api_key
424
490
  ]
425
-
491
+
426
492
  # Connect provider change event
427
493
  computer_provider.change(
428
494
  fn=update_provider_visibility,
429
495
  inputs=[computer_provider],
430
496
  outputs=[container_name, cua_cloud_api_key],
431
- queue=False
497
+ queue=False,
432
498
  )
433
-
499
+
434
500
  # Connect UI update events
435
- for dropdown in [agent_loop, omni_model_choice, uitars_model_choice, openai_model_choice, anthropic_model_choice]:
501
+ for dropdown in [
502
+ agent_loop,
503
+ omni_model_choice,
504
+ uitars_model_choice,
505
+ openai_model_choice,
506
+ anthropic_model_choice,
507
+ ]:
436
508
  dropdown.change(
437
509
  fn=update_ui,
438
- inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
510
+ inputs=[
511
+ agent_loop,
512
+ openai_model_choice,
513
+ anthropic_model_choice,
514
+ omni_model_choice,
515
+ uitars_model_choice,
516
+ ],
439
517
  outputs=[
440
- openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
441
- openai_key_group, anthropic_key_group,
442
- custom_model, provider_base_url, provider_api_key,
443
- model_choice
518
+ openai_model_choice,
519
+ anthropic_model_choice,
520
+ omni_model_choice,
521
+ uitars_model_choice,
522
+ openai_key_group,
523
+ anthropic_key_group,
524
+ custom_model,
525
+ provider_base_url,
526
+ provider_api_key,
527
+ model_choice,
444
528
  ],
445
- queue=False
529
+ queue=False,
446
530
  )
447
531
 
448
532
  save_trajectory = gr.Checkbox(
@@ -461,7 +545,7 @@ if __name__ == "__main__":
461
545
  info="Number of recent images to keep in context",
462
546
  interactive=True,
463
547
  )
464
-
548
+
465
549
  max_budget = gr.Number(
466
550
  label="Max Budget ($)",
467
551
  value=lambda: None,
@@ -478,10 +562,8 @@ if __name__ == "__main__":
478
562
  "Ask me to perform tasks in a virtual environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
479
563
  )
480
564
 
481
- chatbot_history = gr.Chatbot(type="messages")
482
- msg = gr.Textbox(
483
- placeholder="Ask me to perform tasks in a virtual environment"
484
- )
565
+ chatbot_history = gr.Chatbot()
566
+ msg = gr.Textbox(placeholder="Ask me to perform tasks in a virtual environment")
485
567
  clear = gr.Button("Clear")
486
568
  cancel_button = gr.Button("Cancel", variant="stop")
487
569
 
@@ -498,11 +580,23 @@ if __name__ == "__main__":
498
580
  global global_agent
499
581
  if global_agent:
500
582
  print("DEBUG - Cancelling agent task")
501
- history.append(gr.ChatMessage(role="assistant", content="Task cancelled by user", metadata={"title": "❌ Cancelled"}))
583
+ history.append(
584
+ gr.ChatMessage(
585
+ role="assistant",
586
+ content="Task cancelled by user",
587
+ metadata={"title": "❌ Cancelled"},
588
+ )
589
+ )
502
590
  else:
503
- history.append(gr.ChatMessage(role="assistant", content="No active agent task to cancel", metadata={"title": "ℹ️ Info"}))
591
+ history.append(
592
+ gr.ChatMessage(
593
+ role="assistant",
594
+ content="No active agent task to cancel",
595
+ metadata={"title": "ℹ️ Info"},
596
+ )
597
+ )
504
598
  return history
505
-
599
+
506
600
  # Process response function
507
601
  async def process_response(
508
602
  history,
@@ -542,10 +636,13 @@ if __name__ == "__main__":
542
636
  model_choice_value = uitars_model_value
543
637
  else:
544
638
  model_choice_value = "No models available"
545
-
639
+
546
640
  # Determine if this is a custom model selection
547
- is_custom_model_selected = model_choice_value in ["Custom model (OpenAI compatible API)", "Custom model (ollama)"]
548
-
641
+ is_custom_model_selected = model_choice_value in [
642
+ "Custom model (OpenAI compatible API)",
643
+ "Custom model (ollama)",
644
+ ]
645
+
549
646
  # Determine the model name string to analyze
550
647
  if is_custom_model_selected:
551
648
  model_string_to_analyze = custom_model_value
@@ -583,13 +680,19 @@ if __name__ == "__main__":
583
680
  model_string=model_string,
584
681
  save_trajectory=save_traj,
585
682
  only_n_most_recent_images=recent_imgs,
586
- custom_model_name=custom_model_value if is_custom_model_selected else None,
683
+ custom_model_name=(
684
+ custom_model_value if is_custom_model_selected else None
685
+ ),
587
686
  computer_os=computer_os,
588
687
  computer_provider=computer_provider,
589
688
  computer_name=container_name,
590
689
  computer_api_key=cua_cloud_api_key,
591
690
  verbosity=logging.DEBUG,
592
- max_trajectory_budget=max_budget_value if max_budget_value and max_budget_value > 0 else None,
691
+ max_trajectory_budget=(
692
+ max_budget_value
693
+ if max_budget_value and max_budget_value > 0
694
+ else None
695
+ ),
593
696
  )
594
697
 
595
698
  if global_agent is None:
@@ -605,7 +708,7 @@ if __name__ == "__main__":
605
708
  # Add user message to global history
606
709
  global global_messages
607
710
  global_messages.append({"role": "user", "content": last_user_message})
608
-
711
+
609
712
  # Stream responses from the agent
610
713
  async for result in global_agent.run(global_messages):
611
714
  global_messages += result.get("output", [])
@@ -613,18 +716,20 @@ if __name__ == "__main__":
613
716
  # from pprint import pprint
614
717
  # pprint(result)
615
718
  # print(f"DEBUG - Agent response ------- END")
616
-
719
+
617
720
  # Process the result output
618
721
  for item in result.get("output", []):
619
722
  if item.get("type") == "message":
620
723
  content = item.get("content", [])
621
724
  for content_part in content:
622
725
  if content_part.get("text"):
623
- history.append(gr.ChatMessage(
624
- role=item.get("role", "assistant"),
625
- content=content_part.get("text", ""),
626
- metadata=content_part.get("metadata", {})
627
- ))
726
+ history.append(
727
+ gr.ChatMessage(
728
+ role=item.get("role", "assistant"),
729
+ content=content_part.get("text", ""),
730
+ metadata=content_part.get("metadata", {}),
731
+ )
732
+ )
628
733
  elif item.get("type") == "computer_call":
629
734
  action = item.get("action", {})
630
735
  action_type = action.get("type", "")
@@ -632,43 +737,52 @@ if __name__ == "__main__":
632
737
  action_title = f"🛠️ Performing {action_type}"
633
738
  if action.get("x") and action.get("y"):
634
739
  action_title += f" at ({action['x']}, {action['y']})"
635
- history.append(gr.ChatMessage(
636
- role="assistant",
637
- content=f"```json\n{json.dumps(action)}\n```",
638
- metadata={"title": action_title}
639
- ))
740
+ history.append(
741
+ gr.ChatMessage(
742
+ role="assistant",
743
+ content=f"```json\n{json.dumps(action)}\n```",
744
+ metadata={"title": action_title},
745
+ )
746
+ )
640
747
  elif item.get("type") == "function_call":
641
748
  function_name = item.get("name", "")
642
749
  arguments = item.get("arguments", "{}")
643
- history.append(gr.ChatMessage(
644
- role="assistant",
645
- content=f"🔧 Calling function: {function_name}\n```json\n{arguments}\n```",
646
- metadata={"title": f"Function Call: {function_name}"}
647
- ))
750
+ history.append(
751
+ gr.ChatMessage(
752
+ role="assistant",
753
+ content=f"🔧 Calling function: {function_name}\n```json\n{arguments}\n```",
754
+ metadata={"title": f"Function Call: {function_name}"},
755
+ )
756
+ )
648
757
  elif item.get("type") == "function_call_output":
649
758
  output = item.get("output", "")
650
- history.append(gr.ChatMessage(
651
- role="assistant",
652
- content=f"📤 Function output:\n```\n{output}\n```",
653
- metadata={"title": "Function Output"}
654
- ))
759
+ history.append(
760
+ gr.ChatMessage(
761
+ role="assistant",
762
+ content=f"📤 Function output:\n```\n{output}\n```",
763
+ metadata={"title": "Function Output"},
764
+ )
765
+ )
655
766
  elif item.get("type") == "computer_call_output":
656
767
  output = item.get("output", {}).get("image_url", "")
657
768
  image_markdown = f"![Computer output]({output})"
658
- history.append(gr.ChatMessage(
659
- role="assistant",
660
- content=image_markdown,
661
- metadata={"title": "🖥️ Computer Output"}
662
- ))
663
-
769
+ history.append(
770
+ gr.ChatMessage(
771
+ role="assistant",
772
+ content=image_markdown,
773
+ metadata={"title": "🖥️ Computer Output"},
774
+ )
775
+ )
776
+
664
777
  yield history
665
-
778
+
666
779
  except Exception as e:
667
780
  import traceback
781
+
668
782
  traceback.print_exc()
669
783
  history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
670
784
  yield history
671
-
785
+
672
786
  # Connect the submit button
673
787
  submit_event = msg.submit(
674
788
  fn=chat_submit,
@@ -706,44 +820,77 @@ if __name__ == "__main__":
706
820
  global global_messages
707
821
  global_messages.clear()
708
822
  return None
709
-
823
+
710
824
  clear.click(clear_chat, None, chatbot_history, queue=False)
711
-
825
+
712
826
  # Connect cancel button
713
827
  cancel_button.click(
714
- cancel_agent_task,
715
- [chatbot_history],
716
- [chatbot_history],
717
- queue=False
828
+ cancel_agent_task, [chatbot_history], [chatbot_history], queue=False
718
829
  )
719
830
 
720
831
  # Code display update function
721
- def update_code_display(agent_loop, model_choice_val, custom_model_val, chat_history, recent_images_val, save_trajectory_val, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget_val):
832
+ def update_code_display(
833
+ agent_loop,
834
+ model_choice_val,
835
+ custom_model_val,
836
+ chat_history,
837
+ recent_images_val,
838
+ save_trajectory_val,
839
+ computer_os,
840
+ computer_provider,
841
+ container_name,
842
+ cua_cloud_api_key,
843
+ max_budget_val,
844
+ ):
722
845
  messages = []
723
846
  if chat_history:
724
847
  for msg in chat_history:
725
848
  if isinstance(msg, dict) and msg.get("role") == "user":
726
849
  messages.append(msg.get("content", ""))
727
-
850
+
728
851
  return generate_python_code(
729
- agent_loop,
730
- model_choice_val or custom_model_val or "gpt-4o",
731
- messages,
852
+ agent_loop,
853
+ model_choice_val or custom_model_val or "gpt-4o",
854
+ messages,
732
855
  recent_images_val,
733
856
  save_trajectory_val,
734
857
  computer_os,
735
858
  computer_provider,
736
859
  container_name,
737
860
  cua_cloud_api_key,
738
- max_budget_val
861
+ max_budget_val,
739
862
  )
740
-
863
+
741
864
  # Update code display when configuration changes
742
- for component in [agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget]:
865
+ for component in [
866
+ agent_loop,
867
+ model_choice,
868
+ custom_model,
869
+ chatbot_history,
870
+ recent_images,
871
+ save_trajectory,
872
+ computer_os,
873
+ computer_provider,
874
+ container_name,
875
+ cua_cloud_api_key,
876
+ max_budget,
877
+ ]:
743
878
  component.change(
744
879
  update_code_display,
745
- inputs=[agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget],
746
- outputs=[code_display]
880
+ inputs=[
881
+ agent_loop,
882
+ model_choice,
883
+ custom_model,
884
+ chatbot_history,
885
+ recent_images,
886
+ save_trajectory,
887
+ computer_os,
888
+ computer_provider,
889
+ container_name,
890
+ cua_cloud_api_key,
891
+ max_budget,
892
+ ],
893
+ outputs=[code_display],
747
894
  )
748
895
 
749
896
  return demo