cua-agent 0.4.33__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +26 -17
  24. agent/computers/cua.py +27 -23
  25. agent/computers/custom.py +72 -69
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +49 -20
  44. agent/loops/omniparser.py +212 -209
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +475 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.33.dist-info → cua_agent-0.4.35.dist-info}/METADATA +22 -10
  58. cua_agent-0.4.35.dist-info/RECORD +64 -0
  59. cua_agent-0.4.33.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.33.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.33.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
@@ -2,19 +2,25 @@
2
2
  UI Components for the Gradio interface
3
3
  """
4
4
 
5
- import os
6
5
  import asyncio
7
- import logging
8
6
  import json
7
+ import logging
8
+ import os
9
9
  import platform
10
10
  from pathlib import Path
11
- from typing import Dict, List, Optional, Any, cast
11
+ from typing import Any, Dict, List, Optional, cast
12
+
12
13
  import gradio as gr
13
14
  from gradio.components.chatbot import MetadataDict
14
15
 
15
16
  from .app import (
16
- load_settings, save_settings, create_agent, get_model_string,
17
- get_ollama_models, global_agent, global_computer
17
+ create_agent,
18
+ get_model_string,
19
+ get_ollama_models,
20
+ global_agent,
21
+ global_computer,
22
+ load_settings,
23
+ save_settings,
18
24
  )
19
25
 
20
26
  # Global messages array to maintain conversation history
@@ -23,15 +29,15 @@ global_messages = []
23
29
 
24
30
  def create_gradio_ui() -> gr.Blocks:
25
31
  """Create a Gradio UI for the Computer-Use Agent."""
26
-
32
+
27
33
  # Load settings
28
34
  saved_settings = load_settings()
29
-
35
+
30
36
  # Check for API keys
31
37
  openai_api_key = os.environ.get("OPENAI_API_KEY", "")
32
38
  anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
33
39
  cua_api_key = os.environ.get("CUA_API_KEY", "")
34
-
40
+
35
41
  # Model choices
36
42
  openai_models = ["OpenAI: Computer-Use Preview"]
37
43
  anthropic_models = [
@@ -43,10 +49,10 @@ def create_gradio_ui() -> gr.Blocks:
43
49
  omni_models = [
44
50
  "OMNI: OpenAI GPT-4o",
45
51
  "OMNI: OpenAI GPT-4o mini",
46
- "OMNI: Claude 3.7 Sonnet (20250219)",
47
- "OMNI: Claude 3.5 Sonnet (20241022)"
52
+ "OMNI: Claude 3.7 Sonnet (20250219)",
53
+ "OMNI: Claude 3.5 Sonnet (20241022)",
48
54
  ]
49
-
55
+
50
56
  # Check if API keys are available
51
57
  has_openai_key = bool(openai_api_key)
52
58
  has_anthropic_key = bool(anthropic_api_key)
@@ -59,15 +65,20 @@ def create_gradio_ui() -> gr.Blocks:
59
65
 
60
66
  # Detect platform
61
67
  is_mac = platform.system().lower() == "darwin"
62
-
68
+
63
69
  # Format model choices
64
70
  provider_to_models = {
65
71
  "OPENAI": openai_models,
66
72
  "ANTHROPIC": anthropic_models,
67
73
  "OMNI": omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
68
- "UITARS": ([
69
- "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
70
- ] if is_mac else []) + ["Custom model (OpenAI compatible API)"],
74
+ "UITARS": (
75
+ [
76
+ "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
77
+ ]
78
+ if is_mac
79
+ else []
80
+ )
81
+ + ["Custom model (OpenAI compatible API)"],
71
82
  }
72
83
 
73
84
  # Apply saved settings
@@ -82,7 +93,9 @@ def create_gradio_ui() -> gr.Blocks:
82
93
  elif initial_loop == "ANTHROPIC":
83
94
  initial_model = anthropic_models[0] if anthropic_models else "No models available"
84
95
  else: # OMNI
85
- initial_model = omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
96
+ initial_model = (
97
+ omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
98
+ )
86
99
 
87
100
  initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
88
101
  initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
@@ -96,16 +109,27 @@ def create_gradio_ui() -> gr.Blocks:
96
109
  "Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
97
110
  "Configure SSH keys and set up a connection to a remote server",
98
111
  ]
99
-
100
- def generate_python_code(agent_loop_choice, model_name, tasks, recent_images=3, save_trajectory=True, computer_os="linux", computer_provider="cloud", container_name="", cua_cloud_api_key="", max_budget=None):
112
+
113
+ def generate_python_code(
114
+ agent_loop_choice,
115
+ model_name,
116
+ tasks,
117
+ recent_images=3,
118
+ save_trajectory=True,
119
+ computer_os="linux",
120
+ computer_provider="cloud",
121
+ container_name="",
122
+ cua_cloud_api_key="",
123
+ max_budget=None,
124
+ ):
101
125
  """Generate Python code for the current configuration and tasks."""
102
126
  tasks_str = ""
103
127
  for task in tasks:
104
128
  if task and task.strip():
105
129
  tasks_str += f' "{task}",\n'
106
-
130
+
107
131
  model_string = get_model_string(model_name, agent_loop_choice)
108
-
132
+
109
133
  computer_args = []
110
134
  if computer_os != "macos":
111
135
  computer_args.append(f'os_type="{computer_os}"')
@@ -115,14 +139,14 @@ def create_gradio_ui() -> gr.Blocks:
115
139
  computer_args.append(f'name="{container_name}"')
116
140
  if cua_cloud_api_key:
117
141
  computer_args.append(f'api_key="{cua_cloud_api_key}"')
118
-
142
+
119
143
  computer_args_str = ", ".join(computer_args)
120
144
  if computer_args_str:
121
145
  computer_args_str = f"({computer_args_str})"
122
146
  else:
123
147
  computer_args_str = "()"
124
-
125
- code = f'''import asyncio
148
+
149
+ code = f"""import asyncio
126
150
  from computer import Computer
127
151
  from agent import ComputerAgent
128
152
 
@@ -131,22 +155,22 @@ async def main():
131
155
  agent = ComputerAgent(
132
156
  model="{model_string}",
133
157
  tools=[computer],
134
- only_n_most_recent_images={recent_images},'''
135
-
158
+ only_n_most_recent_images={recent_images},"""
159
+
136
160
  if save_trajectory:
137
- code += '''
138
- trajectory_dir="trajectories",'''
139
-
161
+ code += """
162
+ trajectory_dir="trajectories","""
163
+
140
164
  if max_budget:
141
- code += f'''
142
- max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},'''
143
-
144
- code += '''
165
+ code += f"""
166
+ max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},"""
167
+
168
+ code += """
145
169
  )
146
- '''
147
-
170
+ """
171
+
148
172
  if tasks_str:
149
- code += f'''
173
+ code += f"""
150
174
  # Prompts for the computer-use agent
151
175
  tasks = [
152
176
  {tasks_str.rstrip()}
@@ -158,23 +182,23 @@ async def main():
158
182
  async for result in agent.run(messages):
159
183
  for item in result["output"]:
160
184
  if item["type"] == "message":
161
- print(item["content"][0]["text"])'''
185
+ print(item["content"][0]["text"])"""
162
186
  else:
163
- code += f'''
187
+ code += """
164
188
  # Execute a single task
165
189
  task = "Search for information about CUA on GitHub"
166
- print(f"Executing task: {{task}}")
167
- messages = [{{"role": "user", "content": task}}]
190
+ print(f"Executing task: {task}")
191
+ messages = [{"role": "user", "content": task}]
168
192
  async for result in agent.run(messages):
169
193
  for item in result["output"]:
170
194
  if item["type"] == "message":
171
- print(item["content"][0]["text"])'''
195
+ print(item["content"][0]["text"])"""
172
196
 
173
- code += '''
197
+ code += """
174
198
 
175
199
  if __name__ == "__main__":
176
- asyncio.run(main())'''
177
-
200
+ asyncio.run(main())"""
201
+
178
202
  return code
179
203
 
180
204
  # Create the Gradio interface
@@ -199,11 +223,11 @@ if __name__ == "__main__":
199
223
  value=generate_python_code(initial_loop, "gpt-4o", []),
200
224
  interactive=False,
201
225
  )
202
-
226
+
203
227
  with gr.Accordion("Computer Configuration", open=True):
204
228
  is_windows = platform.system().lower() == "windows"
205
229
  is_mac = platform.system().lower() == "darwin"
206
-
230
+
207
231
  providers = ["cloud", "localhost", "docker"]
208
232
  if is_mac:
209
233
  providers += ["lume"]
@@ -227,30 +251,30 @@ if __name__ == "__main__":
227
251
  value=computer_choices[0],
228
252
  info="Select the operating system for the computer",
229
253
  )
230
-
254
+
231
255
  computer_provider = gr.Radio(
232
256
  choices=providers,
233
257
  label="Provider",
234
258
  value="lume" if is_mac else "cloud",
235
259
  info="Select the computer provider",
236
260
  )
237
-
261
+
238
262
  container_name = gr.Textbox(
239
263
  label="Container Name",
240
264
  placeholder="Enter container name (optional)",
241
265
  value=os.environ.get("CUA_CONTAINER_NAME", ""),
242
266
  info="Optional name for the container",
243
267
  )
244
-
268
+
245
269
  cua_cloud_api_key = gr.Textbox(
246
270
  label="CUA Cloud API Key",
247
271
  placeholder="Enter your CUA Cloud API key",
248
272
  value=os.environ.get("CUA_API_KEY", ""),
249
273
  type="password",
250
274
  info="Required for cloud provider",
251
- visible=(not has_cua_key)
275
+ visible=(not has_cua_key),
252
276
  )
253
-
277
+
254
278
  with gr.Accordion("Agent Configuration", open=True):
255
279
  agent_loop = gr.Dropdown(
256
280
  choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
@@ -267,90 +291,113 @@ if __name__ == "__main__":
267
291
  value=openai_models[0] if openai_models else "No models available",
268
292
  info="Select OpenAI model",
269
293
  interactive=True,
270
- visible=(initial_loop == "OPENAI")
294
+ visible=(initial_loop == "OPENAI"),
271
295
  )
272
-
296
+
273
297
  anthropic_model_choice = gr.Dropdown(
274
298
  choices=anthropic_models,
275
299
  label="Anthropic Model",
276
- value=anthropic_models[0] if anthropic_models else "No models available",
300
+ value=(
301
+ anthropic_models[0] if anthropic_models else "No models available"
302
+ ),
277
303
  info="Select Anthropic model",
278
304
  interactive=True,
279
- visible=(initial_loop == "ANTHROPIC")
305
+ visible=(initial_loop == "ANTHROPIC"),
280
306
  )
281
-
307
+
282
308
  omni_model_choice = gr.Dropdown(
283
- choices=omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
309
+ choices=omni_models
310
+ + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
284
311
  label="OMNI Model",
285
- value=omni_models[0] if omni_models else "Custom model (OpenAI compatible API)",
312
+ value=(
313
+ omni_models[0]
314
+ if omni_models
315
+ else "Custom model (OpenAI compatible API)"
316
+ ),
286
317
  info="Select OMNI model or choose a custom model option",
287
318
  interactive=True,
288
- visible=(initial_loop == "OMNI")
319
+ visible=(initial_loop == "OMNI"),
289
320
  )
290
-
321
+
291
322
  uitars_model_choice = gr.Dropdown(
292
323
  choices=provider_to_models.get("UITARS", ["No models available"]),
293
324
  label="UITARS Model",
294
- value=provider_to_models.get("UITARS", ["No models available"])[0] if provider_to_models.get("UITARS") else "No models available",
325
+ value=(
326
+ provider_to_models.get("UITARS", ["No models available"])[0]
327
+ if provider_to_models.get("UITARS")
328
+ else "No models available"
329
+ ),
295
330
  info="Select UITARS model",
296
331
  interactive=True,
297
- visible=(initial_loop == "UITARS")
332
+ visible=(initial_loop == "UITARS"),
298
333
  )
299
-
334
+
300
335
  model_choice = gr.Textbox(visible=False)
301
336
 
302
337
  # API key inputs
303
- with gr.Group(visible=not has_openai_key and (initial_loop == "OPENAI" or initial_loop == "OMNI")) as openai_key_group:
338
+ with gr.Group(
339
+ visible=not has_openai_key
340
+ and (initial_loop == "OPENAI" or initial_loop == "OMNI")
341
+ ) as openai_key_group:
304
342
  openai_api_key_input = gr.Textbox(
305
343
  label="OpenAI API Key",
306
344
  placeholder="Enter your OpenAI API key",
307
345
  value=os.environ.get("OPENAI_API_KEY", ""),
308
346
  interactive=True,
309
347
  type="password",
310
- info="Required for OpenAI models"
348
+ info="Required for OpenAI models",
311
349
  )
312
-
313
- with gr.Group(visible=not has_anthropic_key and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")) as anthropic_key_group:
350
+
351
+ with gr.Group(
352
+ visible=not has_anthropic_key
353
+ and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")
354
+ ) as anthropic_key_group:
314
355
  anthropic_api_key_input = gr.Textbox(
315
356
  label="Anthropic API Key",
316
357
  placeholder="Enter your Anthropic API key",
317
358
  value=os.environ.get("ANTHROPIC_API_KEY", ""),
318
359
  interactive=True,
319
360
  type="password",
320
- info="Required for Anthropic models"
361
+ info="Required for Anthropic models",
321
362
  )
322
-
363
+
323
364
  # API key handlers
324
365
  def set_openai_api_key(key):
325
366
  if key and key.strip():
326
367
  os.environ["OPENAI_API_KEY"] = key.strip()
327
- print(f"DEBUG - Set OpenAI API key environment variable")
368
+ print("DEBUG - Set OpenAI API key environment variable")
328
369
  return key
329
-
370
+
330
371
  def set_anthropic_api_key(key):
331
372
  if key and key.strip():
332
373
  os.environ["ANTHROPIC_API_KEY"] = key.strip()
333
- print(f"DEBUG - Set Anthropic API key environment variable")
374
+ print("DEBUG - Set Anthropic API key environment variable")
334
375
  return key
335
-
376
+
336
377
  openai_api_key_input.change(
337
378
  fn=set_openai_api_key,
338
379
  inputs=[openai_api_key_input],
339
380
  outputs=[openai_api_key_input],
340
- queue=False
381
+ queue=False,
341
382
  )
342
-
383
+
343
384
  anthropic_api_key_input.change(
344
385
  fn=set_anthropic_api_key,
345
386
  inputs=[anthropic_api_key_input],
346
387
  outputs=[anthropic_api_key_input],
347
- queue=False
388
+ queue=False,
348
389
  )
349
390
 
350
391
  # UI update function
351
- def update_ui(loop=None, openai_model=None, anthropic_model=None, omni_model=None, uitars_model=None):
392
+ def update_ui(
393
+ loop=None,
394
+ openai_model=None,
395
+ anthropic_model=None,
396
+ omni_model=None,
397
+ uitars_model=None,
398
+ ):
352
399
  loop = loop or agent_loop.value
353
-
400
+
354
401
  model_value = None
355
402
  if loop == "OPENAI" and openai_model:
356
403
  model_value = openai_model
@@ -360,21 +407,37 @@ if __name__ == "__main__":
360
407
  model_value = omni_model
361
408
  elif loop == "UITARS" and uitars_model:
362
409
  model_value = uitars_model
363
-
364
- openai_visible = (loop == "OPENAI")
365
- anthropic_visible = (loop == "ANTHROPIC")
366
- omni_visible = (loop == "OMNI")
367
- uitars_visible = (loop == "UITARS")
368
-
369
- show_openai_key = not has_openai_key and (loop == "OPENAI" or (loop == "OMNI" and model_value and "OpenAI" in model_value and "Custom" not in model_value))
370
- show_anthropic_key = not has_anthropic_key and (loop == "ANTHROPIC" or (loop == "OMNI" and model_value and "Claude" in model_value and "Custom" not in model_value))
371
-
410
+
411
+ openai_visible = loop == "OPENAI"
412
+ anthropic_visible = loop == "ANTHROPIC"
413
+ omni_visible = loop == "OMNI"
414
+ uitars_visible = loop == "UITARS"
415
+
416
+ show_openai_key = not has_openai_key and (
417
+ loop == "OPENAI"
418
+ or (
419
+ loop == "OMNI"
420
+ and model_value
421
+ and "OpenAI" in model_value
422
+ and "Custom" not in model_value
423
+ )
424
+ )
425
+ show_anthropic_key = not has_anthropic_key and (
426
+ loop == "ANTHROPIC"
427
+ or (
428
+ loop == "OMNI"
429
+ and model_value
430
+ and "Claude" in model_value
431
+ and "Custom" not in model_value
432
+ )
433
+ )
434
+
372
435
  is_custom_openai_api = model_value == "Custom model (OpenAI compatible API)"
373
436
  is_custom_ollama = model_value == "Custom model (ollama)"
374
437
  is_any_custom = is_custom_openai_api or is_custom_ollama
375
-
438
+
376
439
  model_choice_value = model_value if model_value else ""
377
-
440
+
378
441
  return [
379
442
  gr.update(visible=openai_visible),
380
443
  gr.update(visible=anthropic_visible),
@@ -385,15 +448,18 @@ if __name__ == "__main__":
385
448
  gr.update(visible=is_any_custom),
386
449
  gr.update(visible=is_custom_openai_api),
387
450
  gr.update(visible=is_custom_openai_api),
388
- gr.update(value=model_choice_value)
451
+ gr.update(value=model_choice_value),
389
452
  ]
390
-
453
+
391
454
  # Custom model inputs
392
455
  custom_model = gr.Textbox(
393
456
  label="Custom Model Name",
394
457
  placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct or llama3)",
395
458
  value=initial_custom_model,
396
- visible=(initial_model == "Custom model (OpenAI compatible API)" or initial_model == "Custom model (ollama)"),
459
+ visible=(
460
+ initial_model == "Custom model (OpenAI compatible API)"
461
+ or initial_model == "Custom model (ollama)"
462
+ ),
397
463
  interactive=True,
398
464
  )
399
465
 
@@ -413,36 +479,56 @@ if __name__ == "__main__":
413
479
  interactive=True,
414
480
  type="password",
415
481
  )
416
-
482
+
417
483
  # Provider visibility update function
418
484
  def update_provider_visibility(provider):
419
485
  """Update visibility of container name and API key based on selected provider."""
420
486
  is_localhost = provider == "localhost"
421
487
  return [
422
488
  gr.update(visible=not is_localhost), # container_name
423
- gr.update(visible=not is_localhost and not has_cua_key) # cua_cloud_api_key
489
+ gr.update(
490
+ visible=not is_localhost and not has_cua_key
491
+ ), # cua_cloud_api_key
424
492
  ]
425
-
493
+
426
494
  # Connect provider change event
427
495
  computer_provider.change(
428
496
  fn=update_provider_visibility,
429
497
  inputs=[computer_provider],
430
498
  outputs=[container_name, cua_cloud_api_key],
431
- queue=False
499
+ queue=False,
432
500
  )
433
-
501
+
434
502
  # Connect UI update events
435
- for dropdown in [agent_loop, omni_model_choice, uitars_model_choice, openai_model_choice, anthropic_model_choice]:
503
+ for dropdown in [
504
+ agent_loop,
505
+ omni_model_choice,
506
+ uitars_model_choice,
507
+ openai_model_choice,
508
+ anthropic_model_choice,
509
+ ]:
436
510
  dropdown.change(
437
511
  fn=update_ui,
438
- inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
512
+ inputs=[
513
+ agent_loop,
514
+ openai_model_choice,
515
+ anthropic_model_choice,
516
+ omni_model_choice,
517
+ uitars_model_choice,
518
+ ],
439
519
  outputs=[
440
- openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
441
- openai_key_group, anthropic_key_group,
442
- custom_model, provider_base_url, provider_api_key,
443
- model_choice
520
+ openai_model_choice,
521
+ anthropic_model_choice,
522
+ omni_model_choice,
523
+ uitars_model_choice,
524
+ openai_key_group,
525
+ anthropic_key_group,
526
+ custom_model,
527
+ provider_base_url,
528
+ provider_api_key,
529
+ model_choice,
444
530
  ],
445
- queue=False
531
+ queue=False,
446
532
  )
447
533
 
448
534
  save_trajectory = gr.Checkbox(
@@ -461,7 +547,7 @@ if __name__ == "__main__":
461
547
  info="Number of recent images to keep in context",
462
548
  interactive=True,
463
549
  )
464
-
550
+
465
551
  max_budget = gr.Number(
466
552
  label="Max Budget ($)",
467
553
  value=lambda: None,
@@ -479,9 +565,7 @@ if __name__ == "__main__":
479
565
  )
480
566
 
481
567
  chatbot_history = gr.Chatbot(type="messages")
482
- msg = gr.Textbox(
483
- placeholder="Ask me to perform tasks in a virtual environment"
484
- )
568
+ msg = gr.Textbox(placeholder="Ask me to perform tasks in a virtual environment")
485
569
  clear = gr.Button("Clear")
486
570
  cancel_button = gr.Button("Cancel", variant="stop")
487
571
 
@@ -498,11 +582,23 @@ if __name__ == "__main__":
498
582
  global global_agent
499
583
  if global_agent:
500
584
  print("DEBUG - Cancelling agent task")
501
- history.append(gr.ChatMessage(role="assistant", content="Task cancelled by user", metadata={"title": "❌ Cancelled"}))
585
+ history.append(
586
+ gr.ChatMessage(
587
+ role="assistant",
588
+ content="Task cancelled by user",
589
+ metadata={"title": "❌ Cancelled"},
590
+ )
591
+ )
502
592
  else:
503
- history.append(gr.ChatMessage(role="assistant", content="No active agent task to cancel", metadata={"title": "ℹ️ Info"}))
593
+ history.append(
594
+ gr.ChatMessage(
595
+ role="assistant",
596
+ content="No active agent task to cancel",
597
+ metadata={"title": "ℹ️ Info"},
598
+ )
599
+ )
504
600
  return history
505
-
601
+
506
602
  # Process response function
507
603
  async def process_response(
508
604
  history,
@@ -542,10 +638,13 @@ if __name__ == "__main__":
542
638
  model_choice_value = uitars_model_value
543
639
  else:
544
640
  model_choice_value = "No models available"
545
-
641
+
546
642
  # Determine if this is a custom model selection
547
- is_custom_model_selected = model_choice_value in ["Custom model (OpenAI compatible API)", "Custom model (ollama)"]
548
-
643
+ is_custom_model_selected = model_choice_value in [
644
+ "Custom model (OpenAI compatible API)",
645
+ "Custom model (ollama)",
646
+ ]
647
+
549
648
  # Determine the model name string to analyze
550
649
  if is_custom_model_selected:
551
650
  model_string_to_analyze = custom_model_value
@@ -583,13 +682,19 @@ if __name__ == "__main__":
583
682
  model_string=model_string,
584
683
  save_trajectory=save_traj,
585
684
  only_n_most_recent_images=recent_imgs,
586
- custom_model_name=custom_model_value if is_custom_model_selected else None,
685
+ custom_model_name=(
686
+ custom_model_value if is_custom_model_selected else None
687
+ ),
587
688
  computer_os=computer_os,
588
689
  computer_provider=computer_provider,
589
690
  computer_name=container_name,
590
691
  computer_api_key=cua_cloud_api_key,
591
692
  verbosity=logging.DEBUG,
592
- max_trajectory_budget=max_budget_value if max_budget_value and max_budget_value > 0 else None,
693
+ max_trajectory_budget=(
694
+ max_budget_value
695
+ if max_budget_value and max_budget_value > 0
696
+ else None
697
+ ),
593
698
  )
594
699
 
595
700
  if global_agent is None:
@@ -605,7 +710,7 @@ if __name__ == "__main__":
605
710
  # Add user message to global history
606
711
  global global_messages
607
712
  global_messages.append({"role": "user", "content": last_user_message})
608
-
713
+
609
714
  # Stream responses from the agent
610
715
  async for result in global_agent.run(global_messages):
611
716
  global_messages += result.get("output", [])
@@ -613,18 +718,20 @@ if __name__ == "__main__":
613
718
  # from pprint import pprint
614
719
  # pprint(result)
615
720
  # print(f"DEBUG - Agent response ------- END")
616
-
721
+
617
722
  # Process the result output
618
723
  for item in result.get("output", []):
619
724
  if item.get("type") == "message":
620
725
  content = item.get("content", [])
621
726
  for content_part in content:
622
727
  if content_part.get("text"):
623
- history.append(gr.ChatMessage(
624
- role=item.get("role", "assistant"),
625
- content=content_part.get("text", ""),
626
- metadata=content_part.get("metadata", {})
627
- ))
728
+ history.append(
729
+ gr.ChatMessage(
730
+ role=item.get("role", "assistant"),
731
+ content=content_part.get("text", ""),
732
+ metadata=content_part.get("metadata", {}),
733
+ )
734
+ )
628
735
  elif item.get("type") == "computer_call":
629
736
  action = item.get("action", {})
630
737
  action_type = action.get("type", "")
@@ -632,43 +739,52 @@ if __name__ == "__main__":
632
739
  action_title = f"🛠️ Performing {action_type}"
633
740
  if action.get("x") and action.get("y"):
634
741
  action_title += f" at ({action['x']}, {action['y']})"
635
- history.append(gr.ChatMessage(
636
- role="assistant",
637
- content=f"```json\n{json.dumps(action)}\n```",
638
- metadata={"title": action_title}
639
- ))
742
+ history.append(
743
+ gr.ChatMessage(
744
+ role="assistant",
745
+ content=f"```json\n{json.dumps(action)}\n```",
746
+ metadata={"title": action_title},
747
+ )
748
+ )
640
749
  elif item.get("type") == "function_call":
641
750
  function_name = item.get("name", "")
642
751
  arguments = item.get("arguments", "{}")
643
- history.append(gr.ChatMessage(
644
- role="assistant",
645
- content=f"🔧 Calling function: {function_name}\n```json\n{arguments}\n```",
646
- metadata={"title": f"Function Call: {function_name}"}
647
- ))
752
+ history.append(
753
+ gr.ChatMessage(
754
+ role="assistant",
755
+ content=f"🔧 Calling function: {function_name}\n```json\n{arguments}\n```",
756
+ metadata={"title": f"Function Call: {function_name}"},
757
+ )
758
+ )
648
759
  elif item.get("type") == "function_call_output":
649
760
  output = item.get("output", "")
650
- history.append(gr.ChatMessage(
651
- role="assistant",
652
- content=f"📤 Function output:\n```\n{output}\n```",
653
- metadata={"title": "Function Output"}
654
- ))
761
+ history.append(
762
+ gr.ChatMessage(
763
+ role="assistant",
764
+ content=f"📤 Function output:\n```\n{output}\n```",
765
+ metadata={"title": "Function Output"},
766
+ )
767
+ )
655
768
  elif item.get("type") == "computer_call_output":
656
769
  output = item.get("output", {}).get("image_url", "")
657
770
  image_markdown = f"![Computer output]({output})"
658
- history.append(gr.ChatMessage(
659
- role="assistant",
660
- content=image_markdown,
661
- metadata={"title": "🖥️ Computer Output"}
662
- ))
663
-
771
+ history.append(
772
+ gr.ChatMessage(
773
+ role="assistant",
774
+ content=image_markdown,
775
+ metadata={"title": "🖥️ Computer Output"},
776
+ )
777
+ )
778
+
664
779
  yield history
665
-
780
+
666
781
  except Exception as e:
667
782
  import traceback
783
+
668
784
  traceback.print_exc()
669
785
  history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
670
786
  yield history
671
-
787
+
672
788
  # Connect the submit button
673
789
  submit_event = msg.submit(
674
790
  fn=chat_submit,
@@ -706,44 +822,77 @@ if __name__ == "__main__":
706
822
  global global_messages
707
823
  global_messages.clear()
708
824
  return None
709
-
825
+
710
826
  clear.click(clear_chat, None, chatbot_history, queue=False)
711
-
827
+
712
828
  # Connect cancel button
713
829
  cancel_button.click(
714
- cancel_agent_task,
715
- [chatbot_history],
716
- [chatbot_history],
717
- queue=False
830
+ cancel_agent_task, [chatbot_history], [chatbot_history], queue=False
718
831
  )
719
832
 
720
833
  # Code display update function
721
- def update_code_display(agent_loop, model_choice_val, custom_model_val, chat_history, recent_images_val, save_trajectory_val, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget_val):
834
+ def update_code_display(
835
+ agent_loop,
836
+ model_choice_val,
837
+ custom_model_val,
838
+ chat_history,
839
+ recent_images_val,
840
+ save_trajectory_val,
841
+ computer_os,
842
+ computer_provider,
843
+ container_name,
844
+ cua_cloud_api_key,
845
+ max_budget_val,
846
+ ):
722
847
  messages = []
723
848
  if chat_history:
724
849
  for msg in chat_history:
725
850
  if isinstance(msg, dict) and msg.get("role") == "user":
726
851
  messages.append(msg.get("content", ""))
727
-
852
+
728
853
  return generate_python_code(
729
- agent_loop,
730
- model_choice_val or custom_model_val or "gpt-4o",
731
- messages,
854
+ agent_loop,
855
+ model_choice_val or custom_model_val or "gpt-4o",
856
+ messages,
732
857
  recent_images_val,
733
858
  save_trajectory_val,
734
859
  computer_os,
735
860
  computer_provider,
736
861
  container_name,
737
862
  cua_cloud_api_key,
738
- max_budget_val
863
+ max_budget_val,
739
864
  )
740
-
865
+
741
866
  # Update code display when configuration changes
742
- for component in [agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget]:
867
+ for component in [
868
+ agent_loop,
869
+ model_choice,
870
+ custom_model,
871
+ chatbot_history,
872
+ recent_images,
873
+ save_trajectory,
874
+ computer_os,
875
+ computer_provider,
876
+ container_name,
877
+ cua_cloud_api_key,
878
+ max_budget,
879
+ ]:
743
880
  component.change(
744
881
  update_code_display,
745
- inputs=[agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget],
746
- outputs=[code_display]
882
+ inputs=[
883
+ agent_loop,
884
+ model_choice,
885
+ custom_model,
886
+ chatbot_history,
887
+ recent_images,
888
+ save_trajectory,
889
+ computer_os,
890
+ computer_provider,
891
+ container_name,
892
+ cua_cloud_api_key,
893
+ max_budget,
894
+ ],
895
+ outputs=[code_display],
747
896
  )
748
897
 
749
898
  return demo