cua-agent 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (112) hide show
  1. agent/__init__.py +21 -12
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +229 -0
  5. agent/agent.py +594 -0
  6. agent/callbacks/__init__.py +19 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/telemetry.py +210 -0
  13. agent/callbacks/trajectory_saver.py +305 -0
  14. agent/cli.py +297 -0
  15. agent/computer_handler.py +107 -0
  16. agent/decorators.py +90 -0
  17. agent/loops/__init__.py +11 -0
  18. agent/loops/anthropic.py +728 -0
  19. agent/loops/omniparser.py +339 -0
  20. agent/loops/openai.py +95 -0
  21. agent/loops/uitars.py +688 -0
  22. agent/responses.py +207 -0
  23. agent/telemetry.py +135 -14
  24. agent/types.py +79 -0
  25. agent/ui/__init__.py +7 -1
  26. agent/ui/__main__.py +2 -13
  27. agent/ui/gradio/__init__.py +6 -19
  28. agent/ui/gradio/app.py +94 -1313
  29. agent/ui/gradio/ui_components.py +721 -0
  30. cua_agent-0.4.0.dist-info/METADATA +424 -0
  31. cua_agent-0.4.0.dist-info/RECORD +33 -0
  32. agent/core/__init__.py +0 -27
  33. agent/core/agent.py +0 -210
  34. agent/core/base.py +0 -217
  35. agent/core/callbacks.py +0 -200
  36. agent/core/experiment.py +0 -249
  37. agent/core/factory.py +0 -122
  38. agent/core/messages.py +0 -332
  39. agent/core/provider_config.py +0 -21
  40. agent/core/telemetry.py +0 -142
  41. agent/core/tools/__init__.py +0 -21
  42. agent/core/tools/base.py +0 -74
  43. agent/core/tools/bash.py +0 -52
  44. agent/core/tools/collection.py +0 -46
  45. agent/core/tools/computer.py +0 -113
  46. agent/core/tools/edit.py +0 -67
  47. agent/core/tools/manager.py +0 -56
  48. agent/core/tools.py +0 -32
  49. agent/core/types.py +0 -88
  50. agent/core/visualization.py +0 -197
  51. agent/providers/__init__.py +0 -4
  52. agent/providers/anthropic/__init__.py +0 -6
  53. agent/providers/anthropic/api/client.py +0 -360
  54. agent/providers/anthropic/api/logging.py +0 -150
  55. agent/providers/anthropic/api_handler.py +0 -140
  56. agent/providers/anthropic/callbacks/__init__.py +0 -5
  57. agent/providers/anthropic/callbacks/manager.py +0 -65
  58. agent/providers/anthropic/loop.py +0 -568
  59. agent/providers/anthropic/prompts.py +0 -23
  60. agent/providers/anthropic/response_handler.py +0 -226
  61. agent/providers/anthropic/tools/__init__.py +0 -33
  62. agent/providers/anthropic/tools/base.py +0 -88
  63. agent/providers/anthropic/tools/bash.py +0 -66
  64. agent/providers/anthropic/tools/collection.py +0 -34
  65. agent/providers/anthropic/tools/computer.py +0 -396
  66. agent/providers/anthropic/tools/edit.py +0 -326
  67. agent/providers/anthropic/tools/manager.py +0 -54
  68. agent/providers/anthropic/tools/run.py +0 -42
  69. agent/providers/anthropic/types.py +0 -16
  70. agent/providers/anthropic/utils.py +0 -381
  71. agent/providers/omni/__init__.py +0 -8
  72. agent/providers/omni/api_handler.py +0 -42
  73. agent/providers/omni/clients/anthropic.py +0 -103
  74. agent/providers/omni/clients/base.py +0 -35
  75. agent/providers/omni/clients/oaicompat.py +0 -195
  76. agent/providers/omni/clients/ollama.py +0 -122
  77. agent/providers/omni/clients/openai.py +0 -155
  78. agent/providers/omni/clients/utils.py +0 -25
  79. agent/providers/omni/image_utils.py +0 -34
  80. agent/providers/omni/loop.py +0 -990
  81. agent/providers/omni/parser.py +0 -307
  82. agent/providers/omni/prompts.py +0 -64
  83. agent/providers/omni/tools/__init__.py +0 -30
  84. agent/providers/omni/tools/base.py +0 -29
  85. agent/providers/omni/tools/bash.py +0 -74
  86. agent/providers/omni/tools/computer.py +0 -179
  87. agent/providers/omni/tools/manager.py +0 -61
  88. agent/providers/omni/utils.py +0 -236
  89. agent/providers/openai/__init__.py +0 -6
  90. agent/providers/openai/api_handler.py +0 -456
  91. agent/providers/openai/loop.py +0 -472
  92. agent/providers/openai/response_handler.py +0 -205
  93. agent/providers/openai/tools/__init__.py +0 -15
  94. agent/providers/openai/tools/base.py +0 -79
  95. agent/providers/openai/tools/computer.py +0 -326
  96. agent/providers/openai/tools/manager.py +0 -106
  97. agent/providers/openai/types.py +0 -36
  98. agent/providers/openai/utils.py +0 -98
  99. agent/providers/uitars/__init__.py +0 -1
  100. agent/providers/uitars/clients/base.py +0 -35
  101. agent/providers/uitars/clients/mlxvlm.py +0 -263
  102. agent/providers/uitars/clients/oaicompat.py +0 -214
  103. agent/providers/uitars/loop.py +0 -660
  104. agent/providers/uitars/prompts.py +0 -63
  105. agent/providers/uitars/tools/__init__.py +0 -1
  106. agent/providers/uitars/tools/computer.py +0 -283
  107. agent/providers/uitars/tools/manager.py +0 -60
  108. agent/providers/uitars/utils.py +0 -264
  109. cua_agent-0.3.2.dist-info/METADATA +0 -295
  110. cua_agent-0.3.2.dist-info/RECORD +0 -87
  111. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0.dist-info}/WHEEL +0 -0
  112. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,721 @@
1
+ """
2
+ UI Components for the Gradio interface
3
+ """
4
+
5
+ import os
6
+ import asyncio
7
+ import logging
8
+ import json
9
+ import platform
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional, Any, cast
12
+ import gradio as gr
13
+ from gradio.components.chatbot import MetadataDict
14
+
15
+ from .app import (
16
+ load_settings, save_settings, create_agent, get_model_string,
17
+ get_ollama_models, global_agent, global_computer
18
+ )
19
+
20
+ # Global messages array to maintain conversation history
21
+ global_messages = []
22
+
23
+
24
+ def create_gradio_ui() -> gr.Blocks:
25
+ """Create a Gradio UI for the Computer-Use Agent."""
26
+
27
+ # Load settings
28
+ saved_settings = load_settings()
29
+
30
+ # Check for API keys
31
+ openai_api_key = os.environ.get("OPENAI_API_KEY", "")
32
+ anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
33
+ cua_api_key = os.environ.get("CUA_API_KEY", "")
34
+
35
+ # Model choices
36
+ openai_models = ["OpenAI: Computer-Use Preview"]
37
+ anthropic_models = [
38
+ "Anthropic: Claude 4 Opus (20250514)",
39
+ "Anthropic: Claude 4 Sonnet (20250514)",
40
+ "Anthropic: Claude 3.7 Sonnet (20250219)",
41
+ "Anthropic: Claude 3.5 Sonnet (20240620)",
42
+ ]
43
+ omni_models = [
44
+ "OMNI: OpenAI GPT-4o",
45
+ "OMNI: OpenAI GPT-4o mini",
46
+ "OMNI: Claude 3.7 Sonnet (20250219)",
47
+ "OMNI: Claude 3.5 Sonnet (20240620)"
48
+ ]
49
+
50
+ # Check if API keys are available
51
+ has_openai_key = bool(openai_api_key)
52
+ has_anthropic_key = bool(anthropic_api_key)
53
+ has_cua_key = bool(cua_api_key)
54
+
55
+ # Get Ollama models for OMNI
56
+ ollama_models = get_ollama_models()
57
+ if ollama_models:
58
+ omni_models += ollama_models
59
+
60
+ # Detect platform
61
+ is_mac = platform.system().lower() == "darwin"
62
+
63
+ # Format model choices
64
+ provider_to_models = {
65
+ "OPENAI": openai_models,
66
+ "ANTHROPIC": anthropic_models,
67
+ "OMNI": omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
68
+ "UITARS": ([
69
+ "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
70
+ ] if is_mac else []) + ["Custom model (OpenAI compatible API)"],
71
+ }
72
+
73
+ # Apply saved settings
74
+ initial_loop = saved_settings.get("agent_loop", "OMNI")
75
+ available_models_for_loop = provider_to_models.get(initial_loop, [])
76
+ saved_model_choice = saved_settings.get("model_choice")
77
+ if saved_model_choice and saved_model_choice in available_models_for_loop:
78
+ initial_model = saved_model_choice
79
+ else:
80
+ if initial_loop == "OPENAI":
81
+ initial_model = openai_models[0] if openai_models else "No models available"
82
+ elif initial_loop == "ANTHROPIC":
83
+ initial_model = anthropic_models[0] if anthropic_models else "No models available"
84
+ else: # OMNI
85
+ initial_model = omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
86
+
87
+ initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
88
+ initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
89
+ initial_save_trajectory = saved_settings.get("save_trajectory", True)
90
+ initial_recent_images = saved_settings.get("recent_images", 3)
91
+
92
+ # Example prompts
93
+ example_messages = [
94
+ "Create a Python virtual environment, install pandas and matplotlib, then plot stock data",
95
+ "Open a PDF in Preview, add annotations, and save it as a compressed version",
96
+ "Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
97
+ "Configure SSH keys and set up a connection to a remote server",
98
+ ]
99
+
100
+ def generate_python_code(agent_loop_choice, model_name, tasks, recent_images=3, save_trajectory=True, computer_os="linux", computer_provider="cloud", container_name="", cua_cloud_api_key="", max_budget=None):
101
+ """Generate Python code for the current configuration and tasks."""
102
+ tasks_str = ""
103
+ for task in tasks:
104
+ if task and task.strip():
105
+ tasks_str += f' "{task}",\n'
106
+
107
+ model_string = get_model_string(model_name, agent_loop_choice)
108
+
109
+ computer_args = []
110
+ if computer_os != "macos":
111
+ computer_args.append(f'os_type="{computer_os}"')
112
+ if computer_provider != "lume":
113
+ computer_args.append(f'provider_type="{computer_provider}"')
114
+ if container_name:
115
+ computer_args.append(f'name="{container_name}"')
116
+ if cua_cloud_api_key:
117
+ computer_args.append(f'api_key="{cua_cloud_api_key}"')
118
+
119
+ computer_args_str = ", ".join(computer_args)
120
+ if computer_args_str:
121
+ computer_args_str = f"({computer_args_str})"
122
+ else:
123
+ computer_args_str = "()"
124
+
125
+ code = f'''import asyncio
126
+ from computer import Computer
127
+ from agent import ComputerAgent
128
+
129
+ async def main():
130
+ async with Computer{computer_args_str} as computer:
131
+ agent = ComputerAgent(
132
+ model="{model_string}",
133
+ tools=[computer],
134
+ only_n_most_recent_images={recent_images},'''
135
+
136
+ if save_trajectory:
137
+ code += '''
138
+ trajectory_dir="trajectories",'''
139
+
140
+ if max_budget:
141
+ code += f'''
142
+ max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},'''
143
+
144
+ code += '''
145
+ )
146
+ '''
147
+
148
+ if tasks_str:
149
+ code += f'''
150
+ # Prompts for the computer-use agent
151
+ tasks = [
152
+ {tasks_str.rstrip()}
153
+ ]
154
+
155
+ for task in tasks:
156
+ print(f"Executing task: {{task}}")
157
+ messages = [{{"role": "user", "content": task}}]
158
+ async for result in agent.run(messages):
159
+ for item in result["output"]:
160
+ if item["type"] == "message":
161
+ print(item["content"][0]["text"])'''
162
+ else:
163
+ code += f'''
164
+ # Execute a single task
165
+ task = "Search for information about CUA on GitHub"
166
+ print(f"Executing task: {{task}}")
167
+ messages = [{{"role": "user", "content": task}}]
168
+ async for result in agent.run(messages):
169
+ for item in result["output"]:
170
+ if item["type"] == "message":
171
+ print(item["content"][0]["text"])'''
172
+
173
+ code += '''
174
+
175
+ if __name__ == "__main__":
176
+ asyncio.run(main())'''
177
+
178
+ return code
179
+
180
+ # Create the Gradio interface
181
+ with gr.Blocks(title="Computer-Use Agent") as demo:
182
+ with gr.Row():
183
+ # Left column for settings
184
+ with gr.Column(scale=1):
185
+ # Logo
186
+ gr.HTML(
187
+ """
188
+ <div style="display: flex; justify-content: center; margin-bottom: 0.5em">
189
+ <img alt="CUA Logo" style="width: 80px;"
190
+ src="https://github.com/trycua/cua/blob/main/img/logo_black.png?raw=true" />
191
+ </div>
192
+ """
193
+ )
194
+
195
+ # Python code accordion
196
+ with gr.Accordion("Python Code", open=False):
197
+ code_display = gr.Code(
198
+ language="python",
199
+ value=generate_python_code(initial_loop, "gpt-4o", []),
200
+ interactive=False,
201
+ )
202
+
203
+ with gr.Accordion("Computer Configuration", open=True):
204
+ computer_os = gr.Radio(
205
+ choices=["macos", "linux", "windows"],
206
+ label="Operating System",
207
+ value="macos",
208
+ info="Select the operating system for the computer",
209
+ )
210
+
211
+ is_windows = platform.system().lower() == "windows"
212
+ is_mac = platform.system().lower() == "darwin"
213
+
214
+ providers = ["cloud"]
215
+ if is_mac:
216
+ providers += ["lume"]
217
+ if is_windows:
218
+ providers += ["winsandbox"]
219
+
220
+ computer_provider = gr.Radio(
221
+ choices=providers,
222
+ label="Provider",
223
+ value="lume" if is_mac else "cloud",
224
+ info="Select the computer provider",
225
+ )
226
+
227
+ container_name = gr.Textbox(
228
+ label="Container Name",
229
+ placeholder="Enter container name (optional)",
230
+ value=os.environ.get("CUA_CONTAINER_NAME", ""),
231
+ info="Optional name for the container",
232
+ )
233
+
234
+ cua_cloud_api_key = gr.Textbox(
235
+ label="CUA Cloud API Key",
236
+ placeholder="Enter your CUA Cloud API key",
237
+ value=os.environ.get("CUA_API_KEY", ""),
238
+ type="password",
239
+ info="Required for cloud provider",
240
+ visible=(not has_cua_key)
241
+ )
242
+
243
+ with gr.Accordion("Agent Configuration", open=True):
244
+ agent_loop = gr.Dropdown(
245
+ choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
246
+ label="Agent Loop",
247
+ value=initial_loop,
248
+ info="Select the agent loop provider",
249
+ )
250
+
251
+ # Model selection dropdowns
252
+ with gr.Group() as model_selection_group:
253
+ openai_model_choice = gr.Dropdown(
254
+ choices=openai_models,
255
+ label="OpenAI Model",
256
+ value=openai_models[0] if openai_models else "No models available",
257
+ info="Select OpenAI model",
258
+ interactive=True,
259
+ visible=(initial_loop == "OPENAI")
260
+ )
261
+
262
+ anthropic_model_choice = gr.Dropdown(
263
+ choices=anthropic_models,
264
+ label="Anthropic Model",
265
+ value=anthropic_models[0] if anthropic_models else "No models available",
266
+ info="Select Anthropic model",
267
+ interactive=True,
268
+ visible=(initial_loop == "ANTHROPIC")
269
+ )
270
+
271
+ omni_model_choice = gr.Dropdown(
272
+ choices=omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
273
+ label="OMNI Model",
274
+ value=omni_models[0] if omni_models else "Custom model (OpenAI compatible API)",
275
+ info="Select OMNI model or choose a custom model option",
276
+ interactive=True,
277
+ visible=(initial_loop == "OMNI")
278
+ )
279
+
280
+ uitars_model_choice = gr.Dropdown(
281
+ choices=provider_to_models.get("UITARS", ["No models available"]),
282
+ label="UITARS Model",
283
+ value=provider_to_models.get("UITARS", ["No models available"])[0] if provider_to_models.get("UITARS") else "No models available",
284
+ info="Select UITARS model",
285
+ interactive=True,
286
+ visible=(initial_loop == "UITARS")
287
+ )
288
+
289
+ model_choice = gr.Textbox(visible=False)
290
+
291
+ # API key inputs
292
+ with gr.Group(visible=not has_openai_key and (initial_loop == "OPENAI" or initial_loop == "OMNI")) as openai_key_group:
293
+ openai_api_key_input = gr.Textbox(
294
+ label="OpenAI API Key",
295
+ placeholder="Enter your OpenAI API key",
296
+ value=os.environ.get("OPENAI_API_KEY", ""),
297
+ interactive=True,
298
+ type="password",
299
+ info="Required for OpenAI models"
300
+ )
301
+
302
+ with gr.Group(visible=not has_anthropic_key and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")) as anthropic_key_group:
303
+ anthropic_api_key_input = gr.Textbox(
304
+ label="Anthropic API Key",
305
+ placeholder="Enter your Anthropic API key",
306
+ value=os.environ.get("ANTHROPIC_API_KEY", ""),
307
+ interactive=True,
308
+ type="password",
309
+ info="Required for Anthropic models"
310
+ )
311
+
312
+ # API key handlers
313
+ def set_openai_api_key(key):
314
+ if key and key.strip():
315
+ os.environ["OPENAI_API_KEY"] = key.strip()
316
+ print(f"DEBUG - Set OpenAI API key environment variable")
317
+ return key
318
+
319
+ def set_anthropic_api_key(key):
320
+ if key and key.strip():
321
+ os.environ["ANTHROPIC_API_KEY"] = key.strip()
322
+ print(f"DEBUG - Set Anthropic API key environment variable")
323
+ return key
324
+
325
+ openai_api_key_input.change(
326
+ fn=set_openai_api_key,
327
+ inputs=[openai_api_key_input],
328
+ outputs=[openai_api_key_input],
329
+ queue=False
330
+ )
331
+
332
+ anthropic_api_key_input.change(
333
+ fn=set_anthropic_api_key,
334
+ inputs=[anthropic_api_key_input],
335
+ outputs=[anthropic_api_key_input],
336
+ queue=False
337
+ )
338
+
339
+ # UI update function
340
+ def update_ui(loop=None, openai_model=None, anthropic_model=None, omni_model=None, uitars_model=None):
341
+ loop = loop or agent_loop.value
342
+
343
+ model_value = None
344
+ if loop == "OPENAI" and openai_model:
345
+ model_value = openai_model
346
+ elif loop == "ANTHROPIC" and anthropic_model:
347
+ model_value = anthropic_model
348
+ elif loop == "OMNI" and omni_model:
349
+ model_value = omni_model
350
+ elif loop == "UITARS" and uitars_model:
351
+ model_value = uitars_model
352
+
353
+ openai_visible = (loop == "OPENAI")
354
+ anthropic_visible = (loop == "ANTHROPIC")
355
+ omni_visible = (loop == "OMNI")
356
+ uitars_visible = (loop == "UITARS")
357
+
358
+ show_openai_key = not has_openai_key and (loop == "OPENAI" or (loop == "OMNI" and model_value and "OpenAI" in model_value and "Custom" not in model_value))
359
+ show_anthropic_key = not has_anthropic_key and (loop == "ANTHROPIC" or (loop == "OMNI" and model_value and "Claude" in model_value and "Custom" not in model_value))
360
+
361
+ is_custom_openai_api = model_value == "Custom model (OpenAI compatible API)"
362
+ is_custom_ollama = model_value == "Custom model (ollama)"
363
+ is_any_custom = is_custom_openai_api or is_custom_ollama
364
+
365
+ model_choice_value = model_value if model_value else ""
366
+
367
+ return [
368
+ gr.update(visible=openai_visible),
369
+ gr.update(visible=anthropic_visible),
370
+ gr.update(visible=omni_visible),
371
+ gr.update(visible=uitars_visible),
372
+ gr.update(visible=show_openai_key),
373
+ gr.update(visible=show_anthropic_key),
374
+ gr.update(visible=is_any_custom),
375
+ gr.update(visible=is_custom_openai_api),
376
+ gr.update(visible=is_custom_openai_api),
377
+ gr.update(value=model_choice_value)
378
+ ]
379
+
380
+ # Custom model inputs
381
+ custom_model = gr.Textbox(
382
+ label="Custom Model Name",
383
+ placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct or llama3)",
384
+ value=initial_custom_model,
385
+ visible=(initial_model == "Custom model (OpenAI compatible API)" or initial_model == "Custom model (ollama)"),
386
+ interactive=True,
387
+ )
388
+
389
+ provider_base_url = gr.Textbox(
390
+ label="Provider Base URL",
391
+ placeholder="Enter provider base URL (e.g., http://localhost:1234/v1)",
392
+ value=initial_provider_base_url,
393
+ visible=(initial_model == "Custom model (OpenAI compatible API)"),
394
+ interactive=True,
395
+ )
396
+
397
+ provider_api_key = gr.Textbox(
398
+ label="Provider API Key",
399
+ placeholder="Enter provider API key (if required)",
400
+ value="",
401
+ visible=(initial_model == "Custom model (OpenAI compatible API)"),
402
+ interactive=True,
403
+ type="password",
404
+ )
405
+
406
+ # Connect UI update events
407
+ for dropdown in [agent_loop, omni_model_choice, uitars_model_choice, openai_model_choice, anthropic_model_choice]:
408
+ dropdown.change(
409
+ fn=update_ui,
410
+ inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
411
+ outputs=[
412
+ openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
413
+ openai_key_group, anthropic_key_group,
414
+ custom_model, provider_base_url, provider_api_key,
415
+ model_choice
416
+ ],
417
+ queue=False
418
+ )
419
+
420
+ save_trajectory = gr.Checkbox(
421
+ label="Save Trajectory",
422
+ value=initial_save_trajectory,
423
+ info="Save the agent's trajectory for debugging",
424
+ interactive=True,
425
+ )
426
+
427
+ recent_images = gr.Slider(
428
+ label="Recent Images",
429
+ minimum=1,
430
+ maximum=10,
431
+ value=initial_recent_images,
432
+ step=1,
433
+ info="Number of recent images to keep in context",
434
+ interactive=True,
435
+ )
436
+
437
+ max_budget = gr.Number(
438
+ label="Max Budget ($)",
439
+ value=lambda: None,
440
+ minimum=-1,
441
+ maximum=100.0,
442
+ step=0.1,
443
+ info="Optional budget limit for trajectory (0 = no limit)",
444
+ interactive=True,
445
+ )
446
+
447
+ # Right column for chat interface
448
+ with gr.Column(scale=2):
449
+ gr.Markdown(
450
+ "Ask me to perform tasks in a virtual environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
451
+ )
452
+
453
+ chatbot_history = gr.Chatbot(type="messages")
454
+ msg = gr.Textbox(
455
+ placeholder="Ask me to perform tasks in a virtual environment"
456
+ )
457
+ clear = gr.Button("Clear")
458
+ cancel_button = gr.Button("Cancel", variant="stop")
459
+
460
+ # Add examples
461
+ example_group = gr.Examples(examples=example_messages, inputs=msg)
462
+
463
+ # Chat submission function
464
+ def chat_submit(message, history):
465
+ history.append(gr.ChatMessage(role="user", content=message))
466
+ return "", history
467
+
468
+ # Cancel function
469
+ async def cancel_agent_task(history):
470
+ global global_agent
471
+ if global_agent:
472
+ print("DEBUG - Cancelling agent task")
473
+ history.append(gr.ChatMessage(role="assistant", content="Task cancelled by user", metadata={"title": "❌ Cancelled"}))
474
+ else:
475
+ history.append(gr.ChatMessage(role="assistant", content="No active agent task to cancel", metadata={"title": "ℹ️ Info"}))
476
+ return history
477
+
478
+ # Process response function
479
+ async def process_response(
480
+ history,
481
+ openai_model_value,
482
+ anthropic_model_value,
483
+ omni_model_value,
484
+ uitars_model_value,
485
+ custom_model_value,
486
+ agent_loop_choice,
487
+ save_traj,
488
+ recent_imgs,
489
+ custom_url_value=None,
490
+ custom_api_key=None,
491
+ openai_key_input=None,
492
+ anthropic_key_input=None,
493
+ computer_os="linux",
494
+ computer_provider="cloud",
495
+ container_name="",
496
+ cua_cloud_api_key="",
497
+ max_budget_value=None,
498
+ ):
499
+ if not history:
500
+ yield history
501
+ return
502
+
503
+ # Get the last user message
504
+ last_user_message = history[-1]["content"]
505
+
506
+ # Get the appropriate model value based on the agent loop
507
+ if agent_loop_choice == "OPENAI":
508
+ model_choice_value = openai_model_value
509
+ elif agent_loop_choice == "ANTHROPIC":
510
+ model_choice_value = anthropic_model_value
511
+ elif agent_loop_choice == "OMNI":
512
+ model_choice_value = omni_model_value
513
+ elif agent_loop_choice == "UITARS":
514
+ model_choice_value = uitars_model_value
515
+ else:
516
+ model_choice_value = "No models available"
517
+
518
+ # Determine if this is a custom model selection
519
+ is_custom_model_selected = model_choice_value in ["Custom model (OpenAI compatible API)", "Custom model (ollama)"]
520
+
521
+ # Determine the model name string to analyze
522
+ if is_custom_model_selected:
523
+ model_string_to_analyze = custom_model_value
524
+ else:
525
+ model_string_to_analyze = model_choice_value
526
+
527
+ try:
528
+ # Get the model string
529
+ model_string = get_model_string(model_string_to_analyze, agent_loop_choice)
530
+
531
+ # Set API keys if provided
532
+ if openai_key_input:
533
+ os.environ["OPENAI_API_KEY"] = openai_key_input
534
+ if anthropic_key_input:
535
+ os.environ["ANTHROPIC_API_KEY"] = anthropic_key_input
536
+ if cua_cloud_api_key:
537
+ os.environ["CUA_API_KEY"] = cua_cloud_api_key
538
+
539
+ # Save settings
540
+ current_settings = {
541
+ "agent_loop": agent_loop_choice,
542
+ "model_choice": model_choice_value,
543
+ "custom_model": custom_model_value,
544
+ "provider_base_url": custom_url_value,
545
+ "save_trajectory": save_traj,
546
+ "recent_images": recent_imgs,
547
+ "computer_os": computer_os,
548
+ "computer_provider": computer_provider,
549
+ "container_name": container_name,
550
+ }
551
+ save_settings(current_settings)
552
+
553
+ # Create agent
554
+ global_agent = create_agent(
555
+ model_string=model_string,
556
+ save_trajectory=save_traj,
557
+ only_n_most_recent_images=recent_imgs,
558
+ custom_model_name=custom_model_value if is_custom_model_selected else None,
559
+ computer_os=computer_os,
560
+ computer_provider=computer_provider,
561
+ computer_name=container_name,
562
+ computer_api_key=cua_cloud_api_key,
563
+ verbosity=logging.DEBUG,
564
+ max_trajectory_budget=max_budget_value if max_budget_value and max_budget_value > 0 else None,
565
+ )
566
+
567
+ if global_agent is None:
568
+ history.append(
569
+ gr.ChatMessage(
570
+ role="assistant",
571
+ content="Failed to create agent. Check API keys and configuration.",
572
+ )
573
+ )
574
+ yield history
575
+ return
576
+
577
+ # Add user message to global history
578
+ global global_messages
579
+ global_messages.append({"role": "user", "content": last_user_message})
580
+
581
+ # Stream responses from the agent
582
+ async for result in global_agent.run(global_messages):
583
+ global_messages += result.get("output", [])
584
+ # print(f"DEBUG - Agent response ------- START")
585
+ # from pprint import pprint
586
+ # pprint(result)
587
+ # print(f"DEBUG - Agent response ------- END")
588
+
589
+ # Process the result output
590
+ for item in result.get("output", []):
591
+ if item.get("type") == "message":
592
+ content = item.get("content", [])
593
+ for content_part in content:
594
+ if content_part.get("text"):
595
+ history.append(gr.ChatMessage(
596
+ role=item.get("role", "assistant"),
597
+ content=content_part.get("text", ""),
598
+ metadata=content_part.get("metadata", {})
599
+ ))
600
+ elif item.get("type") == "computer_call":
601
+ action = item.get("action", {})
602
+ action_type = action.get("type", "")
603
+ if action_type:
604
+ action_title = f"🛠️ Performing {action_type}"
605
+ if action.get("x") and action.get("y"):
606
+ action_title += f" at ({action['x']}, {action['y']})"
607
+ history.append(gr.ChatMessage(
608
+ role="assistant",
609
+ content=f"```json\n{json.dumps(action)}\n```",
610
+ metadata={"title": action_title}
611
+ ))
612
+ elif item.get("type") == "function_call":
613
+ function_name = item.get("name", "")
614
+ arguments = item.get("arguments", "{}")
615
+ history.append(gr.ChatMessage(
616
+ role="assistant",
617
+ content=f"🔧 Calling function: {function_name}\n```json\n{arguments}\n```",
618
+ metadata={"title": f"Function Call: {function_name}"}
619
+ ))
620
+ elif item.get("type") == "function_call_output":
621
+ output = item.get("output", "")
622
+ history.append(gr.ChatMessage(
623
+ role="assistant",
624
+ content=f"📤 Function output:\n```\n{output}\n```",
625
+ metadata={"title": "Function Output"}
626
+ ))
627
+ elif item.get("type") == "computer_call_output":
628
+ output = item.get("output", {}).get("image_url", "")
629
+ image_markdown = f"![Computer output]({output})"
630
+ history.append(gr.ChatMessage(
631
+ role="assistant",
632
+ content=image_markdown,
633
+ metadata={"title": "🖥️ Computer Output"}
634
+ ))
635
+
636
+ yield history
637
+
638
+ except Exception as e:
639
+ import traceback
640
+ traceback.print_exc()
641
+ history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
642
+ yield history
643
+
644
+ # Connect the submit button
645
+ submit_event = msg.submit(
646
+ fn=chat_submit,
647
+ inputs=[msg, chatbot_history],
648
+ outputs=[msg, chatbot_history],
649
+ queue=False,
650
+ ).then(
651
+ fn=process_response,
652
+ inputs=[
653
+ chatbot_history,
654
+ openai_model_choice,
655
+ anthropic_model_choice,
656
+ omni_model_choice,
657
+ uitars_model_choice,
658
+ custom_model,
659
+ agent_loop,
660
+ save_trajectory,
661
+ recent_images,
662
+ provider_base_url,
663
+ provider_api_key,
664
+ openai_api_key_input,
665
+ anthropic_api_key_input,
666
+ computer_os,
667
+ computer_provider,
668
+ container_name,
669
+ cua_cloud_api_key,
670
+ max_budget,
671
+ ],
672
+ outputs=[chatbot_history],
673
+ queue=True,
674
+ )
675
+
676
+ # Clear button functionality
677
+ def clear_chat():
678
+ global global_messages
679
+ global_messages.clear()
680
+ return None
681
+
682
+ clear.click(clear_chat, None, chatbot_history, queue=False)
683
+
684
+ # Connect cancel button
685
+ cancel_button.click(
686
+ cancel_agent_task,
687
+ [chatbot_history],
688
+ [chatbot_history],
689
+ queue=False
690
+ )
691
+
692
+ # Code display update function
693
+ def update_code_display(agent_loop, model_choice_val, custom_model_val, chat_history, recent_images_val, save_trajectory_val, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget_val):
694
+ messages = []
695
+ if chat_history:
696
+ for msg in chat_history:
697
+ if isinstance(msg, dict) and msg.get("role") == "user":
698
+ messages.append(msg.get("content", ""))
699
+
700
+ return generate_python_code(
701
+ agent_loop,
702
+ model_choice_val or custom_model_val or "gpt-4o",
703
+ messages,
704
+ recent_images_val,
705
+ save_trajectory_val,
706
+ computer_os,
707
+ computer_provider,
708
+ container_name,
709
+ cua_cloud_api_key,
710
+ max_budget_val
711
+ )
712
+
713
+ # Update code display when configuration changes
714
+ for component in [agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget]:
715
+ component.change(
716
+ update_code_display,
717
+ inputs=[agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget],
718
+ outputs=[code_display]
719
+ )
720
+
721
+ return demo