cua-agent 0.3.1__py3-none-any.whl → 0.4.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show
  1. agent/__init__.py +15 -51
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +216 -0
  5. agent/agent.py +577 -0
  6. agent/callbacks/__init__.py +17 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/trajectory_saver.py +305 -0
  13. agent/cli.py +290 -0
  14. agent/computer_handler.py +107 -0
  15. agent/decorators.py +90 -0
  16. agent/loops/__init__.py +11 -0
  17. agent/loops/anthropic.py +728 -0
  18. agent/loops/omniparser.py +339 -0
  19. agent/loops/openai.py +95 -0
  20. agent/loops/uitars.py +688 -0
  21. agent/responses.py +207 -0
  22. agent/types.py +79 -0
  23. agent/ui/__init__.py +7 -1
  24. agent/ui/gradio/__init__.py +6 -19
  25. agent/ui/gradio/app.py +80 -1299
  26. agent/ui/gradio/ui_components.py +703 -0
  27. cua_agent-0.4.0b1.dist-info/METADATA +424 -0
  28. cua_agent-0.4.0b1.dist-info/RECORD +30 -0
  29. {cua_agent-0.3.1.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +1 -1
  30. agent/core/__init__.py +0 -27
  31. agent/core/agent.py +0 -210
  32. agent/core/base.py +0 -217
  33. agent/core/callbacks.py +0 -200
  34. agent/core/experiment.py +0 -249
  35. agent/core/factory.py +0 -122
  36. agent/core/messages.py +0 -332
  37. agent/core/provider_config.py +0 -21
  38. agent/core/telemetry.py +0 -142
  39. agent/core/tools/__init__.py +0 -21
  40. agent/core/tools/base.py +0 -74
  41. agent/core/tools/bash.py +0 -52
  42. agent/core/tools/collection.py +0 -46
  43. agent/core/tools/computer.py +0 -113
  44. agent/core/tools/edit.py +0 -67
  45. agent/core/tools/manager.py +0 -56
  46. agent/core/tools.py +0 -32
  47. agent/core/types.py +0 -88
  48. agent/core/visualization.py +0 -197
  49. agent/providers/__init__.py +0 -4
  50. agent/providers/anthropic/__init__.py +0 -6
  51. agent/providers/anthropic/api/client.py +0 -360
  52. agent/providers/anthropic/api/logging.py +0 -150
  53. agent/providers/anthropic/api_handler.py +0 -140
  54. agent/providers/anthropic/callbacks/__init__.py +0 -5
  55. agent/providers/anthropic/callbacks/manager.py +0 -65
  56. agent/providers/anthropic/loop.py +0 -568
  57. agent/providers/anthropic/prompts.py +0 -23
  58. agent/providers/anthropic/response_handler.py +0 -226
  59. agent/providers/anthropic/tools/__init__.py +0 -33
  60. agent/providers/anthropic/tools/base.py +0 -88
  61. agent/providers/anthropic/tools/bash.py +0 -66
  62. agent/providers/anthropic/tools/collection.py +0 -34
  63. agent/providers/anthropic/tools/computer.py +0 -396
  64. agent/providers/anthropic/tools/edit.py +0 -326
  65. agent/providers/anthropic/tools/manager.py +0 -54
  66. agent/providers/anthropic/tools/run.py +0 -42
  67. agent/providers/anthropic/types.py +0 -16
  68. agent/providers/anthropic/utils.py +0 -367
  69. agent/providers/omni/__init__.py +0 -8
  70. agent/providers/omni/api_handler.py +0 -42
  71. agent/providers/omni/clients/anthropic.py +0 -103
  72. agent/providers/omni/clients/base.py +0 -35
  73. agent/providers/omni/clients/oaicompat.py +0 -195
  74. agent/providers/omni/clients/ollama.py +0 -122
  75. agent/providers/omni/clients/openai.py +0 -155
  76. agent/providers/omni/clients/utils.py +0 -25
  77. agent/providers/omni/image_utils.py +0 -34
  78. agent/providers/omni/loop.py +0 -990
  79. agent/providers/omni/parser.py +0 -307
  80. agent/providers/omni/prompts.py +0 -64
  81. agent/providers/omni/tools/__init__.py +0 -30
  82. agent/providers/omni/tools/base.py +0 -29
  83. agent/providers/omni/tools/bash.py +0 -74
  84. agent/providers/omni/tools/computer.py +0 -179
  85. agent/providers/omni/tools/manager.py +0 -61
  86. agent/providers/omni/utils.py +0 -236
  87. agent/providers/openai/__init__.py +0 -6
  88. agent/providers/openai/api_handler.py +0 -456
  89. agent/providers/openai/loop.py +0 -472
  90. agent/providers/openai/response_handler.py +0 -205
  91. agent/providers/openai/tools/__init__.py +0 -15
  92. agent/providers/openai/tools/base.py +0 -79
  93. agent/providers/openai/tools/computer.py +0 -326
  94. agent/providers/openai/tools/manager.py +0 -106
  95. agent/providers/openai/types.py +0 -36
  96. agent/providers/openai/utils.py +0 -98
  97. agent/providers/uitars/__init__.py +0 -1
  98. agent/providers/uitars/clients/base.py +0 -35
  99. agent/providers/uitars/clients/mlxvlm.py +0 -263
  100. agent/providers/uitars/clients/oaicompat.py +0 -214
  101. agent/providers/uitars/loop.py +0 -660
  102. agent/providers/uitars/prompts.py +0 -63
  103. agent/providers/uitars/tools/__init__.py +0 -1
  104. agent/providers/uitars/tools/computer.py +0 -283
  105. agent/providers/uitars/tools/manager.py +0 -60
  106. agent/providers/uitars/utils.py +0 -264
  107. agent/telemetry.py +0 -21
  108. agent/ui/__main__.py +0 -15
  109. cua_agent-0.3.1.dist-info/METADATA +0 -295
  110. cua_agent-0.3.1.dist-info/RECORD +0 -87
  111. {cua_agent-0.3.1.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,703 @@
1
+ """
2
+ UI Components for the Gradio interface
3
+ """
4
+
5
+ import os
6
+ import asyncio
7
+ import logging
8
+ import json
9
+ import platform
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional, Any, cast
12
+ import gradio as gr
13
+ from gradio.components.chatbot import MetadataDict
14
+
15
+ from .app import (
16
+ load_settings, save_settings, create_agent, get_model_string,
17
+ get_ollama_models, GradioChatScreenshotHandler, global_agent, global_computer
18
+ )
19
+
20
+
21
+ def create_gradio_ui() -> gr.Blocks:
22
+ """Create a Gradio UI for the Computer-Use Agent."""
23
+
24
+ # Load settings
25
+ saved_settings = load_settings()
26
+
27
+ # Check for API keys
28
+ openai_api_key = os.environ.get("OPENAI_API_KEY", "")
29
+ anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
30
+ cua_api_key = os.environ.get("CUA_API_KEY", "")
31
+
32
+ # Model choices
33
+ openai_models = ["OpenAI: Computer-Use Preview"]
34
+ anthropic_models = [
35
+ "Anthropic: Claude 4 Opus (20250514)",
36
+ "Anthropic: Claude 4 Sonnet (20250514)",
37
+ "Anthropic: Claude 3.7 Sonnet (20250219)",
38
+ "Anthropic: Claude 3.5 Sonnet (20240620)",
39
+ ]
40
+ omni_models = [
41
+ "OMNI: OpenAI GPT-4o",
42
+ "OMNI: OpenAI GPT-4o mini",
43
+ "OMNI: Claude 3.7 Sonnet (20250219)",
44
+ "OMNI: Claude 3.5 Sonnet (20240620)"
45
+ ]
46
+
47
+ # Check if API keys are available
48
+ has_openai_key = bool(openai_api_key)
49
+ has_anthropic_key = bool(anthropic_api_key)
50
+ has_cua_key = bool(cua_api_key)
51
+
52
+ # Get Ollama models for OMNI
53
+ ollama_models = get_ollama_models()
54
+ if ollama_models:
55
+ omni_models += ollama_models
56
+
57
+ # Detect platform
58
+ is_mac = platform.system().lower() == "darwin"
59
+
60
+ # Format model choices
61
+ provider_to_models = {
62
+ "OPENAI": openai_models,
63
+ "ANTHROPIC": anthropic_models,
64
+ "OMNI": omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
65
+ "UITARS": ([
66
+ "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
67
+ ] if is_mac else []) + ["Custom model (OpenAI compatible API)"],
68
+ }
69
+
70
+ # Apply saved settings
71
+ initial_loop = saved_settings.get("agent_loop", "OMNI")
72
+ available_models_for_loop = provider_to_models.get(initial_loop, [])
73
+ saved_model_choice = saved_settings.get("model_choice")
74
+ if saved_model_choice and saved_model_choice in available_models_for_loop:
75
+ initial_model = saved_model_choice
76
+ else:
77
+ if initial_loop == "OPENAI":
78
+ initial_model = openai_models[0] if openai_models else "No models available"
79
+ elif initial_loop == "ANTHROPIC":
80
+ initial_model = anthropic_models[0] if anthropic_models else "No models available"
81
+ else: # OMNI
82
+ initial_model = omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
83
+
84
+ initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
85
+ initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
86
+ initial_save_trajectory = saved_settings.get("save_trajectory", True)
87
+ initial_recent_images = saved_settings.get("recent_images", 3)
88
+
89
+ # Example prompts
90
+ example_messages = [
91
+ "Create a Python virtual environment, install pandas and matplotlib, then plot stock data",
92
+ "Open a PDF in Preview, add annotations, and save it as a compressed version",
93
+ "Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
94
+ "Configure SSH keys and set up a connection to a remote server",
95
+ ]
96
+
97
+ def generate_python_code(agent_loop_choice, model_name, tasks, recent_images=3, save_trajectory=True, computer_os="linux", computer_provider="cloud", container_name="", cua_cloud_api_key="", max_budget=None):
98
+ """Generate Python code for the current configuration and tasks."""
99
+ tasks_str = ""
100
+ for task in tasks:
101
+ if task and task.strip():
102
+ tasks_str += f' "{task}",\n'
103
+
104
+ model_string = get_model_string(model_name, agent_loop_choice)
105
+
106
+ computer_args = []
107
+ if computer_os != "macos":
108
+ computer_args.append(f'os_type="{computer_os}"')
109
+ if computer_provider != "lume":
110
+ computer_args.append(f'provider_type="{computer_provider}"')
111
+ if container_name:
112
+ computer_args.append(f'name="{container_name}"')
113
+ if cua_cloud_api_key:
114
+ computer_args.append(f'api_key="{cua_cloud_api_key}"')
115
+
116
+ computer_args_str = ", ".join(computer_args)
117
+ if computer_args_str:
118
+ computer_args_str = f"({computer_args_str})"
119
+ else:
120
+ computer_args_str = "()"
121
+
122
+ code = f'''import asyncio
123
+ from computer import Computer
124
+ from agent import ComputerAgent
125
+
126
+ async def main():
127
+ async with Computer{computer_args_str} as computer:
128
+ agent = ComputerAgent(
129
+ model="{model_string}",
130
+ tools=[computer],
131
+ only_n_most_recent_images={recent_images},'''
132
+
133
+ if save_trajectory:
134
+ code += '''
135
+ trajectory_dir="trajectories",'''
136
+
137
+ if max_budget:
138
+ code += f'''
139
+ max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},'''
140
+
141
+ code += '''
142
+ )
143
+ '''
144
+
145
+ if tasks_str:
146
+ code += f'''
147
+ # Prompts for the computer-use agent
148
+ tasks = [
149
+ {tasks_str.rstrip()}
150
+ ]
151
+
152
+ for task in tasks:
153
+ print(f"Executing task: {{task}}")
154
+ messages = [{{"role": "user", "content": task}}]
155
+ async for result in agent.run(messages):
156
+ for item in result["output"]:
157
+ if item["type"] == "message":
158
+ print(item["content"][0]["text"])'''
159
+ else:
160
+ code += f'''
161
+ # Execute a single task
162
+ task = "Search for information about CUA on GitHub"
163
+ print(f"Executing task: {{task}}")
164
+ messages = [{{"role": "user", "content": task}}]
165
+ async for result in agent.run(messages):
166
+ for item in result["output"]:
167
+ if item["type"] == "message":
168
+ print(item["content"][0]["text"])'''
169
+
170
+ code += '''
171
+
172
+ if __name__ == "__main__":
173
+ asyncio.run(main())'''
174
+
175
+ return code
176
+
177
+ # Create the Gradio interface
178
+ with gr.Blocks(title="Computer-Use Agent") as demo:
179
+ with gr.Row():
180
+ # Left column for settings
181
+ with gr.Column(scale=1):
182
+ # Logo
183
+ gr.HTML(
184
+ """
185
+ <div style="display: flex; justify-content: center; margin-bottom: 0.5em">
186
+ <img alt="CUA Logo" style="width: 80px;"
187
+ src="https://github.com/trycua/cua/blob/main/img/logo_black.png?raw=true" />
188
+ </div>
189
+ """
190
+ )
191
+
192
+ # Python code accordion
193
+ with gr.Accordion("Python Code", open=False):
194
+ code_display = gr.Code(
195
+ language="python",
196
+ value=generate_python_code(initial_loop, "gpt-4o", []),
197
+ interactive=False,
198
+ )
199
+
200
+ with gr.Accordion("Computer Configuration", open=True):
201
+ computer_os = gr.Radio(
202
+ choices=["macos", "linux", "windows"],
203
+ label="Operating System",
204
+ value="macos",
205
+ info="Select the operating system for the computer",
206
+ )
207
+
208
+ is_windows = platform.system().lower() == "windows"
209
+ is_mac = platform.system().lower() == "darwin"
210
+
211
+ providers = ["cloud"]
212
+ if is_mac:
213
+ providers += ["lume"]
214
+ if is_windows:
215
+ providers += ["winsandbox"]
216
+
217
+ computer_provider = gr.Radio(
218
+ choices=providers,
219
+ label="Provider",
220
+ value="lume" if is_mac else "cloud",
221
+ info="Select the computer provider",
222
+ )
223
+
224
+ container_name = gr.Textbox(
225
+ label="Container Name",
226
+ placeholder="Enter container name (optional)",
227
+ value=os.environ.get("CUA_CONTAINER_NAME", ""),
228
+ info="Optional name for the container",
229
+ )
230
+
231
+ cua_cloud_api_key = gr.Textbox(
232
+ label="CUA Cloud API Key",
233
+ placeholder="Enter your CUA Cloud API key",
234
+ value=os.environ.get("CUA_API_KEY", ""),
235
+ type="password",
236
+ info="Required for cloud provider",
237
+ visible=(not has_cua_key)
238
+ )
239
+
240
+ with gr.Accordion("Agent Configuration", open=True):
241
+ agent_loop = gr.Dropdown(
242
+ choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
243
+ label="Agent Loop",
244
+ value=initial_loop,
245
+ info="Select the agent loop provider",
246
+ )
247
+
248
+ # Model selection dropdowns
249
+ with gr.Group() as model_selection_group:
250
+ openai_model_choice = gr.Dropdown(
251
+ choices=openai_models,
252
+ label="OpenAI Model",
253
+ value=openai_models[0] if openai_models else "No models available",
254
+ info="Select OpenAI model",
255
+ interactive=True,
256
+ visible=(initial_loop == "OPENAI")
257
+ )
258
+
259
+ anthropic_model_choice = gr.Dropdown(
260
+ choices=anthropic_models,
261
+ label="Anthropic Model",
262
+ value=anthropic_models[0] if anthropic_models else "No models available",
263
+ info="Select Anthropic model",
264
+ interactive=True,
265
+ visible=(initial_loop == "ANTHROPIC")
266
+ )
267
+
268
+ omni_model_choice = gr.Dropdown(
269
+ choices=omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
270
+ label="OMNI Model",
271
+ value=omni_models[0] if omni_models else "Custom model (OpenAI compatible API)",
272
+ info="Select OMNI model or choose a custom model option",
273
+ interactive=True,
274
+ visible=(initial_loop == "OMNI")
275
+ )
276
+
277
+ uitars_model_choice = gr.Dropdown(
278
+ choices=provider_to_models.get("UITARS", ["No models available"]),
279
+ label="UITARS Model",
280
+ value=provider_to_models.get("UITARS", ["No models available"])[0] if provider_to_models.get("UITARS") else "No models available",
281
+ info="Select UITARS model",
282
+ interactive=True,
283
+ visible=(initial_loop == "UITARS")
284
+ )
285
+
286
+ model_choice = gr.Textbox(visible=False)
287
+
288
+ # API key inputs
289
+ with gr.Group(visible=not has_openai_key and (initial_loop == "OPENAI" or initial_loop == "OMNI")) as openai_key_group:
290
+ openai_api_key_input = gr.Textbox(
291
+ label="OpenAI API Key",
292
+ placeholder="Enter your OpenAI API key",
293
+ value=os.environ.get("OPENAI_API_KEY", ""),
294
+ interactive=True,
295
+ type="password",
296
+ info="Required for OpenAI models"
297
+ )
298
+
299
+ with gr.Group(visible=not has_anthropic_key and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")) as anthropic_key_group:
300
+ anthropic_api_key_input = gr.Textbox(
301
+ label="Anthropic API Key",
302
+ placeholder="Enter your Anthropic API key",
303
+ value=os.environ.get("ANTHROPIC_API_KEY", ""),
304
+ interactive=True,
305
+ type="password",
306
+ info="Required for Anthropic models"
307
+ )
308
+
309
+ # API key handlers
310
+ def set_openai_api_key(key):
311
+ if key and key.strip():
312
+ os.environ["OPENAI_API_KEY"] = key.strip()
313
+ print(f"DEBUG - Set OpenAI API key environment variable")
314
+ return key
315
+
316
+ def set_anthropic_api_key(key):
317
+ if key and key.strip():
318
+ os.environ["ANTHROPIC_API_KEY"] = key.strip()
319
+ print(f"DEBUG - Set Anthropic API key environment variable")
320
+ return key
321
+
322
+ openai_api_key_input.change(
323
+ fn=set_openai_api_key,
324
+ inputs=[openai_api_key_input],
325
+ outputs=[openai_api_key_input],
326
+ queue=False
327
+ )
328
+
329
+ anthropic_api_key_input.change(
330
+ fn=set_anthropic_api_key,
331
+ inputs=[anthropic_api_key_input],
332
+ outputs=[anthropic_api_key_input],
333
+ queue=False
334
+ )
335
+
336
+ # UI update function
337
+ def update_ui(loop=None, openai_model=None, anthropic_model=None, omni_model=None, uitars_model=None):
338
+ loop = loop or agent_loop.value
339
+
340
+ model_value = None
341
+ if loop == "OPENAI" and openai_model:
342
+ model_value = openai_model
343
+ elif loop == "ANTHROPIC" and anthropic_model:
344
+ model_value = anthropic_model
345
+ elif loop == "OMNI" and omni_model:
346
+ model_value = omni_model
347
+ elif loop == "UITARS" and uitars_model:
348
+ model_value = uitars_model
349
+
350
+ openai_visible = (loop == "OPENAI")
351
+ anthropic_visible = (loop == "ANTHROPIC")
352
+ omni_visible = (loop == "OMNI")
353
+ uitars_visible = (loop == "UITARS")
354
+
355
+ show_openai_key = not has_openai_key and (loop == "OPENAI" or (loop == "OMNI" and model_value and "OpenAI" in model_value and "Custom" not in model_value))
356
+ show_anthropic_key = not has_anthropic_key and (loop == "ANTHROPIC" or (loop == "OMNI" and model_value and "Claude" in model_value and "Custom" not in model_value))
357
+
358
+ is_custom_openai_api = model_value == "Custom model (OpenAI compatible API)"
359
+ is_custom_ollama = model_value == "Custom model (ollama)"
360
+ is_any_custom = is_custom_openai_api or is_custom_ollama
361
+
362
+ model_choice_value = model_value if model_value else ""
363
+
364
+ return [
365
+ gr.update(visible=openai_visible),
366
+ gr.update(visible=anthropic_visible),
367
+ gr.update(visible=omni_visible),
368
+ gr.update(visible=uitars_visible),
369
+ gr.update(visible=show_openai_key),
370
+ gr.update(visible=show_anthropic_key),
371
+ gr.update(visible=is_any_custom),
372
+ gr.update(visible=is_custom_openai_api),
373
+ gr.update(visible=is_custom_openai_api),
374
+ gr.update(value=model_choice_value)
375
+ ]
376
+
377
+ # Custom model inputs
378
+ custom_model = gr.Textbox(
379
+ label="Custom Model Name",
380
+ placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct or llama3)",
381
+ value=initial_custom_model,
382
+ visible=(initial_model == "Custom model (OpenAI compatible API)" or initial_model == "Custom model (ollama)"),
383
+ interactive=True,
384
+ )
385
+
386
+ provider_base_url = gr.Textbox(
387
+ label="Provider Base URL",
388
+ placeholder="Enter provider base URL (e.g., http://localhost:1234/v1)",
389
+ value=initial_provider_base_url,
390
+ visible=(initial_model == "Custom model (OpenAI compatible API)"),
391
+ interactive=True,
392
+ )
393
+
394
+ provider_api_key = gr.Textbox(
395
+ label="Provider API Key",
396
+ placeholder="Enter provider API key (if required)",
397
+ value="",
398
+ visible=(initial_model == "Custom model (OpenAI compatible API)"),
399
+ interactive=True,
400
+ type="password",
401
+ )
402
+
403
+ # Connect UI update events
404
+ for dropdown in [agent_loop, omni_model_choice, uitars_model_choice, openai_model_choice, anthropic_model_choice]:
405
+ dropdown.change(
406
+ fn=update_ui,
407
+ inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
408
+ outputs=[
409
+ openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
410
+ openai_key_group, anthropic_key_group,
411
+ custom_model, provider_base_url, provider_api_key,
412
+ model_choice
413
+ ],
414
+ queue=False
415
+ )
416
+
417
+ save_trajectory = gr.Checkbox(
418
+ label="Save Trajectory",
419
+ value=initial_save_trajectory,
420
+ info="Save the agent's trajectory for debugging",
421
+ interactive=True,
422
+ )
423
+
424
+ recent_images = gr.Slider(
425
+ label="Recent Images",
426
+ minimum=1,
427
+ maximum=10,
428
+ value=initial_recent_images,
429
+ step=1,
430
+ info="Number of recent images to keep in context",
431
+ interactive=True,
432
+ )
433
+
434
+ max_budget = gr.Number(
435
+ label="Max Budget ($)",
436
+ value=lambda: None,
437
+ minimum=-1,
438
+ maximum=100.0,
439
+ step=0.1,
440
+ info="Optional budget limit for trajectory (0 = no limit)",
441
+ interactive=True,
442
+ )
443
+
444
+ # Right column for chat interface
445
+ with gr.Column(scale=2):
446
+ gr.Markdown(
447
+ "Ask me to perform tasks in a virtual environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
448
+ )
449
+
450
+ chatbot_history = gr.Chatbot(type="messages")
451
+ msg = gr.Textbox(
452
+ placeholder="Ask me to perform tasks in a virtual environment"
453
+ )
454
+ clear = gr.Button("Clear")
455
+ cancel_button = gr.Button("Cancel", variant="stop")
456
+
457
+ # Add examples
458
+ example_group = gr.Examples(examples=example_messages, inputs=msg)
459
+
460
+ # Chat submission function
461
+ def chat_submit(message, history):
462
+ history.append(gr.ChatMessage(role="user", content=message))
463
+ return "", history
464
+
465
+ # Cancel function
466
+ async def cancel_agent_task(history):
467
+ global global_agent
468
+ if global_agent:
469
+ print("DEBUG - Cancelling agent task")
470
+ history.append(gr.ChatMessage(role="assistant", content="Task cancelled by user", metadata={"title": "❌ Cancelled"}))
471
+ else:
472
+ history.append(gr.ChatMessage(role="assistant", content="No active agent task to cancel", metadata={"title": "ℹ️ Info"}))
473
+ return history
474
+
475
+ # Process response function
476
+ async def process_response(
477
+ history,
478
+ openai_model_value,
479
+ anthropic_model_value,
480
+ omni_model_value,
481
+ uitars_model_value,
482
+ custom_model_value,
483
+ agent_loop_choice,
484
+ save_traj,
485
+ recent_imgs,
486
+ custom_url_value=None,
487
+ custom_api_key=None,
488
+ openai_key_input=None,
489
+ anthropic_key_input=None,
490
+ computer_os="linux",
491
+ computer_provider="cloud",
492
+ container_name="",
493
+ cua_cloud_api_key="",
494
+ max_budget_value=None,
495
+ ):
496
+ if not history:
497
+ yield history
498
+ return
499
+
500
+ # Get the last user message
501
+ last_user_message = history[-1]["content"]
502
+
503
+ # Get the appropriate model value based on the agent loop
504
+ if agent_loop_choice == "OPENAI":
505
+ model_choice_value = openai_model_value
506
+ elif agent_loop_choice == "ANTHROPIC":
507
+ model_choice_value = anthropic_model_value
508
+ elif agent_loop_choice == "OMNI":
509
+ model_choice_value = omni_model_value
510
+ elif agent_loop_choice == "UITARS":
511
+ model_choice_value = uitars_model_value
512
+ else:
513
+ model_choice_value = "No models available"
514
+
515
+ # Determine if this is a custom model selection
516
+ is_custom_model_selected = model_choice_value in ["Custom model (OpenAI compatible API)", "Custom model (ollama)"]
517
+
518
+ # Determine the model name string to analyze
519
+ if is_custom_model_selected:
520
+ model_string_to_analyze = custom_model_value
521
+ else:
522
+ model_string_to_analyze = model_choice_value
523
+
524
+ try:
525
+ # Get the model string
526
+ model_string = get_model_string(model_string_to_analyze, agent_loop_choice)
527
+
528
+ # Set API keys if provided
529
+ if openai_key_input:
530
+ os.environ["OPENAI_API_KEY"] = openai_key_input
531
+ if anthropic_key_input:
532
+ os.environ["ANTHROPIC_API_KEY"] = anthropic_key_input
533
+ if cua_cloud_api_key:
534
+ os.environ["CUA_API_KEY"] = cua_cloud_api_key
535
+
536
+ # Save settings
537
+ current_settings = {
538
+ "agent_loop": agent_loop_choice,
539
+ "model_choice": model_choice_value,
540
+ "custom_model": custom_model_value,
541
+ "provider_base_url": custom_url_value,
542
+ "save_trajectory": save_traj,
543
+ "recent_images": recent_imgs,
544
+ "computer_os": computer_os,
545
+ "computer_provider": computer_provider,
546
+ "container_name": container_name,
547
+ }
548
+ save_settings(current_settings)
549
+
550
+ # Create agent
551
+ global_agent = create_agent(
552
+ model_string=model_string,
553
+ save_trajectory=save_traj,
554
+ only_n_most_recent_images=recent_imgs,
555
+ custom_model_name=custom_model_value if is_custom_model_selected else None,
556
+ computer_os=computer_os,
557
+ computer_provider=computer_provider,
558
+ computer_name=container_name,
559
+ computer_api_key=cua_cloud_api_key,
560
+ verbosity=logging.DEBUG,
561
+ max_trajectory_budget=max_budget_value if max_budget_value and max_budget_value > 0 else None,
562
+ )
563
+
564
+ if global_agent is None:
565
+ history.append(
566
+ gr.ChatMessage(
567
+ role="assistant",
568
+ content="Failed to create agent. Check API keys and configuration.",
569
+ )
570
+ )
571
+ yield history
572
+ return
573
+
574
+ # Create message list for agent
575
+ messages = [{"role": "user", "content": last_user_message}]
576
+
577
+ # Stream responses from the agent
578
+ async for result in global_agent.run(messages):
579
+ print(f"DEBUG - Agent response ------- START")
580
+ from pprint import pprint
581
+ pprint(result)
582
+ print(f"DEBUG - Agent response ------- END")
583
+
584
+ # Process the result output
585
+ for item in result.get("output", []):
586
+ if item.get("type") == "message":
587
+ content = item.get("content", [])
588
+ for content_part in content:
589
+ if content_part.get("text"):
590
+ history.append(gr.ChatMessage(
591
+ role=item.get("role", "assistant"),
592
+ content=content_part.get("text", ""),
593
+ metadata=content_part.get("metadata", {})
594
+ ))
595
+ elif item.get("type") == "computer_call":
596
+ action = item.get("action", {})
597
+ action_type = action.get("type", "")
598
+ if action_type:
599
+ action_title = f"🛠️ Performing {action_type}"
600
+ if action.get("x") and action.get("y"):
601
+ action_title += f" at ({action['x']}, {action['y']})"
602
+ history.append(gr.ChatMessage(
603
+ role="assistant",
604
+ content=f"```json\n{json.dumps(action)}\n```",
605
+ metadata={"title": action_title}
606
+ ))
607
+ elif item.get("type") == "function_call":
608
+ function_name = item.get("name", "")
609
+ arguments = item.get("arguments", "{}")
610
+ history.append(gr.ChatMessage(
611
+ role="assistant",
612
+ content=f"🔧 Calling function: {function_name}\n```json\n{arguments}\n```",
613
+ metadata={"title": f"Function Call: {function_name}"}
614
+ ))
615
+ elif item.get("type") == "function_call_output":
616
+ output = item.get("output", "")
617
+ history.append(gr.ChatMessage(
618
+ role="assistant",
619
+ content=f"📤 Function output:\n```\n{output}\n```",
620
+ metadata={"title": "Function Output"}
621
+ ))
622
+
623
+ yield history
624
+
625
+ except Exception as e:
626
+ import traceback
627
+ traceback.print_exc()
628
+ history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
629
+ yield history
630
+
631
+ # Connect the submit button
632
+ submit_event = msg.submit(
633
+ fn=chat_submit,
634
+ inputs=[msg, chatbot_history],
635
+ outputs=[msg, chatbot_history],
636
+ queue=False,
637
+ ).then(
638
+ fn=process_response,
639
+ inputs=[
640
+ chatbot_history,
641
+ openai_model_choice,
642
+ anthropic_model_choice,
643
+ omni_model_choice,
644
+ uitars_model_choice,
645
+ custom_model,
646
+ agent_loop,
647
+ save_trajectory,
648
+ recent_images,
649
+ provider_base_url,
650
+ provider_api_key,
651
+ openai_api_key_input,
652
+ anthropic_api_key_input,
653
+ computer_os,
654
+ computer_provider,
655
+ container_name,
656
+ cua_cloud_api_key,
657
+ max_budget,
658
+ ],
659
+ outputs=[chatbot_history],
660
+ queue=True,
661
+ )
662
+
663
+ # Clear button functionality
664
+ clear.click(lambda: None, None, chatbot_history, queue=False)
665
+
666
+ # Connect cancel button
667
+ cancel_button.click(
668
+ cancel_agent_task,
669
+ [chatbot_history],
670
+ [chatbot_history],
671
+ queue=False
672
+ )
673
+
674
+ # Code display update function
675
+ def update_code_display(agent_loop, model_choice_val, custom_model_val, chat_history, recent_images_val, save_trajectory_val, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget_val):
676
+ messages = []
677
+ if chat_history:
678
+ for msg in chat_history:
679
+ if isinstance(msg, dict) and msg.get("role") == "user":
680
+ messages.append(msg.get("content", ""))
681
+
682
+ return generate_python_code(
683
+ agent_loop,
684
+ model_choice_val or custom_model_val or "gpt-4o",
685
+ messages,
686
+ recent_images_val,
687
+ save_trajectory_val,
688
+ computer_os,
689
+ computer_provider,
690
+ container_name,
691
+ cua_cloud_api_key,
692
+ max_budget_val
693
+ )
694
+
695
+ # Update code display when configuration changes
696
+ for component in [agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget]:
697
+ component.change(
698
+ update_code_display,
699
+ inputs=[agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget],
700
+ outputs=[code_display]
701
+ )
702
+
703
+ return demo